Commit 2abb31a9 authored by Jungshik Shin's avatar Jungshik Shin Committed by Commit Bot

Use ICU to validate and canonicalize lang tag

- Get rid of an unnecessary call to uloc_canonicalize in js-locale.
- Do not use regex, but rely on ICU for the structrural validity check
with Chrome's ICU or ICU 63 or newer. Otherwise, continue to use regex.

This became possible thanks to a couple of bug fixes in ICU ToT that
were cherry-picked for Chromium's ICU.

Not yet done is to change js-locale to use CanonicalizeLocale().
That will make a few more tests pass.

Bug: v8:8135
Test: test262/intl402/Intl/getCanonicalLocales/*
Test: test262/intl402/Locale/*
Cq-Include-Trybots: luci.v8.try:v8_linux_noi18n_rel_ng
Cq-Include-Trybots: luci.chromium.try:linux_chromium_rel_ng
Change-Id: I45c10b298fb041e0b39a4d96309c68a7966f91c2
Reviewed-on: https://chromium-review.googlesource.com/c/1215223
Commit-Queue: Jungshik Shin <jshin@chromium.org>
Reviewed-by: 's avatarSathya Gunasekaran <gsathya@chromium.org>
Cr-Commit-Position: refs/heads/master@{#56399}
parent 2729ce8b
......@@ -2626,9 +2626,11 @@ Isolate::Isolate()
host_initialize_import_meta_object_callback_(nullptr),
load_start_time_ms_(0),
#ifdef V8_INTL_SUPPORT
#if USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
language_singleton_regexp_matcher_(nullptr),
language_tag_regexp_matcher_(nullptr),
language_variant_regexp_matcher_(nullptr),
#endif // USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
default_locale_(""),
#endif // V8_INTL_SUPPORT
serializer_enabled_(false),
......@@ -2865,6 +2867,7 @@ Isolate::~Isolate() {
date_cache_ = nullptr;
#ifdef V8_INTL_SUPPORT
#if USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
delete language_singleton_regexp_matcher_;
language_singleton_regexp_matcher_ = nullptr;
......@@ -2873,6 +2876,7 @@ Isolate::~Isolate() {
delete language_variant_regexp_matcher_;
language_variant_regexp_matcher_ = nullptr;
#endif // USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
#endif // V8_INTL_SUPPORT
delete regexp_stack_;
......
......@@ -1182,6 +1182,7 @@ class Isolate : private HiddenFactory {
}
#ifdef V8_INTL_SUPPORT
#if USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
icu::RegexMatcher* language_singleton_regexp_matcher() {
return language_singleton_regexp_matcher_;
}
......@@ -1193,6 +1194,7 @@ class Isolate : private HiddenFactory {
icu::RegexMatcher* language_variant_regexp_matcher() {
return language_variant_regexp_matcher_;
}
#endif // USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
const std::string& default_locale() { return default_locale_; }
......@@ -1201,6 +1203,7 @@ class Isolate : private HiddenFactory {
default_locale_ = locale;
}
#if USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
void set_language_tag_regexp_matchers(
icu::RegexMatcher* language_singleton_regexp_matcher,
icu::RegexMatcher* language_tag_regexp_matcher,
......@@ -1212,6 +1215,7 @@ class Isolate : private HiddenFactory {
language_tag_regexp_matcher_ = language_tag_regexp_matcher;
language_variant_regexp_matcher_ = language_variant_regexp_matcher;
}
#endif // USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
#endif // V8_INTL_SUPPORT
static const int kProtectorValid = 1;
......@@ -1730,9 +1734,11 @@ class Isolate : private HiddenFactory {
double load_start_time_ms_;
#ifdef V8_INTL_SUPPORT
#if USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
icu::RegexMatcher* language_singleton_regexp_matcher_;
icu::RegexMatcher* language_tag_regexp_matcher_;
icu::RegexMatcher* language_variant_regexp_matcher_;
#endif // USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
std::string default_locale_;
#endif // V8_INTL_SUPPORT
......
......@@ -39,10 +39,6 @@
#include "unicode/uvernum.h"
#include "unicode/uversion.h"
#if U_ICU_VERSION_MAJOR_NUM >= 59
#include "unicode/char16ptr.h"
#endif
namespace v8 {
namespace internal {
......@@ -87,14 +83,18 @@ icu::Locale Intl::CreateICULocale(Isolate* isolate,
// Convert BCP47 into ICU locale format.
UErrorCode status = U_ZERO_ERROR;
char icu_result[ULOC_FULLNAME_CAPACITY];
int icu_length = 0;
int parsed_length = 0;
// bcp47_locale_str should be a canonicalized language tag, which
// means this shouldn't fail.
uloc_forLanguageTag(*bcp47_locale, icu_result, ULOC_FULLNAME_CAPACITY,
&icu_length, &status);
&parsed_length, &status);
CHECK(U_SUCCESS(status));
CHECK_LT(0, icu_length);
// bcp47_locale is already checked for its structural validity
// so that it should be parsed completely.
int bcp47length = bcp47_locale.length();
CHECK_EQ(bcp47length, parsed_length);
icu::Locale icu_locale(icu_result);
if (icu_locale.isBogus()) {
......@@ -381,6 +381,7 @@ MaybeHandle<Object> Intl::LegacyUnwrapReceiver(Isolate* isolate,
namespace {
#if USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
// Define general regexp macros.
// Note "(?:" means the regexp group a non-capture group.
#define REGEX_ALPHA "[a-z]"
......@@ -492,6 +493,7 @@ icu::RegexMatcher* GetLanguageVariantRegexMatcher(Isolate* isolate) {
}
return language_variant_regexp_matcher;
}
#endif // USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
} // anonymous namespace
......@@ -617,6 +619,7 @@ char AsciiToLower(char c) {
return c | (1 << 5);
}
#if USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
/**
* Check the structural Validity of the language tag per ECMA 402 6.2.2:
* - Well-formed per RFC 5646 2.1
......@@ -628,7 +631,7 @@ char AsciiToLower(char c) {
* primary/extended language, script, region, variant are not checked
* against the IANA language subtag registry.
*
* ICU is too permissible and lets invalid tags, like
* ICU 62 or earlier is too permissible and lets invalid tags, like
* hant-cmn-cn, through.
*
* Returns false if the language tag is invalid.
......@@ -719,6 +722,7 @@ bool IsStructurallyValidLanguageTag(Isolate* isolate,
return true;
}
#endif // USE_CHROMIUM_ICU == 0 || U_ICU_VERSION_MAJOR_NUM < 63
bool IsLowerAscii(char c) { return c >= 'a' && c < 'z'; }
......@@ -770,6 +774,14 @@ Maybe<std::string> Intl::CanonicalizeLanguageTag(Isolate* isolate,
}
std::string locale(locale_str->ToCString().get());
if (locale.length() == 0 ||
!String::IsAscii(locale.data(), static_cast<int>(locale.length()))) {
THROW_NEW_ERROR_RETURN_VALUE(
isolate,
NewRangeError(MessageTemplate::kInvalidLanguageTag, locale_str),
Nothing<std::string>());
}
// Optimize for the most common case: a 2-letter language code in the
// canonical form/lowercase that is not one of the deprecated codes
// (in, iw, ji, jw). Don't check for ~70 of 3-letter deprecated language
......@@ -783,12 +795,15 @@ Maybe<std::string> Intl::CanonicalizeLanguageTag(Isolate* isolate,
// Because per BCP 47 2.1.1 language tags are case-insensitive, lowercase
// the input before any more check.
std::transform(locale.begin(), locale.end(), locale.begin(), AsciiToLower);
#if USE_CHROMIUM_ICU == 0 && U_ICU_VERSION_MAJOR_NUM < 63
if (!IsStructurallyValidLanguageTag(isolate, locale)) {
THROW_NEW_ERROR_RETURN_VALUE(
isolate,
NewRangeError(MessageTemplate::kInvalidLanguageTag, locale_str),
Nothing<std::string>());
}
#endif
// ICU maps a few grandfathered tags to what looks like a regular language
// tag even though IANA language tag registry does not have a preferred
......@@ -806,11 +821,18 @@ Maybe<std::string> Intl::CanonicalizeLanguageTag(Isolate* isolate,
// https://unicode-org.atlassian.net/browse/ICU-13417
UErrorCode error = U_ZERO_ERROR;
char icu_result[ULOC_FULLNAME_CAPACITY];
// uloc_forLanguageTag checks the structrual validity. If the input BCP47
// language tag is parsed all the way to the end, it indicates that the input
// is structurally valid. Due to a couple of bugs, we can't use it
// without Chromium patches or ICU 62 or earlier.
int parsed_length;
uloc_forLanguageTag(locale.c_str(), icu_result, ULOC_FULLNAME_CAPACITY,
nullptr, &error);
if (U_FAILURE(error) || error == U_STRING_NOT_TERMINATED_WARNING) {
// TODO(jshin): This should not happen because the structural validity
// is already checked. If that's the case, remove this.
&parsed_length, &error);
if (U_FAILURE(error) ||
#if USE_CHROMIUM_ICU == 1 || U_ICU_VERSION_MAJOR_NUM >= 63
static_cast<size_t>(parsed_length) < locale.length() ||
#endif
error == U_STRING_NOT_TERMINATED_WARNING) {
THROW_NEW_ERROR_RETURN_VALUE(
isolate,
NewRangeError(MessageTemplate::kInvalidLanguageTag, locale_str),
......
......@@ -173,7 +173,6 @@ MaybeHandle<JSLocale> JSLocale::Initialize(Isolate* isolate,
// Get ICU locale format, and canonicalize it.
char icu_result[ULOC_FULLNAME_CAPACITY];
char icu_canonical[ULOC_FULLNAME_CAPACITY];
if (locale->length() == 0) {
THROW_NEW_ERROR(isolate, NewRangeError(MessageTemplate::kLocaleNotEmpty),
......@@ -184,11 +183,14 @@ MaybeHandle<JSLocale> JSLocale::Initialize(Isolate* isolate,
CHECK_LT(0, bcp47_locale.length());
CHECK_NOT_NULL(*bcp47_locale);
int icu_length = uloc_forLanguageTag(
*bcp47_locale, icu_result, ULOC_FULLNAME_CAPACITY, nullptr, &status);
int parsed_length = 0;
int icu_length =
uloc_forLanguageTag(*bcp47_locale, icu_result, ULOC_FULLNAME_CAPACITY,
&parsed_length, &status);
if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING ||
icu_length == 0) {
if (U_FAILURE(status) ||
parsed_length < static_cast<int>(bcp47_locale.length()) ||
status == U_STRING_NOT_TERMINATED_WARNING || icu_length == 0) {
THROW_NEW_ERROR(
isolate,
NewRangeError(MessageTemplate::kLocaleBadParameters,
......@@ -211,18 +213,7 @@ MaybeHandle<JSLocale> JSLocale::Initialize(Isolate* isolate,
}
DCHECK(error.FromJust());
uloc_canonicalize(icu_result, icu_canonical, ULOC_FULLNAME_CAPACITY, &status);
if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) {
THROW_NEW_ERROR(
isolate,
NewRangeError(MessageTemplate::kLocaleBadParameters,
isolate->factory()->NewStringFromAsciiChecked(kMethod),
locale_holder),
JSLocale);
return MaybeHandle<JSLocale>();
}
if (!PopulateLocaleWithUnicodeTags(isolate, icu_canonical, locale_holder)) {
if (!PopulateLocaleWithUnicodeTags(isolate, icu_result, locale_holder)) {
THROW_NEW_ERROR(
isolate,
NewRangeError(MessageTemplate::kLocaleBadParameters,
......@@ -234,13 +225,13 @@ MaybeHandle<JSLocale> JSLocale::Initialize(Isolate* isolate,
// Extract language, script and region parts.
char icu_language[ULOC_LANG_CAPACITY];
uloc_getLanguage(icu_canonical, icu_language, ULOC_LANG_CAPACITY, &status);
uloc_getLanguage(icu_result, icu_language, ULOC_LANG_CAPACITY, &status);
char icu_script[ULOC_SCRIPT_CAPACITY];
uloc_getScript(icu_canonical, icu_script, ULOC_SCRIPT_CAPACITY, &status);
uloc_getScript(icu_result, icu_script, ULOC_SCRIPT_CAPACITY, &status);
char icu_region[ULOC_COUNTRY_CAPACITY];
uloc_getCountry(icu_canonical, icu_region, ULOC_COUNTRY_CAPACITY, &status);
uloc_getCountry(icu_result, icu_region, ULOC_COUNTRY_CAPACITY, &status);
if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) {
THROW_NEW_ERROR(
......@@ -271,8 +262,7 @@ MaybeHandle<JSLocale> JSLocale::Initialize(Isolate* isolate,
}
char icu_base_name[ULOC_FULLNAME_CAPACITY];
uloc_getBaseName(icu_canonical, icu_base_name, ULOC_FULLNAME_CAPACITY,
&status);
uloc_getBaseName(icu_result, icu_base_name, ULOC_FULLNAME_CAPACITY, &status);
// We need to convert it back to BCP47.
char bcp47_result[ULOC_FULLNAME_CAPACITY];
uloc_toLanguageTag(icu_base_name, bcp47_result, ULOC_FULLNAME_CAPACITY, true,
......@@ -290,7 +280,7 @@ MaybeHandle<JSLocale> JSLocale::Initialize(Isolate* isolate,
locale_holder->set_base_name(*base_name);
// Produce final representation of the locale string, for toString().
uloc_toLanguageTag(icu_canonical, bcp47_result, ULOC_FULLNAME_CAPACITY, true,
uloc_toLanguageTag(icu_result, bcp47_result, ULOC_FULLNAME_CAPACITY, true,
&status);
if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) {
THROW_NEW_ERROR(
......
......@@ -7,9 +7,10 @@ assertDoesNotThrow(() => Intl.getCanonicalLocales("foobar-foobar"));
// Ignore duplicate subtags in different namespaces; eg, 'a' vs 'u'.
assertDoesNotThrow(() => Intl.getCanonicalLocales("en-a-ca-Chinese-u-ca-Chinese"));
// Ignore duplicate subtags in U-extension as well. Only the first count.
// See RFC 6067 for details.
assertDoesNotThrow(() => Intl.getCanonicalLocales("en-u-ca-gregory-ca-chinese"));
assertEquals("en-u-ca-gregory", Intl.getCanonicalLocales("en-u-ca-gregory-ca-chinese")[0]);
// Check duplicate subtags (after the first tag) are detected.
assertThrows(() => Intl.getCanonicalLocales("en-foobar-foobar"), RangeError);
// Duplicate subtags are valid as per the ECMA262 spec.
assertDoesNotThrow(() => Intl.getCanonicalLocales("en-u-ca-gregory-ca-chinese"));
......@@ -23,5 +23,5 @@
].forEach(([inputLocale, expectedLocale]) => {
const canonicalLocales = Intl.getCanonicalLocales(inputLocale);
assertEquals(canonicalLocales.length, 1);
assertEquals(canonicalLocales[0], expectedLocale);
assertEquals(expectedLocale, canonicalLocales[0]);
})
......@@ -29,6 +29,6 @@
["aam-u-ca-gregory", "aas-u-ca-gregory"],
].forEach(([inputLocale, expectedLocale]) => {
const canonicalLocales = Intl.getCanonicalLocales(inputLocale);
assertEquals(canonicalLocales.length, 1);
assertEquals(canonicalLocales[0], expectedLocale);
assertEquals(1, canonicalLocales.length);
assertEquals(expectedLocale, canonicalLocales[0]);
})
......@@ -584,7 +584,6 @@
'intl402/Locale/constructor-options-region-valid': [FAIL],
'intl402/Locale/constructor-options-script-invalid': [FAIL],
'intl402/Locale/constructor-options-script-valid': [FAIL],
'intl402/Locale/constructor-unicode-ext-invalid': [FAIL],
'intl402/Locale/getters': [FAIL],
'intl402/Locale/invalid-tag-throws': [FAIL],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment