Commit f24b575d authored by Jungshik Shin's avatar Jungshik Shin Committed by Commit Bot

Fix canonicalization of grandfathered tags

ICU maps a few grandfathered tags to made-up values even when there
is no preferred value entry in the IANA language tag registry. [1]

1. Check for grandfathered tags without preferred value upfront
   and return them as they're.
2. Lowercase the input before structural validity check to simplify
   check for grandfathered tag without preferred value as well
   as regexps used in the structural validity check.

intl/general/grandfathered_tags_without_preferred_value is added and
intl/general/language_tags_with_preferred_values is changed to check
for case-insensitive matching of grandfathered tags.

[1] https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry

Bug: v8:7669
Test: test262/intl402/Intl/getCanonicalLocales/preferred-grandfathered
Test: intl/general/grandfathered_tags_without_preferred_value
Cq-Include-Trybots: luci.v8.try:v8_linux_noi18n_rel_ng
Cq-Include-Trybots: luci.chromium.try:linux_chromium_rel_ng
Change-Id: Ie0520de8712928300fd71fe152909789483ec256
Reviewed-on: https://chromium-review.googlesource.com/1156529
Commit-Queue: Jungshik Shin <jshin@chromium.org>
Reviewed-by: 's avatarSathya Gunasekaran <gsathya@chromium.org>
Cr-Commit-Position: refs/heads/master@{#54829}
parent cd4b7228
......@@ -1351,25 +1351,28 @@ namespace {
// Define general regexp macros.
// Note "(?:" means the regexp group a non-capture group.
#define REGEX_ALPHA "[a-zA-Z]"
#define REGEX_ALPHA "[a-z]"
#define REGEX_DIGIT "[0-9]"
#define REGEX_ALPHANUM "(?:" REGEX_ALPHA "|" REGEX_DIGIT ")"
void BuildLanguageTagRegexps(Isolate* isolate) {
// Define the language tag regexp macros.
// For info on BCP 47 see https://tools.ietf.org/html/bcp47
// For info on BCP 47 see https://tools.ietf.org/html/bcp47 .
// Because language tags are case insensitive per BCP 47 2.1.1 and regexp's
// defined below will always be used after lowercasing the input, uppercase
// ranges in BCP 47 2.1 are dropped and grandfathered tags are all lowercased.
// clang-format off
#define BCP47_REGULAR \
"(?:art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|" \
"zh-min|zh-min-nan|zh-xiang)"
#define BCP47_IRREGULAR \
"(?:en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|" \
"(?:en-gb-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|" \
"i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|" \
"i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)"
"i-tsu|sgn-be-fr|sgn-be-nl|sgn-ch-de)"
#define BCP47_GRANDFATHERED "(?:" BCP47_IRREGULAR "|" BCP47_REGULAR ")"
#define BCP47_PRIVATE_USE "(?:x(?:-" REGEX_ALPHANUM "{1,8})+)"
#define BCP47_SINGLETON "(?:" REGEX_DIGIT "|" "[A-WY-Za-wy-z])"
#define BCP47_SINGLETON "(?:" REGEX_DIGIT "|" "[a-wy-z])"
#define BCP47_EXTENSION "(?:" BCP47_SINGLETON "(?:-" REGEX_ALPHANUM "{2,8})+)"
#define BCP47_VARIANT \
......@@ -1603,8 +1606,6 @@ bool IsStructurallyValidLanguageTag(Isolate* isolate,
return false;
}
std::transform(locale.begin(), locale.end(), locale.begin(), AsciiToLower);
// Just return if it's a x- form. It's all private.
if (locale.find("x-") == 0) {
return true;
......@@ -1684,6 +1685,18 @@ bool IsDeprecatedLanguage(const std::string& locale) {
return locale == "in" || locale == "iw" || locale == "ji" || locale == "jw";
}
// Reference:
// https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
bool IsGrandfatheredTagWithoutPreferredVaule(const std::string& locale) {
if (V8_UNLIKELY(locale == "zh-min" || locale == "cel-gaulish")) return true;
if (locale.length() > 6 /* i-mingo is 7 chars long */ &&
V8_UNLIKELY(locale[0] == 'i' && locale[1] == '-')) {
return locale.substr(2) == "default" || locale.substr(2) == "enochian" ||
locale.substr(2) == "mingo";
}
return false;
}
} // anonymous namespace
MaybeHandle<String> Intl::CanonicalizeLanguageTag(Isolate* isolate,
......@@ -1710,6 +1723,9 @@ MaybeHandle<String> Intl::CanonicalizeLanguageTag(Isolate* isolate,
return locale_str;
}
// Because per BCP 47 2.1.1 language tags are case-insensitive, lowercase
// the input before any more check.
std::transform(locale.begin(), locale.end(), locale.begin(), AsciiToLower);
if (!IsStructurallyValidLanguageTag(isolate, locale)) {
THROW_NEW_ERROR(
isolate,
......@@ -1717,6 +1733,12 @@ MaybeHandle<String> Intl::CanonicalizeLanguageTag(Isolate* isolate,
String);
}
// ICU maps a few grandfathered tags to what looks like a regular language
// tag even though IANA language tag registry does not have a preferred
// entry map for them. Return them as they're with lowercasing.
if (IsGrandfatheredTagWithoutPreferredVaule(locale))
return isolate->factory()->NewStringFromAsciiChecked(locale.data());
// // ECMA 402 6.2.3
// TODO(jshin): uloc_{for,to}TanguageTag can fail even for a structually valid
// language tag if it's too long (much longer than 100 chars). Even if we
......
// Copyright 2018 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
[
// Grandfathered tags without a preferred value in the IANA language
// tag registry. Nonetheless, ICU cooks up a value when canonicalizing.
// v8 works around that ICU issue.
// See https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
["cel-gaulish", "cel-gaulish"],
["i-default", "i-default"],
["i-mingo", "i-mingo"],
["i-enochian", "i-enochian"],
["zh-min", "zh-min"],
// Matching should be case-insensitive.
["I-default", "i-default"],
["i-DEFAULT", "i-default"],
["I-DEFAULT", "i-default"],
["i-DEfauLT", "i-default"],
["zh-Min", "zh-min"],
["Zh-min", "zh-min"],
].forEach(([inputLocale, expectedLocale]) => {
const canonicalLocales = Intl.getCanonicalLocales(inputLocale);
assertEquals(canonicalLocales.length, 1);
assertEquals(canonicalLocales[0], expectedLocale);
})
......@@ -7,6 +7,11 @@
["sgn-de", "gsg"],
["sgn-de-u-co-phonebk", "gsg-u-co-phonebk"],
// Matching should be case-insensitive.
["sgn-De", "gsg"],
["sgn-BE-FR", "sfb"],
["Sgn-bE-Fr", "sfb"],
// deprecated region tag
["und-Latn-dd", "und-Latn-DE"],
["und-dd-u-co-phonebk", "und-DE-u-co-phonebk"],
......@@ -22,8 +27,8 @@
["jw", "jv"],
["aam", "aas"],
["aam-u-ca-gregory", "aas-u-ca-gregory"],
].forEach(function (entry) {
const canonicalLocales = Intl.getCanonicalLocales(entry[0]);
].forEach(([inputLocale, expectedLocale]) => {
const canonicalLocales = Intl.getCanonicalLocales(inputLocale);
assertEquals(canonicalLocales.length, 1);
assertEquals(canonicalLocales[0], entry[1]);
assertEquals(canonicalLocales[0], expectedLocale);
})
......@@ -435,7 +435,6 @@
# https://bugs.chromium.org/p/v8/issues/detail?id=7669
'intl402/Intl/getCanonicalLocales/canonicalized-tags': [FAIL],
'intl402/Intl/getCanonicalLocales/preferred-grandfathered': [FAIL],
# Tests assume that the sort order of "same elements" (comparator returns 0)
# is deterministic.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment