Commit d58f40b6 authored by Sathya Gunasekaran's avatar Sathya Gunasekaran Committed by Commit Bot

[Intl] Refactor LookupSupportedLocales

Fix spec non compliance by only trimming the unicode locales and not
all extensions.

Remove regexp and just use straightforward string manipulation.

Bug: v8:5751
Cq-Include-Trybots: luci.v8.try:v8_linux_noi18n_rel_ng
Change-Id: Ie95828a8f62834daf8cde189f408e95a14e796fe
Reviewed-on: https://chromium-review.googlesource.com/c/1255556
Commit-Queue: Sathya Gunasekaran <gsathya@chromium.org>
Reviewed-by: 's avatarAdam Klein <adamk@chromium.org>
Cr-Commit-Position: refs/heads/master@{#56458}
parent 766ab3a5
...@@ -1198,96 +1198,108 @@ Maybe<bool> Intl::SetNumberFormatDigitOptions(Isolate* isolate, ...@@ -1198,96 +1198,108 @@ Maybe<bool> Intl::SetNumberFormatDigitOptions(Isolate* isolate,
namespace { namespace {
// ECMA 402 9.2.2 BestAvailableLocale(availableLocales, locale) // ecma402/#sec-bestavailablelocale
// https://tc39.github.io/ecma402/#sec-bestavailablelocale std::string BestAvailableLocale(const std::set<std::string>& available_locales,
std::string BestAvailableLocale(std::set<std::string> available_locales, const std::string& locale) {
std::string locale) {
const char separator = '-';
// 1. Let candidate be locale. // 1. Let candidate be locale.
std::string candidate = locale;
// 2. Repeat, // 2. Repeat,
do { while (true) {
// 2.a. If availableLocales contains an element equal to candidate, return // 2.a. If availableLocales contains an element equal to candidate, return
// candidate. // candidate.
if (available_locales.find(locale) != available_locales.end()) { if (available_locales.find(candidate) != available_locales.end()) {
return locale; return candidate;
} }
// 2.b. Let pos be the character index of the last occurrence of "-" // 2.b. Let pos be the character index of the last occurrence of "-"
// (U+002D) within candidate. If that character does not occur, return // (U+002D) within candidate. If that character does not occur, return
// undefined. // undefined.
size_t pos = locale.rfind(separator); size_t pos = candidate.rfind('-');
if (pos == std::string::npos) { if (pos == std::string::npos) {
return ""; return std::string();
} }
// 2.c. If pos ≥ 2 and the character "-" occurs at index pos-2 of candidate, // 2.c. If pos ≥ 2 and the character "-" occurs at index pos-2 of candidate,
// decrease pos by 2. // decrease pos by 2.
if (pos >= 2 && locale[pos - 2] == separator) { if (pos >= 2 && candidate[pos - 2] == '-') {
pos -= 2; pos -= 2;
} }
// 2.d. Let candidate be the substring of candidate from position 0, // 2.d. Let candidate be the substring of candidate from position 0,
// inclusive, to position pos, exclusive. // inclusive, to position pos, exclusive.
locale = locale.substr(0, pos); candidate = candidate.substr(0, pos);
} while (true); }
} }
#define ANY_EXTENSION_REGEXP "-[a-z0-9]{1}-.*" // Removes unicode extensions from a given bcp47 language tag.
// For example, converts 'en-US-u-co-emoji' to 'en-US'.
std::string RemoveUnicodeExtensions(const std::string& locale) {
size_t length = locale.length();
// Privateuse or grandfathered locales have no extension sequences.
if ((length > 1) && (locale[1] == '-')) {
// Check to make sure that this really is a grandfathered or
// privateuse extension. ICU can sometimes mess up the
// canonicalization.
CHECK(locale[0] == 'x' || locale[0] == 'i');
return locale;
}
size_t unicode_extension_start = locale.find("-u-");
std::unique_ptr<icu::RegexMatcher> GetAnyExtensionRegexpMatcher() { // No unicode extensions found.
UErrorCode status = U_ZERO_ERROR; if (unicode_extension_start == std::string::npos) return locale;
std::unique_ptr<icu::RegexMatcher> matcher(new icu::RegexMatcher(
icu::UnicodeString(ANY_EXTENSION_REGEXP, -1, US_INV), 0, status)); size_t private_extension_start = locale.find("-x-");
DCHECK(U_SUCCESS(status));
return matcher; // Unicode extensions found within privateuse subtags don't count.
} if (private_extension_start != std::string::npos &&
private_extension_start < unicode_extension_start) {
return locale;
}
#undef ANY_EXTENSION_REGEXP const std::string beginning = locale.substr(0, unicode_extension_start);
size_t unicode_extension_end = length;
DCHECK_GT(length, 2);
// ECMA 402 9.2.7 LookupSupportedLocales(availableLocales, requestedLocales) // Find the end of the extension production as per the bcp47 grammar
// https://tc39.github.io/ecma402/#sec-lookupsupportedlocales // by looking for '-' followed by 2 chars and then another '-'.
for (size_t i = unicode_extension_start + 1; i < length - 2; i++) {
if (locale[i] != '-') continue;
if (locale[i + 2] == '-') {
unicode_extension_end = i;
break;
}
i += 2;
}
const std::string end = locale.substr(unicode_extension_end);
return beginning + end;
}
// ecma402/#sec-lookupsupportedlocales
std::vector<std::string> LookupSupportedLocales( std::vector<std::string> LookupSupportedLocales(
const std::set<std::string>& available_locales, const std::set<std::string>& available_locales,
const std::vector<std::string>& requested_locales) { const std::vector<std::string>& requested_locales) {
std::unique_ptr<icu::RegexMatcher> matcher = GetAnyExtensionRegexpMatcher();
// 1. Let subset be a new empty List. // 1. Let subset be a new empty List.
std::vector<std::string> subset; std::vector<std::string> subset;
// 2. For each element locale of requestedLocales in List order, do // 2. For each element locale of requestedLocales in List order, do
for (const auto& locale : requested_locales) { for (const std::string& locale : requested_locales) {
// 2.a. Let noExtensionsLocale be the String value that is locale with all // 2. a. Let noExtensionsLocale be the String value that is locale
// Unicode locale extension sequences removed. // with all Unicode locale extension sequences removed.
icu::UnicodeString locale_uni(locale.c_str(), -1, US_INV); std::string no_extension_locale = RemoveUnicodeExtensions(locale);
// TODO(bstell): look at using uloc_forLanguageTag to convert the language
// tag to locale id // 2. b. Let availableLocale be
// TODO(bstell): look at using uloc_getBaseName to just get the name without // BestAvailableLocale(availableLocales, noExtensionsLocale).
// all the keywords
matcher->reset(locale_uni);
UErrorCode status = U_ZERO_ERROR;
// TODO(bstell): need to determine if this is the correct behavior.
// This matches the JS implementation but might not match the spec.
// According to
// https://tc39.github.io/ecma402/#sec-unicode-locale-extension-sequences:
//
// This standard uses the term "Unicode locale extension sequence" for
// any substring of a language tag that is not part of a private use
// subtag sequence, starts with a separator "-" and the singleton "u",
// and includes the maximum sequence of following non-singleton subtags
// and their preceding "-" separators.
//
// According to the spec a locale "en-t-aaa-u-bbb-v-ccc-x-u-ddd", should
// remove only the "-u-bbb" part, and keep everything else, whereas this
// regexp matcher would leave only the "en".
icu::UnicodeString no_extensions_locale_uni =
matcher->replaceAll("", status);
DCHECK(U_SUCCESS(status));
std::string no_extensions_locale;
no_extensions_locale_uni.toUTF8String(no_extensions_locale);
// 2.b. Let availableLocale be BestAvailableLocale(availableLocales,
// noExtensionsLocale).
std::string available_locale = std::string available_locale =
BestAvailableLocale(available_locales, no_extensions_locale); BestAvailableLocale(available_locales, no_extension_locale);
// 2.c. If availableLocale is not undefined, append locale to the end of
// subset. // 2. c. If availableLocale is not undefined, append locale to the
// end of subset.
if (!available_locale.empty()) { if (!available_locale.empty()) {
subset.push_back(locale); subset.push_back(locale);
} }
......
...@@ -27,55 +27,67 @@ ...@@ -27,55 +27,67 @@
// Tests supportedLocalesOf method. // Tests supportedLocalesOf method.
var undef = Intl.DateTimeFormat.supportedLocalesOf(); var services = [
assertEquals([], undef); Intl.DateTimeFormat,
Intl.Collator,
Intl.NumberFormat,
Intl.PluralRules
];
var empty = Intl.DateTimeFormat.supportedLocalesOf([]); for (const service of services) {
assertEquals([], empty); let undef = service.supportedLocalesOf();
assertEquals([], undef);
var strLocale = Intl.DateTimeFormat.supportedLocalesOf('sr'); let empty = service.supportedLocalesOf([]);
assertEquals('sr', strLocale[0]); assertEquals([], empty);
var multiLocale = let strLocale = service.supportedLocalesOf("sr");
Intl.DateTimeFormat.supportedLocalesOf(['sr-Thai-RS', 'de', 'zh-CN']); assertEquals("sr", strLocale[0]);
assertEquals('sr-Thai-RS', multiLocale[0]);
assertEquals('de', multiLocale[1]);
assertEquals('zh-CN', multiLocale[2]);
collatorUndef = Intl.Collator.supportedLocalesOf(); var locales = ["sr-Thai-RS", "de", "zh-CN"];
assertEquals([], collatorUndef); let multiLocale = service.supportedLocalesOf(locales);
assertEquals("sr-Thai-RS", multiLocale[0]);
assertEquals("de", multiLocale[1]);
assertEquals("zh-CN", multiLocale[2]);
collatorEmpty = Intl.Collator.supportedLocalesOf([]); let numLocale = service.supportedLocalesOf(1);
assertEquals([], collatorEmpty); assertEquals([], numLocale);
assertThrows(function() {
numLocale = Intl.Collator.supportedLocalesOf([1]);
}, TypeError);
collatorStrLocale = Intl.Collator.supportedLocalesOf('sr'); extensionLocale = service.supportedLocalesOf("id-u-co-pinyin");
assertEquals('sr', collatorStrLocale[0]); assertEquals("id-u-co-pinyin", extensionLocale[0]);
collatorMultiLocale = bestFitLocale = service.supportedLocalesOf("de", {
Intl.Collator.supportedLocalesOf(['sr-Thai-RS', 'de', 'zh-CN']); localeMatcher: "best fit"
assertEquals('sr-Thai-RS', collatorMultiLocale[0]); });
assertEquals('de', collatorMultiLocale[1]); assertEquals("de", bestFitLocale[0]);
assertEquals('zh-CN', collatorMultiLocale[2]);
numLocale = Intl.Collator.supportedLocalesOf(1); // Need a better test for "lookup" once it differs from "best fit".
assertEquals([], numLocale); lookupLocale = service.supportedLocalesOf("zh-CN", {
localeMatcher: "lookup"
});
assertEquals("zh-CN", lookupLocale[0]);
assertThrows(function() { assertThrows(function() {
numLocale = Intl.Collator.supportedLocalesOf([1]); service.supportedLocalesOf("id-u-co-pinyin", { localeMatcher: "xyz" });
}, TypeError); }, RangeError);
extensionLocale = Intl.Collator.supportedLocalesOf('id-u-co-pinyin'); privateuseLocale = service.supportedLocalesOf("en-US-x-twain");
assertEquals('id-u-co-pinyin', extensionLocale[0]); assertEquals("en-US-x-twain", privateuseLocale[0]);
bestFitLocale = privateuseLocale2 = service.supportedLocalesOf("x-twain");
Intl.Collator.supportedLocalesOf('de', {localeMatcher: 'best fit'}); assertEquals(undefined, privateuseLocale2[0]);
assertEquals('de', bestFitLocale[0]);
// Need a better test for "lookup" once it differs from "best fit". grandfatheredLocale = service.supportedLocalesOf("art-lojban");
lookupLocale = assertEquals(undefined, grandfatheredLocale[0]);
Intl.Collator.supportedLocalesOf('zh-CN', {localeMatcher: 'lookup'});
assertEquals('zh-CN', lookupLocale[0]);
assertThrows(function() { grandfatheredLocale2 = service.supportedLocalesOf("i-pwn");
Intl.Collator.supportedLocalesOf('id-u-co-pinyin', {localeMatcher: 'xyz'}); assertEquals(undefined, grandfatheredLocale2[0]);
}, RangeError);
unicodeInPrivateuseLocale = service.supportedLocalesOf(
"en-US-x-u-co-phonebk"
);
assertEquals("en-US-x-u-co-phonebk", unicodeInPrivateuseLocale[0]);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment