Commit e3a5b1e4 authored by Brian Stell's avatar Brian Stell Committed by Commit Bot

Add IsStructurallyValidLanguageTag() routine.

Bug: v8:5751


Cq-Include-Trybots: luci.v8.try:v8_linux_noi18n_rel_ng
Change-Id: If71ab553f7f70dd148fb90a18ccd9b1c69791323
Reviewed-on: https://chromium-review.googlesource.com/1119103Reviewed-by: 's avatarSathya Gunasekaran <gsathya@chromium.org>
Reviewed-by: 's avatarJungshik Shin <jshin@chromium.org>
Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Commit-Queue: Brian Stell <bstell@chromium.org>
Cr-Commit-Position: refs/heads/master@{#54499}
parent 46a78fbe
......@@ -18,6 +18,7 @@
#include "src/isolate.h"
#include "src/objects-inl.h"
#include "src/objects/managed.h"
#include "src/objects/string.h"
#include "src/property-descriptor.h"
#include "unicode/brkiter.h"
#include "unicode/bytestream.h"
......@@ -1308,11 +1309,7 @@ MaybeHandle<Object> NumberFormat::FormatNumber(
reinterpret_cast<const uint16_t*>(result.getBuffer()), result.length()));
}
// TODO(bstell): enable this anonymous namespace once these routines are called:
// * GetLanguageSingletonRegexMatcher,
// * GetLanguageTagRegexMatcher
// * GetLanguageVariantRegexMatcher
// namespace {
namespace {
// TODO(bstell): Make all these a constexpr on the Intl class.
void BuildLanguageTagRegexps(Isolate* isolate) {
......@@ -1353,13 +1350,13 @@ void BuildLanguageTagRegexps(Isolate* isolate) {
UErrorCode status = U_ZERO_ERROR;
icu::RegexMatcher* language_singleton_regexp_matcher = new icu::RegexMatcher(
icu::UnicodeString::fromUTF8(language_singleton_regexp), 0, status);
CHECK(U_SUCCESS(status));
icu::RegexMatcher* language_tag_regexp_matcher = new icu::RegexMatcher(
icu::UnicodeString::fromUTF8(language_tag_regexp), 0, status);
CHECK(U_SUCCESS(status));
icu::RegexMatcher* language_variant_regexp_matcher = new icu::RegexMatcher(
icu::UnicodeString::fromUTF8(language_variant_regexp), 0, status);
if (!U_SUCCESS(status)) {
return;
}
CHECK(U_SUCCESS(status));
isolate->set_language_tag_regexp_matchers(language_singleton_regexp_matcher,
language_tag_regexp_matcher,
......@@ -1398,7 +1395,7 @@ icu::RegexMatcher* GetLanguageVariantRegexMatcher(Isolate* isolate) {
return language_variant_regexp_matcher;
}
// } // anonymous namespace
} // anonymous namespace
MaybeHandle<JSObject> Intl::ResolveLocale(Isolate* isolate, const char* service,
Handle<Object> requestedLocales,
......@@ -1497,5 +1494,122 @@ V8_WARN_UNUSED_RESULT Maybe<bool> Intl::GetBoolOption(
return Just(false);
}
// TODO(bstell): enable this anonymous namespace once
// IsStructurallyValidLanguageTag called.
// namespace {
char AsciiToLower(char c) {
if (c < 'A' || c > 'Z') {
return c;
}
return c | (1 << 5);
}
/**
* Check the structural Validity of the language tag per ECMA 402 6.2.2:
* - Well-formed per RFC 5646 2.1
* - There are no duplicate variant subtags
* - There are no duplicate singleton (extension) subtags
*
* One extra-check is done (from RFC 5646 2.2.9): the tag is compared
* against the list of grandfathered tags. However, subtags for
* primary/extended language, script, region, variant are not checked
* against the IANA language subtag registry.
*
* ICU is too permissible and lets invalid tags, like
* hant-cmn-cn, through.
*
* Returns false if the language tag is invalid.
*/
bool IsStructurallyValidLanguageTag(Isolate* isolate,
const std::string& locale_in) {
if (!String::IsAscii(locale_in.c_str(),
static_cast<int>(locale_in.length()))) {
return false;
}
std::string locale(locale_in);
icu::RegexMatcher* language_tag_regexp_matcher =
GetLanguageTagRegexMatcher(isolate);
// Check if it's well-formed, including grandfathered tags.
language_tag_regexp_matcher->reset(
icu::UnicodeString(locale.c_str(), -1, US_INV));
UErrorCode status = U_ZERO_ERROR;
bool is_valid_lang_tag = language_tag_regexp_matcher->matches(status);
if (!is_valid_lang_tag || V8_UNLIKELY(U_FAILURE(status))) {
return false;
}
std::transform(locale.begin(), locale.end(), locale.begin(), AsciiToLower);
// Just return if it's a x- form. It's all private.
if (locale.find("x-") == 0) {
return true;
}
// Check if there are any duplicate variants or singletons (extensions).
// Remove private use section.
locale = locale.substr(0, locale.find("-x-"));
// Skip language since it can match variant regex, so we start from 1.
// We are matching i-klingon here, but that's ok, since i-klingon-klingon
// is not valid and would fail LANGUAGE_TAG_RE test.
size_t pos = 0;
std::vector<std::string> parts;
while ((pos = locale.find("-")) != std::string::npos) {
std::string token = locale.substr(0, pos);
parts.push_back(token);
locale = locale.substr(pos + 1);
}
if (locale.length() != 0) {
parts.push_back(locale);
}
icu::RegexMatcher* language_variant_regexp_matcher =
GetLanguageVariantRegexMatcher(isolate);
icu::RegexMatcher* language_singleton_regexp_matcher =
GetLanguageSingletonRegexMatcher(isolate);
std::vector<std::string> variants;
std::vector<std::string> extensions;
for (const auto& value : parts) {
language_variant_regexp_matcher->reset(
icu::UnicodeString::fromUTF8(value.c_str()));
bool is_language_variant = language_variant_regexp_matcher->matches(status);
if (V8_UNLIKELY(U_FAILURE(status))) {
return false;
}
if (is_language_variant && extensions.size() == 0) {
if (std::find(variants.begin(), variants.end(), value) ==
variants.end()) {
variants.push_back(value);
} else {
return false;
}
}
language_singleton_regexp_matcher->reset(
icu::UnicodeString(value.c_str(), -1, US_INV));
bool is_language_singleton =
language_singleton_regexp_matcher->matches(status);
if (V8_UNLIKELY(U_FAILURE(status))) {
return false;
}
if (is_language_singleton) {
if (std::find(extensions.begin(), extensions.end(), value) ==
extensions.end()) {
extensions.push_back(value);
} else {
return false;
}
}
}
return true;
}
// } // anonymous namespace
} // namespace internal
} // namespace v8
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment