Commit b348d47b authored by jshin's avatar jshin Committed by Commit bot

Use ICU case conversion/transliterator for case conversion

When I18N is enabled, use ICU's case conversion API and transliteration
API [1] to implement String.prototype.to{Upper,Lower}Case and
String.prototype.toLocale{Upper,Lower}Case.

* ICU-based case conversion was implemented in runtime-i18n.cc/i18n.js
* The above 4 functions are overridden with those in i18n.js when
  --icu_case_mapping flag is turned on. To control the override by the flag,
  they're overriden in icu-case-mapping.js

Previously, toLocale{U,L}Case just called to{U,L}Case so that they didn't
support locale-sensitive case conversion for Turkic languages (az, tr),
Greek (el) and Lithuanian (lt).

Before ICU APIs for the most general case are called, a fast-path for Latin-1
is tried. It's taken from Blink and adopted as necessary. This fast path
is always tried for to{U,L}Case. For toLocale{U,L}Case, it's only taken
when a locale (explicitly specified or default) is not in {az, el, lt, tr}.

With these changes, a build with --icu_case_mapping=true passes a bunch
of tests in test262/intl402/Strings/* and intl/* that failed before.

Handling of pure ASCII strings (aligned at word boundary) are not as fast
as Unibrow's implementation that uses word-by-word case conversion. OTOH,
Latin-1 input handling is faster than Unibrow. General Unicode input
handling is slower but more accurate.

See https://docs.google.com/spreadsheets/d/1KJCJxKc1FxFXjwmYqABS0_2cNdPetvnd8gY8_HGSbrg/edit?usp=sharing for the benchmark.

This CL started with http://crrev.com/1544023002#ps200001 by littledan@,
but has changed significantly since.

[1] See why transliteration API is needed for uppercasing in Greek.
    http://bugs.icu-project.org/trac/ticket/10582

R=yangguo
BUG=v8:4476,v8:4477
LOG=Y
TEST=test262/{built-ins,intl402}/Strings/*, webkit/fast/js/*, mjsunit/string-case,
     intl/general/case*

Review-Url: https://codereview.chromium.org/1812673005
Cr-Commit-Position: refs/heads/master@{#36187}
parent adcc5119
......@@ -428,6 +428,10 @@ action("js2c_experimental") {
"$target_gen_dir/experimental-libraries.cc",
]
if (v8_enable_i18n_support) {
sources += [ "src/js/icu-case-mapping.js" ]
}
args = [
rebase_path("$target_gen_dir/experimental-libraries.cc",
root_build_dir),
......
......@@ -2478,6 +2478,9 @@ EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(harmony_instanceof)
EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(harmony_restrictive_declarations)
EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(harmony_exponentiation_operator)
EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(harmony_string_padding)
#ifdef V8_I18N_SUPPORT
EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(icu_case_mapping)
#endif
void InstallPublicSymbol(Factory* factory, Handle<Context> native_context,
const char* name, Handle<Symbol> value) {
......@@ -3046,6 +3049,10 @@ bool Genesis::InstallExperimentalNatives() {
static const char* harmony_exponentiation_operator_natives[] = {nullptr};
static const char* harmony_string_padding_natives[] = {
"native harmony-string-padding.js", nullptr};
#ifdef V8_I18N_SUPPORT
static const char* icu_case_mapping_natives[] = {"native icu-case-mapping.js",
nullptr};
#endif
for (int i = ExperimentalNatives::GetDebuggerCount();
i < ExperimentalNatives::GetBuiltinsCount(); i++) {
......
......@@ -193,12 +193,22 @@ DEFINE_IMPLICATION(es_staging, harmony_regexp_lookbehind)
DEFINE_IMPLICATION(es_staging, move_object_start)
// Features that are still work in progress (behind individual flags).
#ifdef V8_I18N_SUPPORT
#define HARMONY_INPROGRESS(V) \
V(harmony_function_sent, "harmony function.sent") \
V(harmony_sharedarraybuffer, "harmony sharedarraybuffer") \
V(harmony_simd, "harmony simd") \
V(harmony_do_expressions, "harmony do-expressions") \
V(harmony_regexp_property, "harmony unicode regexp property classes") \
V(icu_case_mapping, "case mapping with ICU rather than Unibrow")
#else
#define HARMONY_INPROGRESS(V) \
V(harmony_function_sent, "harmony function.sent") \
V(harmony_sharedarraybuffer, "harmony sharedarraybuffer") \
V(harmony_simd, "harmony simd") \
V(harmony_do_expressions, "harmony do-expressions") \
V(harmony_regexp_property, "harmony unicode regexp property classes")
#endif
// Features that are complete (but still behind --harmony/es-staging flag).
#define HARMONY_STAGED(V) \
......
......@@ -142,6 +142,13 @@ var AVAILABLE_LOCALES = {
*/
var DEFAULT_ICU_LOCALE = UNDEFINED;
function GetDefaultICULocaleJS() {
if (IS_UNDEFINED(DEFAULT_ICU_LOCALE)) {
DEFAULT_ICU_LOCALE = %GetDefaultICULocale();
}
return DEFAULT_ICU_LOCALE;
}
/**
* Unicode extension regular expression.
*/
......@@ -446,11 +453,7 @@ function lookupMatcher(service, requestedLocales) {
}
// Didn't find a match, return default.
if (IS_UNDEFINED(DEFAULT_ICU_LOCALE)) {
DEFAULT_ICU_LOCALE = %GetDefaultICULocale();
}
return {'locale': DEFAULT_ICU_LOCALE, 'extension': '', 'position': -1};
return {'locale': GetDefaultICULocaleJS(), 'extension': '', 'position': -1};
}
......@@ -722,21 +725,24 @@ function toTitleCaseTimezoneLocation(location) {
*/
function canonicalizeLanguageTag(localeID) {
// null is typeof 'object' so we have to do extra check.
if (typeof localeID !== 'string' && typeof localeID !== 'object' ||
if ((!IS_STRING(localeID) && !IS_RECEIVER(localeID)) ||
IS_NULL(localeID)) {
throw MakeTypeError(kLanguageID);
}
// Optimize for the most common case; a language code alone in
// the canonical form/lowercase (e.g. "en", "fil").
if (IS_STRING(localeID) &&
!IS_NULL(InternalRegExpMatch(/^[a-z]{2,3}$/, localeID))) {
return localeID;
}
var localeString = GlobalString(localeID);
if (isValidLanguageTag(localeString) === false) {
throw MakeRangeError(kInvalidLanguageTag, localeString);
}
// This call will strip -kn but not -kn-true extensions.
// ICU bug filled - http://bugs.icu-project.org/trac/ticket/9265.
// TODO(cira): check if -u-kn-true-kc-true-kh-true still throws after
// upgrade to ICU 4.9.
var tag = %CanonicalizeLanguageTag(localeString);
if (tag === 'invalid-tag') {
throw MakeRangeError(kInvalidLanguageTag, localeString);
......@@ -1989,6 +1995,37 @@ function cachedOrNewService(service, locales, options, defaults) {
return new savedObjects[service](locales, useOptions);
}
function LocaleConvertCase(s, locales, isToUpper) {
// ECMA 402 section 13.1.2 steps 1 through 12.
var language;
// Optimize for the most common two cases. initializeLocaleList() can handle
// them as well, but it's rather slow accounting for over 60% of
// toLocale{U,L}Case() and about 40% of toLocale{U,L}Case("<locale>").
if (IS_UNDEFINED(locales)) {
language = GetDefaultICULocaleJS();
} else if (IS_STRING(locales)) {
language = canonicalizeLanguageTag(locales);
} else {
var locales = initializeLocaleList(locales);
language = locales.length > 0 ? locales[0] : GetDefaultICULocaleJS();
}
// StringSplit is slower than this.
var pos = %_Call(StringIndexOf, language, '-');
if (pos != -1) {
language = %_Call(StringSubstring, language, 0, pos);
}
var CUSTOM_CASE_LANGUAGES = ['az', 'el', 'lt', 'tr'];
var langIndex = %_Call(ArrayIndexOf, CUSTOM_CASE_LANGUAGES, language);
if (langIndex == -1) {
// language-independent case conversion.
return isToUpper ? %StringToUpperCaseI18N(s) : %StringToLowerCaseI18N(s);
}
return %StringLocaleConvertCase(s, isToUpper,
CUSTOM_CASE_LANGUAGES[langIndex]);
}
/**
* Compares this and that, and returns less than 0, 0 or greater than 0 value.
* Overrides the built-in method.
......@@ -2041,6 +2078,56 @@ OverrideFunction(GlobalString.prototype, 'normalize', function() {
}
);
function ToLowerCaseI18N() {
if (!IS_UNDEFINED(new.target)) {
throw MakeTypeError(kOrdinaryFunctionCalledAsConstructor);
}
CHECK_OBJECT_COERCIBLE(this, "String.prototype.toLowerCase");
var s = TO_STRING(this);
return %StringToLowerCaseI18N(s);
}
function ToUpperCaseI18N() {
if (!IS_UNDEFINED(new.target)) {
throw MakeTypeError(kOrdinaryFunctionCalledAsConstructor);
}
CHECK_OBJECT_COERCIBLE(this, "String.prototype.toUpperCase");
var s = TO_STRING(this);
return %StringToUpperCaseI18N(s);
}
function ToLocaleLowerCaseI18N(locales) {
if (!IS_UNDEFINED(new.target)) {
throw MakeTypeError(kOrdinaryFunctionCalledAsConstructor);
}
CHECK_OBJECT_COERCIBLE(this, "String.prototype.toLocaleLowerCase");
return LocaleConvertCase(TO_STRING(this), locales, false);
}
%FunctionSetLength(ToLocaleLowerCaseI18N, 0);
function ToLocaleUpperCaseI18N(locales) {
if (!IS_UNDEFINED(new.target)) {
throw MakeTypeError(kOrdinaryFunctionCalledAsConstructor);
}
CHECK_OBJECT_COERCIBLE(this, "String.prototype.toLocaleUpperCase");
return LocaleConvertCase(TO_STRING(this), locales, true);
}
%FunctionSetLength(ToLocaleUpperCaseI18N, 0);
%FunctionRemovePrototype(ToLowerCaseI18N);
%FunctionRemovePrototype(ToUpperCaseI18N);
%FunctionRemovePrototype(ToLocaleLowerCaseI18N);
%FunctionRemovePrototype(ToLocaleUpperCaseI18N);
utils.Export(function(to) {
to.ToLowerCaseI18N = ToLowerCaseI18N;
to.ToUpperCaseI18N = ToUpperCaseI18N;
to.ToLocaleLowerCaseI18N = ToLocaleLowerCaseI18N;
to.ToLocaleUpperCaseI18N = ToLocaleUpperCaseI18N;
});
/**
* Formats a Number object (this) using locale and options values.
......
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
(function(global, utils) {
"use strict";
%CheckIsBootstrapping();
var GlobalString = global.String;
var OverrideFunction = utils.OverrideFunction;
var ToLowerCaseI18N = utils.ImportNow("ToLowerCaseI18N");
var ToUpperCaseI18N = utils.ImportNow("ToUpperCaseI18N");
var ToLocaleLowerCaseI18N = utils.ImportNow("ToLocaleLowerCaseI18N");
var ToLocaleUpperCaseI18N = utils.ImportNow("ToLocaleUpperCaseI18N");
OverrideFunction(GlobalString.prototype, 'toLowerCase', ToLowerCaseI18N, true);
OverrideFunction(GlobalString.prototype, 'toUpperCase', ToUpperCaseI18N, true);
OverrideFunction(GlobalString.prototype, 'toLocaleLowerCase',
ToLocaleLowerCaseI18N, true);
OverrideFunction(GlobalString.prototype, 'toLocaleUpperCase',
ToLocaleUpperCaseI18N, true);
})
......@@ -208,7 +208,11 @@ function PostNatives(utils) {
"SetIteratorNext",
"SetValues",
"SymbolToString",
"ToLocaleLowerCaseI18N",
"ToLocaleUpperCaseI18N",
"ToLowerCaseI18N",
"ToPositiveInteger",
"ToUpperCaseI18N",
// From runtime:
"is_concat_spreadable_symbol",
"iterator_symbol",
......
......@@ -8645,26 +8645,26 @@ class String: public Name {
class FlatContent {
public:
// Returns true if the string is flat and this structure contains content.
bool IsFlat() { return state_ != NON_FLAT; }
bool IsFlat() const { return state_ != NON_FLAT; }
// Returns true if the structure contains one-byte content.
bool IsOneByte() { return state_ == ONE_BYTE; }
bool IsOneByte() const { return state_ == ONE_BYTE; }
// Returns true if the structure contains two-byte content.
bool IsTwoByte() { return state_ == TWO_BYTE; }
bool IsTwoByte() const { return state_ == TWO_BYTE; }
// Return the one byte content of the string. Only use if IsOneByte()
// returns true.
Vector<const uint8_t> ToOneByteVector() {
Vector<const uint8_t> ToOneByteVector() const {
DCHECK_EQ(ONE_BYTE, state_);
return Vector<const uint8_t>(onebyte_start, length_);
}
// Return the two-byte content of the string. Only use if IsTwoByte()
// returns true.
Vector<const uc16> ToUC16Vector() {
Vector<const uc16> ToUC16Vector() const {
DCHECK_EQ(TWO_BYTE, state_);
return Vector<const uc16>(twobyte_start, length_);
}
uc16 Get(int i) {
uc16 Get(int i) const {
DCHECK(i < length_);
DCHECK(state_ != NON_FLAT);
if (state_ == ONE_BYTE) return onebyte_start[i];
......
......@@ -29,10 +29,12 @@
#include "unicode/rbbi.h"
#include "unicode/smpdtfmt.h"
#include "unicode/timezone.h"
#include "unicode/translit.h"
#include "unicode/uchar.h"
#include "unicode/ucol.h"
#include "unicode/ucurr.h"
#include "unicode/uloc.h"
#include "unicode/unistr.h"
#include "unicode/unum.h"
#include "unicode/uversion.h"
......@@ -749,6 +751,360 @@ RUNTIME_FUNCTION(Runtime_BreakIteratorBreakType) {
return *isolate->factory()->NewStringFromStaticChars("unknown");
}
}
namespace {
void ConvertCaseWithTransliterator(icu::UnicodeString* input,
const char* transliterator_id) {
UErrorCode status = U_ZERO_ERROR;
base::SmartPointer<icu::Transliterator> translit(
icu::Transliterator::createInstance(
icu::UnicodeString(transliterator_id, -1, US_INV), UTRANS_FORWARD,
status));
if (U_FAILURE(status)) return;
translit->transliterate(*input);
}
const UChar* GetUCharBufferFromFlat(const String::FlatContent& flat,
base::SmartArrayPointer<uc16>* dest,
int32_t length) {
DCHECK(flat.IsFlat());
if (flat.IsOneByte()) {
if (dest->is_empty()) {
dest->Reset(NewArray<uc16>(length));
CopyChars(dest->get(), flat.ToOneByteVector().start(), length);
}
return reinterpret_cast<const UChar*>(dest->get());
} else {
return reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());
}
}
MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,
bool is_to_upper, const char* lang) {
int32_t src_length = s->length();
// Greek uppercasing has to be done via transliteration.
// TODO(jshin): Drop this special-casing once ICU's regular case conversion
// API supports Greek uppercasing. See
// http://bugs.icu-project.org/trac/ticket/10582 .
// In the meantime, if there's no Greek character in |s|, call this
// function again with the root locale (lang="").
// ICU's C API for transliteration is nasty and we just use C++ API.
if (V8_UNLIKELY(is_to_upper && lang[0] == 'e' && lang[1] == 'l')) {
icu::UnicodeString converted;
base::SmartArrayPointer<uc16> sap;
{
DisallowHeapAllocation no_gc;
String::FlatContent flat = s->GetFlatContent();
const UChar* src = GetUCharBufferFromFlat(flat, &sap, src_length);
// Starts with the source string (read-only alias with copy-on-write
// semantics) and will be modified to contain the converted result.
// Using read-only alias at first saves one copy operation if
// transliteration does not change the input, which is rather rare.
// Moreover, transliteration takes rather long so that saving one copy
// helps only a little bit.
converted.setTo(false, src, src_length);
ConvertCaseWithTransliterator(&converted, "el-Upper");
// If no change is made, just return |s|.
if (converted.getBuffer() == src) return *s;
}
Handle<String> result;
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, result,
isolate->factory()->NewStringFromTwoByte(Vector<const uint16_t>(
reinterpret_cast<const uint16_t*>(converted.getBuffer()),
converted.length())));
return *result;
}
auto case_converter = is_to_upper ? u_strToUpper : u_strToLower;
int32_t dest_length = src_length;
UErrorCode status;
Handle<SeqTwoByteString> result;
base::SmartArrayPointer<uc16> sap;
// This is not a real loop. It'll be executed only once (no overflow) or
// twice (overflow).
for (int i = 0; i < 2; ++i) {
result =
isolate->factory()->NewRawTwoByteString(dest_length).ToHandleChecked();
DisallowHeapAllocation no_gc;
String::FlatContent flat = s->GetFlatContent();
const UChar* src = GetUCharBufferFromFlat(flat, &sap, src_length);
status = U_ZERO_ERROR;
dest_length = case_converter(reinterpret_cast<UChar*>(result->GetChars()),
dest_length, src, src_length, lang, &status);
if (status != U_BUFFER_OVERFLOW_ERROR) break;
}
// In most cases, the output will fill the destination buffer completely
// leading to an unterminated string (U_STRING_NOT_TERMINATED_WARNING).
// Only in rare cases, it'll be shorter than the destination buffer and
// |result| has to be truncated.
DCHECK(U_SUCCESS(status));
if (V8_LIKELY(status == U_STRING_NOT_TERMINATED_WARNING)) {
DCHECK(dest_length == result->length());
return *result;
}
if (U_SUCCESS(status)) {
DCHECK(dest_length < result->length());
return *Handle<SeqTwoByteString>::cast(
SeqString::Truncate(result, dest_length));
}
return *s;
}
inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }
const uint8_t kToLower[256] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,
0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,
0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73,
0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B,
0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,
0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB,
0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7,
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
0xFC, 0xFD, 0xFE, 0xFF,
};
inline uint16_t ToLatin1Lower(uint16_t ch) {
return static_cast<uint16_t>(kToLower[ch]);
}
inline uint16_t ToASCIIUpper(uint16_t ch) {
return ch & ~((ch >= 'a' && ch <= 'z') << 5);
}
// Does not work for U+00DF (sharp-s), U+00B5 (micron), U+00FF.
inline uint16_t ToLatin1Upper(uint16_t ch) {
DCHECK(ch != 0xDF && ch != 0xB5 && ch != 0xFF);
return ch &
~(((ch >= 'a' && ch <= 'z') || (((ch & 0xE0) == 0xE0) && ch != 0xE7))
<< 5);
}
template <typename Char>
bool ToUpperFastASCII(const Vector<const Char>& src,
Handle<SeqOneByteString> result) {
// Do a faster loop for the case where all the characters are ASCII.
uint16_t ored = 0;
int32_t index = 0;
for (auto it = src.begin(); it != src.end(); ++it) {
uint16_t ch = static_cast<uint16_t>(*it);
ored |= ch;
result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));
}
return !(ored & ~0x7F);
}
const uint16_t sharp_s = 0xDF;
template <typename Char>
bool ToUpperOneByte(const Vector<const Char>& src,
Handle<SeqOneByteString> result, int* sharp_s_count) {
// Still pretty-fast path for the input with non-ASCII Latin-1 characters.
// There are two special cases.
// 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.
// 2. Lower case sharp-S converts to "SS" (two characters)
*sharp_s_count = 0;
int32_t index = 0;
for (auto it = src.begin(); it != src.end(); ++it) {
uint16_t ch = static_cast<uint16_t>(*it);
if (V8_UNLIKELY(ch == sharp_s)) {
++(*sharp_s_count);
continue;
}
if (V8_UNLIKELY(ch == 0xB5 || ch == 0xFF)) {
// Since this upper-cased character does not fit in an 8-bit string, we
// need to take the 16-bit path.
return false;
}
result->SeqOneByteStringSet(index++, ToLatin1Upper(ch));
}
return true;
}
template <typename Char>
void ToUpperWithSharpS(const Vector<const Char>& src,
Handle<SeqOneByteString> result) {
int32_t dest_index = 0;
for (auto it = src.begin(); it != src.end(); ++it) {
uint16_t ch = static_cast<uint16_t>(*it);
if (ch == sharp_s) {
result->SeqOneByteStringSet(dest_index++, 'S');
result->SeqOneByteStringSet(dest_index++, 'S');
} else {
result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));
}
}
}
} // namespace
RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {
HandleScope scope(isolate);
DCHECK_EQ(args.length(), 1);
CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
int length = s->length();
s = String::Flatten(s);
// First scan the string for uppercase and non-ASCII characters:
if (s->HasOnlyOneByteChars()) {
unsigned first_index_to_lower = length;
for (int index = 0; index < length; ++index) {
// Blink specializes this path for one-byte strings, so it
// does not need to do a generic get, but can do the equivalent
// of SeqOneByteStringGet.
uint16_t ch = s->Get(index);
if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
first_index_to_lower = index;
break;
}
}
// Nothing to do if the string is all ASCII with no uppercase.
if (first_index_to_lower == length) return *s;
// We depend here on the invariant that the length of a Latin1
// string is invariant under ToLowerCase, and the result always
// fits in the Latin1 range in the *root locale*. It does not hold
// for ToUpperCase even in the root locale.
Handle<SeqOneByteString> result;
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, result, isolate->factory()->NewRawOneByteString(length));
DisallowHeapAllocation no_gc;
String::FlatContent flat = s->GetFlatContent();
if (flat.IsOneByte()) {
const uint8_t* src = flat.ToOneByteVector().start();
CopyChars(result->GetChars(), src, first_index_to_lower);
for (int index = first_index_to_lower; index < length; ++index) {
uint16_t ch = static_cast<uint16_t>(src[index]);
result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
}
} else {
const uint16_t* src = flat.ToUC16Vector().start();
CopyChars(result->GetChars(), src, first_index_to_lower);
for (int index = first_index_to_lower; index < length; ++index) {
uint16_t ch = src[index];
result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
}
}
return *result;
}
// Blink had an additional case here for ASCII 2-byte strings, but
// that is subsumed by the above code (assuming there isn't a false
// negative for HasOnlyOneByteChars).
// Do a slower implementation for cases that include non-ASCII characters.
return LocaleConvertCase(s, isolate, false, "");
}
RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {
HandleScope scope(isolate);
DCHECK_EQ(args.length(), 1);
CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
// This function could be optimized for no-op cases the way lowercase
// counterpart is, but in empirical testing, few actual calls to upper()
// are no-ops. So, it wouldn't be worth the extra time for pre-scanning.
int32_t length = s->length();
s = String::Flatten(s);
if (s->HasOnlyOneByteChars()) {
Handle<SeqOneByteString> result;
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, result, isolate->factory()->NewRawOneByteString(length));
int sharp_s_count;
bool is_result_single_byte;
{
DisallowHeapAllocation no_gc;
String::FlatContent flat = s->GetFlatContent();
// If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII
// could be removed because ToUpperOneByte is pretty fast now (it
// does not call ICU API any more.).
if (flat.IsOneByte()) {
Vector<const uint8_t> src = flat.ToOneByteVector();
if (ToUpperFastASCII(src, result)) return *result;
is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);
} else {
DCHECK(flat.IsTwoByte());
Vector<const uint16_t> src = flat.ToUC16Vector();
if (ToUpperFastASCII(src, result)) return *result;
is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);
}
}
// Go to the full Unicode path if there are characters whose uppercase
// is beyond the Latin-1 range (cannot be represented in OneByteString).
if (V8_UNLIKELY(!is_result_single_byte)) {
return LocaleConvertCase(s, isolate, true, "");
}
if (sharp_s_count == 0) return *result;
// We have sharp_s_count sharp-s characters, but the result is still
// in the Latin-1 range.
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, result,
isolate->factory()->NewRawOneByteString(length + sharp_s_count));
DisallowHeapAllocation no_gc;
String::FlatContent flat = s->GetFlatContent();
if (flat.IsOneByte()) {
ToUpperWithSharpS(flat.ToOneByteVector(), result);
} else {
ToUpperWithSharpS(flat.ToUC16Vector(), result);
}
return *result;
}
return LocaleConvertCase(s, isolate, true, "");
}
RUNTIME_FUNCTION(Runtime_StringLocaleConvertCase) {
HandleScope scope(isolate);
DCHECK_EQ(args.length(), 3);
CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
CONVERT_BOOLEAN_ARG_CHECKED(is_upper, 1);
CONVERT_ARG_HANDLE_CHECKED(SeqOneByteString, lang, 2);
// All the languages requiring special handling ("az", "el", "lt", "tr")
// have a 2-letter language code.
DCHECK(lang->length() == 2);
uint8_t lang_str[3];
memcpy(lang_str, lang->GetChars(), 2);
lang_str[2] = 0;
s = String::Flatten(s);
// TODO(jshin): Consider adding a fast path for ASCII or Latin-1. The fastpath
// in the root locale needs to be adjusted for az, lt and tr because even case
// mapping of ASCII range characters are different in those locales.
// Greek (el) does not require any adjustment, though.
return LocaleConvertCase(s, isolate, is_upper,
reinterpret_cast<const char*>(lang_str));
}
} // namespace internal
} // namespace v8
......
......@@ -1077,7 +1077,7 @@ MUST_USE_RESULT static Object* ConvertCase(
RUNTIME_FUNCTION(Runtime_StringToLowerCase) {
HandleScope scope(isolate);
DCHECK(args.length() == 1);
DCHECK_EQ(args.length(), 1);
CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
return ConvertCase(s, isolate, isolate->runtime_state()->to_lower_mapping());
}
......@@ -1085,7 +1085,7 @@ RUNTIME_FUNCTION(Runtime_StringToLowerCase) {
RUNTIME_FUNCTION(Runtime_StringToUpperCase) {
HandleScope scope(isolate);
DCHECK(args.length() == 1);
DCHECK_EQ(args.length(), 1);
CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
return ConvertCase(s, isolate, isolate->runtime_state()->to_upper_mapping());
}
......
......@@ -262,7 +262,10 @@ namespace internal {
F(BreakIteratorFirst, 1, 1) \
F(BreakIteratorNext, 1, 1) \
F(BreakIteratorCurrent, 1, 1) \
F(BreakIteratorBreakType, 1, 1)
F(BreakIteratorBreakType, 1, 1) \
F(StringToLowerCaseI18N, 1, 1) \
F(StringToUpperCaseI18N, 1, 1) \
F(StringLocaleConvertCase, 3, 1)
#else
#define FOR_EACH_INTRINSIC_I18N(F)
#endif
......
......@@ -1988,17 +1988,6 @@
}, {
'toolsets': ['target'],
}],
['v8_enable_i18n_support==1', {
'variables': {
'i18n_library_files': [
'js/i18n.js',
],
},
}, {
'variables': {
'i18n_library_files': [],
},
}],
],
'variables': {
'library_files': [
......@@ -2048,6 +2037,12 @@
'libraries_experimental_bin_file': '<(SHARED_INTERMEDIATE_DIR)/libraries-experimental.bin',
'libraries_extras_bin_file': '<(SHARED_INTERMEDIATE_DIR)/libraries-extras.bin',
'libraries_experimental_extras_bin_file': '<(SHARED_INTERMEDIATE_DIR)/libraries-experimental-extras.bin',
'conditions': [
['v8_enable_i18n_support==1', {
'library_files': ['js/i18n.js'],
'experimental_library_files': ['js/icu-case-mapping.js'],
}],
],
},
'actions': [
{
......@@ -2055,7 +2050,6 @@
'inputs': [
'../tools/js2c.py',
'<@(library_files)',
'<@(i18n_library_files)'
],
'outputs': ['<(SHARED_INTERMEDIATE_DIR)/libraries.cc'],
'action': [
......@@ -2064,7 +2058,6 @@
'<(SHARED_INTERMEDIATE_DIR)/libraries.cc',
'CORE',
'<@(library_files)',
'<@(i18n_library_files)'
],
},
{
......@@ -2072,7 +2065,6 @@
'inputs': [
'../tools/js2c.py',
'<@(library_files)',
'<@(i18n_library_files)'
],
'outputs': ['<@(libraries_bin_file)'],
'action': [
......@@ -2081,7 +2073,6 @@
'<(SHARED_INTERMEDIATE_DIR)/libraries.cc',
'CORE',
'<@(library_files)',
'<@(i18n_library_files)',
'--startup_blob', '<@(libraries_bin_file)',
'--nojs',
],
......@@ -2098,7 +2089,7 @@
'../tools/js2c.py',
'<(SHARED_INTERMEDIATE_DIR)/experimental-libraries.cc',
'EXPERIMENTAL',
'<@(experimental_library_files)'
'<@(experimental_library_files)',
],
},
{
......
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --icu_case_mapping
// Some edge cases that unibrow got wrong
assertEquals("𐐘", "𐑀".toUpperCase());
assertEquals("𐑀", "𐐘".toLowerCase());
assertEquals("σ", "Σ".toLowerCase());
// Some different paths in the ICU case conversion fastpath
assertEquals("σς", "\u03A3\u03A3".toLowerCase());
// Expand sharp s in latin1 fastpath
assertEquals("ASSB", "A\u00DFB".toUpperCase());
assertEquals("AB", "Ab".toUpperCase());
// Find first upper case in fastpath
assertEquals("ab", "aB".toLowerCase());
assertEquals("AÜ", "aü".toUpperCase());
assertEquals("AÜ", "AÜ".toUpperCase());
assertEquals("aü", "aü".toLowerCase());
assertEquals("aü", "AÜ".toLowerCase());
assertEquals("aü", "AÜ".toLowerCase());
// Starts with fastpath, but switches to full Unicode path
// U+00FF is uppercased to U+0178.
assertEquals("AŸ", "aÿ".toUpperCase());
// U+00B5 (µ) is uppercased to U+039C (Μ)
assertEquals("AΜ", "aµ".toUpperCase());
// Buffer size increase
assertEquals("CSSBẶ", "cßbặ".toUpperCase());
assertEquals("FIFLFFIFFL", "\uFB01\uFB02\uFB03\uFB04".toUpperCase());
// OneByte input with buffer size increase: non-fast path
assertEquals("ABCSS", "abCß".toLocaleUpperCase("tr"));
// More comprehensive tests for "tr", "az" and "lt" are in
// test262/intl402/Strings/*
// Buffer size decrease with a single locale or locale list.
// In Turkic (tr, az), U+0307 preceeded by Capital Letter I is dropped.
assertEquals("abci", "aBcI\u0307".toLocaleLowerCase("tr"));
assertEquals("abci", "aBcI\u0307".toLocaleLowerCase("az"));
assertEquals("abci", "aBcI\u0307".toLocaleLowerCase(["tr", "en"]));
// Cons string
assertEquals("abcijkl", ("aBcI" + "\u0307jkl").toLocaleLowerCase("tr"));
assertEquals("abcijkl",
("aB" + "cI" + "\u0307j" + "kl").toLocaleLowerCase("tr"));
assertEquals("abci\u0307jkl", ("aBcI" + "\u0307jkl").toLocaleLowerCase("en"));
assertEquals("abci\u0307jkl",
("aB" + "cI" + "\u0307j" + "kl").toLocaleLowerCase("en"));
assertEquals("abci\u0307jkl", ("aBcI" + "\u0307jkl").toLowerCase());
assertEquals("abci\u0307jkl",
("aB" + "cI" + "\u0307j" + "kl").toLowerCase());
// "tr" and "az" should behave identically.
assertEquals("aBcI\u0307".toLocaleLowerCase("tr"),
"aBcI\u0307".toLocaleLowerCase("az"));
// What matters is the first locale in the locale list.
assertEquals("aBcI\u0307".toLocaleLowerCase(["tr", "en", "fr"]),
"aBcI\u0307".toLocaleLowerCase("tr"));
assertEquals("aBcI\u0307".toLocaleLowerCase(["en", "tr", "az"]),
"aBcI\u0307".toLocaleLowerCase("en"));
assertEquals("aBcI\u0307".toLocaleLowerCase(["en", "tr", "az"]),
"aBcI\u0307".toLowerCase());
// An empty locale list is the same as the default locale. Try these tests
// under Turkish and Greek locale.
assertEquals("aBcI\u0307".toLocaleLowerCase([]),
"aBcI\u0307".toLocaleLowerCase());
assertEquals("aBcI\u0307".toLocaleLowerCase([]),
"aBcI\u0307".toLocaleLowerCase(Intl.GetDefaultLocale));
assertEquals("άόύώ".toLocaleUpperCase([]), "άόύώ".toLocaleUpperCase());
assertEquals("άόύώ".toLocaleUpperCase([]),
"άόύώ".toLocaleUpperCase(Intl.GetDefaultLocale));
// English/root locale keeps U+0307 (combining dot above).
assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase("en"));
assertEquals("abci\u0307", "aBcI\u0307".toLocaleLowerCase(["en", "tr"]));
assertEquals("abci\u0307", "aBcI\u0307".toLowerCase());
// Greek uppercasing: not covered by intl402/String/*, yet. Tonos (U+0301) and
// other diacritic marks are dropped. This rule is based on the current CLDR's
// el-Upper transformation, but Greek uppercasing rules are more sophisticated
// than this. See http://bugs.icu-project.org/trac/ticket/10582 and
// http://unicode.org/cldr/trac/ticket/7905 .
assertEquals("Α", \u0301".toLocaleUpperCase("el"));
assertEquals("Α", \u0301".toLocaleUpperCase("el-GR"));
assertEquals("Α", \u0301".toLocaleUpperCase("el-Grek"));
assertEquals("Α", \u0301".toLocaleUpperCase("el-Grek-GR"));
assertEquals("Α", "ά".toLocaleUpperCase("el"));
assertEquals("ΑΟΥΩ", "άόύώ".toLocaleUpperCase("el"));
assertEquals("ΑΟΥΩ", \u0301ο\u0301υ\u0301ω\u0301".toLocaleUpperCase("el"));
assertEquals("ΑΟΥΩ", "άόύώ".toLocaleUpperCase("el"));
assertEquals("ΟΕ", \u1f15".toLocaleUpperCase("el"));
assertEquals("ΟΕ", \u0301ε\u0314\u0301".toLocaleUpperCase("el"));
// Input and output are identical.
assertEquals("αβγδε", "αβγδε".toLocaleLowerCase("el"));
assertEquals("ΑΒΓΔΕ", "ΑΒΓΔΕ".toLocaleUpperCase("el"));
assertEquals("ΑΒΓΔΕАБ𝐀𝐁", "ΑΒΓΔΕАБ𝐀𝐁".toLocaleUpperCase("el"));
assertEquals("ABCDEÂÓḴ123", "ABCDEÂÓḴ123".toLocaleUpperCase("el"));
// ASCII-only or Latin-1 only: 1-byte
assertEquals("ABCDE123", "ABCDE123".toLocaleUpperCase("el"));
assertEquals("ABCDEÂÓ123", "ABCDEÂÓ123".toLocaleUpperCase("el"));
// To make sure that the input string is not overwritten in place.
var strings = ["abCdef", "αβγδε", "άόύώ", "аб"];
for (var s of strings) {
var backupAsArray = s.split("");
var uppered = s.toLocaleUpperCase("el");
assertEquals(s, backupAsArray.join(""));
}
// In other locales, U+0301 is preserved.
assertEquals(\u0301Ο\u0301Υ\u0301Ω\u0301",
\u0301ο\u0301υ\u0301ω\u0301".toLocaleUpperCase("en"));
assertEquals(\u0301Ο\u0301Υ\u0301Ω\u0301",
\u0301ο\u0301υ\u0301ω\u0301".toUpperCase());
// Plane 1; Deseret and Warang Citi Script.
assertEquals("\u{10400}\u{118A0}", "\u{10428}\u{118C0}".toUpperCase());
assertEquals("\u{10428}\u{118C0}", "\u{10400}\u{118A0}".toLowerCase());
// Mathematical Bold {Capital, Small} Letter A do not change.
assertEquals("\u{1D400}\u{1D41A}", "\u{1D400}\u{1D41A}".toUpperCase());
assertEquals("\u{1D400}\u{1D41A}", "\u{1D400}\u{1D41A}".toLowerCase());
// Plane 1; New characters in Unicode 8.0
assertEquals("\u{10C80}", "\u{10CC0}".toUpperCase());
assertEquals("\u{10CC0}", "\u{10C80}".toLowerCase());
assertEquals("\u{10C80}", "\u{10CC0}".toLocaleUpperCase());
assertEquals("\u{10CC0}", "\u{10C80}".toLocaleLowerCase());
assertEquals("\u{10C80}", "\u{10CC0}".toLocaleUpperCase(["tr"]));
assertEquals("\u{10C80}", "\u{10CC0}".toLocaleUpperCase(["tr"]));
assertEquals("\u{10CC0}", "\u{10C80}".toLocaleLowerCase());
......@@ -26,10 +26,12 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import os
import re
from testrunner.local import testsuite
from testrunner.objects import testcase
FLAGS_PATTERN = re.compile(r"//\s+Flags:(.*)")
class IntlTestSuite(testsuite.TestSuite):
......@@ -55,7 +57,11 @@ class IntlTestSuite(testsuite.TestSuite):
return tests
def GetFlagsForTestCase(self, testcase, context):
source = self.GetSourceForTest(testcase)
flags = ["--allow-natives-syntax"] + context.mode_flags
flags_match = re.findall(FLAGS_PATTERN, source)
for match in flags_match:
flags += match.strip().split()
files = []
files.append(os.path.join(self.root, "assert.js"))
......@@ -71,6 +77,10 @@ class IntlTestSuite(testsuite.TestSuite):
return testcase.flags + flags
def GetSourceForTest(self, testcase):
filename = os.path.join(self.root, testcase.path + self.suffix())
with open(filename) as f:
return f.read()
def GetSuite(name, root):
return IntlTestSuite(name, root)
......@@ -139,14 +139,16 @@
'intl402/NumberFormat/11.1.1_1': [FAIL],
# https://code.google.com/p/v8/issues/detail?id=4476
'built-ins/String/prototype/toLocaleLowerCase/special_casing_conditional': [FAIL],
'built-ins/String/prototype/toLocaleLowerCase/supplementary_plane': [FAIL],
# The bug is fixed but behind a flag, --icu_case_mapping.
'built-ins/String/prototype/toLowerCase/special_casing_conditional': [FAIL],
'built-ins/String/prototype/toLowerCase/supplementary_plane': [FAIL],
'built-ins/String/prototype/toLocaleUpperCase/supplementary_plane': [FAIL],
'built-ins/String/prototype/toUpperCase/supplementary_plane': [FAIL],
# https://code.google.com/p/v8/issues/detail?id=4477
# The bug is fixed but behind a flag, --icu_case_mapping.
'built-ins/String/prototype/toLocaleUpperCase/supplementary_plane': [FAIL],
'built-ins/String/prototype/toLocaleLowerCase/supplementary_plane': [FAIL],
'built-ins/String/prototype/toLocaleLowerCase/special_casing_conditional': [FAIL],
'intl402/String/prototype/toLocaleLowerCase/special_casing_Azeri': [FAIL],
'intl402/String/prototype/toLocaleLowerCase/special_casing_Lithuanian': [FAIL],
'intl402/String/prototype/toLocaleLowerCase/special_casing_Turkish': [FAIL],
......@@ -423,6 +425,22 @@
'built-ins/String/prototype/normalize/return-normalized-string': [SKIP],
'built-ins/String/prototype/normalize/return-normalized-string-from-coerced-form': [SKIP],
'built-ins/String/prototype/normalize/return-normalized-string-using-default-parameter': [SKIP],
# Case-conversion is not fully compliant to the Unicode spec with i18n off.
'built-ins/String/prototype/toLocaleLowerCase/special_casing_conditional': [FAIL],
'built-ins/String/prototype/toLocaleLowerCase/supplementary_plane': [FAIL],
'built-ins/String/prototype/toLowerCase/special_casing_conditional': [FAIL],
'built-ins/String/prototype/toLowerCase/supplementary_plane': [FAIL],
'built-ins/String/prototype/toLocaleUpperCase/supplementary_plane': [FAIL],
'built-ins/String/prototype/toUpperCase/supplementary_plane': [FAIL],
# Locale-sensitive case-conversion is not available with i18n off.
'intl402/String/prototype/toLocaleLowerCase/special_casing_Azeri': [FAIL],
'intl402/String/prototype/toLocaleLowerCase/special_casing_Lithuanian': [FAIL],
'intl402/String/prototype/toLocaleLowerCase/special_casing_Turkish': [FAIL],
'intl402/String/prototype/toLocaleUpperCase/special_casing_Azeri': [FAIL],
'intl402/String/prototype/toLocaleUpperCase/special_casing_Lithuanian': [FAIL],
'intl402/String/prototype/toLocaleUpperCase/special_casing_Turkish': [FAIL],
}], # no_i18n == True
['arch == arm or arch == mipsel or arch == mips or arch == arm64 or arch == mips64 or arch == mips64el', {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment