Commit af38272d authored by jshin's avatar jshin Committed by Commit bot

Optimize case conversion with icu_case_mapping

Use FastAsciiConvert (as used by Unibrow) for i18n-aware
case conversion with --icu_case_mapping.

Move FastAsciiConvert to src/string-case.cc so that it can be used
by both runtime-{string,i18n}.

Add more tests.

BUG=v8:4477,v8:4476
TEST=intl/general/case*

Review-Url: https://codereview.chromium.org/2533983006
Cr-Commit-Position: refs/heads/master@{#41821}
parent 4c640be1
...@@ -1691,6 +1691,8 @@ v8_source_set("v8_base") { ...@@ -1691,6 +1691,8 @@ v8_source_set("v8_base") {
"src/startup-data-util.h", "src/startup-data-util.h",
"src/string-builder.cc", "src/string-builder.cc",
"src/string-builder.h", "src/string-builder.h",
"src/string-case.cc",
"src/string-case.h",
"src/string-search.h", "src/string-search.h",
"src/string-stream.cc", "src/string-stream.cc",
"src/string-stream.h", "src/string-stream.h",
......
...@@ -2121,27 +2121,16 @@ OverrideFunction(GlobalString.prototype, 'normalize', function() { ...@@ -2121,27 +2121,16 @@ OverrideFunction(GlobalString.prototype, 'normalize', function() {
); );
function ToLowerCaseI18N() { function ToLowerCaseI18N() {
if (!IS_UNDEFINED(new.target)) {
throw %make_type_error(kOrdinaryFunctionCalledAsConstructor);
}
CHECK_OBJECT_COERCIBLE(this, "String.prototype.toLowerCase"); CHECK_OBJECT_COERCIBLE(this, "String.prototype.toLowerCase");
var s = TO_STRING(this); return %StringToLowerCaseI18N(TO_STRING(this));
return %StringToLowerCaseI18N(s);
} }
function ToUpperCaseI18N() { function ToUpperCaseI18N() {
if (!IS_UNDEFINED(new.target)) {
throw %make_type_error(kOrdinaryFunctionCalledAsConstructor);
}
CHECK_OBJECT_COERCIBLE(this, "String.prototype.toUpperCase"); CHECK_OBJECT_COERCIBLE(this, "String.prototype.toUpperCase");
var s = TO_STRING(this); return %StringToUpperCaseI18N(TO_STRING(this));
return %StringToUpperCaseI18N(s);
} }
function ToLocaleLowerCaseI18N(locales) { function ToLocaleLowerCaseI18N(locales) {
if (!IS_UNDEFINED(new.target)) {
throw %make_type_error(kOrdinaryFunctionCalledAsConstructor);
}
CHECK_OBJECT_COERCIBLE(this, "String.prototype.toLocaleLowerCase"); CHECK_OBJECT_COERCIBLE(this, "String.prototype.toLocaleLowerCase");
return LocaleConvertCase(TO_STRING(this), locales, false); return LocaleConvertCase(TO_STRING(this), locales, false);
} }
...@@ -2149,9 +2138,6 @@ function ToLocaleLowerCaseI18N(locales) { ...@@ -2149,9 +2138,6 @@ function ToLocaleLowerCaseI18N(locales) {
%FunctionSetLength(ToLocaleLowerCaseI18N, 0); %FunctionSetLength(ToLocaleLowerCaseI18N, 0);
function ToLocaleUpperCaseI18N(locales) { function ToLocaleUpperCaseI18N(locales) {
if (!IS_UNDEFINED(new.target)) {
throw %make_type_error(kOrdinaryFunctionCalledAsConstructor);
}
CHECK_OBJECT_COERCIBLE(this, "String.prototype.toLocaleUpperCase"); CHECK_OBJECT_COERCIBLE(this, "String.prototype.toLocaleUpperCase");
return LocaleConvertCase(TO_STRING(this), locales, true); return LocaleConvertCase(TO_STRING(this), locales, true);
} }
......
...@@ -8,13 +8,15 @@ ...@@ -8,13 +8,15 @@
#include <memory> #include <memory>
#include "src/api.h"
#include "src/api-natives.h" #include "src/api-natives.h"
#include "src/api.h"
#include "src/arguments.h" #include "src/arguments.h"
#include "src/factory.h" #include "src/factory.h"
#include "src/i18n.h" #include "src/i18n.h"
#include "src/isolate-inl.h" #include "src/isolate-inl.h"
#include "src/messages.h" #include "src/messages.h"
#include "src/string-case.h"
#include "src/utils.h"
#include "unicode/brkiter.h" #include "unicode/brkiter.h"
#include "unicode/calendar.h" #include "unicode/calendar.h"
...@@ -1041,15 +1043,14 @@ bool ToUpperFastASCII(const Vector<const Char>& src, ...@@ -1041,15 +1043,14 @@ bool ToUpperFastASCII(const Vector<const Char>& src,
const uint16_t sharp_s = 0xDF; const uint16_t sharp_s = 0xDF;
template <typename Char> template <typename Char>
bool ToUpperOneByte(const Vector<const Char>& src, bool ToUpperOneByte(const Vector<const Char>& src, uint8_t* dest,
Handle<SeqOneByteString> result, int* sharp_s_count) { int* sharp_s_count) {
// Still pretty-fast path for the input with non-ASCII Latin-1 characters. // Still pretty-fast path for the input with non-ASCII Latin-1 characters.
// There are two special cases. // There are two special cases.
// 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF. // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.
// 2. Lower case sharp-S converts to "SS" (two characters) // 2. Lower case sharp-S converts to "SS" (two characters)
*sharp_s_count = 0; *sharp_s_count = 0;
int32_t index = 0;
for (auto it = src.begin(); it != src.end(); ++it) { for (auto it = src.begin(); it != src.end(); ++it) {
uint16_t ch = static_cast<uint16_t>(*it); uint16_t ch = static_cast<uint16_t>(*it);
if (V8_UNLIKELY(ch == sharp_s)) { if (V8_UNLIKELY(ch == sharp_s)) {
...@@ -1061,7 +1062,7 @@ bool ToUpperOneByte(const Vector<const Char>& src, ...@@ -1061,7 +1062,7 @@ bool ToUpperOneByte(const Vector<const Char>& src,
// need to take the 16-bit path. // need to take the 16-bit path.
return false; return false;
} }
result->SeqOneByteStringSet(index++, ToLatin1Upper(ch)); *dest++ = ToLatin1Upper(ch);
} }
return true; return true;
...@@ -1082,6 +1083,16 @@ void ToUpperWithSharpS(const Vector<const Char>& src, ...@@ -1082,6 +1083,16 @@ void ToUpperWithSharpS(const Vector<const Char>& src,
} }
} }
inline int FindFirstUpperOrNonAscii(Handle<String> s, int length) {
for (int index = 0; index < length; ++index) {
uint16_t ch = s->Get(index);
if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
return index;
}
}
return length;
}
} // namespace } // namespace
RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) { RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {
...@@ -1091,60 +1102,65 @@ RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) { ...@@ -1091,60 +1102,65 @@ RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {
int length = s->length(); int length = s->length();
s = String::Flatten(s); s = String::Flatten(s);
// First scan the string for uppercase and non-ASCII characters:
if (s->HasOnlyOneByteChars()) {
int first_index_to_lower = length;
for (int index = 0; index < length; ++index) {
// Blink specializes this path for one-byte strings, so it
// does not need to do a generic get, but can do the equivalent
// of SeqOneByteStringGet.
uint16_t ch = s->Get(index);
if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
first_index_to_lower = index;
break;
}
}
if (!s->HasOnlyOneByteChars()) {
// Use a slower implementation for strings with characters beyond U+00FF.
return LocaleConvertCase(s, isolate, false, "");
}
// We depend here on the invariant that the length of a Latin1
// string is invariant under ToLowerCase, and the result always
// fits in the Latin1 range in the *root locale*. It does not hold
// for ToUpperCase even in the root locale.
// Scan the string for uppercase and non-ASCII characters for strings
// shorter than a machine-word without any memory allocation overhead.
// TODO(jshin): Apply this to a longer input by breaking FastAsciiConvert()
// to two parts, one for scanning the prefix with no change and the other for
// handling ASCII-only characters.
int index_to_first_unprocessed = length;
const bool is_short = length < static_cast<int>(sizeof(uintptr_t));
if (is_short) {
index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);
// Nothing to do if the string is all ASCII with no uppercase. // Nothing to do if the string is all ASCII with no uppercase.
if (first_index_to_lower == length) return *s; if (index_to_first_unprocessed == length) return *s;
}
// We depend here on the invariant that the length of a Latin1 Handle<SeqOneByteString> result =
// string is invariant under ToLowerCase, and the result always isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
// fits in the Latin1 range in the *root locale*. It does not hold
// for ToUpperCase even in the root locale.
Handle<SeqOneByteString> result;
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, result, isolate->factory()->NewRawOneByteString(length));
DisallowHeapAllocation no_gc; DisallowHeapAllocation no_gc;
String::FlatContent flat = s->GetFlatContent(); String::FlatContent flat = s->GetFlatContent();
if (flat.IsOneByte()) { uint8_t* dest = result->GetChars();
const uint8_t* src = flat.ToOneByteVector().start(); if (flat.IsOneByte()) {
CopyChars(result->GetChars(), src, const uint8_t* src = flat.ToOneByteVector().start();
static_cast<size_t>(first_index_to_lower)); bool has_changed_character = false;
for (int index = first_index_to_lower; index < length; ++index) { index_to_first_unprocessed = FastAsciiConvert<true>(
uint16_t ch = static_cast<uint16_t>(src[index]); reinterpret_cast<char*>(dest), reinterpret_cast<const char*>(src),
result->SeqOneByteStringSet(index, ToLatin1Lower(ch)); length, &has_changed_character);
} // If not ASCII, we keep the result up to index_to_first_unprocessed and
} else { // process the rest.
const uint16_t* src = flat.ToUC16Vector().start(); if (index_to_first_unprocessed == length)
CopyChars(result->GetChars(), src, return has_changed_character ? *result : *s;
static_cast<size_t>(first_index_to_lower));
for (int index = first_index_to_lower; index < length; ++index) { for (int index = index_to_first_unprocessed; index < length; ++index) {
uint16_t ch = src[index]; dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));
result->SeqOneByteStringSet(index, ToLatin1Lower(ch)); }
} } else {
if (index_to_first_unprocessed == length) {
DCHECK(!is_short);
index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);
}
// Nothing to do if the string is all ASCII with no uppercase.
if (index_to_first_unprocessed == length) return *s;
const uint16_t* src = flat.ToUC16Vector().start();
CopyChars(dest, src, index_to_first_unprocessed);
for (int index = index_to_first_unprocessed; index < length; ++index) {
dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));
} }
return *result;
} }
// Blink had an additional case here for ASCII 2-byte strings, but return *result;
// that is subsumed by the above code (assuming there isn't a false
// negative for HasOnlyOneByteChars).
// Do a slower implementation for cases that include non-ASCII characters.
return LocaleConvertCase(s, isolate, false, "");
} }
RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) { RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {
...@@ -1152,35 +1168,38 @@ RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) { ...@@ -1152,35 +1168,38 @@ RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {
DCHECK_EQ(args.length(), 1); DCHECK_EQ(args.length(), 1);
CONVERT_ARG_HANDLE_CHECKED(String, s, 0); CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
// This function could be optimized for no-op cases the way lowercase
// counterpart is, but in empirical testing, few actual calls to upper()
// are no-ops. So, it wouldn't be worth the extra time for pre-scanning.
int32_t length = s->length(); int32_t length = s->length();
s = String::Flatten(s); s = String::Flatten(s);
if (s->HasOnlyOneByteChars()) { if (s->HasOnlyOneByteChars()) {
Handle<SeqOneByteString> result; Handle<SeqOneByteString> result =
ASSIGN_RETURN_FAILURE_ON_EXCEPTION( isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
isolate, result, isolate->factory()->NewRawOneByteString(length));
int sharp_s_count; int sharp_s_count;
bool is_result_single_byte; bool is_result_single_byte;
{ {
DisallowHeapAllocation no_gc; DisallowHeapAllocation no_gc;
String::FlatContent flat = s->GetFlatContent(); String::FlatContent flat = s->GetFlatContent();
// If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII uint8_t* dest = result->GetChars();
// could be removed because ToUpperOneByte is pretty fast now (it
// does not call ICU API any more.).
if (flat.IsOneByte()) { if (flat.IsOneByte()) {
Vector<const uint8_t> src = flat.ToOneByteVector(); Vector<const uint8_t> src = flat.ToOneByteVector();
if (ToUpperFastASCII(src, result)) return *result; bool has_changed_character = false;
is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count); int index_to_first_unprocessed =
FastAsciiConvert<false>(reinterpret_cast<char*>(result->GetChars()),
reinterpret_cast<const char*>(src.start()),
length, &has_changed_character);
if (index_to_first_unprocessed == length)
return has_changed_character ? *result : *s;
// If not ASCII, we keep the result up to index_to_first_unprocessed and
// process the rest.
is_result_single_byte =
ToUpperOneByte(src.SubVector(index_to_first_unprocessed, length),
dest + index_to_first_unprocessed, &sharp_s_count);
} else { } else {
DCHECK(flat.IsTwoByte()); DCHECK(flat.IsTwoByte());
Vector<const uint16_t> src = flat.ToUC16Vector(); Vector<const uint16_t> src = flat.ToUC16Vector();
if (ToUpperFastASCII(src, result)) return *result; if (ToUpperFastASCII(src, result)) return *result;
is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count); is_result_single_byte = ToUpperOneByte(src, dest, &sharp_s_count);
} }
} }
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "src/arguments.h" #include "src/arguments.h"
#include "src/regexp/jsregexp-inl.h" #include "src/regexp/jsregexp-inl.h"
#include "src/string-builder.h" #include "src/string-builder.h"
#include "src/string-case.h"
#include "src/string-search.h" #include "src/string-search.h"
namespace v8 { namespace v8 {
...@@ -694,122 +695,6 @@ MUST_USE_RESULT static Object* ConvertCaseHelper( ...@@ -694,122 +695,6 @@ MUST_USE_RESULT static Object* ConvertCaseHelper(
} }
} }
static const uintptr_t kOneInEveryByte = kUintptrAllBitsSet / 0xFF;
static const uintptr_t kAsciiMask = kOneInEveryByte << 7;
// Given a word and two range boundaries returns a word with high bit
// set in every byte iff the corresponding input byte was strictly in
// the range (m, n). All the other bits in the result are cleared.
// This function is only useful when it can be inlined and the
// boundaries are statically known.
// Requires: all bytes in the input word and the boundaries must be
// ASCII (less than 0x7F).
static inline uintptr_t AsciiRangeMask(uintptr_t w, char m, char n) {
// Use strict inequalities since in edge cases the function could be
// further simplified.
DCHECK(0 < m && m < n);
// Has high bit set in every w byte less than n.
uintptr_t tmp1 = kOneInEveryByte * (0x7F + n) - w;
// Has high bit set in every w byte greater than m.
uintptr_t tmp2 = w + kOneInEveryByte * (0x7F - m);
return (tmp1 & tmp2 & (kOneInEveryByte * 0x80));
}
#ifdef DEBUG
static bool CheckFastAsciiConvert(char* dst, const char* src, int length,
bool changed, bool is_to_lower) {
bool expected_changed = false;
for (int i = 0; i < length; i++) {
if (dst[i] == src[i]) continue;
expected_changed = true;
if (is_to_lower) {
DCHECK('A' <= src[i] && src[i] <= 'Z');
DCHECK(dst[i] == src[i] + ('a' - 'A'));
} else {
DCHECK('a' <= src[i] && src[i] <= 'z');
DCHECK(dst[i] == src[i] - ('a' - 'A'));
}
}
return (expected_changed == changed);
}
#endif
template <class Converter>
static bool FastAsciiConvert(char* dst, const char* src, int length,
bool* changed_out) {
#ifdef DEBUG
char* saved_dst = dst;
const char* saved_src = src;
#endif
DisallowHeapAllocation no_gc;
// We rely on the distance between upper and lower case letters
// being a known power of 2.
DCHECK('a' - 'A' == (1 << 5));
// Boundaries for the range of input characters than require conversion.
static const char lo = Converter::kIsToLower ? 'A' - 1 : 'a' - 1;
static const char hi = Converter::kIsToLower ? 'Z' + 1 : 'z' + 1;
bool changed = false;
uintptr_t or_acc = 0;
const char* const limit = src + length;
// dst is newly allocated and always aligned.
DCHECK(IsAligned(reinterpret_cast<intptr_t>(dst), sizeof(uintptr_t)));
// Only attempt processing one word at a time if src is also aligned.
if (IsAligned(reinterpret_cast<intptr_t>(src), sizeof(uintptr_t))) {
// Process the prefix of the input that requires no conversion one aligned
// (machine) word at a time.
while (src <= limit - sizeof(uintptr_t)) {
const uintptr_t w = *reinterpret_cast<const uintptr_t*>(src);
or_acc |= w;
if (AsciiRangeMask(w, lo, hi) != 0) {
changed = true;
break;
}
*reinterpret_cast<uintptr_t*>(dst) = w;
src += sizeof(uintptr_t);
dst += sizeof(uintptr_t);
}
// Process the remainder of the input performing conversion when
// required one word at a time.
while (src <= limit - sizeof(uintptr_t)) {
const uintptr_t w = *reinterpret_cast<const uintptr_t*>(src);
or_acc |= w;
uintptr_t m = AsciiRangeMask(w, lo, hi);
// The mask has high (7th) bit set in every byte that needs
// conversion and we know that the distance between cases is
// 1 << 5.
*reinterpret_cast<uintptr_t*>(dst) = w ^ (m >> 2);
src += sizeof(uintptr_t);
dst += sizeof(uintptr_t);
}
}
// Process the last few bytes of the input (or the whole input if
// unaligned access is not supported).
while (src < limit) {
char c = *src;
or_acc |= c;
if (lo < c && c < hi) {
c ^= (1 << 5);
changed = true;
}
*dst = c;
++src;
++dst;
}
if ((or_acc & kAsciiMask) != 0) return false;
DCHECK(CheckFastAsciiConvert(saved_dst, saved_src, length, changed,
Converter::kIsToLower));
*changed_out = changed;
return true;
}
template <class Converter> template <class Converter>
MUST_USE_RESULT static Object* ConvertCase( MUST_USE_RESULT static Object* ConvertCase(
Handle<String> s, Isolate* isolate, Handle<String> s, Isolate* isolate,
...@@ -833,12 +718,13 @@ MUST_USE_RESULT static Object* ConvertCase( ...@@ -833,12 +718,13 @@ MUST_USE_RESULT static Object* ConvertCase(
String::FlatContent flat_content = s->GetFlatContent(); String::FlatContent flat_content = s->GetFlatContent();
DCHECK(flat_content.IsFlat()); DCHECK(flat_content.IsFlat());
bool has_changed_character = false; bool has_changed_character = false;
bool is_ascii = FastAsciiConvert<Converter>( int index_to_first_unprocessed = FastAsciiConvert<Converter::kIsToLower>(
reinterpret_cast<char*>(result->GetChars()), reinterpret_cast<char*>(result->GetChars()),
reinterpret_cast<const char*>(flat_content.ToOneByteVector().start()), reinterpret_cast<const char*>(flat_content.ToOneByteVector().start()),
length, &has_changed_character); length, &has_changed_character);
// If not ASCII, we discard the result and take the 2 byte path. // If not ASCII, we discard the result and take the 2 byte path.
if (is_ascii) return has_changed_character ? *result : *s; if (index_to_first_unprocessed == length)
return has_changed_character ? *result : *s;
} }
Handle<SeqString> result; // Same length as input. Handle<SeqString> result; // Same length as input.
...@@ -872,7 +758,6 @@ RUNTIME_FUNCTION(Runtime_StringToLowerCase) { ...@@ -872,7 +758,6 @@ RUNTIME_FUNCTION(Runtime_StringToLowerCase) {
return ConvertCase(s, isolate, isolate->runtime_state()->to_lower_mapping()); return ConvertCase(s, isolate, isolate->runtime_state()->to_lower_mapping());
} }
RUNTIME_FUNCTION(Runtime_StringToUpperCase) { RUNTIME_FUNCTION(Runtime_StringToUpperCase) {
HandleScope scope(isolate); HandleScope scope(isolate);
DCHECK_EQ(args.length(), 1); DCHECK_EQ(args.length(), 1);
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#define V8_RUNTIME_RUNTIME_UTILS_H_ #define V8_RUNTIME_RUNTIME_UTILS_H_
#include "src/base/logging.h" #include "src/base/logging.h"
#include "src/globals.h"
#include "src/runtime/runtime.h" #include "src/runtime/runtime.h"
namespace v8 { namespace v8 {
......
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/string-case.h"
#include "src/assert-scope.h"
#include "src/base/logging.h"
#include "src/globals.h"
#include "src/utils.h"
namespace v8 {
namespace internal {
#ifdef DEBUG
bool CheckFastAsciiConvert(char* dst, const char* src, int length, bool changed,
bool is_to_lower) {
bool expected_changed = false;
for (int i = 0; i < length; i++) {
if (dst[i] == src[i]) continue;
expected_changed = true;
if (is_to_lower) {
DCHECK('A' <= src[i] && src[i] <= 'Z');
DCHECK(dst[i] == src[i] + ('a' - 'A'));
} else {
DCHECK('a' <= src[i] && src[i] <= 'z');
DCHECK(dst[i] == src[i] - ('a' - 'A'));
}
}
return (expected_changed == changed);
}
#endif
const uintptr_t kOneInEveryByte = kUintptrAllBitsSet / 0xFF;
const uintptr_t kAsciiMask = kOneInEveryByte << 7;
// Given a word and two range boundaries returns a word with high bit
// set in every byte iff the corresponding input byte was strictly in
// the range (m, n). All the other bits in the result are cleared.
// This function is only useful when it can be inlined and the
// boundaries are statically known.
// Requires: all bytes in the input word and the boundaries must be
// ASCII (less than 0x7F).
static inline uintptr_t AsciiRangeMask(uintptr_t w, char m, char n) {
// Use strict inequalities since in edge cases the function could be
// further simplified.
DCHECK(0 < m && m < n);
// Has high bit set in every w byte less than n.
uintptr_t tmp1 = kOneInEveryByte * (0x7F + n) - w;
// Has high bit set in every w byte greater than m.
uintptr_t tmp2 = w + kOneInEveryByte * (0x7F - m);
return (tmp1 & tmp2 & (kOneInEveryByte * 0x80));
}
template <bool is_lower>
int FastAsciiConvert(char* dst, const char* src, int length,
bool* changed_out) {
#ifdef DEBUG
char* saved_dst = dst;
#endif
const char* saved_src = src;
DisallowHeapAllocation no_gc;
// We rely on the distance between upper and lower case letters
// being a known power of 2.
DCHECK('a' - 'A' == (1 << 5));
// Boundaries for the range of input characters than require conversion.
static const char lo = is_lower ? 'A' - 1 : 'a' - 1;
static const char hi = is_lower ? 'Z' + 1 : 'z' + 1;
bool changed = false;
const char* const limit = src + length;
// dst is newly allocated and always aligned.
DCHECK(IsAligned(reinterpret_cast<intptr_t>(dst), sizeof(uintptr_t)));
// Only attempt processing one word at a time if src is also aligned.
if (IsAligned(reinterpret_cast<intptr_t>(src), sizeof(uintptr_t))) {
// Process the prefix of the input that requires no conversion one aligned
// (machine) word at a time.
while (src <= limit - sizeof(uintptr_t)) {
const uintptr_t w = *reinterpret_cast<const uintptr_t*>(src);
if ((w & kAsciiMask) != 0) return static_cast<int>(src - saved_src);
if (AsciiRangeMask(w, lo, hi) != 0) {
changed = true;
break;
}
*reinterpret_cast<uintptr_t*>(dst) = w;
src += sizeof(uintptr_t);
dst += sizeof(uintptr_t);
}
// Process the remainder of the input performing conversion when
// required one word at a time.
while (src <= limit - sizeof(uintptr_t)) {
const uintptr_t w = *reinterpret_cast<const uintptr_t*>(src);
if ((w & kAsciiMask) != 0) return static_cast<int>(src - saved_src);
uintptr_t m = AsciiRangeMask(w, lo, hi);
// The mask has high (7th) bit set in every byte that needs
// conversion and we know that the distance between cases is
// 1 << 5.
*reinterpret_cast<uintptr_t*>(dst) = w ^ (m >> 2);
src += sizeof(uintptr_t);
dst += sizeof(uintptr_t);
}
}
// Process the last few bytes of the input (or the whole input if
// unaligned access is not supported).
while (src < limit) {
char c = *src;
if ((c & kAsciiMask) != 0) return static_cast<int>(src - saved_src);
if (lo < c && c < hi) {
c ^= (1 << 5);
changed = true;
}
*dst = c;
++src;
++dst;
}
DCHECK(
CheckFastAsciiConvert(saved_dst, saved_src, length, changed, is_lower));
*changed_out = changed;
return length;
}
template int FastAsciiConvert<false>(char* dst, const char* src, int length,
bool* changed_out);
template int FastAsciiConvert<true>(char* dst, const char* src, int length,
bool* changed_out);
} // namespace internal
} // namespace v8
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_STRING_CASE_H_
#define V8_STRING_CASE_H_
namespace v8 {
namespace internal {
template <bool is_lower>
int FastAsciiConvert(char* dst, const char* src, int length, bool* changed_out);
} // namespace internal
} // namespace v8
#endif // V8_STRING_CASE_H__
...@@ -1228,6 +1228,8 @@ ...@@ -1228,6 +1228,8 @@
'startup-data-util.h', 'startup-data-util.h',
'string-builder.cc', 'string-builder.cc',
'string-builder.h', 'string-builder.h',
'string-case.cc',
'string-case.h',
'string-search.h', 'string-search.h',
'string-stream.cc', 'string-stream.cc',
'string-stream.h', 'string-stream.h',
......
...@@ -16,14 +16,33 @@ assertEquals("σς", "\u03A3\u03A3".toLowerCase()); ...@@ -16,14 +16,33 @@ assertEquals("σς", "\u03A3\u03A3".toLowerCase());
// Expand sharp s in latin1 fastpath // Expand sharp s in latin1 fastpath
assertEquals("ASSB", "A\u00DFB".toUpperCase()); assertEquals("ASSB", "A\u00DFB".toUpperCase());
assertEquals("AB", "Ab".toUpperCase()); assertEquals("AB", "Ab".toUpperCase());
// Find first upper case in fastpath // Find first uppercase in fastpath
// Input length < a machine word size
assertEquals("ab", "ab".toLowerCase());
assertEquals("ab", "aB".toLowerCase()); assertEquals("ab", "aB".toLowerCase());
assertEquals("AÜ", "aü".toUpperCase()); assertEquals("AÜ", "aü".toUpperCase());
assertEquals("AÜ", "AÜ".toUpperCase()); assertEquals("AÜ", "AÜ".toUpperCase());
assertEquals("aü", "aü".toLowerCase()); assertEquals("aü", "aü".toLowerCase());
assertEquals("aü", "aÜ".toLowerCase());
assertEquals("aü", "AÜ".toLowerCase()); assertEquals("aü", "AÜ".toLowerCase());
assertEquals("aü", "AÜ".toLowerCase()); assertEquals("aü", "AÜ".toLowerCase());
// Input length >= a machine word size
assertEquals("abcdefghij", "abcdefghij".toLowerCase());
assertEquals("abcdefghij", "abcdefghiJ".toLowerCase());
assertEquals("abçdefghij", "abçdefghiJ".toLowerCase());
assertEquals("abçdefghij", "abÇdefghiJ".toLowerCase());
assertEquals("abcdefghiá", "abcdeFghiá".toLowerCase());
assertEquals("abcdefghiá", "abcdeFghiÁ".toLowerCase());
assertEquals("ABCDEFGHIJ", "ABCDEFGHIJ".toUpperCase());
assertEquals("ABCDEFGHIJ", "ABCDEFGHIj".toUpperCase());
assertEquals("ABÇDEFGHIJ", "ABÇDEFGHIj".toUpperCase());
assertEquals("ABÇDEFGHIJ", "ABçDEFGHIj".toUpperCase());
assertEquals("ABCDEFGHIÁ", "ABCDEfGHIÁ".toUpperCase());
assertEquals("ABCDEFGHIÁ", "ABCDEfGHIá".toUpperCase());
// Starts with fastpath, but switches to full Unicode path // Starts with fastpath, but switches to full Unicode path
// U+00FF is uppercased to U+0178. // U+00FF is uppercased to U+0178.
assertEquals("AŸ", "aÿ".toUpperCase()); assertEquals("AŸ", "aÿ".toUpperCase());
...@@ -33,6 +52,10 @@ assertEquals("AΜ", "aµ".toUpperCase()); ...@@ -33,6 +52,10 @@ assertEquals("AΜ", "aµ".toUpperCase());
// Buffer size increase // Buffer size increase
assertEquals("CSSBẶ", "cßbặ".toUpperCase()); assertEquals("CSSBẶ", "cßbặ".toUpperCase());
assertEquals("FIFLFFIFFL", "\uFB01\uFB02\uFB03\uFB04".toUpperCase()); assertEquals("FIFLFFIFFL", "\uFB01\uFB02\uFB03\uFB04".toUpperCase());
assertEquals("ABCÀCSSA", "abcàcßa".toUpperCase());
assertEquals("ABCDEFGHIÀCSSA", "ABCDEFGHIàcßa".toUpperCase());
assertEquals("ABCDEFGHIÀCSSA", "abcdeFghiàcßa".toUpperCase());
// OneByte input with buffer size increase: non-fast path // OneByte input with buffer size increase: non-fast path
assertEquals("ABCSS", "abCß".toLocaleUpperCase("tr")); assertEquals("ABCSS", "abCß".toLocaleUpperCase("tr"));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment