Commit eb550c6d authored by yangguo@chromium.org's avatar yangguo@chromium.org

Fix y-umlaut to uppercase.

R=dcarney@chromium.org
BUG=v8:2984

Review URL: https://codereview.chromium.org/59853006

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@17545 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent c44a4d38
......@@ -4111,13 +4111,12 @@ MaybeObject* Heap::LookupSingleCharacterStringFromCode(uint16_t code) {
return result;
}
Object* result;
SeqTwoByteString* result;
{ MaybeObject* maybe_result = AllocateRawTwoByteString(1);
if (!maybe_result->ToObject(&result)) return maybe_result;
if (!maybe_result->To<SeqTwoByteString>(&result)) return maybe_result;
}
String* answer = String::cast(result);
answer->Set(0, code);
return answer;
result->SeqTwoByteStringSet(0, code);
return result;
}
......
......@@ -6192,6 +6192,7 @@ template <class Converter>
MUST_USE_RESULT static MaybeObject* ConvertCaseHelper(
Isolate* isolate,
String* s,
String::Encoding result_encoding,
int length,
int input_string_length,
unibrow::Mapping<Converter, 128>* mapping) {
......@@ -6207,7 +6208,7 @@ MUST_USE_RESULT static MaybeObject* ConvertCaseHelper(
// might break in the future if we implement more context and locale
// dependent upper/lower conversions.
Object* o;
{ MaybeObject* maybe_o = s->IsOneByteRepresentation()
{ MaybeObject* maybe_o = result_encoding == String::ONE_BYTE_ENCODING
? isolate->heap()->AllocateRawOneByteString(length)
: isolate->heap()->AllocateRawTwoByteString(length);
if (!maybe_o->ToObject(&o)) return maybe_o;
......@@ -6215,6 +6216,8 @@ MUST_USE_RESULT static MaybeObject* ConvertCaseHelper(
String* result = String::cast(o);
bool has_changed_character = false;
DisallowHeapAllocation no_gc;
// Convert all characters to upper case, assuming that they will fit
// in the buffer
Access<ConsStringIteratorOp> op(
......@@ -6223,6 +6226,10 @@ MUST_USE_RESULT static MaybeObject* ConvertCaseHelper(
unibrow::uchar chars[Converter::kMaxWidth];
// We can assume that the string is not empty
uc32 current = stream.GetNext();
// y with umlauts is the only character that stops fitting into one-byte
// when converting to uppercase.
static const uc32 yuml_code = 0xff;
bool ignore_yuml = result->IsSeqTwoByteString() || Converter::kIsToLower;
for (int i = 0; i < length;) {
bool has_next = stream.HasMore();
uc32 next = has_next ? stream.GetNext() : 0;
......@@ -6231,13 +6238,14 @@ MUST_USE_RESULT static MaybeObject* ConvertCaseHelper(
// The case conversion of this character is the character itself.
result->Set(i, current);
i++;
} else if (char_length == 1) {
} else if (char_length == 1 && (ignore_yuml || current != yuml_code)) {
// Common case: converting the letter resulted in one character.
ASSERT(static_cast<uc32>(chars[0]) != current);
result->Set(i, chars[0]);
has_changed_character = true;
i++;
} else if (length == input_string_length) {
bool found_yuml = (current == yuml_code);
// We've assumed that the result would be as long as the
// input but here is a character that converts to several
// characters. No matter, we calculate the exact length
......@@ -6257,6 +6265,7 @@ MUST_USE_RESULT static MaybeObject* ConvertCaseHelper(
int current_length = i + char_length + next_length;
while (stream.HasMore()) {
current = stream.GetNext();
found_yuml |= (current == yuml_code);
// NOTE: we use 0 as the next character here because, while
// the next character may affect what a character converts to,
// it does not in any case affect the length of what it convert
......@@ -6269,8 +6278,10 @@ MUST_USE_RESULT static MaybeObject* ConvertCaseHelper(
return Failure::OutOfMemoryException(0x13);
}
}
// Try again with the real length.
return Smi::FromInt(current_length);
// Try again with the real length. Return signed if we need
// to allocate a two-byte string for y-umlaut to uppercase.
return (found_yuml && !ignore_yuml) ? Smi::FromInt(-current_length)
: Smi::FromInt(current_length);
} else {
for (int j = 0; j < char_length; j++) {
result->Set(i, chars[j]);
......@@ -6316,121 +6327,107 @@ static inline uintptr_t AsciiRangeMask(uintptr_t w, char m, char n) {
}
enum AsciiCaseConversion {
ASCII_TO_LOWER,
ASCII_TO_UPPER
};
template <AsciiCaseConversion dir>
struct FastAsciiConverter {
static bool Convert(char* dst, char* src, int length, bool* changed_out) {
template<class Converter>
static bool FastAsciiConvert(char* dst,
char* src,
int length,
bool* changed_out) {
#ifdef DEBUG
char* saved_dst = dst;
char* saved_src = src;
#endif
// We rely on the distance between upper and lower case letters
// being a known power of 2.
ASSERT('a' - 'A' == (1 << 5));
// Boundaries for the range of input characters than require conversion.
const char lo = (dir == ASCII_TO_LOWER) ? 'A' - 1 : 'a' - 1;
const char hi = (dir == ASCII_TO_LOWER) ? 'Z' + 1 : 'z' + 1;
bool changed = false;
uintptr_t or_acc = 0;
char* const limit = src + length;
DisallowHeapAllocation no_gc;
// We rely on the distance between upper and lower case letters
// being a known power of 2.
ASSERT('a' - 'A' == (1 << 5));
// Boundaries for the range of input characters than require conversion.
static const char lo = Converter::kIsToLower ? 'A' - 1 : 'a' - 1;
static const char hi = Converter::kIsToLower ? 'Z' + 1 : 'z' + 1;
bool changed = false;
uintptr_t or_acc = 0;
char* const limit = src + length;
#ifdef V8_HOST_CAN_READ_UNALIGNED
// Process the prefix of the input that requires no conversion one
// (machine) word at a time.
while (src <= limit - sizeof(uintptr_t)) {
uintptr_t w = *reinterpret_cast<uintptr_t*>(src);
or_acc |= w;
if (AsciiRangeMask(w, lo, hi) != 0) {
changed = true;
break;
}
*reinterpret_cast<uintptr_t*>(dst) = w;
src += sizeof(uintptr_t);
dst += sizeof(uintptr_t);
}
// Process the remainder of the input performing conversion when
// required one word at a time.
while (src <= limit - sizeof(uintptr_t)) {
uintptr_t w = *reinterpret_cast<uintptr_t*>(src);
or_acc |= w;
uintptr_t m = AsciiRangeMask(w, lo, hi);
// The mask has high (7th) bit set in every byte that needs
// conversion and we know that the distance between cases is
// 1 << 5.
*reinterpret_cast<uintptr_t*>(dst) = w ^ (m >> 2);
src += sizeof(uintptr_t);
dst += sizeof(uintptr_t);
}
#endif
// Process the last few bytes of the input (or the whole input if
// unaligned access is not supported).
while (src < limit) {
char c = *src;
or_acc |= c;
if (lo < c && c < hi) {
c ^= (1 << 5);
changed = true;
}
*dst = c;
++src;
++dst;
}
if ((or_acc & kAsciiMask) != 0) {
return false;
// Process the prefix of the input that requires no conversion one
// (machine) word at a time.
while (src <= limit - sizeof(uintptr_t)) {
uintptr_t w = *reinterpret_cast<uintptr_t*>(src);
or_acc |= w;
if (AsciiRangeMask(w, lo, hi) != 0) {
changed = true;
break;
}
#ifdef DEBUG
CheckConvert(saved_dst, saved_src, length, changed);
*reinterpret_cast<uintptr_t*>(dst) = w;
src += sizeof(uintptr_t);
dst += sizeof(uintptr_t);
}
// Process the remainder of the input performing conversion when
// required one word at a time.
while (src <= limit - sizeof(uintptr_t)) {
uintptr_t w = *reinterpret_cast<uintptr_t*>(src);
or_acc |= w;
uintptr_t m = AsciiRangeMask(w, lo, hi);
// The mask has high (7th) bit set in every byte that needs
// conversion and we know that the distance between cases is
// 1 << 5.
*reinterpret_cast<uintptr_t*>(dst) = w ^ (m >> 2);
src += sizeof(uintptr_t);
dst += sizeof(uintptr_t);
}
#endif
*changed_out = changed;
return true;
// Process the last few bytes of the input (or the whole input if
// unaligned access is not supported).
while (src < limit) {
char c = *src;
or_acc |= c;
if (lo < c && c < hi) {
c ^= (1 << 5);
changed = true;
}
*dst = c;
++src;
++dst;
}
if ((or_acc & kAsciiMask) != 0) {
return false;
}
ASSERT(CheckFastAsciiConvert(
saved_dst, saved_src, length, changed, Converter::kIsToLower));
*changed_out = changed;
return true;
}
#ifdef DEBUG
static void CheckConvert(char* dst, char* src, int length, bool changed) {
bool expected_changed = false;
for (int i = 0; i < length; i++) {
if (dst[i] == src[i]) continue;
expected_changed = true;
if (dir == ASCII_TO_LOWER) {
ASSERT('A' <= src[i] && src[i] <= 'Z');
ASSERT(dst[i] == src[i] + ('a' - 'A'));
} else {
ASSERT(dir == ASCII_TO_UPPER);
ASSERT('a' <= src[i] && src[i] <= 'z');
ASSERT(dst[i] == src[i] - ('a' - 'A'));
}
static bool CheckFastAsciiConvert(char* dst,
char* src,
int length,
bool changed,
bool is_to_lower) {
bool expected_changed = false;
for (int i = 0; i < length; i++) {
if (dst[i] == src[i]) continue;
expected_changed = true;
if (is_to_lower) {
ASSERT('A' <= src[i] && src[i] <= 'Z');
ASSERT(dst[i] == src[i] + ('a' - 'A'));
} else {
ASSERT('a' <= src[i] && src[i] <= 'z');
ASSERT(dst[i] == src[i] - ('a' - 'A'));
}
ASSERT(expected_changed == changed);
}
return (expected_changed == changed);
}
#endif
};
struct ToLowerTraits {
typedef unibrow::ToLowercase UnibrowConverter;
typedef FastAsciiConverter<ASCII_TO_LOWER> AsciiConverter;
};
struct ToUpperTraits {
typedef unibrow::ToUppercase UnibrowConverter;
typedef FastAsciiConverter<ASCII_TO_UPPER> AsciiConverter;
};
} // namespace
template <typename ConvertTraits>
template <class Converter>
MUST_USE_RESULT static MaybeObject* ConvertCase(
Arguments args,
Isolate* isolate,
unibrow::Mapping<typename ConvertTraits::UnibrowConverter, 128>* mapping) {
unibrow::Mapping<Converter, 128>* mapping) {
SealHandleScope shs(isolate);
CONVERT_ARG_CHECKED(String, s, 0);
s = s->TryFlattenGetString();
......@@ -6452,7 +6449,7 @@ MUST_USE_RESULT static MaybeObject* ConvertCase(
}
SeqOneByteString* result = SeqOneByteString::cast(o);
bool has_changed_character;
bool is_ascii = ConvertTraits::AsciiConverter::Convert(
bool is_ascii = FastAsciiConvert<Converter>(
reinterpret_cast<char*>(result->GetChars()),
reinterpret_cast<char*>(SeqOneByteString::cast(s)->GetChars()),
length,
......@@ -6463,31 +6460,35 @@ MUST_USE_RESULT static MaybeObject* ConvertCase(
}
}
String::Encoding result_encoding = s->IsOneByteRepresentationUnderneath()
? String::ONE_BYTE_ENCODING : String::TWO_BYTE_ENCODING;
Object* answer;
{ MaybeObject* maybe_answer =
ConvertCaseHelper(isolate, s, length, length, mapping);
{ MaybeObject* maybe_answer = ConvertCaseHelper(
isolate, s, result_encoding, length, length, mapping);
if (!maybe_answer->ToObject(&answer)) return maybe_answer;
}
if (answer->IsSmi()) {
// Retry with correct length.
{ MaybeObject* maybe_answer =
ConvertCaseHelper(isolate,
s, Smi::cast(answer)->value(), length, mapping);
if (!maybe_answer->ToObject(&answer)) return maybe_answer;
int new_length = Smi::cast(answer)->value();
if (new_length < 0) {
result_encoding = String::TWO_BYTE_ENCODING;
new_length = -new_length;
}
MaybeObject* maybe_answer = ConvertCaseHelper(
isolate, s, result_encoding, new_length, length, mapping);
if (!maybe_answer->ToObject(&answer)) return maybe_answer;
}
return answer;
}
RUNTIME_FUNCTION(MaybeObject*, Runtime_StringToLowerCase) {
return ConvertCase<ToLowerTraits>(
return ConvertCase(
args, isolate, isolate->runtime_state()->to_lower_mapping());
}
RUNTIME_FUNCTION(MaybeObject*, Runtime_StringToUpperCase) {
return ConvertCase<ToUpperTraits>(
return ConvertCase(
args, isolate, isolate->runtime_state()->to_upper_mapping());
}
......
......@@ -235,6 +235,7 @@ struct ConnectorPunctuation {
};
struct ToLowercase {
static const int kMaxWidth = 3;
static const bool kIsToLower = true;
static int Convert(uchar c,
uchar n,
uchar* result,
......@@ -242,6 +243,7 @@ struct ToLowercase {
};
struct ToUppercase {
static const int kMaxWidth = 3;
static const bool kIsToLower = false;
static int Convert(uchar c,
uchar n,
uchar* result,
......
// Copyright 2013 the V8 project authors. All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
assertEquals("\u0178", "\xff".toUpperCase());
assertEquals("abcdefghijklmn\xffopq",
("ABCDEFGHIJKL" + "MN\u0178OPQ").toLowerCase());
assertEquals("\xff", "\u0178".toLowerCase());
assertEquals("ABCDEFGHIJKLMN\u0178OPQ",
("abcdefghijk" + "lmn\xffopq").toUpperCase());
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment