Commit ec30cf47 authored by Maya Lekova's avatar Maya Lekova Committed by Commit Bot

Revert "[utf8] Rewrite NewStringFromUtf8 using Utf8::ValueOfIncremental"

This reverts commit 73dd9b55.

Reason for revert: Broke telemetry layout tests - https://ci.chromium.org/p/chromium/builders/luci.chromium.try/win7-rel/9936 as can be seen in this roll - https://chromium-review.googlesource.com/c/chromium/src/+/1454259

Original change's description:
> [utf8] Rewrite NewStringFromUtf8 using Utf8::ValueOfIncremental
> 
> This is 3-4x faster than using the Utf8Decoder. This matters for proper
> parse-time measurements using d8.
> 
> Change-Id: I9870e9fbe400ec022a6eeb20491c80a2a32f8519
> Reviewed-on: https://chromium-review.googlesource.com/c/1451827
> Commit-Queue: Toon Verwaest <verwaest@chromium.org>
> Reviewed-by: Leszek Swirski <leszeks@chromium.org>
> Reviewed-by: Ulan Degenbaev <ulan@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#59347}

TBR=ulan@chromium.org,leszeks@chromium.org,verwaest@chromium.org

# Not skipping CQ checks because original CL landed > 1 day ago.

Change-Id: I3f8faebb61c19a41ee496a571228f53c0d5fc8dd
Reviewed-on: https://chromium-review.googlesource.com/c/1454495Reviewed-by: 's avatarMaya Lekova <mslekova@chromium.org>
Commit-Queue: Yang Guo <yangguo@chromium.org>
Cr-Commit-Position: refs/heads/master@{#59378}
parent 85fcaff1
...@@ -42,7 +42,7 @@ ...@@ -42,7 +42,7 @@
#include "src/objects/stack-frame-info-inl.h" #include "src/objects/stack-frame-info-inl.h"
#include "src/objects/struct-inl.h" #include "src/objects/struct-inl.h"
#include "src/unicode-cache.h" #include "src/unicode-cache.h"
#include "src/unicode-inl.h" #include "src/unicode-decoder.h"
namespace v8 { namespace v8 {
namespace internal { namespace internal {
...@@ -661,38 +661,13 @@ MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> string, ...@@ -661,38 +661,13 @@ MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> string,
return NewStringFromOneByte(Vector<const uint8_t>::cast(string), pretenure); return NewStringFromOneByte(Vector<const uint8_t>::cast(string), pretenure);
} }
std::unique_ptr<uint16_t[]> buffer(new uint16_t[length - non_ascii_start]); // Non-ASCII and we need to decode.
auto non_ascii = string.SubVector(non_ascii_start, length);
const uint8_t* cursor = Access<UnicodeCache::Utf8Decoder> decoder(
reinterpret_cast<const uint8_t*>(&string[non_ascii_start]); isolate()->unicode_cache()->utf8_decoder());
const uint8_t* end = reinterpret_cast<const uint8_t*>(string.end()); decoder->Reset(non_ascii);
uint16_t* output_cursor = buffer.get();
uint32_t incomplete_char = 0;
unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
while (cursor < end) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (V8_LIKELY(t <= unibrow::Utf16::kMaxNonSurrogateCharCode)) {
*(output_cursor++) = static_cast<uc16>(t); // The most frequent case.
} else if (t == unibrow::Utf8::kIncomplete) {
continue;
} else {
*(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
*(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
}
}
unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
if (t != unibrow::Utf8::kBufferEmpty) {
*(output_cursor++) = static_cast<uc16>(t);
}
DCHECK_LE(output_cursor, buffer.get() + length - non_ascii_start); int utf16_length = static_cast<int>(decoder->Utf16Length());
int utf16_length = static_cast<int>(output_cursor - buffer.get());
DCHECK_GT(utf16_length, 0); DCHECK_GT(utf16_length, 0);
// Allocate string. // Allocate string.
...@@ -701,13 +676,15 @@ MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> string, ...@@ -701,13 +676,15 @@ MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> string,
isolate(), result, isolate(), result,
NewRawTwoByteString(non_ascii_start + utf16_length, pretenure), String); NewRawTwoByteString(non_ascii_start + utf16_length, pretenure), String);
DCHECK_LE(non_ascii_start + utf16_length, length); // Copy ASCII portion.
DisallowHeapAllocation no_gc; DisallowHeapAllocation no_gc;
uint16_t* data = result->GetChars(no_gc); uint16_t* data = result->GetChars(no_gc);
CopyChars(data, ascii_data, non_ascii_start); for (int i = 0; i < non_ascii_start; i++) {
CopyChars(data + non_ascii_start, buffer.get(), utf16_length); *data++ = *ascii_data++;
}
// Now write the remainder.
decoder->WriteUtf16(data, utf16_length, non_ascii);
return result; return result;
} }
......
...@@ -514,38 +514,23 @@ bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) { ...@@ -514,38 +514,23 @@ bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) {
unibrow::Utf8::State state = chunk.start.state; unibrow::Utf8::State state = chunk.start.state;
uint32_t incomplete_char = chunk.start.incomplete_char; uint32_t incomplete_char = chunk.start.incomplete_char;
size_t it = current_.pos.bytes - chunk.start.bytes; size_t it = current_.pos.bytes - chunk.start.bytes;
const uint8_t* cursor = &chunk.data[it]; size_t chars = chunk.start.chars;
const uint8_t* end = &chunk.data[chunk.length]; while (it < chunk.length && chars < position) {
unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
size_t chars = current_.pos.chars; chunk.data[it], &it, &state, &incomplete_char);
if (t == kUtf8Bom && current_.pos.chars == 0) {
if (V8_UNLIKELY(current_.pos.bytes < 3 && chars == 0)) { // BOM detected at beginning of the stream. Don't copy it.
while (cursor < end) { } else if (t != unibrow::Utf8::kIncomplete) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (t == unibrow::Utf8::kIncomplete) continue;
if (t != kUtf8Bom) {
chars++;
if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
}
break;
}
}
while (cursor < end && chars < position) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (t != unibrow::Utf8::kIncomplete) {
chars++; chars++;
if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++; if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
} }
} }
current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data); current_.pos.bytes += it;
current_.pos.chars = chars; current_.pos.chars = chars;
current_.pos.incomplete_char = incomplete_char; current_.pos.incomplete_char = incomplete_char;
current_.pos.state = state; current_.pos.state = state;
current_.chunk_no += (cursor == end); current_.chunk_no += (it == chunk.length);
return current_.pos.chars == position; return current_.pos.chars == position;
} }
...@@ -559,8 +544,8 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() { ...@@ -559,8 +544,8 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
// The buffer_ is writable, but buffer_*_ members are const. So we get a // The buffer_ is writable, but buffer_*_ members are const. So we get a
// non-const pointer into buffer that points to the same char as buffer_end_. // non-const pointer into buffer that points to the same char as buffer_end_.
uint16_t* output_cursor = buffer_ + (buffer_end_ - buffer_start_); uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
DCHECK_EQ(output_cursor, buffer_end_); DCHECK_EQ(cursor, buffer_end_);
unibrow::Utf8::State state = current_.pos.state; unibrow::Utf8::State state = current_.pos.state;
uint32_t incomplete_char = current_.pos.incomplete_char; uint32_t incomplete_char = current_.pos.incomplete_char;
...@@ -571,7 +556,7 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() { ...@@ -571,7 +556,7 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state); unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
if (t != unibrow::Utf8::kBufferEmpty) { if (t != unibrow::Utf8::kBufferEmpty) {
DCHECK_EQ(t, unibrow::Utf8::kBadChar); DCHECK_EQ(t, unibrow::Utf8::kBadChar);
*output_cursor = static_cast<uc16>(t); *cursor = static_cast<uc16>(t);
buffer_end_++; buffer_end_++;
current_.pos.chars++; current_.pos.chars++;
current_.pos.incomplete_char = 0; current_.pos.incomplete_char = 0;
...@@ -581,50 +566,30 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() { ...@@ -581,50 +566,30 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
} }
size_t it = current_.pos.bytes - chunk.start.bytes; size_t it = current_.pos.bytes - chunk.start.bytes;
const uint8_t* cursor = chunk.data + it; while (it < chunk.length && cursor + 1 < buffer_start_ + kBufferSize) {
const uint8_t* end = chunk.data + chunk.length; unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
chunk.data[it], &it, &state, &incomplete_char);
// Deal with possible BOM. if (V8_LIKELY(t < kUtf8Bom)) {
if (V8_UNLIKELY(current_.pos.bytes < 3 && current_.pos.chars == 0)) { *(cursor++) = static_cast<uc16>(t); // The by most frequent case.
while (cursor < end) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (V8_LIKELY(t < kUtf8Bom)) {
*(output_cursor++) = static_cast<uc16>(t); // The most frequent case.
} else if (t == unibrow::Utf8::kIncomplete) {
continue;
} else if (t == kUtf8Bom) {
// BOM detected at beginning of the stream. Don't copy it.
} else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
*(output_cursor++) = static_cast<uc16>(t);
} else {
*(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
*(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
}
break;
}
}
while (cursor < end && output_cursor + 1 < buffer_start_ + kBufferSize) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (V8_LIKELY(t < unibrow::Utf16::kMaxNonSurrogateCharCode)) {
*(output_cursor++) = static_cast<uc16>(t); // The most frequent case.
} else if (t == unibrow::Utf8::kIncomplete) { } else if (t == unibrow::Utf8::kIncomplete) {
continue; continue;
} else if (t == kUtf8Bom && current_.pos.bytes + it == 3) {
// BOM detected at beginning of the stream. Don't copy it.
} else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
*(cursor++) = static_cast<uc16>(t);
} else { } else {
*(output_cursor++) = unibrow::Utf16::LeadSurrogate(t); *(cursor++) = unibrow::Utf16::LeadSurrogate(t);
*(output_cursor++) = unibrow::Utf16::TrailSurrogate(t); *(cursor++) = unibrow::Utf16::TrailSurrogate(t);
} }
} }
current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data); current_.pos.bytes = chunk.start.bytes + it;
current_.pos.chars += (output_cursor - buffer_end_); current_.pos.chars += (cursor - buffer_end_);
current_.pos.incomplete_char = incomplete_char; current_.pos.incomplete_char = incomplete_char;
current_.pos.state = state; current_.pos.state = state;
current_.chunk_no += (cursor == end); current_.chunk_no += (it == chunk.length);
buffer_end_ = output_cursor; buffer_end_ = cursor;
} }
bool Utf8ExternalStreamingStream::FetchChunk() { bool Utf8ExternalStreamingStream::FetchChunk() {
......
...@@ -56,53 +56,6 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n, ...@@ -56,53 +56,6 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
} }
} }
// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
// stream in. This **must** be followed by a call to ValueOfIncrementalFinish
// when the stream is complete, to ensure incomplete sequences are handled.
uchar Utf8::ValueOfIncremental(const byte** cursor, State* state,
Utf8IncrementalBuffer* buffer) {
DCHECK_NOT_NULL(buffer);
State old_state = *state;
byte next = **cursor;
*cursor += 1;
if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {
DCHECK_EQ(0u, *buffer);
return static_cast<uchar>(next);
}
// So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation
// char in that sequence.
Utf8DfaDecoder::Decode(next, state, buffer);
switch (*state) {
case State::kAccept: {
uchar t = *buffer;
*buffer = 0;
return t;
}
case State::kReject:
*state = State::kAccept;
*buffer = 0;
// If we hit a bad byte, we need to determine if we were trying to start
// a sequence or continue one. If we were trying to start a sequence,
// that means it's just an invalid lead byte and we need to continue to
// the next (which we already did above). If we were already in a
// sequence, we need to reprocess this same byte after resetting to the
// initial state.
if (old_state != State::kAccept) {
// We were trying to continue a sequence, so let's reprocess this byte
// next time.
*cursor -= 1;
}
return kBadChar;
default:
return kIncomplete;
}
}
unsigned Utf8::EncodeOneByte(char* str, uint8_t c) { unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
static const int kMask = ~(1 << 6); static const int kMask = ~(1 << 6);
......
...@@ -203,17 +203,62 @@ uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) { ...@@ -203,17 +203,62 @@ uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
Utf8IncrementalBuffer buffer = 0; Utf8IncrementalBuffer buffer = 0;
uchar t; uchar t;
const byte* start = str; size_t i = 0;
const byte* end = str + max_length;
do { do {
t = ValueOfIncremental(&str, &state, &buffer); t = ValueOfIncremental(str[i], &i, &state, &buffer);
} while (str < end && t == kIncomplete); } while (i < max_length && t == kIncomplete);
*cursor += str - start; *cursor += i;
return (state == State::kAccept) ? t : kBadChar; return (state == State::kAccept) ? t : kBadChar;
} }
// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
// stream in. This **must** be followed by a call to ValueOfIncrementalFinish
// when the stream is complete, to ensure incomplete sequences are handled.
uchar Utf8::ValueOfIncremental(byte next, size_t* cursor, State* state,
Utf8IncrementalBuffer* buffer) {
DCHECK_NOT_NULL(buffer);
State old_state = *state;
*cursor += 1;
if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {
DCHECK_EQ(0u, *buffer);
return static_cast<uchar>(next);
}
// So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation
// char in that sequence.
Utf8DfaDecoder::Decode(next, state, buffer);
switch (*state) {
case State::kAccept: {
uchar t = *buffer;
*buffer = 0;
return t;
}
case State::kReject:
*state = State::kAccept;
*buffer = 0;
// If we hit a bad byte, we need to determine if we were trying to start
// a sequence or continue one. If we were trying to start a sequence,
// that means it's just an invalid lead byte and we need to continue to
// the next (which we already did above). If we were already in a
// sequence, we need to reprocess this same byte after resetting to the
// initial state.
if (old_state != State::kAccept) {
// We were trying to continue a sequence, so let's reprocess this byte
// next time.
*cursor -= 1;
}
return kBadChar;
default:
return kIncomplete;
}
}
// Finishes the incremental decoding, ensuring that if an unfinished sequence // Finishes the incremental decoding, ensuring that if an unfinished sequence
// is left that it is replaced by a replacement char. // is left that it is replaced by a replacement char.
uchar Utf8::ValueOfIncrementalFinish(State* state) { uchar Utf8::ValueOfIncrementalFinish(State* state) {
......
...@@ -163,8 +163,8 @@ class V8_EXPORT_PRIVATE Utf8 { ...@@ -163,8 +163,8 @@ class V8_EXPORT_PRIVATE Utf8 {
static inline uchar ValueOf(const byte* str, size_t length, size_t* cursor); static inline uchar ValueOf(const byte* str, size_t length, size_t* cursor);
typedef uint32_t Utf8IncrementalBuffer; typedef uint32_t Utf8IncrementalBuffer;
static inline uchar ValueOfIncremental(const byte** cursor, State* state, static uchar ValueOfIncremental(byte next_byte, size_t* cursor, State* state,
Utf8IncrementalBuffer* buffer); Utf8IncrementalBuffer* buffer);
static uchar ValueOfIncrementalFinish(State* state); static uchar ValueOfIncrementalFinish(State* state);
// Excludes non-characters from the set of valid code points. // Excludes non-characters from the set of valid code points.
......
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
// found in the LICENSE file. // found in the LICENSE file.
#include "test/cctest/unicode-helpers.h" #include "test/cctest/unicode-helpers.h"
#include "src/unicode-inl.h"
int Ucs2CharLength(unibrow::uchar c) { int Ucs2CharLength(unibrow::uchar c) {
if (c == unibrow::Utf8::kIncomplete || c == unibrow::Utf8::kBufferEmpty) { if (c == unibrow::Utf8::kIncomplete || c == unibrow::Utf8::kBufferEmpty) {
...@@ -20,9 +19,10 @@ int Utf8LengthHelper(const char* s) { ...@@ -20,9 +19,10 @@ int Utf8LengthHelper(const char* s) {
unibrow::Utf8::State state = unibrow::Utf8::State::kAccept; unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
int length = 0; int length = 0;
const uint8_t* c = reinterpret_cast<const uint8_t*>(s); size_t i = 0;
while (*c != '\0') { while (s[i] != '\0') {
unibrow::uchar tmp = unibrow::Utf8::ValueOfIncremental(&c, &state, &buffer); unibrow::uchar tmp =
unibrow::Utf8::ValueOfIncremental(s[i], &i, &state, &buffer);
length += Ucs2CharLength(tmp); length += Ucs2CharLength(tmp);
} }
unibrow::uchar tmp = unibrow::Utf8::ValueOfIncrementalFinish(&state); unibrow::uchar tmp = unibrow::Utf8::ValueOfIncrementalFinish(&state);
......
...@@ -50,11 +50,9 @@ void DecodeIncrementally(const std::vector<byte>& bytes, ...@@ -50,11 +50,9 @@ void DecodeIncrementally(const std::vector<byte>& bytes,
std::vector<unibrow::uchar>* output) { std::vector<unibrow::uchar>* output) {
unibrow::Utf8::Utf8IncrementalBuffer buffer = 0; unibrow::Utf8::Utf8IncrementalBuffer buffer = 0;
unibrow::Utf8::State state = unibrow::Utf8::State::kAccept; unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
const byte* cursor = &bytes[0]; for (size_t i = 0; i < bytes.size();) {
const byte* end = &bytes[bytes.size()];
while (cursor < end) {
unibrow::uchar result = unibrow::uchar result =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &buffer); unibrow::Utf8::ValueOfIncremental(bytes[i], &i, &state, &buffer);
if (result != unibrow::Utf8::kIncomplete) { if (result != unibrow::Utf8::kIncomplete) {
output->push_back(result); output->push_back(result);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment