Commit 73dd9b55 authored by Toon Verwaest's avatar Toon Verwaest Committed by Commit Bot

[utf8] Rewrite NewStringFromUtf8 using Utf8::ValueOfIncremental

This is 3-4x faster than using the Utf8Decoder. This matters for proper
parse-time measurements using d8.

Change-Id: I9870e9fbe400ec022a6eeb20491c80a2a32f8519
Reviewed-on: https://chromium-review.googlesource.com/c/1451827
Commit-Queue: Toon Verwaest <verwaest@chromium.org>
Reviewed-by: 's avatarLeszek Swirski <leszeks@chromium.org>
Reviewed-by: 's avatarUlan Degenbaev <ulan@chromium.org>
Cr-Commit-Position: refs/heads/master@{#59347}
parent d8ca31ab
......@@ -42,7 +42,7 @@
#include "src/objects/stack-frame-info-inl.h"
#include "src/objects/struct-inl.h"
#include "src/unicode-cache.h"
#include "src/unicode-decoder.h"
#include "src/unicode-inl.h"
namespace v8 {
namespace internal {
......@@ -670,13 +670,38 @@ MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> string,
return NewStringFromOneByte(Vector<const uint8_t>::cast(string), pretenure);
}
// Non-ASCII and we need to decode.
auto non_ascii = string.SubVector(non_ascii_start, length);
Access<UnicodeCache::Utf8Decoder> decoder(
isolate()->unicode_cache()->utf8_decoder());
decoder->Reset(non_ascii);
std::unique_ptr<uint16_t[]> buffer(new uint16_t[length - non_ascii_start]);
const uint8_t* cursor =
reinterpret_cast<const uint8_t*>(&string[non_ascii_start]);
const uint8_t* end = reinterpret_cast<const uint8_t*>(string.end());
uint16_t* output_cursor = buffer.get();
uint32_t incomplete_char = 0;
unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
while (cursor < end) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (V8_LIKELY(t <= unibrow::Utf16::kMaxNonSurrogateCharCode)) {
*(output_cursor++) = static_cast<uc16>(t); // The most frequent case.
} else if (t == unibrow::Utf8::kIncomplete) {
continue;
} else {
*(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
*(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
}
}
int utf16_length = static_cast<int>(decoder->Utf16Length());
unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
if (t != unibrow::Utf8::kBufferEmpty) {
*(output_cursor++) = static_cast<uc16>(t);
}
DCHECK_LE(output_cursor, buffer.get() + length - non_ascii_start);
int utf16_length = static_cast<int>(output_cursor - buffer.get());
DCHECK_GT(utf16_length, 0);
// Allocate string.
......@@ -685,15 +710,13 @@ MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> string,
isolate(), result,
NewRawTwoByteString(non_ascii_start + utf16_length, pretenure), String);
// Copy ASCII portion.
DCHECK_LE(non_ascii_start + utf16_length, length);
DisallowHeapAllocation no_gc;
uint16_t* data = result->GetChars(no_gc);
for (int i = 0; i < non_ascii_start; i++) {
*data++ = *ascii_data++;
}
CopyChars(data, ascii_data, non_ascii_start);
CopyChars(data + non_ascii_start, buffer.get(), utf16_length);
// Now write the remainder.
decoder->WriteUtf16(data, utf16_length, non_ascii);
return result;
}
......
......@@ -514,23 +514,38 @@ bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) {
unibrow::Utf8::State state = chunk.start.state;
uint32_t incomplete_char = chunk.start.incomplete_char;
size_t it = current_.pos.bytes - chunk.start.bytes;
size_t chars = chunk.start.chars;
while (it < chunk.length && chars < position) {
unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
chunk.data[it], &it, &state, &incomplete_char);
if (t == kUtf8Bom && current_.pos.chars == 0) {
// BOM detected at beginning of the stream. Don't copy it.
} else if (t != unibrow::Utf8::kIncomplete) {
const uint8_t* cursor = &chunk.data[it];
const uint8_t* end = &chunk.data[chunk.length];
size_t chars = current_.pos.chars;
if (V8_UNLIKELY(current_.pos.bytes < 3 && chars == 0)) {
while (cursor < end) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (t == unibrow::Utf8::kIncomplete) continue;
if (t != kUtf8Bom) {
chars++;
if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
}
break;
}
}
while (cursor < end && chars < position) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (t != unibrow::Utf8::kIncomplete) {
chars++;
if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
}
}
current_.pos.bytes += it;
current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
current_.pos.chars = chars;
current_.pos.incomplete_char = incomplete_char;
current_.pos.state = state;
current_.chunk_no += (it == chunk.length);
current_.chunk_no += (cursor == end);
return current_.pos.chars == position;
}
......@@ -544,8 +559,8 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
// The buffer_ is writable, but buffer_*_ members are const. So we get a
// non-const pointer into buffer that points to the same char as buffer_end_.
uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
DCHECK_EQ(cursor, buffer_end_);
uint16_t* output_cursor = buffer_ + (buffer_end_ - buffer_start_);
DCHECK_EQ(output_cursor, buffer_end_);
unibrow::Utf8::State state = current_.pos.state;
uint32_t incomplete_char = current_.pos.incomplete_char;
......@@ -556,7 +571,7 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
if (t != unibrow::Utf8::kBufferEmpty) {
DCHECK_EQ(t, unibrow::Utf8::kBadChar);
*cursor = static_cast<uc16>(t);
*output_cursor = static_cast<uc16>(t);
buffer_end_++;
current_.pos.chars++;
current_.pos.incomplete_char = 0;
......@@ -566,30 +581,50 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
}
size_t it = current_.pos.bytes - chunk.start.bytes;
while (it < chunk.length && cursor + 1 < buffer_start_ + kBufferSize) {
unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
chunk.data[it], &it, &state, &incomplete_char);
if (V8_LIKELY(t < kUtf8Bom)) {
*(cursor++) = static_cast<uc16>(t); // The by most frequent case.
const uint8_t* cursor = chunk.data + it;
const uint8_t* end = chunk.data + chunk.length;
// Deal with possible BOM.
if (V8_UNLIKELY(current_.pos.bytes < 3 && current_.pos.chars == 0)) {
while (cursor < end) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (V8_LIKELY(t < kUtf8Bom)) {
*(output_cursor++) = static_cast<uc16>(t); // The most frequent case.
} else if (t == unibrow::Utf8::kIncomplete) {
continue;
} else if (t == kUtf8Bom) {
// BOM detected at beginning of the stream. Don't copy it.
} else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
*(output_cursor++) = static_cast<uc16>(t);
} else {
*(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
*(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
}
break;
}
}
while (cursor < end && output_cursor + 1 < buffer_start_ + kBufferSize) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (V8_LIKELY(t < unibrow::Utf16::kMaxNonSurrogateCharCode)) {
*(output_cursor++) = static_cast<uc16>(t); // The most frequent case.
} else if (t == unibrow::Utf8::kIncomplete) {
continue;
} else if (t == kUtf8Bom && current_.pos.bytes + it == 3) {
// BOM detected at beginning of the stream. Don't copy it.
} else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
*(cursor++) = static_cast<uc16>(t);
} else {
*(cursor++) = unibrow::Utf16::LeadSurrogate(t);
*(cursor++) = unibrow::Utf16::TrailSurrogate(t);
*(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
*(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
}
}
current_.pos.bytes = chunk.start.bytes + it;
current_.pos.chars += (cursor - buffer_end_);
current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
current_.pos.chars += (output_cursor - buffer_end_);
current_.pos.incomplete_char = incomplete_char;
current_.pos.state = state;
current_.chunk_no += (it == chunk.length);
current_.chunk_no += (cursor == end);
buffer_end_ = cursor;
buffer_end_ = output_cursor;
}
bool Utf8ExternalStreamingStream::FetchChunk() {
......
......@@ -56,6 +56,53 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
}
}
// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
// stream in. This **must** be followed by a call to ValueOfIncrementalFinish
// when the stream is complete, to ensure incomplete sequences are handled.
uchar Utf8::ValueOfIncremental(const byte** cursor, State* state,
Utf8IncrementalBuffer* buffer) {
DCHECK_NOT_NULL(buffer);
State old_state = *state;
byte next = **cursor;
*cursor += 1;
if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {
DCHECK_EQ(0u, *buffer);
return static_cast<uchar>(next);
}
// So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation
// char in that sequence.
Utf8DfaDecoder::Decode(next, state, buffer);
switch (*state) {
case State::kAccept: {
uchar t = *buffer;
*buffer = 0;
return t;
}
case State::kReject:
*state = State::kAccept;
*buffer = 0;
// If we hit a bad byte, we need to determine if we were trying to start
// a sequence or continue one. If we were trying to start a sequence,
// that means it's just an invalid lead byte and we need to continue to
// the next (which we already did above). If we were already in a
// sequence, we need to reprocess this same byte after resetting to the
// initial state.
if (old_state != State::kAccept) {
// We were trying to continue a sequence, so let's reprocess this byte
// next time.
*cursor -= 1;
}
return kBadChar;
default:
return kIncomplete;
}
}
unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
static const int kMask = ~(1 << 6);
......
......@@ -203,62 +203,17 @@ uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
Utf8IncrementalBuffer buffer = 0;
uchar t;
size_t i = 0;
const byte* start = str;
const byte* end = str + max_length;
do {
t = ValueOfIncremental(str[i], &i, &state, &buffer);
} while (i < max_length && t == kIncomplete);
t = ValueOfIncremental(&str, &state, &buffer);
} while (str < end && t == kIncomplete);
*cursor += i;
*cursor += str - start;
return (state == State::kAccept) ? t : kBadChar;
}
// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
// stream in. This **must** be followed by a call to ValueOfIncrementalFinish
// when the stream is complete, to ensure incomplete sequences are handled.
uchar Utf8::ValueOfIncremental(byte next, size_t* cursor, State* state,
Utf8IncrementalBuffer* buffer) {
DCHECK_NOT_NULL(buffer);
State old_state = *state;
*cursor += 1;
if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {
DCHECK_EQ(0u, *buffer);
return static_cast<uchar>(next);
}
// So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation
// char in that sequence.
Utf8DfaDecoder::Decode(next, state, buffer);
switch (*state) {
case State::kAccept: {
uchar t = *buffer;
*buffer = 0;
return t;
}
case State::kReject:
*state = State::kAccept;
*buffer = 0;
// If we hit a bad byte, we need to determine if we were trying to start
// a sequence or continue one. If we were trying to start a sequence,
// that means it's just an invalid lead byte and we need to continue to
// the next (which we already did above). If we were already in a
// sequence, we need to reprocess this same byte after resetting to the
// initial state.
if (old_state != State::kAccept) {
// We were trying to continue a sequence, so let's reprocess this byte
// next time.
*cursor -= 1;
}
return kBadChar;
default:
return kIncomplete;
}
}
// Finishes the incremental decoding, ensuring that if an unfinished sequence
// is left that it is replaced by a replacement char.
uchar Utf8::ValueOfIncrementalFinish(State* state) {
......
......@@ -163,8 +163,8 @@ class V8_EXPORT_PRIVATE Utf8 {
static inline uchar ValueOf(const byte* str, size_t length, size_t* cursor);
typedef uint32_t Utf8IncrementalBuffer;
static uchar ValueOfIncremental(byte next_byte, size_t* cursor, State* state,
Utf8IncrementalBuffer* buffer);
static inline uchar ValueOfIncremental(const byte** cursor, State* state,
Utf8IncrementalBuffer* buffer);
static uchar ValueOfIncrementalFinish(State* state);
// Excludes non-characters from the set of valid code points.
......
......@@ -3,6 +3,7 @@
// found in the LICENSE file.
#include "test/cctest/unicode-helpers.h"
#include "src/unicode-inl.h"
int Ucs2CharLength(unibrow::uchar c) {
if (c == unibrow::Utf8::kIncomplete || c == unibrow::Utf8::kBufferEmpty) {
......@@ -19,10 +20,9 @@ int Utf8LengthHelper(const char* s) {
unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
int length = 0;
size_t i = 0;
while (s[i] != '\0') {
unibrow::uchar tmp =
unibrow::Utf8::ValueOfIncremental(s[i], &i, &state, &buffer);
const uint8_t* c = reinterpret_cast<const uint8_t*>(s);
while (*c != '\0') {
unibrow::uchar tmp = unibrow::Utf8::ValueOfIncremental(&c, &state, &buffer);
length += Ucs2CharLength(tmp);
}
unibrow::uchar tmp = unibrow::Utf8::ValueOfIncrementalFinish(&state);
......
......@@ -50,9 +50,11 @@ void DecodeIncrementally(const std::vector<byte>& bytes,
std::vector<unibrow::uchar>* output) {
unibrow::Utf8::Utf8IncrementalBuffer buffer = 0;
unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
for (size_t i = 0; i < bytes.size();) {
const byte* cursor = &bytes[0];
const byte* end = &bytes[bytes.size()];
while (cursor < end) {
unibrow::uchar result =
unibrow::Utf8::ValueOfIncremental(bytes[i], &i, &state, &buffer);
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &buffer);
if (result != unibrow::Utf8::kIncomplete) {
output->push_back(result);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment