Commit a64ccef7 authored by Toon Verwaest's avatar Toon Verwaest Committed by Commit Bot

[scanner] Separate ascii-in-utf8 length computation from decoding the chars

This way we walk the input string twice, but we reduce the number of branches
per ascii char in the long-ascii-sequence case from 2 per char to ~ 1 + 2 /
sizeof(intptr). Let's land and see what the bots say.

Change-Id: I574971c7df896237f3382be634a9bedc920fc827
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1649356Reviewed-by: 's avatarLeszek Swirski <leszeks@chromium.org>
Commit-Queue: Toon Verwaest <verwaest@chromium.org>
Cr-Commit-Position: refs/heads/master@{#62046}
parent faaf4a8a
...@@ -607,13 +607,10 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() { ...@@ -607,13 +607,10 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
size_t max_buffer = max_buffer_end - output_cursor; size_t max_buffer = max_buffer_end - output_cursor;
int max_length = static_cast<int>(Min(remaining, max_buffer)); int max_length = static_cast<int>(Min(remaining, max_buffer));
DCHECK_EQ(state, unibrow::Utf8::State::kAccept); DCHECK_EQ(state, unibrow::Utf8::State::kAccept);
const uint8_t* read_end = cursor + max_length; int ascii_length = NonAsciiStart(cursor, max_length);
for (; cursor < read_end; cursor++) { CopyChars(output_cursor, cursor, ascii_length);
uint8_t c = *cursor; cursor += ascii_length;
DCHECK_EQ(unibrow::Utf8::kMaxOneByteChar, 0x7F); output_cursor += ascii_length;
if (c > unibrow::Utf8::kMaxOneByteChar) break;
*(output_cursor++) = c;
}
} }
current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data); current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment