Commit 928e7b29 authored by Toon Verwaest's avatar Toon Verwaest Committed by Commit Bot

[scanner] Decode utf8 as chunks come in to utf16, allowing unbuffered streaming

Change-Id: Iaad8bc94e9222d309749491df9a500544b5b37da
Reviewed-on: https://chromium-review.googlesource.com/1158687
Commit-Queue: Toon Verwaest <verwaest@chromium.org>
Reviewed-by: 's avatarMarja Hölttä <marja@chromium.org>
Cr-Commit-Position: refs/heads/master@{#54877}
parent f57273ae
......@@ -88,8 +88,8 @@ class ExternalStringStream {
template <typename Char>
class ChunkedStream {
public:
explicit ChunkedStream(ScriptCompiler::ExternalSourceStream* source,
RuntimeCallStats* stats)
ChunkedStream(ScriptCompiler::ExternalSourceStream* source,
RuntimeCallStats* stats)
: source_(source), stats_(stats) {}
Range<Char> GetDataAt(size_t pos) {
......@@ -100,15 +100,15 @@ class ChunkedStream {
}
~ChunkedStream() {
for (size_t i = 0; i < chunks_.size(); i++) {
delete[] chunks_[i].data;
}
for (Chunk& chunk : chunks_) delete[] chunk.data;
}
static const bool kCanAccessHeap = false;
private:
struct Chunk {
Chunk(const Char* const data, size_t position, size_t length)
: data(data), position(position), length(length) {}
const Char* const data;
// The logical position of data.
const size_t position;
......@@ -117,7 +117,7 @@ class ChunkedStream {
};
Chunk FindChunk(size_t position) {
if (chunks_.empty()) FetchChunk(size_t{0});
while (V8_UNLIKELY(chunks_.empty())) FetchChunk(size_t{0});
// Walk forwards while the position is in front of the current chunk.
while (position >= chunks_.back().end_position() &&
......@@ -134,6 +134,14 @@ class ChunkedStream {
UNREACHABLE();
}
virtual void ProcessChunk(const uint8_t* data, size_t position,
size_t length) {
// Incoming data has to be aligned to Char size.
DCHECK_EQ(0, length % sizeof(Char));
chunks_.emplace_back(reinterpret_cast<const Char*>(data), position,
length / sizeof(Char));
}
void FetchChunk(size_t position) {
const uint8_t* data = nullptr;
size_t length;
......@@ -142,21 +150,110 @@ class ChunkedStream {
RuntimeCallCounterId::kGetMoreDataCallback);
length = source_->GetMoreData(&data);
}
// Incoming data has to be aligned to Char size.
DCHECK_EQ(0, length % sizeof(Char));
chunks_.push_back(
{reinterpret_cast<const Char*>(data), position, length / sizeof(Char)});
ProcessChunk(data, position, length);
}
std::vector<struct Chunk> chunks_;
ScriptCompiler::ExternalSourceStream* source_;
RuntimeCallStats* stats_;
protected:
std::vector<struct Chunk> chunks_;
};
template <typename Char>
class Utf8ChunkedStream : public ChunkedStream<uint16_t> {
public:
Utf8ChunkedStream(ScriptCompiler::ExternalSourceStream* source,
RuntimeCallStats* stats)
: ChunkedStream<uint16_t>(source, stats) {}
STATIC_ASSERT(sizeof(Char) == sizeof(uint16_t));
void ProcessChunk(const uint8_t* data, size_t position, size_t length) final {
if (length == 0) {
unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state_);
if (t != unibrow::Utf8::kBufferEmpty) {
DCHECK_EQ(t, unibrow::Utf8::kBadChar);
incomplete_char_ = 0;
uint16_t* result = new uint16_t[1];
result[0] = unibrow::Utf8::kBadChar;
chunks_.emplace_back(result, position, 1);
position++;
}
chunks_.emplace_back(nullptr, position, 0);
delete[] data;
return;
}
// First count the number of complete characters that can be produced.
unibrow::Utf8::State state = state_;
uint32_t incomplete_char = incomplete_char_;
bool seen_bom = seen_bom_;
size_t i = 0;
size_t chars = 0;
while (i < length) {
unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(data[i], &i, &state,
&incomplete_char);
if (!seen_bom && t == kUtf8Bom && position + chars == 0) {
seen_bom = true;
// BOM detected at beginning of the stream. Don't copy it.
} else if (t != unibrow::Utf8::kIncomplete) {
chars++;
if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
}
}
// Process the data.
// If there aren't any complete characters, update the state without
// producing a chunk.
if (chars == 0) {
state_ = state;
incomplete_char_ = incomplete_char;
seen_bom_ = seen_bom;
delete[] data;
return;
}
// Update the state and produce a chunk with complete characters.
uint16_t* result = new uint16_t[chars];
uint16_t* cursor = result;
i = 0;
while (i < length) {
unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(data[i], &i, &state_,
&incomplete_char_);
if (V8_LIKELY(t < kUtf8Bom)) {
*(cursor++) = static_cast<uc16>(t); // The by most frequent case.
} else if (t == unibrow::Utf8::kIncomplete) {
continue;
} else if (!seen_bom_ && t == kUtf8Bom && position == 0 &&
cursor == result) {
// BOM detected at beginning of the stream. Don't copy it.
seen_bom_ = true;
} else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
*(cursor++) = static_cast<uc16>(t);
} else {
*(cursor++) = unibrow::Utf16::LeadSurrogate(t);
*(cursor++) = unibrow::Utf16::TrailSurrogate(t);
}
}
chunks_.emplace_back(result, position, chars);
delete[] data;
}
private:
uint32_t incomplete_char_ = 0;
unibrow::Utf8::State state_ = unibrow::Utf8::State::kAccept;
bool seen_bom_ = false;
};
// Provides a buffered utf-16 view on the bytes from the underlying ByteStream.
// Chars are buffered if either the underlying stream isn't utf-16 or the
// underlying utf-16 stream might move (is on-heap).
template <typename Char, template <typename T> class ByteStream>
template <template <typename T> class ByteStream>
class BufferedCharacterStream : public CharacterStream<uint16_t> {
public:
template <class... TArgs>
......@@ -171,7 +268,7 @@ class BufferedCharacterStream : public CharacterStream<uint16_t> {
buffer_start_ = &buffer_[0];
buffer_cursor_ = buffer_start_;
Range<Char> range = byte_stream_.GetDataAt(position);
Range<uint8_t> range = byte_stream_.GetDataAt(position);
if (range.length() == 0) {
buffer_end_ = buffer_start_;
return false;
......@@ -183,12 +280,12 @@ class BufferedCharacterStream : public CharacterStream<uint16_t> {
return true;
}
bool can_access_heap() final { return ByteStream<uint16_t>::kCanAccessHeap; }
bool can_access_heap() final { return ByteStream<uint8_t>::kCanAccessHeap; }
private:
static const size_t kBufferSize = 512;
uc16 buffer_[kBufferSize];
ByteStream<Char> byte_stream_;
ByteStream<uint8_t> byte_stream_;
};
// Provides a unbuffered utf-16 view on the bytes from the underlying
......@@ -260,328 +357,6 @@ class RelocatingCharacterStream
Isolate* isolate_;
};
// ----------------------------------------------------------------------------
// BufferedUtf16CharacterStreams
//
// A buffered character stream based on a random access character
// source (ReadBlock can be called with pos() pointing to any position,
// even positions before the current).
//
// TODO(verwaest): Remove together with Utf8 external streaming streams.
class BufferedUtf16CharacterStream : public CharacterStream<uint16_t> {
public:
BufferedUtf16CharacterStream();
protected:
static const size_t kBufferSize = 512;
bool ReadBlock() final;
// FillBuffer should read up to kBufferSize characters at position and store
// them into buffer_[0..]. It returns the number of characters stored.
virtual size_t FillBuffer(size_t position) = 0;
// Fixed sized buffer that this class reads from.
// The base class' buffer_start_ should always point to buffer_.
uc16 buffer_[kBufferSize];
};
BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
: CharacterStream(buffer_, buffer_, buffer_, 0) {}
bool BufferedUtf16CharacterStream::ReadBlock() {
DCHECK_EQ(buffer_start_, buffer_);
size_t position = pos();
buffer_pos_ = position;
buffer_cursor_ = buffer_;
buffer_end_ = buffer_ + FillBuffer(position);
DCHECK_EQ(pos(), position);
DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize);
return buffer_cursor_ < buffer_end_;
}
// ----------------------------------------------------------------------------
// Utf8ExternalStreamingStream - chunked streaming of Utf-8 data.
//
// This implementation is fairly complex, since data arrives in chunks which
// may 'cut' arbitrarily into utf-8 characters. Also, seeking to a given
// character position is tricky because the byte position cannot be dericed
// from the character position.
//
// TODO(verwaest): Decode utf8 chunks into utf16 chunks on the blink side
// instead so we don't need to buffer.
class Utf8ExternalStreamingStream : public BufferedUtf16CharacterStream {
public:
Utf8ExternalStreamingStream(
ScriptCompiler::ExternalSourceStream* source_stream,
RuntimeCallStats* stats)
: current_({0, {0, 0, 0, unibrow::Utf8::State::kAccept}}),
source_stream_(source_stream),
stats_(stats) {}
~Utf8ExternalStreamingStream() final {
for (size_t i = 0; i < chunks_.size(); i++) delete[] chunks_[i].data;
}
bool can_access_heap() final { return false; }
protected:
size_t FillBuffer(size_t position) final;
private:
// A position within the data stream. It stores:
// - The 'physical' position (# of bytes in the stream),
// - the 'logical' position (# of ucs-2 characters, also within the stream),
// - a possibly incomplete utf-8 char at the current 'physical' position.
struct StreamPosition {
size_t bytes;
size_t chars;
uint32_t incomplete_char;
unibrow::Utf8::State state;
};
// Position contains a StreamPosition and the index of the chunk the position
// points into. (The chunk_no could be derived from pos, but that'd be
// an expensive search through all chunks.)
struct Position {
size_t chunk_no;
StreamPosition pos;
};
// A chunk in the list of chunks, containing:
// - The chunk data (data pointer and length), and
// - the position at the first byte of the chunk.
struct Chunk {
const uint8_t* data;
size_t length;
StreamPosition start;
};
// Within the current chunk, skip forward from current_ towards position.
bool SkipToPosition(size_t position);
// Within the current chunk, fill the buffer_ (while it has capacity).
void FillBufferFromCurrentChunk();
// Fetch a new chunk (assuming current_ is at the end of the current data).
bool FetchChunk();
// Search through the chunks and set current_ to point to the given position.
// (This call is potentially expensive.)
void SearchPosition(size_t position);
std::vector<Chunk> chunks_;
Position current_;
ScriptCompiler::ExternalSourceStream* source_stream_;
RuntimeCallStats* stats_;
};
bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) {
DCHECK_LE(current_.pos.chars, position); // We can only skip forward.
// Already there? Then return immediately.
if (current_.pos.chars == position) return true;
const Chunk& chunk = chunks_[current_.chunk_no];
DCHECK(current_.pos.bytes >= chunk.start.bytes);
unibrow::Utf8::State state = chunk.start.state;
uint32_t incomplete_char = chunk.start.incomplete_char;
size_t it = current_.pos.bytes - chunk.start.bytes;
size_t chars = chunk.start.chars;
while (it < chunk.length && chars < position) {
unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
chunk.data[it], &it, &state, &incomplete_char);
if (t == kUtf8Bom && current_.pos.chars == 0) {
// BOM detected at beginning of the stream. Don't copy it.
} else if (t != unibrow::Utf8::kIncomplete) {
chars++;
if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
}
}
current_.pos.bytes += it;
current_.pos.chars = chars;
current_.pos.incomplete_char = incomplete_char;
current_.pos.state = state;
current_.chunk_no += (it == chunk.length);
return current_.pos.chars == position;
}
void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
DCHECK_LT(current_.chunk_no, chunks_.size());
DCHECK_EQ(buffer_start_, buffer_cursor_);
DCHECK_LT(buffer_end_ + 1, buffer_start_ + kBufferSize);
const Chunk& chunk = chunks_[current_.chunk_no];
// The buffer_ is writable, but buffer_*_ members are const. So we get a
// non-const pointer into buffer that points to the same char as buffer_end_.
uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
DCHECK_EQ(cursor, buffer_end_);
unibrow::Utf8::State state = current_.pos.state;
uint32_t incomplete_char = current_.pos.incomplete_char;
// If the current chunk is the last (empty) chunk we'll have to process
// any left-over, partial characters.
if (chunk.length == 0) {
unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
if (t != unibrow::Utf8::kBufferEmpty) {
DCHECK_EQ(t, unibrow::Utf8::kBadChar);
*cursor = static_cast<uc16>(t);
buffer_end_++;
current_.pos.chars++;
current_.pos.incomplete_char = 0;
current_.pos.state = state;
}
return;
}
size_t it = current_.pos.bytes - chunk.start.bytes;
while (it < chunk.length && cursor + 1 < buffer_start_ + kBufferSize) {
unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
chunk.data[it], &it, &state, &incomplete_char);
if (V8_LIKELY(t < kUtf8Bom)) {
*(cursor++) = static_cast<uc16>(t); // The by most frequent case.
} else if (t == unibrow::Utf8::kIncomplete) {
continue;
} else if (t == kUtf8Bom && current_.pos.bytes + it == 3) {
// BOM detected at beginning of the stream. Don't copy it.
} else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
*(cursor++) = static_cast<uc16>(t);
} else {
*(cursor++) = unibrow::Utf16::LeadSurrogate(t);
*(cursor++) = unibrow::Utf16::TrailSurrogate(t);
}
}
current_.pos.bytes = chunk.start.bytes + it;
current_.pos.chars += (cursor - buffer_end_);
current_.pos.incomplete_char = incomplete_char;
current_.pos.state = state;
current_.chunk_no += (it == chunk.length);
buffer_end_ = cursor;
}
bool Utf8ExternalStreamingStream::FetchChunk() {
RuntimeCallTimerScope scope(stats_,
RuntimeCallCounterId::kGetMoreDataCallback);
DCHECK_EQ(current_.chunk_no, chunks_.size());
DCHECK(chunks_.empty() || chunks_.back().length != 0);
const uint8_t* chunk = nullptr;
size_t length = source_stream_->GetMoreData(&chunk);
chunks_.push_back({chunk, length, current_.pos});
return length > 0;
}
void Utf8ExternalStreamingStream::SearchPosition(size_t position) {
// If current_ already points to the right position, we're done.
//
// This is expected to be the common case, since we typically call
// FillBuffer right after the current buffer.
if (current_.pos.chars == position) return;
// No chunks. Fetch at least one, so we can assume !chunks_.empty() below.
if (chunks_.empty()) {
DCHECK_EQ(current_.chunk_no, 0u);
DCHECK_EQ(current_.pos.bytes, 0u);
DCHECK_EQ(current_.pos.chars, 0u);
FetchChunk();
}
// Search for the last chunk whose start position is less or equal to
// position.
size_t chunk_no = chunks_.size() - 1;
while (chunk_no > 0 && chunks_[chunk_no].start.chars > position) {
chunk_no--;
}
// Did we find the terminating (zero-length) chunk? Then we're seeking
// behind the end of the data, and position does not exist.
// Set current_ to point to the terminating chunk.
if (chunks_[chunk_no].length == 0) {
current_ = {chunk_no, chunks_[chunk_no].start};
return;
}
// Did we find the non-last chunk? Then our position must be within chunk_no.
if (chunk_no + 1 < chunks_.size()) {
// Fancy-pants optimization for ASCII chunks within a utf-8 stream.
// (Many web sites declare utf-8 encoding, but use only (or almost only) the
// ASCII subset for their JavaScript sources. We can exploit this, by
// checking whether the # bytes in a chunk are equal to the # chars, and if
// so avoid the expensive SkipToPosition.)
bool ascii_only_chunk =
chunks_[chunk_no].start.incomplete_char == 0 &&
(chunks_[chunk_no + 1].start.bytes - chunks_[chunk_no].start.bytes) ==
(chunks_[chunk_no + 1].start.chars - chunks_[chunk_no].start.chars);
if (ascii_only_chunk) {
size_t skip = position - chunks_[chunk_no].start.chars;
current_ = {chunk_no,
{chunks_[chunk_no].start.bytes + skip,
chunks_[chunk_no].start.chars + skip, 0,
unibrow::Utf8::State::kAccept}};
} else {
current_ = {chunk_no, chunks_[chunk_no].start};
SkipToPosition(position);
}
// Since position was within the chunk, SkipToPosition should have found
// something.
DCHECK_EQ(position, current_.pos.chars);
return;
}
// What's left: We're in the last, non-terminating chunk. Our position
// may be in the chunk, but it may also be in 'future' chunks, which we'll
// have to obtain.
DCHECK_EQ(chunk_no, chunks_.size() - 1);
current_ = {chunk_no, chunks_[chunk_no].start};
bool have_more_data = true;
bool found = SkipToPosition(position);
while (have_more_data && !found) {
DCHECK_EQ(current_.chunk_no, chunks_.size());
have_more_data = FetchChunk();
found = have_more_data && SkipToPosition(position);
}
// We'll return with a postion != the desired position only if we're out
// of data. In that case, we'll point to the terminating chunk.
DCHECK_EQ(found, current_.pos.chars == position);
DCHECK_EQ(have_more_data, chunks_.back().length != 0);
DCHECK_IMPLIES(!found, !have_more_data);
DCHECK_IMPLIES(!found, current_.chunk_no == chunks_.size() - 1);
}
size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) {
buffer_cursor_ = buffer_;
buffer_end_ = buffer_;
SearchPosition(position);
bool out_of_data = current_.chunk_no != chunks_.size() &&
chunks_[current_.chunk_no].length == 0 &&
current_.pos.incomplete_char == 0;
if (out_of_data) return 0;
// Fill the buffer, until we have at least one char (or are out of data).
// (The embedder might give us 1-byte blocks within a utf-8 char, so we
// can't guarantee progress with one chunk. Thus we iterate.)
while (!out_of_data && buffer_cursor_ == buffer_end_) {
// At end of current data, but there might be more? Then fetch it.
if (current_.chunk_no == chunks_.size()) {
out_of_data = !FetchChunk();
}
FillBufferFromCurrentChunk();
}
DCHECK_EQ(current_.pos.chars - position,
static_cast<size_t>(buffer_end_ - buffer_cursor_));
return buffer_end_ - buffer_cursor_;
}
// ----------------------------------------------------------------------------
// ScannerStream: Create stream instances.
......@@ -605,7 +380,7 @@ ScannerStream* ScannerStream::For(Isolate* isolate, Handle<String> data,
data = String::Flatten(isolate, data);
}
if (data->IsExternalOneByteString()) {
return new BufferedCharacterStream<uint8_t, ExternalStringStream>(
return new BufferedCharacterStream<ExternalStringStream>(
static_cast<size_t>(start_pos),
ExternalOneByteString::cast(*data)->GetChars() + start_offset,
static_cast<size_t>(end_pos));
......@@ -615,7 +390,7 @@ ScannerStream* ScannerStream::For(Isolate* isolate, Handle<String> data,
ExternalTwoByteString::cast(*data)->GetChars() + start_offset,
static_cast<size_t>(end_pos));
} else if (data->IsSeqOneByteString()) {
return new BufferedCharacterStream<uint8_t, OnHeapStream>(
return new BufferedCharacterStream<OnHeapStream>(
static_cast<size_t>(start_pos), Handle<SeqOneByteString>::cast(data),
start_offset, static_cast<size_t>(end_pos));
} else if (data->IsSeqTwoByteString()) {
......@@ -636,7 +411,7 @@ std::unique_ptr<CharacterStream<uint16_t>> ScannerStream::ForTesting(
std::unique_ptr<CharacterStream<uint16_t>> ScannerStream::ForTesting(
const char* data, size_t length) {
return std::unique_ptr<CharacterStream<uint16_t>>(
new BufferedCharacterStream<uint8_t, ExternalStringStream>(
new BufferedCharacterStream<ExternalStringStream>(
static_cast<size_t>(0), reinterpret_cast<const uint8_t*>(data),
static_cast<size_t>(length)));
}
......@@ -650,10 +425,11 @@ ScannerStream* ScannerStream::For(
return new UnbufferedCharacterStream<ChunkedStream>(
static_cast<size_t>(0), source_stream, stats);
case v8::ScriptCompiler::StreamedSource::ONE_BYTE:
return new BufferedCharacterStream<uint8_t, ChunkedStream>(
static_cast<size_t>(0), source_stream, stats);
return new BufferedCharacterStream<ChunkedStream>(static_cast<size_t>(0),
source_stream, stats);
case v8::ScriptCompiler::StreamedSource::UTF8:
return new Utf8ExternalStreamingStream(source_stream, stats);
return new UnbufferedCharacterStream<Utf8ChunkedStream>(
static_cast<size_t>(0), source_stream, stats);
}
UNREACHABLE();
}
......
......@@ -188,6 +188,18 @@ TEST(Utf8SplitBOM) {
}
}
TEST(Utf8SplitMultiBOM) {
// Construct chunks with a split BOM followed by another split BOM.
const char* chunks = "\xef\xbb\0\xbf\xef\xbb\0\xbf\0\0";
ChunkSource<char> chunk_source(chunks);
std::unique_ptr<i::ScannerStream> stream(v8::internal::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
// Read the data, ensuring we get exactly one of the two BOMs back.
CHECK_EQ(0xFEFF, stream->Advance());
CHECK_EQ(i::ScannerStream::kEndOfInput, stream->Advance());
}
TEST(Ucs2AdvanceUntil) {
// Test utf-8 advancing until a certain char.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment