Commit ce538f70 authored by Wiktor Garbacz's avatar Wiktor Garbacz Committed by Commit Bot

[parser] Refactor streaming scanner streams.

Unify, simplify logic, reduce UTF8 specific handling.

Intend of this is also to have stream views.
Stream views can be used concurrently by multiple threads, but
only one thread may fetch new data from the underlying source.
This together with unified stream view creation is intended to be
used for parse tasks.

BUG=v8:6093

Change-Id: Ied8e93090c506d4735080298f0fdaeed32043915
Reviewed-on: https://chromium-review.googlesource.com/501789
Commit-Queue: Wiktor Garbacz <wiktorg@google.com>
Reviewed-by: 's avatarDaniel Vogelheim <vogelheim@chromium.org>
Reviewed-by: 's avatarMarja Hölttä <marja@chromium.org>
Cr-Commit-Position: refs/heads/master@{#45336}
parent e418a1e4
......@@ -1266,11 +1266,6 @@ class V8_EXPORT ScriptCompiler {
* length of the data returned. When the data ends, GetMoreData should
* return 0. Caller takes ownership of the data.
*
* When streaming UTF-8 data, V8 handles multi-byte characters split between
* two data chunks, but doesn't handle multi-byte characters split between
* more than two data chunks. The embedder can avoid this problem by always
* returning at least 2 bytes of data.
*
* If the embedder wants to cancel the streaming, they should make the next
* GetMoreData call return 0. V8 will interpret it as end of data (and most
* probably, parsing will fail). The streaming task will return as soon as
......
......@@ -4,6 +4,8 @@
#include "src/parsing/scanner-character-streams.h"
#include <memory>
#include "include/v8.h"
#include "src/counters.h"
#include "src/globals.h"
......@@ -16,7 +18,8 @@ namespace v8 {
namespace internal {
namespace {
const unibrow::uchar kUtf8Bom = 0xfeff;
constexpr unibrow::uchar kUtf8Bom = 0xfeff;
constexpr uint8_t kUtf8BomBytes[] = {0xef, 0xbb, 0xbf};
} // namespace
// ----------------------------------------------------------------------------
......@@ -30,7 +33,7 @@ class BufferedUtf16CharacterStream : public Utf16CharacterStream {
BufferedUtf16CharacterStream();
protected:
static const size_t kBufferSize = 512;
static constexpr size_t kBufferCharacterSize = 512;
bool ReadBlock() override;
......@@ -40,21 +43,22 @@ class BufferedUtf16CharacterStream : public Utf16CharacterStream {
// Fixed sized buffer that this class reads from.
// The base class' buffer_start_ should always point to buffer_.
uc16 buffer_[kBufferSize];
uc16 buffer_[kBufferCharacterSize];
static constexpr size_t kBufferSizeInBytes =
sizeof(BufferedUtf16CharacterStream::buffer_);
};
BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
: Utf16CharacterStream(buffer_, buffer_, buffer_, 0) {}
bool BufferedUtf16CharacterStream::ReadBlock() {
DCHECK_EQ(buffer_start_, buffer_);
size_t position = pos();
buffer_pos_ = position;
buffer_cursor_ = buffer_;
buffer_start_ = buffer_cursor_ = buffer_;
buffer_end_ = buffer_ + FillBuffer(position);
DCHECK_EQ(pos(), position);
DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize);
DCHECK_LE(buffer_end_, buffer_start_ + kBufferCharacterSize);
return buffer_cursor_ < buffer_end_;
}
......@@ -87,7 +91,7 @@ GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(
size_t GenericStringUtf16CharacterStream::FillBuffer(size_t from_pos) {
if (from_pos >= length_) return 0;
size_t length = i::Min(kBufferSize, length_ - from_pos);
size_t length = i::Min(kBufferCharacterSize, length_ - from_pos);
String::WriteToFlat<uc16>(*string_, buffer_, static_cast<int>(from_pos),
static_cast<int>(from_pos + length));
return length;
......@@ -179,634 +183,486 @@ ExternalOneByteStringUtf16CharacterStream::
size_t ExternalOneByteStringUtf16CharacterStream::FillBuffer(size_t from_pos) {
if (from_pos >= length_) return 0;
size_t length = Min(kBufferSize, length_ - from_pos);
size_t length = Min(kBufferCharacterSize, length_ - from_pos);
i::CopyCharsUnsigned(buffer_, raw_data_ + from_pos, length);
return length;
}
// ----------------------------------------------------------------------------
// Utf8ExternalStreamingStream - chunked streaming of Utf-8 data.
//
// This implementation is fairly complex, since data arrives in chunks which
// may 'cut' arbitrarily into utf-8 characters. Also, seeking to a given
// character position is tricky because the byte position cannot be dericed
// from the character position.
namespace {
class Utf8ExternalStreamingStream : public BufferedUtf16CharacterStream {
public:
Utf8ExternalStreamingStream(
ScriptCompiler::ExternalSourceStream* source_stream,
RuntimeCallStats* stats)
: current_({0, {0, 0, unibrow::Utf8::Utf8IncrementalBuffer(0)}}),
source_stream_(source_stream),
stats_(stats) {}
~Utf8ExternalStreamingStream() override {
for (size_t i = 0; i < chunks_.size(); i++) delete[] chunks_[i].data;
// A linked-list container for the chunk data.
// Shared by ChunksView and Chunks (which may execute on different threads).
struct Chunk {
enum Type : uint8_t { ONE_BYTE, TWO_BYTE };
Chunk() : Chunk(ONE_BYTE, 0, nullptr, 0, 0, 0) {}
Chunk(Type type, int first_char_start_offset, const uint8_t* data,
size_t byte_length, size_t char_length, size_t char_pos)
: type(type),
first_char_start_offset(first_char_start_offset),
data(data),
byte_length(byte_length),
char_length(char_length),
char_pos(char_pos) {}
~Chunk() { delete next.Value(); }
static Chunk* CreateOneByte(const uint8_t* data, size_t byte_length,
size_t char_pos, size_t start_offset = 0) {
return new Chunk(ONE_BYTE, static_cast<int>(start_offset), data,
byte_length, byte_length, char_pos);
}
protected:
size_t FillBuffer(size_t position) override;
private:
// A position within the data stream. It stores:
// - The 'physical' position (# of bytes in the stream),
// - the 'logical' position (# of ucs-2 characters, also within the stream),
// - a possibly incomplete utf-8 char at the current 'physical' position.
struct StreamPosition {
size_t bytes;
size_t chars;
unibrow::Utf8::Utf8IncrementalBuffer incomplete_char;
};
// Position contains a StreamPosition and the index of the chunk the position
// points into. (The chunk_no could be derived from pos, but that'd be
// an expensive search through all chunks.)
struct Position {
size_t chunk_no;
StreamPosition pos;
};
// A chunk in the list of chunks, containing:
// - The chunk data (data pointer and length), and
// - the position at the first byte of the chunk.
struct Chunk {
const uint8_t* data;
size_t length;
StreamPosition start;
};
// Within the current chunk, skip forward from current_ towards position.
bool SkipToPosition(size_t position);
// Within the current chunk, fill the buffer_ (while it has capacity).
void FillBufferFromCurrentChunk();
// Fetch a new chunk (assuming current_ is at the end of the current data).
bool FetchChunk();
// Search through the chunks and set current_ to point to the given position.
// (This call is potentially expensive.)
void SearchPosition(size_t position);
std::vector<Chunk> chunks_;
Position current_;
ScriptCompiler::ExternalSourceStream* source_stream_;
RuntimeCallStats* stats_;
};
bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) {
DCHECK_LE(current_.pos.chars, position); // We can only skip forward.
// Already there? Then return immediately.
if (current_.pos.chars == position) return true;
const Chunk& chunk = chunks_[current_.chunk_no];
DCHECK(current_.pos.bytes >= chunk.start.bytes);
unibrow::Utf8::Utf8IncrementalBuffer incomplete_char =
chunk.start.incomplete_char;
size_t it = current_.pos.bytes - chunk.start.bytes;
size_t chars = chunk.start.chars;
while (it < chunk.length && chars < position) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(chunk.data[it], &incomplete_char);
if (t == kUtf8Bom && current_.pos.chars == 0) {
// BOM detected at beginning of the stream. Don't copy it.
} else if (t != unibrow::Utf8::kIncomplete) {
chars++;
if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
static Chunk* CreateTwoByte(const uint8_t* data, size_t byte_length,
size_t char_pos, bool odd_start) {
size_t char_length = (odd_start + byte_length) / 2;
return new Chunk(TWO_BYTE, odd_start, data, byte_length, char_length,
char_pos);
}
// Position of the first character which starts in this chunk.
// This is always greater or equal to char_pos, so if no char starts in this
// chunk (i.e. char_length == 0) char_pos+1 is returned.
size_t FirstCharPosition() const {
// There are no split chars for one byte chunks, but there might be some
// start offset
if (type == ONE_BYTE) return char_pos;
return char_pos + (first_char_start_offset != 0);
}
size_t OffsetOf(size_t position) const {
DCHECK_LE(FirstCharPosition(), position);
DCHECK_GE(char_pos + char_length, position);
switch (type) {
case ONE_BYTE:
return position - FirstCharPosition() + first_char_start_offset;
case TWO_BYTE:
return 2 * (position - FirstCharPosition()) + first_char_start_offset;
}
it++;
UNREACHABLE();
}
current_.pos.bytes += it;
current_.pos.chars = chars;
current_.pos.incomplete_char = incomplete_char;
current_.chunk_no += (it == chunk.length);
return current_.pos.chars == position;
}
void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
DCHECK_LT(current_.chunk_no, chunks_.size());
DCHECK_EQ(buffer_start_, buffer_cursor_);
DCHECK_LT(buffer_end_ + 1, buffer_start_ + kBufferSize);
const Chunk& chunk = chunks_[current_.chunk_no];
// The buffer_ is writable, but buffer_*_ members are const. So we get a
// non-const pointer into buffer that points to the same char as buffer_end_.
uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
DCHECK_EQ(cursor, buffer_end_);
// If the current chunk is the last (empty) chunk we'll have to process
// any left-over, partial characters.
if (chunk.length == 0) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncrementalFinish(&current_.pos.incomplete_char);
if (t != unibrow::Utf8::kBufferEmpty) {
DCHECK(t < unibrow::Utf16::kMaxNonSurrogateCharCode);
*cursor = static_cast<uc16>(t);
buffer_end_++;
current_.pos.chars++;
size_t CopyToBuffer(uint8_t* buffer, size_t size, size_t offset) const {
switch (type) {
case ONE_BYTE: {
size_t chars_to_copy = i::Min(size / 2, char_length - offset);
i::CopyCharsUnsigned(reinterpret_cast<uc16*>(buffer), &data[offset],
chars_to_copy);
return 2 * chars_to_copy;
}
case TWO_BYTE: {
size_t bytes_to_copy = i::Min(size, byte_length - offset);
i::MemCopy(buffer, &data[offset], bytes_to_copy);
return bytes_to_copy;
}
}
return;
UNREACHABLE();
}
const Type type;
// Offset to the first byte of first character that starts in this chunk.
const int first_char_start_offset;
const std::unique_ptr<const uint8_t[]> data;
const size_t byte_length;
const size_t char_length;
const size_t char_pos;
base::AtomicValue<const Chunk*> next{nullptr};
const Chunk* prev = nullptr;
};
unibrow::Utf8::Utf8IncrementalBuffer incomplete_char =
current_.pos.incomplete_char;
size_t it;
for (it = current_.pos.bytes - chunk.start.bytes;
it < chunk.length && cursor + 1 < buffer_start_ + kBufferSize; it++) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(chunk.data[it], &incomplete_char);
if (t == unibrow::Utf8::kIncomplete) continue;
if (V8_LIKELY(t < kUtf8Bom)) {
*(cursor++) = static_cast<uc16>(t); // The by most frequent case.
} else if (t == kUtf8Bom && current_.pos.bytes + it == 2) {
// BOM detected at beginning of the stream. Don't copy it.
} else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
*(cursor++) = static_cast<uc16>(t);
// An interface that delivers a sequence of Chunks.
class ChunkSource {
public:
virtual ~ChunkSource() {}
// Fetch and return next chunk.
// Returns nullptr if source is exhausted.
// Should not be called after source is exhausted.
virtual Chunk* GetNextChunk() = 0;
};
// Manages chunks, provides seeking to chunk containing start byte or end byte
// of a character. It cannot fetch new data, but can read data fetched by viewed
// stream (also data fetched after this stream view was created). Multiple
// ChunksViews can operate on same chunks concurrently.
class ChunksView {
public:
ChunksView(const ChunksView& other) = default;
virtual ~ChunksView() {}
// Return the chunk containing the last byte of the char at position.
// Position is in chars not bytes.
// If position is behind the end of the stream, nullptr is returned.
virtual const Chunk* SeekToEndOf(size_t position) {
DCHECK_NOT_NULL(current_);
DCHECK_NOT_NULL(last_seen_);
// We almost always 'stream', meaning we want data from the next chunk
if (V8_LIKELY(position >= last_seen_->char_pos)) {
// Fast-forward to last seen chunk
current_ = last_seen_;
// Continue going forward until position found or out of chunks
while (position >= current_->char_pos + current_->char_length) {
const Chunk* next = current_->next.Value();
if (!next) return nullptr;
current_ = last_seen_ = next;
}
} else if (position >= current_->char_pos) {
// Go forward until position found or out of chunks
while (position >= current_->char_pos + current_->char_length) {
current_ = current_->next.Value();
if (!current_) return nullptr;
}
} else {
*(cursor++) = unibrow::Utf16::LeadSurrogate(t);
*(cursor++) = unibrow::Utf16::TrailSurrogate(t);
// Shortcut jump to first chunk
DCHECK_NOT_NULL(start_.get());
DCHECK_EQ(start_->char_pos, 0);
DCHECK_EQ(start_->byte_length, 0);
const Chunk* first = start_->next.Value();
DCHECK_NOT_NULL(first);
DCHECK_EQ(first->char_pos, 0);
if (position < first->char_pos + first->char_length) {
current_ = first;
}
// Go backwards until position found
while (position < current_->char_pos) {
current_ = current_->prev;
DCHECK_NOT_NULL(current_);
}
}
DCHECK_LE(current_->char_pos, position);
DCHECK_LT(position, current_->char_pos + current_->char_length);
return current_;
}
current_.pos.bytes = chunk.start.bytes + it;
current_.pos.chars += (cursor - buffer_end_);
current_.pos.incomplete_char = incomplete_char;
current_.chunk_no += (it == chunk.length);
buffer_end_ = cursor;
}
bool Utf8ExternalStreamingStream::FetchChunk() {
RuntimeCallTimerScope scope(stats_, &RuntimeCallStats::GetMoreDataCallback);
DCHECK_EQ(current_.chunk_no, chunks_.size());
DCHECK(chunks_.empty() || chunks_.back().length != 0);
const uint8_t* chunk = nullptr;
size_t length = source_stream_->GetMoreData(&chunk);
chunks_.push_back({chunk, length, current_.pos});
return length > 0;
}
void Utf8ExternalStreamingStream::SearchPosition(size_t position) {
// If current_ already points to the right position, we're done.
//
// This is expected to be the common case, since we typically call
// FillBuffer right after the current buffer.
if (current_.pos.chars == position) return;
// No chunks. Fetch at least one, so we can assume !chunks_.empty() below.
if (chunks_.empty()) {
DCHECK_EQ(current_.chunk_no, 0u);
DCHECK_EQ(current_.pos.bytes, 0u);
DCHECK_EQ(current_.pos.chars, 0u);
FetchChunk();
// Return the chunk containing the first byte of the char at position.
// Position is in chars not bytes.
// If position is behind the end of the stream, nullptr is returned.
// This call also guarantees that a whole char is available.
const Chunk* SeekToStartOf(size_t position) {
const Chunk* chunk = SeekToEndOf(position);
if (chunk) {
while (V8_UNLIKELY(position < chunk->FirstCharPosition())) {
chunk = chunk->prev;
DCHECK_NOT_NULL(chunk);
}
current_ = chunk;
}
return chunk;
}
// Search for the last chunk whose start position is less or equal to
// position.
size_t chunk_no = chunks_.size() - 1;
while (chunk_no > 0 && chunks_[chunk_no].start.chars > position) {
chunk_no--;
// Returns next chunk if there is one.
// This function never blocks/fetches new data.
const Chunk* MoveToNext() {
DCHECK_NOT_NULL(current_);
DCHECK_NOT_NULL(last_seen_);
const Chunk* next = current_->next.Value();
if (next) {
// Stream view may move to a previously unseen chunk
if (current_ == last_seen_) {
last_seen_ = next;
}
current_ = next;
}
return next;
}
// Did we find the terminating (zero-length) chunk? Then we're seeking
// behind the end of the data, and position does not exist.
// Set current_ to point to the terminating chunk.
if (chunks_[chunk_no].length == 0) {
current_ = {chunk_no, chunks_[chunk_no].start};
return;
}
protected:
ChunksView()
: start_(std::make_shared<Chunk>()),
current_(start_.get()),
last_seen_(current_) {}
// Start sentinel shared between stream and stream views
// start_->next is first chunk with real data
const std::shared_ptr<Chunk> start_;
// Chunk we're currently at.
// Seek*, MoveToNext alter/work in respect to this.
const Chunk* current_;
// Last chunk this instance of ChunksView has seen so far.
const Chunk* last_seen_;
};
// Did we find the non-last chunk? Then our position must be within chunk_no.
if (chunk_no + 1 < chunks_.size()) {
// Fancy-pants optimization for ASCII chunks within a utf-8 stream.
// (Many web sites declare utf-8 encoding, but use only (or almost only) the
// ASCII subset for their JavaScript sources. We can exploit this, by
// checking whether the # bytes in a chunk are equal to the # chars, and if
// so avoid the expensive SkipToPosition.)
bool ascii_only_chunk =
chunks_[chunk_no].start.incomplete_char ==
unibrow::Utf8::Utf8IncrementalBuffer(0) &&
(chunks_[chunk_no + 1].start.bytes - chunks_[chunk_no].start.bytes) ==
(chunks_[chunk_no + 1].start.chars - chunks_[chunk_no].start.chars);
if (ascii_only_chunk) {
size_t skip = position - chunks_[chunk_no].start.chars;
current_ = {chunk_no,
{chunks_[chunk_no].start.bytes + skip,
chunks_[chunk_no].start.chars + skip,
unibrow::Utf8::Utf8IncrementalBuffer(0)}};
} else {
current_ = {chunk_no, chunks_[chunk_no].start};
SkipToPosition(position);
// A ChunksView subclass that does fetch new data from a ChunkSource if needed.
class Chunks : public ChunksView {
public:
Chunks(std::unique_ptr<ChunkSource> source, RuntimeCallStats* stats)
: source_(std::move(source)),
stats_(stats),
last_added_(start_.get()),
source_exhausted_(false) {}
const Chunk* SeekToEndOf(size_t position) override {
if (!source_exhausted_) {
GetNewDataIfNeeded(position);
}
// Since position was within the chunk, SkipToPosition should have found
// something.
DCHECK_EQ(position, current_.pos.chars);
return;
return ChunksView::SeekToEndOf(position);
}
// What's left: We're in the last, non-terminating chunk. Our position
// may be in the chunk, but it may also be in 'future' chunks, which we'll
// have to obtain.
DCHECK_EQ(chunk_no, chunks_.size() - 1);
current_ = {chunk_no, chunks_[chunk_no].start};
bool have_more_data = true;
bool found = SkipToPosition(position);
while (have_more_data && !found) {
DCHECK_EQ(current_.chunk_no, chunks_.size());
have_more_data = FetchChunk();
found = have_more_data && SkipToPosition(position);
protected:
void GetNewDataIfNeeded(size_t position) {
DCHECK(!source_exhausted_);
DCHECK_NOT_NULL(last_added_);
size_t char_end_pos = last_added_->char_pos + last_added_->char_length;
// Get more data if needed.
if (char_end_pos <= position) {
RuntimeCallTimerScope scope(stats_,
&RuntimeCallStats::GetMoreDataCallback);
while (char_end_pos <= position) {
Chunk* chunk = source_->GetNextChunk();
if (chunk) {
chunk->prev = last_added_;
last_added_->next.SetValue(chunk);
last_added_ = chunk;
char_end_pos += chunk->char_length;
DCHECK_EQ(char_end_pos,
last_added_->char_pos + last_added_->char_length);
} else {
source_exhausted_ = true;
break;
}
}
}
DCHECK_NOT_NULL(last_added_);
last_seen_ = last_added_;
// We either got the chunk we were looking for, or source is exhausted.
DCHECK_IMPLIES(!source_exhausted_, position < char_end_pos);
}
const std::unique_ptr<ChunkSource> source_;
RuntimeCallStats* const stats_;
Chunk* last_added_;
bool source_exhausted_;
};
// We'll return with a postion != the desired position only if we're out
// of data. In that case, we'll point to the terminating chunk.
DCHECK_EQ(found, current_.pos.chars == position);
DCHECK_EQ(have_more_data, chunks_.back().length != 0);
DCHECK_IMPLIES(!found, !have_more_data);
DCHECK_IMPLIES(!found, current_.chunk_no == chunks_.size() - 1);
}
size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) {
buffer_cursor_ = buffer_;
buffer_end_ = buffer_;
SearchPosition(position);
bool out_of_data = current_.chunk_no != chunks_.size() &&
chunks_[current_.chunk_no].length == 0;
if (out_of_data) return 0;
// Fill the buffer, until we have at least one char (or are out of data).
// (The embedder might give us 1-byte blocks within a utf-8 char, so we
// can't guarantee progress with one chunk. Thus we iterate.)
while (!out_of_data && buffer_cursor_ == buffer_end_) {
// At end of current data, but there might be more? Then fetch it.
if (current_.chunk_no == chunks_.size()) {
out_of_data = !FetchChunk();
// ChunkSource for one-byte ExternalSourceStream
class OneByteChunkSource : public ChunkSource {
public:
explicit OneByteChunkSource(ScriptCompiler::ExternalSourceStream* source)
: source_(source) {}
Chunk* GetNextChunk() override {
const uint8_t* data = nullptr;
DCHECK_NOT_NULL(source_);
const size_t byte_len = source_->GetMoreData(&data);
if (V8_UNLIKELY(byte_len == 0)) {
delete[] data;
return nullptr;
}
FillBufferFromCurrentChunk();
size_t char_pos = byte_pos_;
byte_pos_ += byte_len;
return Chunk::CreateOneByte(data, byte_len, char_pos);
}
DCHECK_EQ(current_.pos.chars - position,
static_cast<size_t>(buffer_end_ - buffer_cursor_));
return buffer_end_ - buffer_cursor_;
}
private:
ScriptCompiler::ExternalSourceStream* const source_;
size_t byte_pos_ = 0;
};
// ----------------------------------------------------------------------------
// Chunks - helper for One- + TwoByteExternalStreamingStream
namespace {
// ChunkSource for two-byte ExternalSourceStream
class TwoByteChunkSource : public ChunkSource {
public:
explicit TwoByteChunkSource(ScriptCompiler::ExternalSourceStream* source)
: source_(source) {}
Chunk* GetNextChunk() override {
const uint8_t* data = nullptr;
DCHECK_NOT_NULL(source_);
const size_t byte_len = source_->GetMoreData(&data);
if (V8_UNLIKELY(byte_len == 0)) {
delete[] data;
return nullptr;
}
bool odd_start = (byte_pos_ % 2) == 1;
size_t char_pos = byte_pos_ / 2;
byte_pos_ += byte_len;
return Chunk::CreateTwoByte(data, byte_len, char_pos, odd_start);
}
struct Chunk {
const uint8_t* data;
size_t byte_length;
size_t byte_pos;
private:
ScriptCompiler::ExternalSourceStream* const source_;
size_t byte_pos_ = 0;
};
typedef std::vector<struct Chunk> Chunks;
void DeleteChunks(Chunks& chunks) {
for (size_t i = 0; i < chunks.size(); i++) delete[] chunks[i].data;
}
// ChunkSource that decodes incoming utf8 to two-byte chunks if needed.
// Byte Order Mark at the beginning of the stream is skipped.
// ASCII only chunks are kept as one-byte chunks.
// Performance is optimized for source files with mostly ASCII characters.
class Utf8ChunkSource : public ChunkSource {
public:
explicit Utf8ChunkSource(ScriptCompiler::ExternalSourceStream* source)
: source_(source),
incomplete_char_(unibrow::Utf8::Utf8IncrementalBuffer(0)) {}
// Return the chunk index for the chunk containing position.
// If position is behind the end of the stream, the index of the last,
// zero-length chunk is returned.
size_t FindChunk(Chunks& chunks, ScriptCompiler::ExternalSourceStream* source,
size_t position, RuntimeCallStats* stats) {
size_t end_pos =
chunks.empty() ? 0 : (chunks.back().byte_pos + chunks.back().byte_length);
// Get more data if needed. We usually won't enter the loop body.
bool out_of_data = !chunks.empty() && chunks.back().byte_length == 0;
{
RuntimeCallTimerScope scope(stats, &RuntimeCallStats::GetMoreDataCallback);
while (!out_of_data && end_pos <= position + 1) {
const uint8_t* chunk = nullptr;
size_t len = source->GetMoreData(&chunk);
chunks.push_back({chunk, len, end_pos});
end_pos += len;
out_of_data = (len == 0);
protected:
size_t ComputeAsciiPrefixLength(const uint8_t* data, size_t byte_length) {
if (incomplete_char_ != unibrow::Utf8::Utf8IncrementalBuffer(0)) {
return 0;
}
for (size_t i = 0; i < byte_length; ++i) {
if (data[i] > unibrow::Utf8::kMaxOneByteChar) {
return i;
}
}
return byte_length;
}
// Here, we should always have at least one chunk, and we either have the
// chunk we were looking for, or we're out of data. Also, out_of_data and
// end_pos are current (and designate whether we have exhausted the stream,
// and the length of data received so far, respectively).
DCHECK(!chunks.empty());
DCHECK_EQ(end_pos, chunks.back().byte_pos + chunks.back().byte_length);
DCHECK_EQ(out_of_data, chunks.back().byte_length == 0);
DCHECK(position < end_pos || out_of_data);
// Edge case: position is behind the end of stream: Return the last (length 0)
// chunk to indicate the end of the stream.
if (position >= end_pos) {
DCHECK(out_of_data);
return chunks.size() - 1;
size_t ConvertToUtf16(const uint8_t* data, size_t byte_length,
size_t ascii_prefix_len, const uint8_t** result) {
DCHECK_IMPLIES(ascii_prefix_len != 0,
incomplete_char_ == unibrow::Utf8::Utf8IncrementalBuffer(0));
// We get at most 1 character per byte.
// Only exception is the first byte of chunk which may finish an incomplete
// surrogate pair, hence addition of 1.
uc16* decoded_data = new uc16[byte_length + 1];
i::CopyCharsUnsigned(decoded_data, data, ascii_prefix_len);
size_t decoded_len = ascii_prefix_len;
for (size_t i = ascii_prefix_len; i < byte_length; ++i) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(data[i], &incomplete_char_);
if (t == unibrow::Utf8::kIncomplete) continue;
if (V8_LIKELY(t < kUtf8Bom)) {
decoded_data[decoded_len++] = static_cast<uc16>(t);
} else if (V8_UNLIKELY(is_at_first_char_ && t == kUtf8Bom)) {
// Skip BOM at the beggining of the stream
is_at_first_char_ = false;
} else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
decoded_data[decoded_len++] = static_cast<uc16>(t);
} else {
decoded_data[decoded_len++] = unibrow::Utf16::LeadSurrogate(t);
decoded_data[decoded_len++] = unibrow::Utf16::TrailSurrogate(t);
}
}
*result = reinterpret_cast<uint8_t*>(decoded_data);
return 2 * decoded_len;
}
// We almost always 'stream', meaning we want data from the last chunk, so
// let's look at chunks back-to-front.
size_t chunk_no = chunks.size() - 1;
while (chunks[chunk_no].byte_pos > position) {
DCHECK_NE(chunk_no, 0u);
chunk_no--;
// Skip possible BOM at the beginning of the stream.
// Return number of bytes skipped.
size_t SkipBOM(const uint8_t* data, size_t byte_length) {
if (V8_UNLIKELY(is_at_first_char_ && byte_length >= 3 &&
data[0] == kUtf8BomBytes[0] &&
data[1] == kUtf8BomBytes[1] &&
data[2] == kUtf8BomBytes[2])) {
return 3;
}
return 0;
}
DCHECK_LE(chunks[chunk_no].byte_pos, position);
DCHECK_LT(position, chunks[chunk_no].byte_pos + chunks[chunk_no].byte_length);
return chunk_no;
}
} // anonymous namespace
// ----------------------------------------------------------------------------
// OneByteExternalStreamingStream
//
// A stream of latin-1 encoded, chunked data.
class OneByteExternalStreamingStream : public BufferedUtf16CharacterStream {
public:
explicit OneByteExternalStreamingStream(
ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats)
: source_(source), stats_(stats) {}
~OneByteExternalStreamingStream() override { DeleteChunks(chunks_); }
protected:
size_t FillBuffer(size_t position) override;
Chunk* GetNextChunk() override {
const uint8_t* data = nullptr;
size_t byte_length = 0;
if (!is_at_end_) {
byte_length = source_->GetMoreData(&data);
}
// end of stream
if (V8_UNLIKELY(byte_length == 0)) {
delete[] data;
is_at_end_ = true;
unibrow::uchar t =
unibrow::Utf8::ValueOfIncrementalFinish(&incomplete_char_);
if (t != unibrow::Utf8::kBufferEmpty) {
DCHECK(t < unibrow::Utf16::kMaxNonSurrogateCharCode);
uc16* char_data = new uc16[1];
char_data[0] = static_cast<uc16>(t);
return Chunk::CreateTwoByte(reinterpret_cast<uint8_t*>(char_data), 2,
char_pos_++, false);
}
return nullptr;
}
size_t skipped = SkipBOM(data, byte_length);
if (V8_UNLIKELY(skipped == byte_length)) {
is_at_first_char_ = false;
delete[] data;
return GetNextChunk();
}
size_t ascii_prefix_len =
ComputeAsciiPrefixLength(data + skipped, byte_length - skipped);
bool is_ascii_only = ascii_prefix_len == (byte_length - skipped);
if (V8_LIKELY(is_ascii_only)) {
is_at_first_char_ = false;
Chunk* chunk =
Chunk::CreateOneByte(data, byte_length, char_pos_, skipped);
char_pos_ += chunk->char_length;
return chunk;
}
const uint8_t* decoded_data;
size_t decoded_byte_len = ConvertToUtf16(
data + skipped, byte_length - skipped, ascii_prefix_len, &decoded_data);
delete[] data;
if (V8_UNLIKELY(decoded_byte_len == 0)) {
delete[] decoded_data;
return GetNextChunk();
}
is_at_first_char_ = false;
Chunk* chunk =
Chunk::CreateTwoByte(decoded_data, decoded_byte_len, char_pos_, false);
char_pos_ += chunk->char_length;
return chunk;
}
private:
Chunks chunks_;
ScriptCompiler::ExternalSourceStream* source_;
RuntimeCallStats* stats_;
unibrow::Utf8::Utf8IncrementalBuffer incomplete_char_;
bool is_at_first_char_ = true;
bool is_at_end_ = false;
size_t char_pos_ = 0;
};
size_t OneByteExternalStreamingStream::FillBuffer(size_t position) {
const Chunk& chunk = chunks_[FindChunk(chunks_, source_, position, stats_)];
if (chunk.byte_length == 0) return 0;
size_t start_pos = position - chunk.byte_pos;
size_t len = i::Min(kBufferSize, chunk.byte_length - start_pos);
i::CopyCharsUnsigned(buffer_, chunk.data + start_pos, len);
return len;
}
#if !(V8_TARGET_ARCH_MIPS || V8_TARGET_ARCH_MIPS64)
// ----------------------------------------------------------------------------
// TwoByteExternalStreamingStream
//
// A stream of ucs-2 data, delivered in chunks. Chunks may be 'cut' into the
// middle of characters (or even contain only one byte), which adds a bit
// of complexity. This stream avoid all data copying, except for characters
// that cross chunk boundaries.
} // anonymous namespace
class TwoByteExternalStreamingStream : public Utf16CharacterStream {
// A stream of chunked data
class ExternalStreamingStream : public BufferedUtf16CharacterStream {
public:
explicit TwoByteExternalStreamingStream(
ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats);
~TwoByteExternalStreamingStream() override;
explicit ExternalStreamingStream(std::unique_ptr<ChunkSource> source,
RuntimeCallStats* stats)
: chunks_(new Chunks(std::move(source), stats)) {}
protected:
bool ReadBlock() override;
size_t FillBuffer(size_t position) override;
Chunks chunks_;
ScriptCompiler::ExternalSourceStream* source_;
RuntimeCallStats* stats_;
uc16 one_char_buffer_;
std::unique_ptr<ChunksView> chunks_;
};
TwoByteExternalStreamingStream::TwoByteExternalStreamingStream(
ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats)
: Utf16CharacterStream(&one_char_buffer_, &one_char_buffer_,
&one_char_buffer_, 0),
source_(source),
stats_(stats),
one_char_buffer_(0) {}
TwoByteExternalStreamingStream::~TwoByteExternalStreamingStream() {
DeleteChunks(chunks_);
}
bool TwoByteExternalStreamingStream::ReadBlock() {
bool ExternalStreamingStream::ReadBlock() {
size_t position = pos();
// Find chunk in which the position belongs
const Chunk* chunk = chunks_->SeekToEndOf(position);
// We'll search for the 2nd byte of our character, to make sure we
// have enough data for at least one character.
size_t chunk_no = FindChunk(chunks_, source_, 2 * position + 1, stats_);
// Out of data? Return 0.
if (chunks_[chunk_no].byte_length == 0) {
buffer_cursor_ = buffer_start_;
buffer_end_ = buffer_start_;
// Out of data? Return false.
if (!chunk) {
buffer_cursor_ = buffer_end_ = buffer_start_;
return false;
}
Chunk& current = chunks_[chunk_no];
// Annoying edge case: Chunks may not be 2-byte aligned, meaning that a
// character may be split between the previous and the current chunk.
// If we find such a lonely byte at the beginning of the chunk, we'll use
// one_char_buffer_ to hold the full character.
bool lonely_byte = (chunks_[chunk_no].byte_pos == (2 * position + 1));
if (lonely_byte) {
DCHECK_NE(chunk_no, 0u);
Chunk& previous_chunk = chunks_[chunk_no - 1];
#ifdef V8_TARGET_BIG_ENDIAN
uc16 character = current.data[0] |
previous_chunk.data[previous_chunk.byte_length - 1] << 8;
#else
uc16 character = previous_chunk.data[previous_chunk.byte_length - 1] |
current.data[0] << 8;
#endif
one_char_buffer_ = character;
buffer_pos_ = position;
buffer_start_ = &one_char_buffer_;
buffer_cursor_ = &one_char_buffer_;
buffer_end_ = &one_char_buffer_ + 1;
return true;
bool odd_start = chunk->first_char_start_offset == 1;
if (odd_start || chunk->type != Chunk::TWO_BYTE) {
return BufferedUtf16CharacterStream::ReadBlock();
}
// Common case: character is in current chunk.
DCHECK_LE(current.byte_pos, 2 * position);
DCHECK_LT(2 * position + 1, current.byte_pos + current.byte_length);
// Determine # of full ucs-2 chars in stream, and whether we started on an odd
// byte boundary.
bool odd_start = (current.byte_pos % 2) == 1;
size_t number_chars = (current.byte_length - odd_start) / 2;
// Point the buffer_*_ members into the current chunk and set buffer_cursor_
// to point to position. Be careful when converting the byte positions (in
// Chunk) to the ucs-2 character positions (in buffer_*_ members).
buffer_start_ = reinterpret_cast<const uint16_t*>(current.data + odd_start);
buffer_end_ = buffer_start_ + number_chars;
buffer_pos_ = (current.byte_pos + odd_start) / 2;
// Aligned access is important on MIPS and ARM.
DCHECK_EQ((reinterpret_cast<uintptr_t>(chunk->data.get()) % 2), 0);
buffer_start_ = reinterpret_cast<const uint16_t*>(chunk->data.get());
buffer_end_ = buffer_start_ + chunk->char_length;
buffer_pos_ = chunk->char_pos;
buffer_cursor_ = buffer_start_ + (position - buffer_pos_);
DCHECK_EQ(position, pos());
return true;
}
#else
// ----------------------------------------------------------------------------
// TwoByteExternalBufferedStream
//
// This class is made specifically to address unaligned access to 16-bit data
// in MIPS and ARM architectures. It replaces class
// TwoByteExternalStreamingStream which in some cases does have unaligned
// accesse to 16-bit data
class TwoByteExternalBufferedStream : public Utf16CharacterStream {
public:
explicit TwoByteExternalBufferedStream(
ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats);
~TwoByteExternalBufferedStream();
protected:
static const size_t kBufferSize = 512;
bool ReadBlock() override;
// FillBuffer should read up to kBufferSize characters at position and store
// them into buffer_[0..]. It returns the number of characters stored.
size_t FillBuffer(size_t position, size_t chunk_no);
// Fixed sized buffer that this class reads from.
// The base class' buffer_start_ should always point to buffer_.
uc16 buffer_[kBufferSize];
Chunks chunks_;
ScriptCompiler::ExternalSourceStream* source_;
RuntimeCallStats* stats_;
};
TwoByteExternalBufferedStream::TwoByteExternalBufferedStream(
ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats)
: Utf16CharacterStream(buffer_, buffer_, buffer_, 0),
source_(source),
stats_(stats) {}
TwoByteExternalBufferedStream::~TwoByteExternalBufferedStream() {
DeleteChunks(chunks_);
}
bool TwoByteExternalBufferedStream::ReadBlock() {
size_t position = pos();
// Find chunk in which the position belongs
size_t chunk_no = FindChunk(chunks_, source_, 2 * position + 1, stats_);
size_t ExternalStreamingStream::FillBuffer(size_t position) {
size_t copied_bytes = 0;
const Chunk* chunk = chunks_->SeekToStartOf(position);
// Out of data? Return 0.
if (chunks_[chunk_no].byte_length == 0) {
buffer_cursor_ = buffer_start_;
buffer_end_ = buffer_start_;
return false;
}
Chunk& current = chunks_[chunk_no];
bool odd_start = current.byte_pos % 2;
// Common case: character is in current chunk.
DCHECK_LE(current.byte_pos, 2 * position + odd_start);
DCHECK_LT(2 * position + 1, current.byte_pos + current.byte_length);
// If character starts on odd address copy text in buffer so there is always
// aligned access to characters. This is important on MIPS and ARM
// architectures. Otherwise read characters from memory directly.
if (!odd_start) {
buffer_start_ = reinterpret_cast<const uint16_t*>(current.data);
size_t number_chars = current.byte_length / 2;
buffer_end_ = buffer_start_ + number_chars;
buffer_pos_ = current.byte_pos / 2;
buffer_cursor_ = buffer_start_ + (position - buffer_pos_);
DCHECK_EQ(position, pos());
return true;
} else {
buffer_start_ = buffer_;
buffer_pos_ = position;
buffer_cursor_ = buffer_;
buffer_end_ = buffer_ + FillBuffer(position, chunk_no);
DCHECK_EQ(pos(), position);
DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize);
return buffer_cursor_ < buffer_end_;
}
}
size_t TwoByteExternalBufferedStream::FillBuffer(size_t position,
size_t chunk_no) {
DCHECK_EQ(chunks_[chunk_no].byte_pos % 2, 1u);
bool odd_start = true;
// Align buffer_pos_ to the size of the buffer.
{
size_t new_pos = position / kBufferSize * kBufferSize;
if (new_pos != position) {
chunk_no = FindChunk(chunks_, source_, 2 * new_pos + 1, stats_);
buffer_pos_ = new_pos;
buffer_cursor_ = buffer_start_ + (position - buffer_pos_);
position = new_pos;
odd_start = chunks_[chunk_no].byte_pos % 2;
}
}
Chunk* current = &chunks_[chunk_no];
// Annoying edge case: Chunks may not be 2-byte aligned, meaning that a
// character may be split between the previous and the current chunk.
// If we find such a lonely byte at the beginning of the chunk, we'll copy
// it to the first byte in buffer_.
size_t totalLength = 0;
bool lonely_byte = (current->byte_pos == (2 * position + 1));
if (lonely_byte) {
DCHECK_NE(chunk_no, 0u);
Chunk& previous_chunk = chunks_[chunk_no - 1];
*reinterpret_cast<uint8_t*>(buffer_) =
previous_chunk.data[previous_chunk.byte_length - 1];
totalLength++;
if (!chunk) {
return 0;
}
// Common case: character is in current chunk.
DCHECK_LE(current->byte_pos, 2 * position + odd_start);
DCHECK_LT(2 * position + 1, current->byte_pos + current->byte_length);
// Copy characters from current chunk starting from chunk_pos to the end of
// buffer or chunk.
size_t chunk_pos = position - current->byte_pos / 2;
size_t start_offset = odd_start && chunk_pos != 0;
size_t bytes_to_move =
i::Min(2 * kBufferSize - lonely_byte,
current->byte_length - 2 * chunk_pos + start_offset);
i::MemMove(reinterpret_cast<uint8_t*>(buffer_) + lonely_byte,
current->data + 2 * chunk_pos - start_offset, bytes_to_move);
// Fill up the rest of the buffer if there is space and data left.
totalLength += bytes_to_move;
position = (current->byte_pos + current->byte_length) / 2;
if (position - buffer_pos_ < kBufferSize) {
chunk_no = FindChunk(chunks_, source_, 2 * position + 1, stats_);
current = &chunks_[chunk_no];
odd_start = current->byte_pos % 2;
bytes_to_move = i::Min(2 * kBufferSize - totalLength, current->byte_length);
while (bytes_to_move) {
// Common case: character is in current chunk.
DCHECK_LE(current->byte_pos, 2 * position + odd_start);
DCHECK_LT(2 * position + 1, current->byte_pos + current->byte_length);
i::MemMove(reinterpret_cast<uint8_t*>(buffer_) + totalLength,
current->data, bytes_to_move);
totalLength += bytes_to_move;
position = (current->byte_pos + current->byte_length) / 2;
chunk_no = FindChunk(chunks_, source_, 2 * position + 1, stats_);
current = &chunks_[chunk_no];
odd_start = current->byte_pos % 2;
bytes_to_move =
i::Min(2 * kBufferSize - totalLength, current->byte_length);
size_t offset = chunk->OffsetOf(position);
while (copied_bytes < kBufferSizeInBytes && chunk) {
// Stop copying if next chunk could be used directly and at least one char
// was copied.
if (chunk->type == Chunk::TWO_BYTE && copied_bytes > 2 &&
chunk->first_char_start_offset == 0) {
break;
}
size_t space_left = kBufferSizeInBytes - copied_bytes;
DCHECK_IMPLIES(chunk->type == Chunk::ONE_BYTE, (copied_bytes % 2) == 0);
copied_bytes += chunk->CopyToBuffer(
reinterpret_cast<uint8_t*>(buffer_) + copied_bytes, space_left, offset);
// chunk after the first one are copied from beginning
offset = 0;
chunk = chunks_->MoveToNext();
}
return totalLength / 2;
return copied_bytes / 2;
}
#endif
// ----------------------------------------------------------------------------
// ScannerStream: Create stream instances.
......@@ -850,20 +706,20 @@ Utf16CharacterStream* ScannerStream::For(
ScriptCompiler::ExternalSourceStream* source_stream,
v8::ScriptCompiler::StreamedSource::Encoding encoding,
RuntimeCallStats* stats) {
std::unique_ptr<ChunkSource> source;
switch (encoding) {
case v8::ScriptCompiler::StreamedSource::TWO_BYTE:
#if !(V8_TARGET_ARCH_MIPS || V8_TARGET_ARCH_MIPS64)
return new TwoByteExternalStreamingStream(source_stream, stats);
#else
return new TwoByteExternalBufferedStream(source_stream, stats);
#endif
source.reset(new TwoByteChunkSource(source_stream));
break;
case v8::ScriptCompiler::StreamedSource::ONE_BYTE:
return new OneByteExternalStreamingStream(source_stream, stats);
source.reset(new OneByteChunkSource(source_stream));
break;
case v8::ScriptCompiler::StreamedSource::UTF8:
return new Utf8ExternalStreamingStream(source_stream, stats);
source.reset(new Utf8ChunkSource(source_stream));
break;
}
UNREACHABLE();
return nullptr;
DCHECK_NOT_NULL(source.get());
return new ExternalStreamingStream(std::move(source), stats);
}
} // namespace internal
......
......@@ -435,6 +435,18 @@ TEST(CharacterStreams) {
TestCharacterStreams(buffer, arraysize(buffer) - 1, 576, 3298);
}
TEST(Uft8MultipleBOMChunks) {
const char* chunks = "\xef\xbb\xbf\0\xef\xbb\xbf\0\xef\xbb\xbf\0a\0";
const uint16_t unicode[] = {0xFEFF, 0xFEFF, 97};
ChunkSource chunk_source(chunks);
std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
&chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
for (size_t i = 0; i < arraysize(unicode); i++) {
CHECK_EQ(unicode[i], stream->Advance());
}
CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
}
// Regression test for crbug.com/651333. Read invalid utf-8.
TEST(Regress651333) {
const uint8_t bytes[] =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment