[parser] Refactor streaming scanner streams.

Unify, simplify logic, reduce UTF8 specific handling. Intend of this is also to have stream views. Stream views can be used concurrently by multiple threads, but only one thread may fetch new data from the underlying source. This together with unified stream view creation is intended to be used for parse tasks. BUG=v8:6093 Change-Id: Ied8e93090c506d4735080298f0fdaeed32043915 Reviewed-on: https://chromium-review.googlesource.com/501789 Commit-Queue: Wiktor Garbacz <wiktorg@google.com> Reviewed-by: Daniel Vogelheim <vogelheim@chromium.org> Reviewed-by: Marja Hölttä <marja@chromium.org> Cr-Commit-Position: refs/heads/master@{#45336}

[parser] Refactor streaming scanner streams.
Unify, simplify logic, reduce UTF8 specific handling. Intend of this is also to have stream views. Stream views can be used concurrently by multiple threads, but only one thread may fetch new data from the underlying source. This together with unified stream view creation is intended to be used for parse tasks. BUG=v8:6093 Change-Id: Ied8e93090c506d4735080298f0fdaeed32043915 Reviewed-on: https://chromium-review.googlesource.com/501789 Commit-Queue: Wiktor Garbacz <wiktorg@google.com> Reviewed-by: Daniel Vogelheim <vogelheim@chromium.org> Reviewed-by: Marja Hölttä <marja@chromium.org> Cr-Commit-Position: refs/heads/master@{#45336}
ce538f70 · Wiktor Garbacz · Commit Bot · e418a1e4 · ce538f70 · ce538f70
Commit ce538f70 authored Apr 28, 2017 by Wiktor Garbacz Committed by Commit Bot May 16, 2017
Showing with 444 additions and 581 deletions

v8.h include/v8.h +0 -5

scanner-character-streams.cc src/parsing/scanner-character-streams.cc +432 -576

test-scanner-streams.cc test/cctest/parsing/test-scanner-streams.cc +12 -0

No files found.
--- a/include/v8.h
+++ b/include/v8.h
@@ -1266,11 +1266,6 @@ class V8_EXPORT ScriptCompiler {
     * length of the data returned. When the data ends, GetMoreData should
     * return 0. Caller takes ownership of the data.
     *
-     * When streaming UTF-8 data, V8 handles multi-byte characters split between
-     * two data chunks, but doesn't handle multi-byte characters split between
-     * more than two data chunks. The embedder can avoid this problem by always
-     * returning at least 2 bytes of data.
-     *
     * If the embedder wants to cancel the streaming, they should make the next
     * GetMoreData call return 0. V8 will interpret it as end of data (and most
     * probably, parsing will fail). The streaming task will return as soon as

--- a/src/parsing/scanner-character-streams.cc
+++ b/src/parsing/scanner-character-streams.cc
@@ -4,6 +4,8 @@
 #include "src/parsing/scanner-character-streams.h"
+#include <memory>
 #include "include/v8.h"
 #include "src/counters.h"
 #include "src/globals.h"
@@ -16,7 +18,8 @@ namespace v8 {
 namespace internal {
 namespace {
-const unibrow::uchar kUtf8Bom = 0xfeff;
+constexpr unibrow::uchar kUtf8Bom = 0xfeff;
+constexpr uint8_t kUtf8BomBytes[] = {0xef, 0xbb, 0xbf};
 }  // namespace
 // ----------------------------------------------------------------------------
@@ -30,7 +33,7 @@ class BufferedUtf16CharacterStream : public Utf16CharacterStream {
  BufferedUtf16CharacterStream();
 protected:
-  static const size_t kBufferSize = 512;
+  static constexpr size_t kBufferCharacterSize = 512;
  bool ReadBlock() override;
@@ -40,21 +43,22 @@ class BufferedUtf16CharacterStream : public Utf16CharacterStream {
  // Fixed sized buffer that this class reads from.
  // The base class' buffer_start_ should always point to buffer_.
-  uc16 buffer_[kBufferSize];
+  uc16 buffer_[kBufferCharacterSize];
+  static constexpr size_t kBufferSizeInBytes =
+      sizeof(BufferedUtf16CharacterStream::buffer_);
 };
 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
    : Utf16CharacterStream(buffer_, buffer_, buffer_, 0) {}
 bool BufferedUtf16CharacterStream::ReadBlock() {
-  DCHECK_EQ(buffer_start_, buffer_);
  size_t position = pos();
  buffer_pos_ = position;
-  buffer_cursor_ = buffer_;
+  buffer_start_ = buffer_cursor_ = buffer_;
  buffer_end_ = buffer_ + FillBuffer(position);
  DCHECK_EQ(pos(), position);
-  DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize);
+  DCHECK_LE(buffer_end_, buffer_start_ + kBufferCharacterSize);
  return buffer_cursor_ < buffer_end_;
 }
@@ -87,7 +91,7 @@ GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(
 size_t GenericStringUtf16CharacterStream::FillBuffer(size_t from_pos) {
  if (from_pos >= length_) return 0;
-  size_t length = i::Min(kBufferSize, length_ - from_pos);
+  size_t length = i::Min(kBufferCharacterSize, length_ - from_pos);
  String::WriteToFlat<uc16>(*string_, buffer_, static_cast<int>(from_pos),
                            static_cast<int>(from_pos + length));
  return length;
@@ -179,634 +183,486 @@ ExternalOneByteStringUtf16CharacterStream::
 size_t ExternalOneByteStringUtf16CharacterStream::FillBuffer(size_t from_pos) {
  if (from_pos >= length_) return 0;
-  size_t length = Min(kBufferSize, length_ - from_pos);
+  size_t length = Min(kBufferCharacterSize, length_ - from_pos);
  i::CopyCharsUnsigned(buffer_, raw_data_ + from_pos, length);
  return length;
 }
-// ----------------------------------------------------------------------------
+namespace {
-// Utf8ExternalStreamingStream - chunked streaming of Utf-8 data.
-//
-// This implementation is fairly complex, since data arrives in chunks which
-// may 'cut' arbitrarily into utf-8 characters. Also, seeking to a given
-// character position is tricky because the byte position cannot be dericed
-// from the character position.
-class Utf8ExternalStreamingStream : public BufferedUtf16CharacterStream {
+// A linked-list container for the chunk data.
- public:
+// Shared by ChunksView and Chunks (which may execute on different threads).
-  Utf8ExternalStreamingStream(
+struct Chunk {
-      ScriptCompiler::ExternalSourceStream* source_stream,
+  enum Type : uint8_t { ONE_BYTE, TWO_BYTE };
-      RuntimeCallStats* stats)
+  Chunk() : Chunk(ONE_BYTE, 0, nullptr, 0, 0, 0) {}
-      : current_({0, {0, 0, unibrow::Utf8::Utf8IncrementalBuffer(0)}}),
+  Chunk(Type type, int first_char_start_offset, const uint8_t* data,
-        source_stream_(source_stream),
+        size_t byte_length, size_t char_length, size_t char_pos)
-        stats_(stats) {}
+      : type(type),
-  ~Utf8ExternalStreamingStream() override {
+        first_char_start_offset(first_char_start_offset),
-    for (size_t i = 0; i < chunks_.size(); i++) delete[] chunks_[i].data;
+        data(data),
+        byte_length(byte_length),
+        char_length(char_length),
+        char_pos(char_pos) {}
+  ~Chunk() { delete next.Value(); }
+  static Chunk* CreateOneByte(const uint8_t* data, size_t byte_length,
+                              size_t char_pos, size_t start_offset = 0) {
+    return new Chunk(ONE_BYTE, static_cast<int>(start_offset), data,
+                     byte_length, byte_length, char_pos);
  }
+  static Chunk* CreateTwoByte(const uint8_t* data, size_t byte_length,
- protected:
+                              size_t char_pos, bool odd_start) {
-  size_t FillBuffer(size_t position) override;
+    size_t char_length = (odd_start + byte_length) / 2;
+    return new Chunk(TWO_BYTE, odd_start, data, byte_length, char_length,
- private:
+                     char_pos);
-  // A position within the data stream. It stores:
+  }
-  // - The 'physical' position (# of bytes in the stream),
+  // Position of the first character which starts in this chunk.
-  // - the 'logical' position (# of ucs-2 characters, also within the stream),
+  // This is always greater or equal to char_pos, so if no char starts in this
-  // - a possibly incomplete utf-8 char at the current 'physical' position.
+  // chunk (i.e. char_length == 0) char_pos+1 is returned.
-  struct StreamPosition {
+  size_t FirstCharPosition() const {
-    size_t bytes;
+    // There are no split chars for one byte chunks, but there might be some
-    size_t chars;
+    // start offset
-    unibrow::Utf8::Utf8IncrementalBuffer incomplete_char;
+    if (type == ONE_BYTE) return char_pos;
-  };
+    return char_pos + (first_char_start_offset != 0);
+  }
-  // Position contains a StreamPosition and the index of the chunk the position
+  size_t OffsetOf(size_t position) const {
-  // points into. (The chunk_no could be derived from pos, but that'd be
+    DCHECK_LE(FirstCharPosition(), position);
-  // an expensive search through all chunks.)
+    DCHECK_GE(char_pos + char_length, position);
-  struct Position {
+    switch (type) {
-    size_t chunk_no;
+      case ONE_BYTE:
-    StreamPosition pos;
+        return position - FirstCharPosition() + first_char_start_offset;
-  };
+      case TWO_BYTE:
+        return 2 * (position - FirstCharPosition()) + first_char_start_offset;
-  // A chunk in the list of chunks, containing:
-  // - The chunk data (data pointer and length), and
-  // - the position at the first byte of the chunk.
-  struct Chunk {
-    const uint8_t* data;
-    size_t length;
-    StreamPosition start;
-  };
-  // Within the current chunk, skip forward from current_ towards position.
-  bool SkipToPosition(size_t position);
-  // Within the current chunk, fill the buffer_ (while it has capacity).
-  void FillBufferFromCurrentChunk();
-  // Fetch a new chunk (assuming current_ is at the end of the current data).
-  bool FetchChunk();
-  // Search through the chunks and set current_ to point to the given position.
-  // (This call is potentially expensive.)
-  void SearchPosition(size_t position);
-  std::vector<Chunk> chunks_;
-  Position current_;
-  ScriptCompiler::ExternalSourceStream* source_stream_;
-  RuntimeCallStats* stats_;
-};
-bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) {
-  DCHECK_LE(current_.pos.chars, position);  // We can only skip forward.
-  // Already there? Then return immediately.
-  if (current_.pos.chars == position) return true;
-  const Chunk& chunk = chunks_[current_.chunk_no];
-  DCHECK(current_.pos.bytes >= chunk.start.bytes);
-  unibrow::Utf8::Utf8IncrementalBuffer incomplete_char =
-      chunk.start.incomplete_char;
-  size_t it = current_.pos.bytes - chunk.start.bytes;
-  size_t chars = chunk.start.chars;
-  while (it < chunk.length && chars < position) {
-    unibrow::uchar t =
-        unibrow::Utf8::ValueOfIncremental(chunk.data[it], &incomplete_char);
-    if (t == kUtf8Bom && current_.pos.chars == 0) {
-      // BOM detected at beginning of the stream. Don't copy it.
-    } else if (t != unibrow::Utf8::kIncomplete) {
-      chars++;
-      if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
    }
-    it++;
+    UNREACHABLE();
  }
+  size_t CopyToBuffer(uint8_t* buffer, size_t size, size_t offset) const {
-  current_.pos.bytes += it;
+    switch (type) {
-  current_.pos.chars = chars;
+      case ONE_BYTE: {
-  current_.pos.incomplete_char = incomplete_char;
+        size_t chars_to_copy = i::Min(size / 2, char_length - offset);
-  current_.chunk_no += (it == chunk.length);
+        i::CopyCharsUnsigned(reinterpret_cast<uc16*>(buffer), &data[offset],
+                             chars_to_copy);
-  return current_.pos.chars == position;
+        return 2 * chars_to_copy;
-}
+      }
+      case TWO_BYTE: {
-void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
+        size_t bytes_to_copy = i::Min(size, byte_length - offset);
-  DCHECK_LT(current_.chunk_no, chunks_.size());
+        i::MemCopy(buffer, &data[offset], bytes_to_copy);
-  DCHECK_EQ(buffer_start_, buffer_cursor_);
+        return bytes_to_copy;
-  DCHECK_LT(buffer_end_ + 1, buffer_start_ + kBufferSize);
+      }
-  const Chunk& chunk = chunks_[current_.chunk_no];
-  // The buffer_ is writable, but buffer_*_ members are const. So we get a
-  // non-const pointer into buffer that points to the same char as buffer_end_.
-  uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
-  DCHECK_EQ(cursor, buffer_end_);
-  // If the current chunk is the last (empty) chunk we'll have to process
-  // any left-over, partial characters.
-  if (chunk.length == 0) {
-    unibrow::uchar t =
-        unibrow::Utf8::ValueOfIncrementalFinish(&current_.pos.incomplete_char);
-    if (t != unibrow::Utf8::kBufferEmpty) {
-      DCHECK(t < unibrow::Utf16::kMaxNonSurrogateCharCode);
-      *cursor = static_cast<uc16>(t);
-      buffer_end_++;
-      current_.pos.chars++;
    }
-    return;
+    UNREACHABLE();
  }
+  const Type type;
+  // Offset to the first byte of first character that starts in this chunk.
+  const int first_char_start_offset;
+  const std::unique_ptr<const uint8_t[]> data;
+  const size_t byte_length;
+  const size_t char_length;
+  const size_t char_pos;
+  base::AtomicValue<const Chunk*> next{nullptr};
+  const Chunk* prev = nullptr;
+};
-  unibrow::Utf8::Utf8IncrementalBuffer incomplete_char =
+// An interface that delivers a sequence of Chunks.
-      current_.pos.incomplete_char;
+class ChunkSource {
-  size_t it;
+ public:
-  for (it = current_.pos.bytes - chunk.start.bytes;
+  virtual ~ChunkSource() {}
-       it < chunk.length && cursor + 1 < buffer_start_ + kBufferSize; it++) {
+  // Fetch and return next chunk.
-    unibrow::uchar t =
+  // Returns nullptr if source is exhausted.
-        unibrow::Utf8::ValueOfIncremental(chunk.data[it], &incomplete_char);
+  // Should not be called after source is exhausted.
-    if (t == unibrow::Utf8::kIncomplete) continue;
+  virtual Chunk* GetNextChunk() = 0;
-    if (V8_LIKELY(t < kUtf8Bom)) {
+};
-      *(cursor++) = static_cast<uc16>(t);  // The by most frequent case.
-    } else if (t == kUtf8Bom && current_.pos.bytes + it == 2) {
+// Manages chunks, provides seeking to chunk containing start byte or end byte
-      // BOM detected at beginning of the stream. Don't copy it.
+// of a character. It cannot fetch new data, but can read data fetched by viewed
-    } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
+// stream (also data fetched after this stream view was created). Multiple
-      *(cursor++) = static_cast<uc16>(t);
+// ChunksViews can operate on same chunks concurrently.
+class ChunksView {
+ public:
+  ChunksView(const ChunksView& other) = default;
+  virtual ~ChunksView() {}
+  // Return the chunk containing the last byte of the char at position.
+  // Position is in chars not bytes.
+  // If position is behind the end of the stream, nullptr is returned.
+  virtual const Chunk* SeekToEndOf(size_t position) {
+    DCHECK_NOT_NULL(current_);
+    DCHECK_NOT_NULL(last_seen_);
+    // We almost always 'stream', meaning we want data from the next chunk
+    if (V8_LIKELY(position >= last_seen_->char_pos)) {
+      // Fast-forward to last seen chunk
+      current_ = last_seen_;
+      // Continue going forward until position found or out of chunks
+      while (position >= current_->char_pos + current_->char_length) {
+        const Chunk* next = current_->next.Value();
+        if (!next) return nullptr;
+        current_ = last_seen_ = next;
+      }
+    } else if (position >= current_->char_pos) {
+      // Go forward until position found or out of chunks
+      while (position >= current_->char_pos + current_->char_length) {
+        current_ = current_->next.Value();
+        if (!current_) return nullptr;
+      }
    } else {
-      *(cursor++) = unibrow::Utf16::LeadSurrogate(t);
+      // Shortcut jump to first chunk
-      *(cursor++) = unibrow::Utf16::TrailSurrogate(t);
+      DCHECK_NOT_NULL(start_.get());
+      DCHECK_EQ(start_->char_pos, 0);
+      DCHECK_EQ(start_->byte_length, 0);
+      const Chunk* first = start_->next.Value();
+      DCHECK_NOT_NULL(first);
+      DCHECK_EQ(first->char_pos, 0);
+      if (position < first->char_pos + first->char_length) {
+        current_ = first;
+      }
+      // Go backwards until position found
+      while (position < current_->char_pos) {
+        current_ = current_->prev;
+        DCHECK_NOT_NULL(current_);
+      }
    }
+    DCHECK_LE(current_->char_pos, position);
+    DCHECK_LT(position, current_->char_pos + current_->char_length);
+    return current_;
  }
-  current_.pos.bytes = chunk.start.bytes + it;
+  // Return the chunk containing the first byte of the char at position.
-  current_.pos.chars += (cursor - buffer_end_);
+  // Position is in chars not bytes.
-  current_.pos.incomplete_char = incomplete_char;
+  // If position is behind the end of the stream, nullptr is returned.
-  current_.chunk_no += (it == chunk.length);
+  // This call also guarantees that a whole char is available.
+  const Chunk* SeekToStartOf(size_t position) {
-  buffer_end_ = cursor;
+    const Chunk* chunk = SeekToEndOf(position);
-}
+    if (chunk) {
+      while (V8_UNLIKELY(position < chunk->FirstCharPosition())) {
-bool Utf8ExternalStreamingStream::FetchChunk() {
+        chunk = chunk->prev;
-  RuntimeCallTimerScope scope(stats_, &RuntimeCallStats::GetMoreDataCallback);
+        DCHECK_NOT_NULL(chunk);
-  DCHECK_EQ(current_.chunk_no, chunks_.size());
+      }
-  DCHECK(chunks_.empty() || chunks_.back().length != 0);
+      current_ = chunk;
+    }
-  const uint8_t* chunk = nullptr;
+    return chunk;
-  size_t length = source_stream_->GetMoreData(&chunk);
-  chunks_.push_back({chunk, length, current_.pos});
-  return length > 0;
-}
-void Utf8ExternalStreamingStream::SearchPosition(size_t position) {
-  // If current_ already points to the right position, we're done.
-  //
-  // This is expected to be the common case, since we typically call
-  // FillBuffer right after the current buffer.
-  if (current_.pos.chars == position) return;
-  // No chunks. Fetch at least one, so we can assume !chunks_.empty() below.
-  if (chunks_.empty()) {
-    DCHECK_EQ(current_.chunk_no, 0u);
-    DCHECK_EQ(current_.pos.bytes, 0u);
-    DCHECK_EQ(current_.pos.chars, 0u);
-    FetchChunk();
  }
-  // Search for the last chunk whose start position is less or equal to
+  // Returns next chunk if there is one.
-  // position.
+  // This function never blocks/fetches new data.
-  size_t chunk_no = chunks_.size() - 1;
+  const Chunk* MoveToNext() {
-  while (chunk_no > 0 && chunks_[chunk_no].start.chars > position) {
+    DCHECK_NOT_NULL(current_);
-    chunk_no--;
+    DCHECK_NOT_NULL(last_seen_);
+    const Chunk* next = current_->next.Value();
+    if (next) {
+      // Stream view may move to a previously unseen chunk
+      if (current_ == last_seen_) {
+        last_seen_ = next;
+      }
+      current_ = next;
+    }
+    return next;
  }
-  // Did we find the terminating (zero-length) chunk? Then we're seeking
+ protected:
-  // behind the end of the data, and position does not exist.
+  ChunksView()
-  // Set current_ to point to the terminating chunk.
+      : start_(std::make_shared<Chunk>()),
-  if (chunks_[chunk_no].length == 0) {
+        current_(start_.get()),
-    current_ = {chunk_no, chunks_[chunk_no].start};
+        last_seen_(current_) {}
-    return;
-  }
+  // Start sentinel shared between stream and stream views
+  // start_->next is first chunk with real data
+  const std::shared_ptr<Chunk> start_;
+  // Chunk we're currently at.
+  // Seek*, MoveToNext alter/work in respect to this.
+  const Chunk* current_;
+  // Last chunk this instance of ChunksView has seen so far.
+  const Chunk* last_seen_;
+};
-  // Did we find the non-last chunk? Then our position must be within chunk_no.
+// A ChunksView subclass that does fetch new data from a ChunkSource if needed.
-  if (chunk_no + 1 < chunks_.size()) {
+class Chunks : public ChunksView {
-    // Fancy-pants optimization for ASCII chunks within a utf-8 stream.
+ public:
-    // (Many web sites declare utf-8 encoding, but use only (or almost only) the
+  Chunks(std::unique_ptr<ChunkSource> source, RuntimeCallStats* stats)
-    //  ASCII subset for their JavaScript sources. We can exploit this, by
+      : source_(std::move(source)),
-    //  checking whether the # bytes in a chunk are equal to the # chars, and if
+        stats_(stats),
-    //  so avoid the expensive SkipToPosition.)
+        last_added_(start_.get()),
-    bool ascii_only_chunk =
+        source_exhausted_(false) {}
-        chunks_[chunk_no].start.incomplete_char ==
+  const Chunk* SeekToEndOf(size_t position) override {
-            unibrow::Utf8::Utf8IncrementalBuffer(0) &&
+    if (!source_exhausted_) {
-        (chunks_[chunk_no + 1].start.bytes - chunks_[chunk_no].start.bytes) ==
+      GetNewDataIfNeeded(position);
-            (chunks_[chunk_no + 1].start.chars - chunks_[chunk_no].start.chars);
-    if (ascii_only_chunk) {
-      size_t skip = position - chunks_[chunk_no].start.chars;
-      current_ = {chunk_no,
-                  {chunks_[chunk_no].start.bytes + skip,
-                   chunks_[chunk_no].start.chars + skip,
-                   unibrow::Utf8::Utf8IncrementalBuffer(0)}};
-    } else {
-      current_ = {chunk_no, chunks_[chunk_no].start};
-      SkipToPosition(position);
    }
+    return ChunksView::SeekToEndOf(position);
-    // Since position was within the chunk, SkipToPosition should have found
-    // something.
-    DCHECK_EQ(position, current_.pos.chars);
-    return;
  }
-  // What's left: We're in the last, non-terminating chunk. Our position
+ protected:
-  // may be in the chunk, but it may also be in 'future' chunks, which we'll
+  void GetNewDataIfNeeded(size_t position) {
-  // have to obtain.
+    DCHECK(!source_exhausted_);
-  DCHECK_EQ(chunk_no, chunks_.size() - 1);
+    DCHECK_NOT_NULL(last_added_);
-  current_ = {chunk_no, chunks_[chunk_no].start};
+    size_t char_end_pos = last_added_->char_pos + last_added_->char_length;
-  bool have_more_data = true;
+    // Get more data if needed.
-  bool found = SkipToPosition(position);
+    if (char_end_pos <= position) {
-  while (have_more_data && !found) {
+      RuntimeCallTimerScope scope(stats_,
-    DCHECK_EQ(current_.chunk_no, chunks_.size());
+                                  &RuntimeCallStats::GetMoreDataCallback);
-    have_more_data = FetchChunk();
+      while (char_end_pos <= position) {
-    found = have_more_data && SkipToPosition(position);
+        Chunk* chunk = source_->GetNextChunk();
+        if (chunk) {
+          chunk->prev = last_added_;
+          last_added_->next.SetValue(chunk);
+          last_added_ = chunk;
+          char_end_pos += chunk->char_length;
+          DCHECK_EQ(char_end_pos,
+                    last_added_->char_pos + last_added_->char_length);
+        } else {
+          source_exhausted_ = true;
+          break;
+        }
+      }
+    }
+    DCHECK_NOT_NULL(last_added_);
+    last_seen_ = last_added_;
+    // We either got the chunk we were looking for, or source is exhausted.
+    DCHECK_IMPLIES(!source_exhausted_, position < char_end_pos);
  }
+  const std::unique_ptr<ChunkSource> source_;
+  RuntimeCallStats* const stats_;
+  Chunk* last_added_;
+  bool source_exhausted_;
+};
-  // We'll return with a postion != the desired position only if we're out
+// ChunkSource for one-byte ExternalSourceStream
-  // of data. In that case, we'll point to the terminating chunk.
+class OneByteChunkSource : public ChunkSource {
-  DCHECK_EQ(found, current_.pos.chars == position);
+ public:
-  DCHECK_EQ(have_more_data, chunks_.back().length != 0);
+  explicit OneByteChunkSource(ScriptCompiler::ExternalSourceStream* source)
-  DCHECK_IMPLIES(!found, !have_more_data);
+      : source_(source) {}
-  DCHECK_IMPLIES(!found, current_.chunk_no == chunks_.size() - 1);
+  Chunk* GetNextChunk() override {
-}
+    const uint8_t* data = nullptr;
+    DCHECK_NOT_NULL(source_);
-size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) {
+    const size_t byte_len = source_->GetMoreData(&data);
-  buffer_cursor_ = buffer_;
+    if (V8_UNLIKELY(byte_len == 0)) {
-  buffer_end_ = buffer_;
+      delete[] data;
+      return nullptr;
-  SearchPosition(position);
-  bool out_of_data = current_.chunk_no != chunks_.size() &&
-                     chunks_[current_.chunk_no].length == 0;
-  if (out_of_data) return 0;
-  // Fill the buffer, until we have at least one char (or are out of data).
-  // (The embedder might give us 1-byte blocks within a utf-8 char, so we
-  //  can't guarantee progress with one chunk. Thus we iterate.)
-  while (!out_of_data && buffer_cursor_ == buffer_end_) {
-    // At end of current data, but there might be more? Then fetch it.
-    if (current_.chunk_no == chunks_.size()) {
-      out_of_data = !FetchChunk();
    }
-    FillBufferFromCurrentChunk();
+    size_t char_pos = byte_pos_;
+    byte_pos_ += byte_len;
+    return Chunk::CreateOneByte(data, byte_len, char_pos);
  }
-  DCHECK_EQ(current_.pos.chars - position,
+ private:
-            static_cast<size_t>(buffer_end_ - buffer_cursor_));
+  ScriptCompiler::ExternalSourceStream* const source_;
-  return buffer_end_ - buffer_cursor_;
+  size_t byte_pos_ = 0;
-}
+};
-// ----------------------------------------------------------------------------
+// ChunkSource for two-byte ExternalSourceStream
-// Chunks - helper for One- + TwoByteExternalStreamingStream
+class TwoByteChunkSource : public ChunkSource {
-namespace {
+ public:
+  explicit TwoByteChunkSource(ScriptCompiler::ExternalSourceStream* source)
+      : source_(source) {}
+  Chunk* GetNextChunk() override {
+    const uint8_t* data = nullptr;
+    DCHECK_NOT_NULL(source_);
+    const size_t byte_len = source_->GetMoreData(&data);
+    if (V8_UNLIKELY(byte_len == 0)) {
+      delete[] data;
+      return nullptr;
+    }
+    bool odd_start = (byte_pos_ % 2) == 1;
+    size_t char_pos = byte_pos_ / 2;
+    byte_pos_ += byte_len;
+    return Chunk::CreateTwoByte(data, byte_len, char_pos, odd_start);
+  }
-struct Chunk {
+ private:
-  const uint8_t* data;
+  ScriptCompiler::ExternalSourceStream* const source_;
-  size_t byte_length;
+  size_t byte_pos_ = 0;
-  size_t byte_pos;
 };
-typedef std::vector<struct Chunk> Chunks;
+// ChunkSource that decodes incoming utf8 to two-byte chunks if needed.
+// Byte Order Mark at the beginning of the stream is skipped.
-void DeleteChunks(Chunks& chunks) {
+// ASCII only chunks are kept as one-byte chunks.
-  for (size_t i = 0; i < chunks.size(); i++) delete[] chunks[i].data;
+// Performance is optimized for source files with mostly ASCII characters.
-}
+class Utf8ChunkSource : public ChunkSource {
+ public:
+  explicit Utf8ChunkSource(ScriptCompiler::ExternalSourceStream* source)
+      : source_(source),
+        incomplete_char_(unibrow::Utf8::Utf8IncrementalBuffer(0)) {}
-// Return the chunk index for the chunk containing position.
+ protected:
-// If position is behind the end of the stream, the index of the last,
+  size_t ComputeAsciiPrefixLength(const uint8_t* data, size_t byte_length) {
-// zero-length chunk is returned.
+    if (incomplete_char_ != unibrow::Utf8::Utf8IncrementalBuffer(0)) {
-size_t FindChunk(Chunks& chunks, ScriptCompiler::ExternalSourceStream* source,
+      return 0;
-                 size_t position, RuntimeCallStats* stats) {
+    }
-  size_t end_pos =
+    for (size_t i = 0; i < byte_length; ++i) {
-      chunks.empty() ? 0 : (chunks.back().byte_pos + chunks.back().byte_length);
+      if (data[i] > unibrow::Utf8::kMaxOneByteChar) {
+        return i;
-  // Get more data if needed. We usually won't enter the loop body.
+      }
-  bool out_of_data = !chunks.empty() && chunks.back().byte_length == 0;
-  {
-    RuntimeCallTimerScope scope(stats, &RuntimeCallStats::GetMoreDataCallback);
-    while (!out_of_data && end_pos <= position + 1) {
-      const uint8_t* chunk = nullptr;
-      size_t len = source->GetMoreData(&chunk);
-      chunks.push_back({chunk, len, end_pos});
-      end_pos += len;
-      out_of_data = (len == 0);
    }
+    return byte_length;
  }
+  size_t ConvertToUtf16(const uint8_t* data, size_t byte_length,
-  // Here, we should always have at least one chunk, and we either have the
+                        size_t ascii_prefix_len, const uint8_t** result) {
-  // chunk we were looking for, or we're out of data. Also, out_of_data and
+    DCHECK_IMPLIES(ascii_prefix_len != 0,
-  // end_pos are current (and designate whether we have exhausted the stream,
+                   incomplete_char_ == unibrow::Utf8::Utf8IncrementalBuffer(0));
-  // and the length of data received so far, respectively).
+    // We get at most 1 character per byte.
-  DCHECK(!chunks.empty());
+    // Only exception is the first byte of chunk which may finish an incomplete
-  DCHECK_EQ(end_pos, chunks.back().byte_pos + chunks.back().byte_length);
+    // surrogate pair, hence addition of 1.
-  DCHECK_EQ(out_of_data, chunks.back().byte_length == 0);
+    uc16* decoded_data = new uc16[byte_length + 1];
-  DCHECK(position < end_pos || out_of_data);
+    i::CopyCharsUnsigned(decoded_data, data, ascii_prefix_len);
+    size_t decoded_len = ascii_prefix_len;
-  // Edge case: position is behind the end of stream: Return the last (length 0)
+    for (size_t i = ascii_prefix_len; i < byte_length; ++i) {
-  // chunk to indicate the end of the stream.
+      unibrow::uchar t =
-  if (position >= end_pos) {
+          unibrow::Utf8::ValueOfIncremental(data[i], &incomplete_char_);
-    DCHECK(out_of_data);
+      if (t == unibrow::Utf8::kIncomplete) continue;
-    return chunks.size() - 1;
+      if (V8_LIKELY(t < kUtf8Bom)) {
+        decoded_data[decoded_len++] = static_cast<uc16>(t);
+      } else if (V8_UNLIKELY(is_at_first_char_ && t == kUtf8Bom)) {
+        // Skip BOM at the beggining of the stream
+        is_at_first_char_ = false;
+      } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
+        decoded_data[decoded_len++] = static_cast<uc16>(t);
+      } else {
+        decoded_data[decoded_len++] = unibrow::Utf16::LeadSurrogate(t);
+        decoded_data[decoded_len++] = unibrow::Utf16::TrailSurrogate(t);
+      }
+    }
+    *result = reinterpret_cast<uint8_t*>(decoded_data);
+    return 2 * decoded_len;
  }
-  // We almost always 'stream', meaning we want data from the last chunk, so
+  // Skip possible BOM at the beginning of the stream.
-  // let's look at chunks back-to-front.
+  // Return number of bytes skipped.
-  size_t chunk_no = chunks.size() - 1;
+  size_t SkipBOM(const uint8_t* data, size_t byte_length) {
-  while (chunks[chunk_no].byte_pos > position) {
+    if (V8_UNLIKELY(is_at_first_char_ && byte_length >= 3 &&
-    DCHECK_NE(chunk_no, 0u);
+                    data[0] == kUtf8BomBytes[0] &&
-    chunk_no--;
+                    data[1] == kUtf8BomBytes[1] &&
+                    data[2] == kUtf8BomBytes[2])) {
+      return 3;
+    }
+    return 0;
  }
-  DCHECK_LE(chunks[chunk_no].byte_pos, position);
-  DCHECK_LT(position, chunks[chunk_no].byte_pos + chunks[chunk_no].byte_length);
-  return chunk_no;
-}
-}  // anonymous namespace
-// ----------------------------------------------------------------------------
+  Chunk* GetNextChunk() override {
-// OneByteExternalStreamingStream
+    const uint8_t* data = nullptr;
-//
+    size_t byte_length = 0;
-// A stream of latin-1 encoded, chunked data.
+    if (!is_at_end_) {
+      byte_length = source_->GetMoreData(&data);
-class OneByteExternalStreamingStream : public BufferedUtf16CharacterStream {
+    }
- public:
+    // end of stream
-  explicit OneByteExternalStreamingStream(
+    if (V8_UNLIKELY(byte_length == 0)) {
-      ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats)
+      delete[] data;
-      : source_(source), stats_(stats) {}
+      is_at_end_ = true;
-  ~OneByteExternalStreamingStream() override { DeleteChunks(chunks_); }
+      unibrow::uchar t =
+          unibrow::Utf8::ValueOfIncrementalFinish(&incomplete_char_);
- protected:
+      if (t != unibrow::Utf8::kBufferEmpty) {
-  size_t FillBuffer(size_t position) override;
+        DCHECK(t < unibrow::Utf16::kMaxNonSurrogateCharCode);
+        uc16* char_data = new uc16[1];
+        char_data[0] = static_cast<uc16>(t);
+        return Chunk::CreateTwoByte(reinterpret_cast<uint8_t*>(char_data), 2,
+                                    char_pos_++, false);
+      }
+      return nullptr;
+    }
+    size_t skipped = SkipBOM(data, byte_length);
+    if (V8_UNLIKELY(skipped == byte_length)) {
+      is_at_first_char_ = false;
+      delete[] data;
+      return GetNextChunk();
+    }
+    size_t ascii_prefix_len =
+        ComputeAsciiPrefixLength(data + skipped, byte_length - skipped);
+    bool is_ascii_only = ascii_prefix_len == (byte_length - skipped);
+    if (V8_LIKELY(is_ascii_only)) {
+      is_at_first_char_ = false;
+      Chunk* chunk =
+          Chunk::CreateOneByte(data, byte_length, char_pos_, skipped);
+      char_pos_ += chunk->char_length;
+      return chunk;
+    }
+    const uint8_t* decoded_data;
+    size_t decoded_byte_len = ConvertToUtf16(
+        data + skipped, byte_length - skipped, ascii_prefix_len, &decoded_data);
+    delete[] data;
+    if (V8_UNLIKELY(decoded_byte_len == 0)) {
+      delete[] decoded_data;
+      return GetNextChunk();
+    }
+    is_at_first_char_ = false;
+    Chunk* chunk =
+        Chunk::CreateTwoByte(decoded_data, decoded_byte_len, char_pos_, false);
+    char_pos_ += chunk->char_length;
+    return chunk;
+  }
 private:
-  Chunks chunks_;
  ScriptCompiler::ExternalSourceStream* source_;
-  RuntimeCallStats* stats_;
+  unibrow::Utf8::Utf8IncrementalBuffer incomplete_char_;
+  bool is_at_first_char_ = true;
+  bool is_at_end_ = false;
+  size_t char_pos_ = 0;
 };
-size_t OneByteExternalStreamingStream::FillBuffer(size_t position) {
+}  // anonymous namespace
-  const Chunk& chunk = chunks_[FindChunk(chunks_, source_, position, stats_)];
-  if (chunk.byte_length == 0) return 0;
-  size_t start_pos = position - chunk.byte_pos;
-  size_t len = i::Min(kBufferSize, chunk.byte_length - start_pos);
-  i::CopyCharsUnsigned(buffer_, chunk.data + start_pos, len);
-  return len;
-}
-#if !(V8_TARGET_ARCH_MIPS || V8_TARGET_ARCH_MIPS64)
-// ----------------------------------------------------------------------------
-// TwoByteExternalStreamingStream
-//
-// A stream of ucs-2 data, delivered in chunks. Chunks may be 'cut' into the
-// middle of characters (or even contain only one byte), which adds a bit
-// of complexity. This stream avoid all data copying, except for characters
-// that cross chunk boundaries.
-class TwoByteExternalStreamingStream : public Utf16CharacterStream {
+// A stream of chunked data
+class ExternalStreamingStream : public BufferedUtf16CharacterStream {
 public:
-  explicit TwoByteExternalStreamingStream(
+  explicit ExternalStreamingStream(std::unique_ptr<ChunkSource> source,
-      ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats);
+                                   RuntimeCallStats* stats)
-  ~TwoByteExternalStreamingStream() override;
+      : chunks_(new Chunks(std::move(source), stats)) {}
 protected:
  bool ReadBlock() override;
+  size_t FillBuffer(size_t position) override;
-  Chunks chunks_;
+  std::unique_ptr<ChunksView> chunks_;
-  ScriptCompiler::ExternalSourceStream* source_;
-  RuntimeCallStats* stats_;
-  uc16 one_char_buffer_;
 };
-TwoByteExternalStreamingStream::TwoByteExternalStreamingStream(
+bool ExternalStreamingStream::ReadBlock() {
-    ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats)
-    : Utf16CharacterStream(&one_char_buffer_, &one_char_buffer_,
-                           &one_char_buffer_, 0),
-      source_(source),
-      stats_(stats),
-      one_char_buffer_(0) {}
-TwoByteExternalStreamingStream::~TwoByteExternalStreamingStream() {
-  DeleteChunks(chunks_);
-}
-bool TwoByteExternalStreamingStream::ReadBlock() {
  size_t position = pos();
+  // Find chunk in which the position belongs
+  const Chunk* chunk = chunks_->SeekToEndOf(position);
-  // We'll search for the 2nd byte of our character, to make sure we
+  // Out of data? Return false.
-  // have enough data for at least one character.
+  if (!chunk) {
-  size_t chunk_no = FindChunk(chunks_, source_, 2 * position + 1, stats_);
+    buffer_cursor_ = buffer_end_ = buffer_start_;
-  // Out of data? Return 0.
-  if (chunks_[chunk_no].byte_length == 0) {
-    buffer_cursor_ = buffer_start_;
-    buffer_end_ = buffer_start_;
    return false;
  }
-  Chunk& current = chunks_[chunk_no];
+  bool odd_start = chunk->first_char_start_offset == 1;
-  // Annoying edge case: Chunks may not be 2-byte aligned, meaning that a
+  if (odd_start || chunk->type != Chunk::TWO_BYTE) {
-  // character may be split between the previous and the current chunk.
+    return BufferedUtf16CharacterStream::ReadBlock();
-  // If we find such a lonely byte at the beginning of the chunk, we'll use
-  // one_char_buffer_ to hold the full character.
-  bool lonely_byte = (chunks_[chunk_no].byte_pos == (2 * position + 1));
-  if (lonely_byte) {
-    DCHECK_NE(chunk_no, 0u);
-    Chunk& previous_chunk = chunks_[chunk_no - 1];
-#ifdef V8_TARGET_BIG_ENDIAN
-    uc16 character = current.data[0] |
-                     previous_chunk.data[previous_chunk.byte_length - 1] << 8;
-#else
-    uc16 character = previous_chunk.data[previous_chunk.byte_length - 1] |
-                     current.data[0] << 8;
-#endif
-    one_char_buffer_ = character;
-    buffer_pos_ = position;
-    buffer_start_ = &one_char_buffer_;
-    buffer_cursor_ = &one_char_buffer_;
-    buffer_end_ = &one_char_buffer_ + 1;
-    return true;
  }
-  // Common case: character is in current chunk.
+  // Aligned access is important on MIPS and ARM.
-  DCHECK_LE(current.byte_pos, 2 * position);
+  DCHECK_EQ((reinterpret_cast<uintptr_t>(chunk->data.get()) % 2), 0);
-  DCHECK_LT(2 * position + 1, current.byte_pos + current.byte_length);
+  buffer_start_ = reinterpret_cast<const uint16_t*>(chunk->data.get());
-  // Determine # of full ucs-2 chars in stream, and whether we started on an odd
+  buffer_end_ = buffer_start_ + chunk->char_length;
-  // byte boundary.
+  buffer_pos_ = chunk->char_pos;
-  bool odd_start = (current.byte_pos % 2) == 1;
-  size_t number_chars = (current.byte_length - odd_start) / 2;
-  // Point the buffer_*_ members into the current chunk and set buffer_cursor_
-  // to point to position. Be careful when converting the byte positions (in
-  // Chunk) to the ucs-2 character positions (in buffer_*_ members).
-  buffer_start_ = reinterpret_cast<const uint16_t*>(current.data + odd_start);
-  buffer_end_ = buffer_start_ + number_chars;
-  buffer_pos_ = (current.byte_pos + odd_start) / 2;
  buffer_cursor_ = buffer_start_ + (position - buffer_pos_);
  DCHECK_EQ(position, pos());
  return true;
 }
-#else
+size_t ExternalStreamingStream::FillBuffer(size_t position) {
+  size_t copied_bytes = 0;
-// ----------------------------------------------------------------------------
+  const Chunk* chunk = chunks_->SeekToStartOf(position);
-// TwoByteExternalBufferedStream
-//
-// This class is made specifically to address unaligned access to 16-bit data
-// in MIPS and ARM architectures. It replaces class
-// TwoByteExternalStreamingStream which in some cases does have unaligned
-// accesse to 16-bit data
-class TwoByteExternalBufferedStream : public Utf16CharacterStream {
- public:
-  explicit TwoByteExternalBufferedStream(
-      ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats);
-  ~TwoByteExternalBufferedStream();
- protected:
-  static const size_t kBufferSize = 512;
-  bool ReadBlock() override;
-  // FillBuffer should read up to kBufferSize characters at position and store
-  // them into buffer_[0..]. It returns the number of characters stored.
-  size_t FillBuffer(size_t position, size_t chunk_no);
-  // Fixed sized buffer that this class reads from.
-  // The base class' buffer_start_ should always point to buffer_.
-  uc16 buffer_[kBufferSize];
-  Chunks chunks_;
-  ScriptCompiler::ExternalSourceStream* source_;
-  RuntimeCallStats* stats_;
-};
-TwoByteExternalBufferedStream::TwoByteExternalBufferedStream(
-    ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats)
-    : Utf16CharacterStream(buffer_, buffer_, buffer_, 0),
-      source_(source),
-      stats_(stats) {}
-TwoByteExternalBufferedStream::~TwoByteExternalBufferedStream() {
-  DeleteChunks(chunks_);
-}
-bool TwoByteExternalBufferedStream::ReadBlock() {
-  size_t position = pos();
-  // Find chunk in which the position belongs
-  size_t chunk_no = FindChunk(chunks_, source_, 2 * position + 1, stats_);
  // Out of data? Return 0.
-  if (chunks_[chunk_no].byte_length == 0) {
+  if (!chunk) {
-    buffer_cursor_ = buffer_start_;
+    return 0;
-    buffer_end_ = buffer_start_;
-    return false;
-  }
-  Chunk& current = chunks_[chunk_no];
-  bool odd_start = current.byte_pos % 2;
-  // Common case: character is in current chunk.
-  DCHECK_LE(current.byte_pos, 2 * position + odd_start);
-  DCHECK_LT(2 * position + 1, current.byte_pos + current.byte_length);
-  // If character starts on odd address copy text in buffer so there is always
-  // aligned access to characters. This is important on MIPS and ARM
-  // architectures. Otherwise read characters from memory directly.
-  if (!odd_start) {
-    buffer_start_ = reinterpret_cast<const uint16_t*>(current.data);
-    size_t number_chars = current.byte_length / 2;
-    buffer_end_ = buffer_start_ + number_chars;
-    buffer_pos_ = current.byte_pos / 2;
-    buffer_cursor_ = buffer_start_ + (position - buffer_pos_);
-    DCHECK_EQ(position, pos());
-    return true;
-  } else {
-    buffer_start_ = buffer_;
-    buffer_pos_ = position;
-    buffer_cursor_ = buffer_;
-    buffer_end_ = buffer_ + FillBuffer(position, chunk_no);
-    DCHECK_EQ(pos(), position);
-    DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize);
-    return buffer_cursor_ < buffer_end_;
-  }
-}
-size_t TwoByteExternalBufferedStream::FillBuffer(size_t position,
-                                                 size_t chunk_no) {
-  DCHECK_EQ(chunks_[chunk_no].byte_pos % 2, 1u);
-  bool odd_start = true;
-  // Align buffer_pos_ to the size of the buffer.
-  {
-    size_t new_pos = position / kBufferSize * kBufferSize;
-    if (new_pos != position) {
-      chunk_no = FindChunk(chunks_, source_, 2 * new_pos + 1, stats_);
-      buffer_pos_ = new_pos;
-      buffer_cursor_ = buffer_start_ + (position - buffer_pos_);
-      position = new_pos;
-      odd_start = chunks_[chunk_no].byte_pos % 2;
-    }
-  }
-  Chunk* current = &chunks_[chunk_no];
-  // Annoying edge case: Chunks may not be 2-byte aligned, meaning that a
-  // character may be split between the previous and the current chunk.
-  // If we find such a lonely byte at the beginning of the chunk, we'll copy
-  // it to the first byte in buffer_.
-  size_t totalLength = 0;
-  bool lonely_byte = (current->byte_pos == (2 * position + 1));
-  if (lonely_byte) {
-    DCHECK_NE(chunk_no, 0u);
-    Chunk& previous_chunk = chunks_[chunk_no - 1];
-    *reinterpret_cast<uint8_t*>(buffer_) =
-        previous_chunk.data[previous_chunk.byte_length - 1];
-    totalLength++;
  }
+  size_t offset = chunk->OffsetOf(position);
-  // Common case: character is in current chunk.
+  while (copied_bytes < kBufferSizeInBytes && chunk) {
-  DCHECK_LE(current->byte_pos, 2 * position + odd_start);
+    // Stop copying if next chunk could be used directly and at least one char
-  DCHECK_LT(2 * position + 1, current->byte_pos + current->byte_length);
+    // was copied.
+    if (chunk->type == Chunk::TWO_BYTE && copied_bytes > 2 &&
-  // Copy characters from current chunk starting from chunk_pos to the end of
+        chunk->first_char_start_offset == 0) {
-  // buffer or chunk.
+      break;
-  size_t chunk_pos = position - current->byte_pos / 2;
-  size_t start_offset = odd_start && chunk_pos != 0;
-  size_t bytes_to_move =
-      i::Min(2 * kBufferSize - lonely_byte,
-             current->byte_length - 2 * chunk_pos + start_offset);
-  i::MemMove(reinterpret_cast<uint8_t*>(buffer_) + lonely_byte,
-             current->data + 2 * chunk_pos - start_offset, bytes_to_move);
-  // Fill up the rest of the buffer if there is space and data left.
-  totalLength += bytes_to_move;
-  position = (current->byte_pos + current->byte_length) / 2;
-  if (position - buffer_pos_ < kBufferSize) {
-    chunk_no = FindChunk(chunks_, source_, 2 * position + 1, stats_);
-    current = &chunks_[chunk_no];
-    odd_start = current->byte_pos % 2;
-    bytes_to_move = i::Min(2 * kBufferSize - totalLength, current->byte_length);
-    while (bytes_to_move) {
-      // Common case: character is in current chunk.
-      DCHECK_LE(current->byte_pos, 2 * position + odd_start);
-      DCHECK_LT(2 * position + 1, current->byte_pos + current->byte_length);
-      i::MemMove(reinterpret_cast<uint8_t*>(buffer_) + totalLength,
-                 current->data, bytes_to_move);
-      totalLength += bytes_to_move;
-      position = (current->byte_pos + current->byte_length) / 2;
-      chunk_no = FindChunk(chunks_, source_, 2 * position + 1, stats_);
-      current = &chunks_[chunk_no];
-      odd_start = current->byte_pos % 2;
-      bytes_to_move =
-          i::Min(2 * kBufferSize - totalLength, current->byte_length);
    }
+    size_t space_left = kBufferSizeInBytes - copied_bytes;
+    DCHECK_IMPLIES(chunk->type == Chunk::ONE_BYTE, (copied_bytes % 2) == 0);
+    copied_bytes += chunk->CopyToBuffer(
+        reinterpret_cast<uint8_t*>(buffer_) + copied_bytes, space_left, offset);
+    // chunk after the first one are copied from beginning
+    offset = 0;
+    chunk = chunks_->MoveToNext();
  }
-  return totalLength / 2;
+  return copied_bytes / 2;
 }
-#endif
 // ----------------------------------------------------------------------------
 // ScannerStream: Create stream instances.
@@ -850,20 +706,20 @@ Utf16CharacterStream* ScannerStream::For(
    ScriptCompiler::ExternalSourceStream* source_stream,
    v8::ScriptCompiler::StreamedSource::Encoding encoding,
    RuntimeCallStats* stats) {
+  std::unique_ptr<ChunkSource> source;
  switch (encoding) {
    case v8::ScriptCompiler::StreamedSource::TWO_BYTE:
-#if !(V8_TARGET_ARCH_MIPS || V8_TARGET_ARCH_MIPS64)
+      source.reset(new TwoByteChunkSource(source_stream));
-      return new TwoByteExternalStreamingStream(source_stream, stats);
+      break;
-#else
-      return new TwoByteExternalBufferedStream(source_stream, stats);
-#endif
    case v8::ScriptCompiler::StreamedSource::ONE_BYTE:
-      return new OneByteExternalStreamingStream(source_stream, stats);
+      source.reset(new OneByteChunkSource(source_stream));
+      break;
    case v8::ScriptCompiler::StreamedSource::UTF8:
-      return new Utf8ExternalStreamingStream(source_stream, stats);
+      source.reset(new Utf8ChunkSource(source_stream));
+      break;
  }
-  UNREACHABLE();
+  DCHECK_NOT_NULL(source.get());
-  return nullptr;
+  return new ExternalStreamingStream(std::move(source), stats);
 }
 }  // namespace internal

--- a/test/cctest/parsing/test-scanner-streams.cc
+++ b/test/cctest/parsing/test-scanner-streams.cc
@@ -435,6 +435,18 @@ TEST(CharacterStreams) {
  TestCharacterStreams(buffer, arraysize(buffer) - 1, 576, 3298);
 }
+TEST(Uft8MultipleBOMChunks) {
+  const char* chunks = "\xef\xbb\xbf\0\xef\xbb\xbf\0\xef\xbb\xbf\0a\0";
+  const uint16_t unicode[] = {0xFEFF, 0xFEFF, 97};
+  ChunkSource chunk_source(chunks);
+  std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
+      &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
+  for (size_t i = 0; i < arraysize(unicode); i++) {
+    CHECK_EQ(unicode[i], stream->Advance());
+  }
+  CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
+}
 // Regression test for crbug.com/651333. Read invalid utf-8.
 TEST(Regress651333) {
  const uint8_t bytes[] =