scanner-character-streams.cc 28.5 KB
Newer Older
1
// Copyright 2011 the V8 project authors. All rights reserved.
2 3
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
4

5
#include "src/parsing/scanner-character-streams.h"
6

7 8 9
#include <memory>
#include <vector>

10
#include "include/v8.h"
11
#include "src/common/globals.h"
12
#include "src/handles/handles.h"
13
#include "src/logging/counters.h"
14
#include "src/objects/objects-inl.h"
15
#include "src/parsing/scanner.h"
16
#include "src/strings/unicode-inl.h"
17 18 19 20

namespace v8 {
namespace internal {

21 22
class ScopedExternalStringLock {
 public:
23 24
  explicit ScopedExternalStringLock(ExternalString string) {
    DCHECK(!string.is_null());
25 26
    if (string.IsExternalOneByteString()) {
      resource_ = ExternalOneByteString::cast(string).resource();
27
    } else {
28 29
      DCHECK(string.IsExternalTwoByteString());
      resource_ = ExternalTwoByteString::cast(string).resource();
30 31 32 33 34 35
    }
    DCHECK(resource_);
    resource_->Lock();
  }

  // Copying a lock increases the locking depth.
36
  ScopedExternalStringLock(const ScopedExternalStringLock& other) V8_NOEXCEPT
37 38 39 40 41 42 43 44 45 46 47
      : resource_(other.resource_) {
    resource_->Lock();
  }

  ~ScopedExternalStringLock() { resource_->Unlock(); }

 private:
  // Not nullptr.
  const v8::String::ExternalStringResourceBase* resource_;
};

marja's avatar
marja committed
48
namespace {
49
const unibrow::uchar kUtf8Bom = 0xFEFF;
marja's avatar
marja committed
50 51
}  // namespace

52 53 54 55
template <typename Char>
struct Range {
  const Char* start;
  const Char* end;
56

57 58
  size_t length() { return static_cast<size_t>(end - start); }
  bool unaligned_start() const {
59
    return reinterpret_cast<intptr_t>(start) % sizeof(Char) == 1;
60
  }
61 62
};

63 64 65 66
// A Char stream backed by an on-heap SeqOneByteString or SeqTwoByteString.
template <typename Char>
class OnHeapStream {
 public:
67
  using String = typename CharTraits<Char>::String;
68

69 70
  OnHeapStream(Handle<String> string, size_t start_offset, size_t end)
      : string_(string), start_offset_(start_offset), length_(end) {}
71

72
  OnHeapStream(const OnHeapStream&) V8_NOEXCEPT : start_offset_(0), length_(0) {
73 74 75
    UNREACHABLE();
  }

76 77 78 79 80 81
  // The no_gc argument is only here because of the templated way this class
  // is used along with other implementations that require V8 heap access.
  Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
                        DisallowHeapAllocation* no_gc) {
    return {&string_->GetChars(*no_gc)[start_offset_ + Min(length_, pos)],
            &string_->GetChars(*no_gc)[start_offset_ + length_]};
82
  }
83

84
  static const bool kCanBeCloned = false;
85
  static const bool kCanAccessHeap = true;
86

87 88 89 90 91 92 93 94 95 96
 private:
  Handle<String> string_;
  const size_t start_offset_;
  const size_t length_;
};

// A Char stream backed by an off-heap ExternalOneByteString or
// ExternalTwoByteString.
template <typename Char>
class ExternalStringStream {
97
  using ExternalString = typename CharTraits<Char>::ExternalString;
98

99
 public:
100
  ExternalStringStream(ExternalString string, size_t start_offset,
101 102
                       size_t length)
      : lock_(string),
103
        data_(string.GetChars() + start_offset),
104
        length_(length) {}
105

106 107 108 109
  ExternalStringStream(const ExternalStringStream& other) V8_NOEXCEPT
      : lock_(other.lock_),
        data_(other.data_),
        length_(other.length_) {}
110

111 112 113 114
  // The no_gc argument is only here because of the templated way this class
  // is used along with other implementations that require V8 heap access.
  Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
                        DisallowHeapAllocation* no_gc = nullptr) {
115 116
    return {&data_[Min(length_, pos)], &data_[length_]};
  }
117

118
  static const bool kCanBeCloned = true;
119
  static const bool kCanAccessHeap = false;
120

121 122 123 124 125 126 127 128 129 130 131 132
 private:
  ScopedExternalStringLock lock_;
  const Char* const data_;
  const size_t length_;
};

// A Char stream backed by a C array. Testing only.
template <typename Char>
class TestingStream {
 public:
  TestingStream(const Char* data, size_t length)
      : data_(data), length_(length) {}
133 134 135 136
  // The no_gc argument is only here because of the templated way this class
  // is used along with other implementations that require V8 heap access.
  Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
                        DisallowHeapAllocation* no_gc = nullptr) {
137 138 139 140 141 142
    return {&data_[Min(length_, pos)], &data_[length_]};
  }

  static const bool kCanBeCloned = true;
  static const bool kCanAccessHeap = false;

143 144 145
 private:
  const Char* const data_;
  const size_t length_;
146
};
147

148 149 150 151
// A Char stream backed by multiple source-stream provided off-heap chunks.
template <typename Char>
class ChunkedStream {
 public:
152 153
  explicit ChunkedStream(ScriptCompiler::ExternalSourceStream* source)
      : source_(source) {}
154

155
  ChunkedStream(const ChunkedStream&) V8_NOEXCEPT {
156 157 158 159
    // TODO(rmcilroy): Implement cloning for chunked streams.
    UNREACHABLE();
  }

160 161 162 163
  // The no_gc argument is only here because of the templated way this class
  // is used along with other implementations that require V8 heap access.
  Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
                        DisallowHeapAllocation* no_gc = nullptr) {
164
    Chunk chunk = FindChunk(pos, stats);
165
    size_t buffer_end = chunk.length;
166
    size_t buffer_pos = Min(buffer_end, pos - chunk.position);
167
    return {&chunk.data[buffer_pos], &chunk.data[buffer_end]};
168
  }
169

170
  ~ChunkedStream() {
171
    for (Chunk& chunk : chunks_) delete[] chunk.data;
172
  }
173

174
  static const bool kCanBeCloned = false;
175
  static const bool kCanAccessHeap = false;
176

177 178
 private:
  struct Chunk {
179 180
    Chunk(const Char* const data, size_t position, size_t length)
        : data(data), position(position), length(length) {}
181 182
    const Char* const data;
    // The logical position of data.
183
    const size_t position;
184 185
    const size_t length;
    size_t end_position() const { return position + length; }
186 187
  };

188 189
  Chunk FindChunk(size_t position, RuntimeCallStats* stats) {
    while (V8_UNLIKELY(chunks_.empty())) FetchChunk(size_t{0}, stats);
190 191 192 193

    // Walk forwards while the position is in front of the current chunk.
    while (position >= chunks_.back().end_position() &&
           chunks_.back().length > 0) {
194
      FetchChunk(chunks_.back().end_position(), stats);
195
    }
196

197
    // Walk backwards.
198
    for (auto reverse_it = chunks_.rbegin(); reverse_it != chunks_.rend();
199 200 201 202
         ++reverse_it) {
      if (reverse_it->position <= position) return *reverse_it;
    }

203
    UNREACHABLE();
204 205
  }

206 207 208 209 210 211 212 213
  virtual void ProcessChunk(const uint8_t* data, size_t position,
                            size_t length) {
    // Incoming data has to be aligned to Char size.
    DCHECK_EQ(0, length % sizeof(Char));
    chunks_.emplace_back(reinterpret_cast<const Char*>(data), position,
                         length / sizeof(Char));
  }

214
  void FetchChunk(size_t position, RuntimeCallStats* stats) {
215 216 217
    const uint8_t* data = nullptr;
    size_t length;
    {
218
      RuntimeCallTimerScope scope(stats,
219 220 221
                                  RuntimeCallCounterId::kGetMoreDataCallback);
      length = source_->GetMoreData(&data);
    }
222
    ProcessChunk(data, position, length);
223 224 225
  }

  ScriptCompiler::ExternalSourceStream* source_;
226 227 228 229 230

 protected:
  std::vector<struct Chunk> chunks_;
};

231 232 233 234 235 236 237 238 239 240 241
// Provides a buffered utf-16 view on the bytes from the underlying ByteStream.
// Chars are buffered if either the underlying stream isn't utf-16 or the
// underlying utf-16 stream might move (is on-heap).
template <template <typename T> class ByteStream>
class BufferedCharacterStream : public Utf16CharacterStream {
 public:
  template <class... TArgs>
  BufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) {
    buffer_pos_ = pos;
  }

242 243 244 245 246 247 248 249 250 251
  bool can_be_cloned() const final {
    return ByteStream<uint16_t>::kCanBeCloned;
  }

  std::unique_ptr<Utf16CharacterStream> Clone() const override {
    CHECK(can_be_cloned());
    return std::unique_ptr<Utf16CharacterStream>(
        new BufferedCharacterStream<ByteStream>(*this));
  }

252 253 254 255 256 257 258
 protected:
  bool ReadBlock() final {
    size_t position = pos();
    buffer_pos_ = position;
    buffer_start_ = &buffer_[0];
    buffer_cursor_ = buffer_start_;

259
    DisallowHeapAllocation no_gc;
260
    Range<uint8_t> range =
261
        byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
262 263 264 265 266 267
    if (range.length() == 0) {
      buffer_end_ = buffer_start_;
      return false;
    }

    size_t length = Min(kBufferSize, range.length());
268
    i::CopyChars(buffer_, range.start, length);
269 270 271 272
    buffer_end_ = &buffer_[length];
    return true;
  }

273 274 275
  bool can_access_heap() const final {
    return ByteStream<uint8_t>::kCanAccessHeap;
  }
276 277

 private:
278 279 280
  BufferedCharacterStream(const BufferedCharacterStream<ByteStream>& other)
      : byte_stream_(other.byte_stream_) {}

281 282 283 284 285
  static const size_t kBufferSize = 512;
  uc16 buffer_[kBufferSize];
  ByteStream<uint8_t> byte_stream_;
};

286 287
// Provides a unbuffered utf-16 view on the bytes from the underlying
// ByteStream.
288 289
template <template <typename T> class ByteStream>
class UnbufferedCharacterStream : public Utf16CharacterStream {
290
 public:
291 292
  template <class... TArgs>
  UnbufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) {
293
    buffer_pos_ = pos;
294
  }
295

296 297 298 299 300 301 302 303 304 305 306 307 308
  bool can_access_heap() const final {
    return ByteStream<uint16_t>::kCanAccessHeap;
  }

  bool can_be_cloned() const final {
    return ByteStream<uint16_t>::kCanBeCloned;
  }

  std::unique_ptr<Utf16CharacterStream> Clone() const override {
    return std::unique_ptr<Utf16CharacterStream>(
        new UnbufferedCharacterStream<ByteStream>(*this));
  }

309
 protected:
310
  bool ReadBlock() final {
311 312
    size_t position = pos();
    buffer_pos_ = position;
313
    DisallowHeapAllocation no_gc;
314
    Range<uint16_t> range =
315
        byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
316 317 318
    buffer_start_ = range.start;
    buffer_end_ = range.end;
    buffer_cursor_ = buffer_start_;
319 320 321
    if (range.length() == 0) return false;

    DCHECK(!range.unaligned_start());
322
    DCHECK_LE(buffer_start_, buffer_end_);
323 324
    return true;
  }
325

326 327
  UnbufferedCharacterStream(const UnbufferedCharacterStream<ByteStream>& other)
      : byte_stream_(other.byte_stream_) {}
328

329
  ByteStream<uint16_t> byte_stream_;
330 331
};

332 333 334
// Provides a unbuffered utf-16 view on the bytes from the underlying
// ByteStream.
class RelocatingCharacterStream
335
    : public UnbufferedCharacterStream<OnHeapStream> {
336 337 338
 public:
  template <class... TArgs>
  RelocatingCharacterStream(Isolate* isolate, size_t pos, TArgs... args)
339
      : UnbufferedCharacterStream<OnHeapStream>(pos, args...),
340 341 342 343 344 345 346 347 348 349 350 351 352 353 354
        isolate_(isolate) {
    isolate->heap()->AddGCEpilogueCallback(UpdateBufferPointersCallback,
                                           v8::kGCTypeAll, this);
  }

 private:
  ~RelocatingCharacterStream() final {
    isolate_->heap()->RemoveGCEpilogueCallback(UpdateBufferPointersCallback,
                                               this);
  }

  static void UpdateBufferPointersCallback(v8::Isolate* v8_isolate,
                                           v8::GCType type,
                                           v8::GCCallbackFlags flags,
                                           void* stream) {
355
    reinterpret_cast<RelocatingCharacterStream*>(stream)
356 357 358 359
        ->UpdateBufferPointers();
  }

  void UpdateBufferPointers() {
360
    DisallowHeapAllocation no_gc;
361
    Range<uint16_t> range =
362
        byte_stream_.GetDataAt(buffer_pos_, runtime_call_stats(), &no_gc);
363 364 365 366
    if (range.start != buffer_start_) {
      buffer_cursor_ = (buffer_cursor_ - buffer_start_) + range.start;
      buffer_start_ = range.start;
      buffer_end_ = range.end;
367 368 369 370 371 372
    }
  }

  Isolate* isolate_;
};

373
// ----------------------------------------------------------------------------
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
// BufferedUtf16CharacterStreams
//
// A buffered character stream based on a random access character
// source (ReadBlock can be called with pos() pointing to any position,
// even positions before the current).
//
// TODO(verwaest): Remove together with Utf8 external streaming streams.
class BufferedUtf16CharacterStream : public Utf16CharacterStream {
 public:
  BufferedUtf16CharacterStream();

 protected:
  static const size_t kBufferSize = 512;

  bool ReadBlock() final;

  // FillBuffer should read up to kBufferSize characters at position and store
  // them into buffer_[0..]. It returns the number of characters stored.
  virtual size_t FillBuffer(size_t position) = 0;

  // Fixed sized buffer that this class reads from.
  // The base class' buffer_start_ should always point to buffer_.
  uc16 buffer_[kBufferSize];
};

BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
    : Utf16CharacterStream(buffer_, buffer_, buffer_, 0) {}

bool BufferedUtf16CharacterStream::ReadBlock() {
  DCHECK_EQ(buffer_start_, buffer_);

  size_t position = pos();
  buffer_pos_ = position;
  buffer_cursor_ = buffer_;
  buffer_end_ = buffer_ + FillBuffer(position);
  DCHECK_EQ(pos(), position);
  DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize);
  return buffer_cursor_ < buffer_end_;
}

// ----------------------------------------------------------------------------
// Utf8ExternalStreamingStream - chunked streaming of Utf-8 data.
//
// This implementation is fairly complex, since data arrives in chunks which
// may 'cut' arbitrarily into utf-8 characters. Also, seeking to a given
419
// character position is tricky because the byte position cannot be derived
420 421 422 423 424 425 426 427
// from the character position.
//
// TODO(verwaest): Decode utf8 chunks into utf16 chunks on the blink side
// instead so we don't need to buffer.

class Utf8ExternalStreamingStream : public BufferedUtf16CharacterStream {
 public:
  Utf8ExternalStreamingStream(
428
      ScriptCompiler::ExternalSourceStream* source_stream)
429
      : current_({0, {0, 0, 0, unibrow::Utf8::State::kAccept}}),
430
        source_stream_(source_stream) {}
431
  ~Utf8ExternalStreamingStream() final {
432
    for (const Chunk& chunk : chunks_) delete[] chunk.data;
433 434
  }

435 436 437 438 439 440 441
  bool can_access_heap() const final { return false; }

  bool can_be_cloned() const final { return false; }

  std::unique_ptr<Utf16CharacterStream> Clone() const override {
    UNREACHABLE();
  }
442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473

 protected:
  size_t FillBuffer(size_t position) final;

 private:
  // A position within the data stream. It stores:
  // - The 'physical' position (# of bytes in the stream),
  // - the 'logical' position (# of ucs-2 characters, also within the stream),
  // - a possibly incomplete utf-8 char at the current 'physical' position.
  struct StreamPosition {
    size_t bytes;
    size_t chars;
    uint32_t incomplete_char;
    unibrow::Utf8::State state;
  };

  // Position contains a StreamPosition and the index of the chunk the position
  // points into. (The chunk_no could be derived from pos, but that'd be
  // an expensive search through all chunks.)
  struct Position {
    size_t chunk_no;
    StreamPosition pos;
  };

  // A chunk in the list of chunks, containing:
  // - The chunk data (data pointer and length), and
  // - the position at the first byte of the chunk.
  struct Chunk {
    const uint8_t* data;
    size_t length;
    StreamPosition start;
  };
474

475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501
  // Within the current chunk, skip forward from current_ towards position.
  bool SkipToPosition(size_t position);
  // Within the current chunk, fill the buffer_ (while it has capacity).
  void FillBufferFromCurrentChunk();
  // Fetch a new chunk (assuming current_ is at the end of the current data).
  bool FetchChunk();
  // Search through the chunks and set current_ to point to the given position.
  // (This call is potentially expensive.)
  void SearchPosition(size_t position);

  std::vector<Chunk> chunks_;
  Position current_;
  ScriptCompiler::ExternalSourceStream* source_stream_;
};

bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) {
  DCHECK_LE(current_.pos.chars, position);  // We can only skip forward.

  // Already there? Then return immediately.
  if (current_.pos.chars == position) return true;

  const Chunk& chunk = chunks_[current_.chunk_no];
  DCHECK(current_.pos.bytes >= chunk.start.bytes);

  unibrow::Utf8::State state = chunk.start.state;
  uint32_t incomplete_char = chunk.start.incomplete_char;
  size_t it = current_.pos.bytes - chunk.start.bytes;
502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523
  const uint8_t* cursor = &chunk.data[it];
  const uint8_t* end = &chunk.data[chunk.length];

  size_t chars = current_.pos.chars;

  if (V8_UNLIKELY(current_.pos.bytes < 3 && chars == 0)) {
    while (cursor < end) {
      unibrow::uchar t =
          unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
      if (t == unibrow::Utf8::kIncomplete) continue;
      if (t != kUtf8Bom) {
        chars++;
        if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
      }
      break;
    }
  }

  while (cursor < end && chars < position) {
    unibrow::uchar t =
        unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
    if (t != unibrow::Utf8::kIncomplete) {
524 525 526 527 528
      chars++;
      if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
    }
  }

529
  current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
530 531 532
  current_.pos.chars = chars;
  current_.pos.incomplete_char = incomplete_char;
  current_.pos.state = state;
533
  current_.chunk_no += (cursor == end);
534 535 536 537 538 539 540 541 542 543 544 545 546

  return current_.pos.chars == position;
}

void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
  DCHECK_LT(current_.chunk_no, chunks_.size());
  DCHECK_EQ(buffer_start_, buffer_cursor_);
  DCHECK_LT(buffer_end_ + 1, buffer_start_ + kBufferSize);

  const Chunk& chunk = chunks_[current_.chunk_no];

  // The buffer_ is writable, but buffer_*_ members are const. So we get a
  // non-const pointer into buffer that points to the same char as buffer_end_.
547 548
  uint16_t* output_cursor = buffer_ + (buffer_end_ - buffer_start_);
  DCHECK_EQ(output_cursor, buffer_end_);
549 550 551 552 553 554 555 556 557 558

  unibrow::Utf8::State state = current_.pos.state;
  uint32_t incomplete_char = current_.pos.incomplete_char;

  // If the current chunk is the last (empty) chunk we'll have to process
  // any left-over, partial characters.
  if (chunk.length == 0) {
    unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
    if (t != unibrow::Utf8::kBufferEmpty) {
      DCHECK_EQ(t, unibrow::Utf8::kBadChar);
559
      *output_cursor = static_cast<uc16>(t);
560 561 562 563 564 565 566 567 568
      buffer_end_++;
      current_.pos.chars++;
      current_.pos.incomplete_char = 0;
      current_.pos.state = state;
    }
    return;
  }

  size_t it = current_.pos.bytes - chunk.start.bytes;
569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
  const uint8_t* cursor = chunk.data + it;
  const uint8_t* end = chunk.data + chunk.length;

  // Deal with possible BOM.
  if (V8_UNLIKELY(current_.pos.bytes < 3 && current_.pos.chars == 0)) {
    while (cursor < end) {
      unibrow::uchar t =
          unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
      if (V8_LIKELY(t < kUtf8Bom)) {
        *(output_cursor++) = static_cast<uc16>(t);  // The most frequent case.
      } else if (t == unibrow::Utf8::kIncomplete) {
        continue;
      } else if (t == kUtf8Bom) {
        // BOM detected at beginning of the stream. Don't copy it.
      } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
        *(output_cursor++) = static_cast<uc16>(t);
      } else {
        *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
        *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
      }
      break;
    }
  }

593 594
  const uint16_t* max_buffer_end = buffer_start_ + kBufferSize;
  while (cursor < end && output_cursor + 1 < max_buffer_end) {
595 596 597 598
    unibrow::uchar t =
        unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
    if (V8_LIKELY(t <= unibrow::Utf16::kMaxNonSurrogateCharCode)) {
      *(output_cursor++) = static_cast<uc16>(t);  // The most frequent case.
599 600 601
    } else if (t == unibrow::Utf8::kIncomplete) {
      continue;
    } else {
602 603
      *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
      *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
604
    }
605 606 607 608 609
    // Fast path for ascii sequences.
    size_t remaining = end - cursor;
    size_t max_buffer = max_buffer_end - output_cursor;
    int max_length = static_cast<int>(Min(remaining, max_buffer));
    DCHECK_EQ(state, unibrow::Utf8::State::kAccept);
610 611 612 613
    int ascii_length = NonAsciiStart(cursor, max_length);
    CopyChars(output_cursor, cursor, ascii_length);
    cursor += ascii_length;
    output_cursor += ascii_length;
614 615
  }

616 617
  current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
  current_.pos.chars += (output_cursor - buffer_end_);
618 619
  current_.pos.incomplete_char = incomplete_char;
  current_.pos.state = state;
620
  current_.chunk_no += (cursor == end);
621

622
  buffer_end_ = output_cursor;
623 624 625
}

bool Utf8ExternalStreamingStream::FetchChunk() {
626
  RuntimeCallTimerScope scope(runtime_call_stats(),
627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650
                              RuntimeCallCounterId::kGetMoreDataCallback);
  DCHECK_EQ(current_.chunk_no, chunks_.size());
  DCHECK(chunks_.empty() || chunks_.back().length != 0);

  const uint8_t* chunk = nullptr;
  size_t length = source_stream_->GetMoreData(&chunk);
  chunks_.push_back({chunk, length, current_.pos});
  return length > 0;
}

void Utf8ExternalStreamingStream::SearchPosition(size_t position) {
  // If current_ already points to the right position, we're done.
  //
  // This is expected to be the common case, since we typically call
  // FillBuffer right after the current buffer.
  if (current_.pos.chars == position) return;

  // No chunks. Fetch at least one, so we can assume !chunks_.empty() below.
  if (chunks_.empty()) {
    DCHECK_EQ(current_.chunk_no, 0u);
    DCHECK_EQ(current_.pos.bytes, 0u);
    DCHECK_EQ(current_.pos.chars, 0u);
    FetchChunk();
  }
651

652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714
  // Search for the last chunk whose start position is less or equal to
  // position.
  size_t chunk_no = chunks_.size() - 1;
  while (chunk_no > 0 && chunks_[chunk_no].start.chars > position) {
    chunk_no--;
  }

  // Did we find the terminating (zero-length) chunk? Then we're seeking
  // behind the end of the data, and position does not exist.
  // Set current_ to point to the terminating chunk.
  if (chunks_[chunk_no].length == 0) {
    current_ = {chunk_no, chunks_[chunk_no].start};
    return;
  }

  // Did we find the non-last chunk? Then our position must be within chunk_no.
  if (chunk_no + 1 < chunks_.size()) {
    // Fancy-pants optimization for ASCII chunks within a utf-8 stream.
    // (Many web sites declare utf-8 encoding, but use only (or almost only) the
    //  ASCII subset for their JavaScript sources. We can exploit this, by
    //  checking whether the # bytes in a chunk are equal to the # chars, and if
    //  so avoid the expensive SkipToPosition.)
    bool ascii_only_chunk =
        chunks_[chunk_no].start.incomplete_char == 0 &&
        (chunks_[chunk_no + 1].start.bytes - chunks_[chunk_no].start.bytes) ==
            (chunks_[chunk_no + 1].start.chars - chunks_[chunk_no].start.chars);
    if (ascii_only_chunk) {
      size_t skip = position - chunks_[chunk_no].start.chars;
      current_ = {chunk_no,
                  {chunks_[chunk_no].start.bytes + skip,
                   chunks_[chunk_no].start.chars + skip, 0,
                   unibrow::Utf8::State::kAccept}};
    } else {
      current_ = {chunk_no, chunks_[chunk_no].start};
      SkipToPosition(position);
    }

    // Since position was within the chunk, SkipToPosition should have found
    // something.
    DCHECK_EQ(position, current_.pos.chars);
    return;
  }

  // What's left: We're in the last, non-terminating chunk. Our position
  // may be in the chunk, but it may also be in 'future' chunks, which we'll
  // have to obtain.
  DCHECK_EQ(chunk_no, chunks_.size() - 1);
  current_ = {chunk_no, chunks_[chunk_no].start};
  bool have_more_data = true;
  bool found = SkipToPosition(position);
  while (have_more_data && !found) {
    DCHECK_EQ(current_.chunk_no, chunks_.size());
    have_more_data = FetchChunk();
    found = have_more_data && SkipToPosition(position);
  }

  // We'll return with a postion != the desired position only if we're out
  // of data. In that case, we'll point to the terminating chunk.
  DCHECK_EQ(found, current_.pos.chars == position);
  DCHECK_EQ(have_more_data, chunks_.back().length != 0);
  DCHECK_IMPLIES(!found, !have_more_data);
  DCHECK_IMPLIES(!found, current_.chunk_no == chunks_.size() - 1);
}
715

716 717 718
size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) {
  buffer_cursor_ = buffer_;
  buffer_end_ = buffer_;
719

720 721 722 723
  SearchPosition(position);
  bool out_of_data = current_.chunk_no != chunks_.size() &&
                     chunks_[current_.chunk_no].length == 0 &&
                     current_.pos.incomplete_char == 0;
724

725
  if (out_of_data) return 0;
726

727 728 729 730 731 732 733 734 735 736 737 738 739 740 741
  // Fill the buffer, until we have at least one char (or are out of data).
  // (The embedder might give us 1-byte blocks within a utf-8 char, so we
  //  can't guarantee progress with one chunk. Thus we iterate.)
  while (!out_of_data && buffer_cursor_ == buffer_end_) {
    // At end of current data, but there might be more? Then fetch it.
    if (current_.chunk_no == chunks_.size()) {
      out_of_data = !FetchChunk();
    }
    FillBufferFromCurrentChunk();
  }

  DCHECK_EQ(current_.pos.chars - position,
            static_cast<size_t>(buffer_end_ - buffer_cursor_));
  return buffer_end_ - buffer_cursor_;
}
742

743 744 745 746 747
// ----------------------------------------------------------------------------
// ScannerStream: Create stream instances.

Utf16CharacterStream* ScannerStream::For(Isolate* isolate,
                                         Handle<String> data) {
748
  return ScannerStream::For(isolate, data, 0, data->length());
749
}
750

751 752
Utf16CharacterStream* ScannerStream::For(Isolate* isolate, Handle<String> data,
                                         int start_pos, int end_pos) {
753 754 755
  DCHECK_GE(start_pos, 0);
  DCHECK_LE(start_pos, end_pos);
  DCHECK_LE(end_pos, data->length());
756 757
  size_t start_offset = 0;
  if (data->IsSlicedString()) {
758
    SlicedString string = SlicedString::cast(*data);
759 760 761
    start_offset = string.offset();
    String parent = string.parent();
    if (parent.IsThinString()) parent = ThinString::cast(parent).actual();
762
    data = handle(parent, isolate);
763
  } else {
764
    data = String::Flatten(isolate, data);
765
  }
766
  if (data->IsExternalOneByteString()) {
767
    return new BufferedCharacterStream<ExternalStringStream>(
768 769
        static_cast<size_t>(start_pos), ExternalOneByteString::cast(*data),
        start_offset, static_cast<size_t>(end_pos));
770
  } else if (data->IsExternalTwoByteString()) {
771
    return new UnbufferedCharacterStream<ExternalStringStream>(
772 773
        static_cast<size_t>(start_pos), ExternalTwoByteString::cast(*data),
        start_offset, static_cast<size_t>(end_pos));
774
  } else if (data->IsSeqOneByteString()) {
775 776 777
    return new BufferedCharacterStream<OnHeapStream>(
        static_cast<size_t>(start_pos), Handle<SeqOneByteString>::cast(data),
        start_offset, static_cast<size_t>(end_pos));
778
  } else if (data->IsSeqTwoByteString()) {
779
    return new RelocatingCharacterStream(
780 781 782
        isolate, static_cast<size_t>(start_pos),
        Handle<SeqTwoByteString>::cast(data), start_offset,
        static_cast<size_t>(end_pos));
783
  } else {
784
    UNREACHABLE();
785
  }
786 787
}

788
std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
789 790
    const char* data) {
  return ScannerStream::ForTesting(data, strlen(data));
791 792
}

793
std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
794
    const char* data, size_t length) {
795 796 797 798 799 800 801 802 803 804
  if (data == nullptr) {
    DCHECK_EQ(length, 0);

    // We don't want to pass in a null pointer into the the character stream,
    // because then the one-past-the-end pointer is undefined, so instead pass
    // through this static array.
    static const char non_null_empty_string[1] = {0};
    data = non_null_empty_string;
  }

805
  return std::unique_ptr<Utf16CharacterStream>(
806
      new BufferedCharacterStream<TestingStream>(
807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823
          0, reinterpret_cast<const uint8_t*>(data), length));
}

std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
    const uint16_t* data, size_t length) {
  if (data == nullptr) {
    DCHECK_EQ(length, 0);

    // We don't want to pass in a null pointer into the the character stream,
    // because then the one-past-the-end pointer is undefined, so instead pass
    // through this static array.
    static const uint16_t non_null_empty_uint16_t_string[1] = {0};
    data = non_null_empty_uint16_t_string;
  }

  return std::unique_ptr<Utf16CharacterStream>(
      new UnbufferedCharacterStream<TestingStream>(0, data, length));
824 825
}

826
Utf16CharacterStream* ScannerStream::For(
827
    ScriptCompiler::ExternalSourceStream* source_stream,
828
    v8::ScriptCompiler::StreamedSource::Encoding encoding) {
829 830
  switch (encoding) {
    case v8::ScriptCompiler::StreamedSource::TWO_BYTE:
831
      return new UnbufferedCharacterStream<ChunkedStream>(
832
          static_cast<size_t>(0), source_stream);
833
    case v8::ScriptCompiler::StreamedSource::ONE_BYTE:
834
      return new BufferedCharacterStream<ChunkedStream>(static_cast<size_t>(0),
835
                                                        source_stream);
836
    case v8::ScriptCompiler::StreamedSource::UTF8:
837
      return new Utf8ExternalStreamingStream(source_stream);
838
  }
839
  UNREACHABLE();
840 841
}

842 843
}  // namespace internal
}  // namespace v8