scanner-character-streams.cc 31.6 KB
Newer Older
1
// Copyright 2011 the V8 project authors. All rights reserved.
2 3
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
4

5
#include "src/parsing/scanner-character-streams.h"
6

7
#include "include/v8.h"
8
#include "src/counters.h"
9
#include "src/globals.h"
10
#include "src/handles.h"
11
#include "src/objects-inl.h"
12
#include "src/parsing/scanner.h"
13
#include "src/unicode-inl.h"
14 15 16 17

namespace v8 {
namespace internal {

marja's avatar
marja committed
18
namespace {
19
const unibrow::uchar kUtf8Bom = 0xFEFF;
marja's avatar
marja committed
20 21
}  // namespace

22 23 24 25 26 27 28 29 30
// ----------------------------------------------------------------------------
// BufferedUtf16CharacterStreams
//
// A buffered character stream based on a random access character
// source (ReadBlock can be called with pos() pointing to any position,
// even positions before the current).
class BufferedUtf16CharacterStream : public Utf16CharacterStream {
 public:
  BufferedUtf16CharacterStream();
31

32
 protected:
33
  static const size_t kBufferSize = 512;
34

35
  bool ReadBlock() override;
36

37 38 39
  // FillBuffer should read up to kBufferSize characters at position and store
  // them into buffer_[0..]. It returns the number of characters stored.
  virtual size_t FillBuffer(size_t position) = 0;
40

41 42
  // Fixed sized buffer that this class reads from.
  // The base class' buffer_start_ should always point to buffer_.
43
  uc16 buffer_[kBufferSize];
44
};
45

46 47
BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
    : Utf16CharacterStream(buffer_, buffer_, buffer_, 0) {}
48

49
bool BufferedUtf16CharacterStream::ReadBlock() {
50 51
  DCHECK_EQ(buffer_start_, buffer_);

52 53
  size_t position = pos();
  buffer_pos_ = position;
54
  buffer_cursor_ = buffer_;
55 56
  buffer_end_ = buffer_ + FillBuffer(position);
  DCHECK_EQ(pos(), position);
57
  DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize);
58
  return buffer_cursor_ < buffer_end_;
59 60
}

61 62 63 64
// ----------------------------------------------------------------------------
// GenericStringUtf16CharacterStream.
//
// A stream w/ a data source being a (flattened) Handle<String>.
65

66 67 68 69
class GenericStringUtf16CharacterStream : public BufferedUtf16CharacterStream {
 public:
  GenericStringUtf16CharacterStream(Handle<String> data, size_t start_position,
                                    size_t end_position);
70

71 72
  bool can_access_heap() override { return true; }

73 74
 protected:
  size_t FillBuffer(size_t position) override;
75

76 77 78
  Handle<String> string_;
  size_t length_;
};
79

80 81 82 83 84 85 86
GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(
    Handle<String> data, size_t start_position, size_t end_position)
    : string_(data), length_(end_position) {
  DCHECK_GE(end_position, start_position);
  DCHECK_GE(static_cast<size_t>(string_->length()),
            end_position - start_position);
  buffer_pos_ = start_position;
87 88
}

89 90
size_t GenericStringUtf16CharacterStream::FillBuffer(size_t from_pos) {
  if (from_pos >= length_) return 0;
91

92
  size_t length = i::Min(kBufferSize, length_ - from_pos);
93 94 95
  String::WriteToFlat<uc16>(*string_, buffer_, static_cast<int>(from_pos),
                            static_cast<int>(from_pos + length));
  return length;
96 97
}

98 99 100 101 102
// ----------------------------------------------------------------------------
// ExternalTwoByteStringUtf16CharacterStream.
//
// A stream whose data source is a Handle<ExternalTwoByteString>. It avoids
// all data copying.
103

104 105 106 107 108
class ExternalTwoByteStringUtf16CharacterStream : public Utf16CharacterStream {
 public:
  ExternalTwoByteStringUtf16CharacterStream(Handle<ExternalTwoByteString> data,
                                            size_t start_position,
                                            size_t end_position);
109

110 111
  bool can_access_heap() override { return false; }

112 113
 private:
  bool ReadBlock() override;
114

115 116 117 118
  const uc16* raw_data_;  // Pointer to the actual array of characters.
  size_t start_pos_;
  size_t end_pos_;
};
119

120 121 122 123 124 125 126 127 128 129 130
ExternalTwoByteStringUtf16CharacterStream::
    ExternalTwoByteStringUtf16CharacterStream(
        Handle<ExternalTwoByteString> data, size_t start_position,
        size_t end_position)
    : raw_data_(data->GetTwoByteData(static_cast<int>(start_position))),
      start_pos_(start_position),
      end_pos_(end_position) {
  buffer_start_ = raw_data_;
  buffer_cursor_ = raw_data_;
  buffer_end_ = raw_data_ + (end_pos_ - start_pos_);
  buffer_pos_ = start_pos_;
131 132
}

133 134 135 136 137 138 139 140 141 142 143 144 145 146
bool ExternalTwoByteStringUtf16CharacterStream::ReadBlock() {
  size_t position = pos();
  bool have_data = start_pos_ <= position && position < end_pos_;
  if (have_data) {
    buffer_pos_ = start_pos_;
    buffer_cursor_ = raw_data_ + (position - start_pos_),
    buffer_end_ = raw_data_ + (end_pos_ - start_pos_);
  } else {
    buffer_pos_ = position;
    buffer_cursor_ = raw_data_;
    buffer_end_ = raw_data_;
  }
  return have_data;
}
147

148 149 150 151
// ----------------------------------------------------------------------------
// ExternalOneByteStringUtf16CharacterStream
//
// A stream whose data source is a Handle<ExternalOneByteString>.
152

153 154 155 156 157 158
class ExternalOneByteStringUtf16CharacterStream
    : public BufferedUtf16CharacterStream {
 public:
  ExternalOneByteStringUtf16CharacterStream(Handle<ExternalOneByteString> data,
                                            size_t start_position,
                                            size_t end_position);
159

160 161
  // For testing:
  ExternalOneByteStringUtf16CharacterStream(const char* data, size_t length);
162

163 164
  bool can_access_heap() override { return false; }

165 166
 protected:
  size_t FillBuffer(size_t position) override;
167

168 169 170
  const uint8_t* raw_data_;  // Pointer to the actual array of characters.
  size_t length_;
};
171

172 173 174 175 176 177 178
ExternalOneByteStringUtf16CharacterStream::
    ExternalOneByteStringUtf16CharacterStream(
        Handle<ExternalOneByteString> data, size_t start_position,
        size_t end_position)
    : raw_data_(data->GetChars()), length_(end_position) {
  DCHECK(end_position >= start_position);
  buffer_pos_ = start_position;
179 180
}

181 182 183
ExternalOneByteStringUtf16CharacterStream::
    ExternalOneByteStringUtf16CharacterStream(const char* data, size_t length)
    : raw_data_(reinterpret_cast<const uint8_t*>(data)), length_(length) {}
184

185
size_t ExternalOneByteStringUtf16CharacterStream::FillBuffer(size_t from_pos) {
186
  if (from_pos >= length_) return 0;
187

188
  size_t length = Min(kBufferSize, length_ - from_pos);
189
  i::CopyCharsUnsigned(buffer_, raw_data_ + from_pos, length);
190 191 192
  return length;
}

193 194 195 196 197 198 199
// ----------------------------------------------------------------------------
// Utf8ExternalStreamingStream - chunked streaming of Utf-8 data.
//
// This implementation is fairly complex, since data arrives in chunks which
// may 'cut' arbitrarily into utf-8 characters. Also, seeking to a given
// character position is tricky because the byte position cannot be dericed
// from the character position.
200

201 202 203 204 205
class Utf8ExternalStreamingStream : public BufferedUtf16CharacterStream {
 public:
  Utf8ExternalStreamingStream(
      ScriptCompiler::ExternalSourceStream* source_stream,
      RuntimeCallStats* stats)
206
      : current_({0, {0, 0, 0, unibrow::Utf8::State::kAccept}}),
207 208 209 210
        source_stream_(source_stream),
        stats_(stats) {}
  ~Utf8ExternalStreamingStream() override {
    for (size_t i = 0; i < chunks_.size(); i++) delete[] chunks_[i].data;
211
  }
212

213 214 215 216 217 218 219 220 221 222 223 224 225
  bool can_access_heap() override { return false; }

 protected:
  size_t FillBuffer(size_t position) override;

 private:
  // A position within the data stream. It stores:
  // - The 'physical' position (# of bytes in the stream),
  // - the 'logical' position (# of ucs-2 characters, also within the stream),
  // - a possibly incomplete utf-8 char at the current 'physical' position.
  struct StreamPosition {
    size_t bytes;
    size_t chars;
226 227
    uint32_t incomplete_char;
    unibrow::Utf8::State state;
228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
  };

  // Position contains a StreamPosition and the index of the chunk the position
  // points into. (The chunk_no could be derived from pos, but that'd be
  // an expensive search through all chunks.)
  struct Position {
    size_t chunk_no;
    StreamPosition pos;
  };

  // A chunk in the list of chunks, containing:
  // - The chunk data (data pointer and length), and
  // - the position at the first byte of the chunk.
  struct Chunk {
    const uint8_t* data;
    size_t length;
    StreamPosition start;
  };

  // Within the current chunk, skip forward from current_ towards position.
  bool SkipToPosition(size_t position);
  // Within the current chunk, fill the buffer_ (while it has capacity).
  void FillBufferFromCurrentChunk();
  // Fetch a new chunk (assuming current_ is at the end of the current data).
  bool FetchChunk();
  // Search through the chunks and set current_ to point to the given position.
  // (This call is potentially expensive.)
  void SearchPosition(size_t position);

  std::vector<Chunk> chunks_;
  Position current_;
  ScriptCompiler::ExternalSourceStream* source_stream_;
  RuntimeCallStats* stats_;
261 262
};

263 264 265 266 267 268 269 270 271
bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) {
  DCHECK_LE(current_.pos.chars, position);  // We can only skip forward.

  // Already there? Then return immediately.
  if (current_.pos.chars == position) return true;

  const Chunk& chunk = chunks_[current_.chunk_no];
  DCHECK(current_.pos.bytes >= chunk.start.bytes);

272 273
  unibrow::Utf8::State state = chunk.start.state;
  uint32_t incomplete_char = chunk.start.incomplete_char;
274 275 276
  size_t it = current_.pos.bytes - chunk.start.bytes;
  size_t chars = chunk.start.chars;
  while (it < chunk.length && chars < position) {
277 278
    unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
        chunk.data[it], &it, &state, &incomplete_char);
279 280 281 282 283
    if (t == kUtf8Bom && current_.pos.chars == 0) {
      // BOM detected at beginning of the stream. Don't copy it.
    } else if (t != unibrow::Utf8::kIncomplete) {
      chars++;
      if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
284
    }
285
  }
286

287 288 289
  current_.pos.bytes += it;
  current_.pos.chars = chars;
  current_.pos.incomplete_char = incomplete_char;
290
  current_.pos.state = state;
291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307
  current_.chunk_no += (it == chunk.length);

  return current_.pos.chars == position;
}

void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
  DCHECK_LT(current_.chunk_no, chunks_.size());
  DCHECK_EQ(buffer_start_, buffer_cursor_);
  DCHECK_LT(buffer_end_ + 1, buffer_start_ + kBufferSize);

  const Chunk& chunk = chunks_[current_.chunk_no];

  // The buffer_ is writable, but buffer_*_ members are const. So we get a
  // non-const pointer into buffer that points to the same char as buffer_end_.
  uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
  DCHECK_EQ(cursor, buffer_end_);

308 309 310
  unibrow::Utf8::State state = current_.pos.state;
  uint32_t incomplete_char = current_.pos.incomplete_char;

311 312 313
  // If the current chunk is the last (empty) chunk we'll have to process
  // any left-over, partial characters.
  if (chunk.length == 0) {
314
    unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
315
    if (t != unibrow::Utf8::kBufferEmpty) {
316
      DCHECK_EQ(t, unibrow::Utf8::kBadChar);
317 318 319
      *cursor = static_cast<uc16>(t);
      buffer_end_++;
      current_.pos.chars++;
320 321
      current_.pos.incomplete_char = 0;
      current_.pos.state = state;
322
    }
323
    return;
vogelheim's avatar
vogelheim committed
324
  }
325

326 327 328 329
  size_t it = current_.pos.bytes - chunk.start.bytes;
  while (it < chunk.length && cursor + 1 < buffer_start_ + kBufferSize) {
    unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
        chunk.data[it], &it, &state, &incomplete_char);
330 331
    if (V8_LIKELY(t < kUtf8Bom)) {
      *(cursor++) = static_cast<uc16>(t);  // The by most frequent case.
332 333 334
    } else if (t == unibrow::Utf8::kIncomplete) {
      continue;
    } else if (t == kUtf8Bom && current_.pos.bytes + it == 3) {
335 336 337 338 339 340
      // BOM detected at beginning of the stream. Don't copy it.
    } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
      *(cursor++) = static_cast<uc16>(t);
    } else {
      *(cursor++) = unibrow::Utf16::LeadSurrogate(t);
      *(cursor++) = unibrow::Utf16::TrailSurrogate(t);
341
    }
342
  }
343

344 345 346
  current_.pos.bytes = chunk.start.bytes + it;
  current_.pos.chars += (cursor - buffer_end_);
  current_.pos.incomplete_char = incomplete_char;
347
  current_.pos.state = state;
348
  current_.chunk_no += (it == chunk.length);
349

350 351 352 353
  buffer_end_ = cursor;
}

bool Utf8ExternalStreamingStream::FetchChunk() {
354 355
  RuntimeCallTimerScope scope(stats_,
                              RuntimeCallCounterId::kGetMoreDataCallback);
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
  DCHECK_EQ(current_.chunk_no, chunks_.size());
  DCHECK(chunks_.empty() || chunks_.back().length != 0);

  const uint8_t* chunk = nullptr;
  size_t length = source_stream_->GetMoreData(&chunk);
  chunks_.push_back({chunk, length, current_.pos});
  return length > 0;
}

void Utf8ExternalStreamingStream::SearchPosition(size_t position) {
  // If current_ already points to the right position, we're done.
  //
  // This is expected to be the common case, since we typically call
  // FillBuffer right after the current buffer.
  if (current_.pos.chars == position) return;

  // No chunks. Fetch at least one, so we can assume !chunks_.empty() below.
  if (chunks_.empty()) {
    DCHECK_EQ(current_.chunk_no, 0u);
    DCHECK_EQ(current_.pos.bytes, 0u);
    DCHECK_EQ(current_.pos.chars, 0u);
    FetchChunk();
378
  }
379

380 381 382 383 384
  // Search for the last chunk whose start position is less or equal to
  // position.
  size_t chunk_no = chunks_.size() - 1;
  while (chunk_no > 0 && chunks_[chunk_no].start.chars > position) {
    chunk_no--;
385
  }
386

387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402
  // Did we find the terminating (zero-length) chunk? Then we're seeking
  // behind the end of the data, and position does not exist.
  // Set current_ to point to the terminating chunk.
  if (chunks_[chunk_no].length == 0) {
    current_ = {chunk_no, chunks_[chunk_no].start};
    return;
  }

  // Did we find the non-last chunk? Then our position must be within chunk_no.
  if (chunk_no + 1 < chunks_.size()) {
    // Fancy-pants optimization for ASCII chunks within a utf-8 stream.
    // (Many web sites declare utf-8 encoding, but use only (or almost only) the
    //  ASCII subset for their JavaScript sources. We can exploit this, by
    //  checking whether the # bytes in a chunk are equal to the # chars, and if
    //  so avoid the expensive SkipToPosition.)
    bool ascii_only_chunk =
403
        chunks_[chunk_no].start.incomplete_char == 0 &&
404 405 406 407 408 409
        (chunks_[chunk_no + 1].start.bytes - chunks_[chunk_no].start.bytes) ==
            (chunks_[chunk_no + 1].start.chars - chunks_[chunk_no].start.chars);
    if (ascii_only_chunk) {
      size_t skip = position - chunks_[chunk_no].start.chars;
      current_ = {chunk_no,
                  {chunks_[chunk_no].start.bytes + skip,
410 411
                   chunks_[chunk_no].start.chars + skip, 0,
                   unibrow::Utf8::State::kAccept}};
412 413 414
    } else {
      current_ = {chunk_no, chunks_[chunk_no].start};
      SkipToPosition(position);
415
    }
416 417 418 419 420

    // Since position was within the chunk, SkipToPosition should have found
    // something.
    DCHECK_EQ(position, current_.pos.chars);
    return;
421
  }
422

423 424 425 426 427 428 429 430 431 432 433 434
  // What's left: We're in the last, non-terminating chunk. Our position
  // may be in the chunk, but it may also be in 'future' chunks, which we'll
  // have to obtain.
  DCHECK_EQ(chunk_no, chunks_.size() - 1);
  current_ = {chunk_no, chunks_[chunk_no].start};
  bool have_more_data = true;
  bool found = SkipToPosition(position);
  while (have_more_data && !found) {
    DCHECK_EQ(current_.chunk_no, chunks_.size());
    have_more_data = FetchChunk();
    found = have_more_data && SkipToPosition(position);
  }
435

436 437 438 439 440 441 442 443 444 445 446 447 448 449
  // We'll return with a postion != the desired position only if we're out
  // of data. In that case, we'll point to the terminating chunk.
  DCHECK_EQ(found, current_.pos.chars == position);
  DCHECK_EQ(have_more_data, chunks_.back().length != 0);
  DCHECK_IMPLIES(!found, !have_more_data);
  DCHECK_IMPLIES(!found, current_.chunk_no == chunks_.size() - 1);
}

size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) {
  buffer_cursor_ = buffer_;
  buffer_end_ = buffer_;

  SearchPosition(position);
  bool out_of_data = current_.chunk_no != chunks_.size() &&
450 451 452
                     chunks_[current_.chunk_no].length == 0 &&
                     current_.pos.incomplete_char == 0;

453 454 455 456 457 458 459 460 461
  if (out_of_data) return 0;

  // Fill the buffer, until we have at least one char (or are out of data).
  // (The embedder might give us 1-byte blocks within a utf-8 char, so we
  //  can't guarantee progress with one chunk. Thus we iterate.)
  while (!out_of_data && buffer_cursor_ == buffer_end_) {
    // At end of current data, but there might be more? Then fetch it.
    if (current_.chunk_no == chunks_.size()) {
      out_of_data = !FetchChunk();
462
    }
463
    FillBufferFromCurrentChunk();
464
  }
465

466 467 468 469 470 471 472 473 474 475 476 477 478
  DCHECK_EQ(current_.pos.chars - position,
            static_cast<size_t>(buffer_end_ - buffer_cursor_));
  return buffer_end_ - buffer_cursor_;
}

// ----------------------------------------------------------------------------
// Chunks - helper for One- + TwoByteExternalStreamingStream
namespace {

struct Chunk {
  const uint8_t* data;
  size_t byte_length;
  size_t byte_pos;
479
};
480

481
typedef std::vector<struct Chunk> Chunks;
482

483 484 485 486 487 488 489 490 491 492 493 494 495 496 497
void DeleteChunks(Chunks& chunks) {
  for (size_t i = 0; i < chunks.size(); i++) delete[] chunks[i].data;
}

// Return the chunk index for the chunk containing position.
// If position is behind the end of the stream, the index of the last,
// zero-length chunk is returned.
size_t FindChunk(Chunks& chunks, ScriptCompiler::ExternalSourceStream* source,
                 size_t position, RuntimeCallStats* stats) {
  size_t end_pos =
      chunks.empty() ? 0 : (chunks.back().byte_pos + chunks.back().byte_length);

  // Get more data if needed. We usually won't enter the loop body.
  bool out_of_data = !chunks.empty() && chunks.back().byte_length == 0;
  {
498 499
    RuntimeCallTimerScope scope(stats,
                                RuntimeCallCounterId::kGetMoreDataCallback);
500 501 502 503 504 505 506
    while (!out_of_data && end_pos <= position + 1) {
      const uint8_t* chunk = nullptr;
      size_t len = source->GetMoreData(&chunk);

      chunks.push_back({chunk, len, end_pos});
      end_pos += len;
      out_of_data = (len == 0);
507
    }
508
  }
509

510 511 512 513 514 515 516 517 518 519 520 521 522 523
  // Here, we should always have at least one chunk, and we either have the
  // chunk we were looking for, or we're out of data. Also, out_of_data and
  // end_pos are current (and designate whether we have exhausted the stream,
  // and the length of data received so far, respectively).
  DCHECK(!chunks.empty());
  DCHECK_EQ(end_pos, chunks.back().byte_pos + chunks.back().byte_length);
  DCHECK_EQ(out_of_data, chunks.back().byte_length == 0);
  DCHECK(position < end_pos || out_of_data);

  // Edge case: position is behind the end of stream: Return the last (length 0)
  // chunk to indicate the end of the stream.
  if (position >= end_pos) {
    DCHECK(out_of_data);
    return chunks.size() - 1;
524
  }
525

526 527 528 529 530 531
  // We almost always 'stream', meaning we want data from the last chunk, so
  // let's look at chunks back-to-front.
  size_t chunk_no = chunks.size() - 1;
  while (chunks[chunk_no].byte_pos > position) {
    DCHECK_NE(chunk_no, 0u);
    chunk_no--;
532
  }
533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555
  DCHECK_LE(chunks[chunk_no].byte_pos, position);
  DCHECK_LT(position, chunks[chunk_no].byte_pos + chunks[chunk_no].byte_length);
  return chunk_no;
}

}  // anonymous namespace

// ----------------------------------------------------------------------------
// OneByteExternalStreamingStream
//
// A stream of latin-1 encoded, chunked data.

class OneByteExternalStreamingStream : public BufferedUtf16CharacterStream {
 public:
  explicit OneByteExternalStreamingStream(
      ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats)
      : source_(source), stats_(stats) {}
  ~OneByteExternalStreamingStream() override { DeleteChunks(chunks_); }

  bool can_access_heap() override { return false; }

 protected:
  size_t FillBuffer(size_t position) override;
556 557

 private:
558
  Chunks chunks_;
559
  ScriptCompiler::ExternalSourceStream* source_;
560
  RuntimeCallStats* stats_;
561 562
};

563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580
size_t OneByteExternalStreamingStream::FillBuffer(size_t position) {
  const Chunk& chunk = chunks_[FindChunk(chunks_, source_, position, stats_)];
  if (chunk.byte_length == 0) return 0;

  size_t start_pos = position - chunk.byte_pos;
  size_t len = i::Min(kBufferSize, chunk.byte_length - start_pos);
  i::CopyCharsUnsigned(buffer_, chunk.data + start_pos, len);
  return len;
}

#if !(V8_TARGET_ARCH_MIPS || V8_TARGET_ARCH_MIPS64)
// ----------------------------------------------------------------------------
// TwoByteExternalStreamingStream
//
// A stream of ucs-2 data, delivered in chunks. Chunks may be 'cut' into the
// middle of characters (or even contain only one byte), which adds a bit
// of complexity. This stream avoid all data copying, except for characters
// that cross chunk boundaries.
581

582
class TwoByteExternalStreamingStream : public Utf16CharacterStream {
583
 public:
584 585 586
  explicit TwoByteExternalStreamingStream(
      ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats);
  ~TwoByteExternalStreamingStream() override;
587

588 589
  bool can_access_heap() override { return false; }

590 591 592
 protected:
  bool ReadBlock() override;

593 594 595 596
  Chunks chunks_;
  ScriptCompiler::ExternalSourceStream* source_;
  RuntimeCallStats* stats_;
  uc16 one_char_buffer_;
597 598
};

599 600 601 602 603 604 605 606 607 608 609 610 611
TwoByteExternalStreamingStream::TwoByteExternalStreamingStream(
    ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats)
    : Utf16CharacterStream(&one_char_buffer_, &one_char_buffer_,
                           &one_char_buffer_, 0),
      source_(source),
      stats_(stats),
      one_char_buffer_(0) {}

TwoByteExternalStreamingStream::~TwoByteExternalStreamingStream() {
  DeleteChunks(chunks_);
}

bool TwoByteExternalStreamingStream::ReadBlock() {
612
  size_t position = pos();
613

614 615 616 617 618 619
  // We'll search for the 2nd byte of our character, to make sure we
  // have enough data for at least one character.
  size_t chunk_no = FindChunk(chunks_, source_, 2 * position + 1, stats_);

  // Out of data? Return 0.
  if (chunks_[chunk_no].byte_length == 0) {
620
    buffer_pos_ = position;
621 622
    buffer_cursor_ = buffer_start_;
    buffer_end_ = buffer_start_;
623 624
    return false;
  }
625

626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649
  Chunk& current = chunks_[chunk_no];

  // Annoying edge case: Chunks may not be 2-byte aligned, meaning that a
  // character may be split between the previous and the current chunk.
  // If we find such a lonely byte at the beginning of the chunk, we'll use
  // one_char_buffer_ to hold the full character.
  bool lonely_byte = (chunks_[chunk_no].byte_pos == (2 * position + 1));
  if (lonely_byte) {
    DCHECK_NE(chunk_no, 0u);
    Chunk& previous_chunk = chunks_[chunk_no - 1];
#ifdef V8_TARGET_BIG_ENDIAN
    uc16 character = current.data[0] |
                     previous_chunk.data[previous_chunk.byte_length - 1] << 8;
#else
    uc16 character = previous_chunk.data[previous_chunk.byte_length - 1] |
                     current.data[0] << 8;
#endif

    one_char_buffer_ = character;
    buffer_pos_ = position;
    buffer_start_ = &one_char_buffer_;
    buffer_cursor_ = &one_char_buffer_;
    buffer_end_ = &one_char_buffer_ + 1;
    return true;
650
  }
651

652 653 654 655 656 657 658 659 660 661 662 663 664 665 666
  // Common case: character is in current chunk.
  DCHECK_LE(current.byte_pos, 2 * position);
  DCHECK_LT(2 * position + 1, current.byte_pos + current.byte_length);

  // Determine # of full ucs-2 chars in stream, and whether we started on an odd
  // byte boundary.
  bool odd_start = (current.byte_pos % 2) == 1;
  size_t number_chars = (current.byte_length - odd_start) / 2;

  // Point the buffer_*_ members into the current chunk and set buffer_cursor_
  // to point to position. Be careful when converting the byte positions (in
  // Chunk) to the ucs-2 character positions (in buffer_*_ members).
  buffer_start_ = reinterpret_cast<const uint16_t*>(current.data + odd_start);
  buffer_end_ = buffer_start_ + number_chars;
  buffer_pos_ = (current.byte_pos + odd_start) / 2;
667 668 669
  buffer_cursor_ = buffer_start_ + (position - buffer_pos_);
  DCHECK_EQ(position, pos());
  return true;
670 671
}

672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722
#else

// ----------------------------------------------------------------------------
// TwoByteExternalBufferedStream
//
// This class is made specifically to address unaligned access to 16-bit data
// in MIPS and ARM architectures. It replaces class
// TwoByteExternalStreamingStream which in some cases does have unaligned
// accesse to 16-bit data

class TwoByteExternalBufferedStream : public Utf16CharacterStream {
 public:
  explicit TwoByteExternalBufferedStream(
      ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats);
  ~TwoByteExternalBufferedStream();

  bool can_access_heap() override { return false; }

 protected:
  static const size_t kBufferSize = 512;

  bool ReadBlock() override;

  // FillBuffer should read up to kBufferSize characters at position and store
  // them into buffer_[0..]. It returns the number of characters stored.
  size_t FillBuffer(size_t position, size_t chunk_no);

  // Fixed sized buffer that this class reads from.
  // The base class' buffer_start_ should always point to buffer_.
  uc16 buffer_[kBufferSize];

  Chunks chunks_;
  ScriptCompiler::ExternalSourceStream* source_;
  RuntimeCallStats* stats_;
};

TwoByteExternalBufferedStream::TwoByteExternalBufferedStream(
    ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats)
    : Utf16CharacterStream(buffer_, buffer_, buffer_, 0),
      source_(source),
      stats_(stats) {}

TwoByteExternalBufferedStream::~TwoByteExternalBufferedStream() {
  DeleteChunks(chunks_);
}

bool TwoByteExternalBufferedStream::ReadBlock() {
  size_t position = pos();
  // Find chunk in which the position belongs
  size_t chunk_no = FindChunk(chunks_, source_, 2 * position + 1, stats_);

723
  // Out of data? Return 0.
724 725 726 727 728
  if (chunks_[chunk_no].byte_length == 0) {
    buffer_pos_ = position;
    buffer_cursor_ = buffer_start_;
    buffer_end_ = buffer_start_;
    return false;
729
  }
730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827

  Chunk& current = chunks_[chunk_no];

  bool odd_start = current.byte_pos % 2;
  // Common case: character is in current chunk.
  DCHECK_LE(current.byte_pos, 2 * position + odd_start);
  DCHECK_LT(2 * position + 1, current.byte_pos + current.byte_length);

  // If character starts on odd address copy text in buffer so there is always
  // aligned access to characters. This is important on MIPS and ARM
  // architectures. Otherwise read characters from memory directly.
  if (!odd_start) {
    buffer_start_ = reinterpret_cast<const uint16_t*>(current.data);
    size_t number_chars = current.byte_length / 2;
    buffer_end_ = buffer_start_ + number_chars;
    buffer_pos_ = current.byte_pos / 2;
    buffer_cursor_ = buffer_start_ + (position - buffer_pos_);
    DCHECK_EQ(position, pos());
    return true;
  } else {
    buffer_start_ = buffer_;
    buffer_pos_ = position;
    buffer_cursor_ = buffer_;
    buffer_end_ = buffer_ + FillBuffer(position, chunk_no);
    DCHECK_EQ(pos(), position);
    DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize);
    return buffer_cursor_ < buffer_end_;
  }
}

size_t TwoByteExternalBufferedStream::FillBuffer(size_t position,
                                                 size_t chunk_no) {
  DCHECK_EQ(chunks_[chunk_no].byte_pos % 2, 1u);
  bool odd_start = true;
  // Align buffer_pos_ to the size of the buffer.
  {
    size_t new_pos = position / kBufferSize * kBufferSize;
    if (new_pos != position) {
      chunk_no = FindChunk(chunks_, source_, 2 * new_pos + 1, stats_);
      buffer_pos_ = new_pos;
      buffer_cursor_ = buffer_start_ + (position - buffer_pos_);
      position = new_pos;
      odd_start = chunks_[chunk_no].byte_pos % 2;
    }
  }

  Chunk* current = &chunks_[chunk_no];

  // Annoying edge case: Chunks may not be 2-byte aligned, meaning that a
  // character may be split between the previous and the current chunk.
  // If we find such a lonely byte at the beginning of the chunk, we'll copy
  // it to the first byte in buffer_.
  size_t totalLength = 0;
  bool lonely_byte = (current->byte_pos == (2 * position + 1));
  if (lonely_byte) {
    DCHECK_NE(chunk_no, 0u);
    Chunk& previous_chunk = chunks_[chunk_no - 1];
    *reinterpret_cast<uint8_t*>(buffer_) =
        previous_chunk.data[previous_chunk.byte_length - 1];
    totalLength++;
  }

  // Common case: character is in current chunk.
  DCHECK_LE(current->byte_pos, 2 * position + odd_start);
  DCHECK_LT(2 * position + 1, current->byte_pos + current->byte_length);

  // Copy characters from current chunk starting from chunk_pos to the end of
  // buffer or chunk.
  size_t chunk_pos = position - current->byte_pos / 2;
  size_t start_offset = odd_start && chunk_pos != 0;
  size_t bytes_to_move =
      i::Min(2 * kBufferSize - lonely_byte,
             current->byte_length - 2 * chunk_pos + start_offset);
  i::MemMove(reinterpret_cast<uint8_t*>(buffer_) + lonely_byte,
             current->data + 2 * chunk_pos - start_offset, bytes_to_move);

  // Fill up the rest of the buffer if there is space and data left.
  totalLength += bytes_to_move;
  position = (current->byte_pos + current->byte_length) / 2;
  if (position - buffer_pos_ < kBufferSize) {
    chunk_no = FindChunk(chunks_, source_, 2 * position + 1, stats_);
    current = &chunks_[chunk_no];
    odd_start = current->byte_pos % 2;
    bytes_to_move = i::Min(2 * kBufferSize - totalLength, current->byte_length);
    while (bytes_to_move) {
      // Common case: character is in current chunk.
      DCHECK_LE(current->byte_pos, 2 * position + odd_start);
      DCHECK_LT(2 * position + 1, current->byte_pos + current->byte_length);

      i::MemMove(reinterpret_cast<uint8_t*>(buffer_) + totalLength,
                 current->data, bytes_to_move);
      totalLength += bytes_to_move;
      position = (current->byte_pos + current->byte_length) / 2;
      chunk_no = FindChunk(chunks_, source_, 2 * position + 1, stats_);
      current = &chunks_[chunk_no];
      odd_start = current->byte_pos % 2;
      bytes_to_move =
          i::Min(2 * kBufferSize - totalLength, current->byte_length);
828 829
    }
  }
830
  return totalLength / 2;
831
}
832
#endif
833

834 835
// ----------------------------------------------------------------------------
// ScannerStream: Create stream instances.
836

837 838 839
Utf16CharacterStream* ScannerStream::For(Handle<String> data) {
  return ScannerStream::For(data, 0, data->length());
}
840

841 842
Utf16CharacterStream* ScannerStream::For(Handle<String> data, int start_pos,
                                         int end_pos) {
843 844 845
  DCHECK_GE(start_pos, 0);
  DCHECK_LE(start_pos, end_pos);
  DCHECK_LE(end_pos, data->length());
846 847
  if (data->IsExternalOneByteString()) {
    return new ExternalOneByteStringUtf16CharacterStream(
848 849
        Handle<ExternalOneByteString>::cast(data),
        static_cast<size_t>(start_pos), static_cast<size_t>(end_pos));
850 851
  } else if (data->IsExternalTwoByteString()) {
    return new ExternalTwoByteStringUtf16CharacterStream(
852 853
        Handle<ExternalTwoByteString>::cast(data),
        static_cast<size_t>(start_pos), static_cast<size_t>(end_pos));
854 855
  } else {
    // TODO(vogelheim): Maybe call data.Flatten() first?
856 857
    return new GenericStringUtf16CharacterStream(
        data, static_cast<size_t>(start_pos), static_cast<size_t>(end_pos));
858
  }
859 860
}

861 862 863
std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
    const char* data) {
  return ScannerStream::ForTesting(data, strlen(data));
864 865
}

866 867 868 869
std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
    const char* data, size_t length) {
  return std::unique_ptr<Utf16CharacterStream>(
      new ExternalOneByteStringUtf16CharacterStream(data, length));
870 871
}

872 873
Utf16CharacterStream* ScannerStream::For(
    ScriptCompiler::ExternalSourceStream* source_stream,
874 875
    v8::ScriptCompiler::StreamedSource::Encoding encoding,
    RuntimeCallStats* stats) {
876 877
  switch (encoding) {
    case v8::ScriptCompiler::StreamedSource::TWO_BYTE:
878 879 880 881 882
#if !(V8_TARGET_ARCH_MIPS || V8_TARGET_ARCH_MIPS64)
      return new TwoByteExternalStreamingStream(source_stream, stats);
#else
      return new TwoByteExternalBufferedStream(source_stream, stats);
#endif
883
    case v8::ScriptCompiler::StreamedSource::ONE_BYTE:
884
      return new OneByteExternalStreamingStream(source_stream, stats);
885
    case v8::ScriptCompiler::StreamedSource::UTF8:
886
      return new Utf8ExternalStreamingStream(source_stream, stats);
887
  }
888 889
  UNREACHABLE();
  return nullptr;
890 891
}

892 893
}  // namespace internal
}  // namespace v8