scanner.h 26.6 KB
Newer Older
1
// Copyright 2011 the V8 project authors. All rights reserved.
2 3
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
4

5 6
// Features shared by parsing and pre-parsing scanners.

7 8
#ifndef V8_PARSING_SCANNER_H_
#define V8_PARSING_SCANNER_H_
9

10 11
#include <algorithm>

12
#include "src/allocation.h"
13
#include "src/base/logging.h"
14 15
#include "src/char-predicates.h"
#include "src/globals.h"
16
#include "src/message-template.h"
17
#include "src/parsing/token.h"
18
#include "src/pointer-with-payload.h"
19
#include "src/unicode-decoder.h"
lpy's avatar
lpy committed
20
#include "src/unicode.h"
21

22 23
namespace v8 {
namespace internal {
24

25 26
class AstRawString;
class AstValueFactory;
27 28
class ExternalOneByteString;
class ExternalTwoByteString;
29
class ParserRecorder;
30
class RuntimeCallStats;
31
class Zone;
32

33 34 35 36 37 38 39 40
// ---------------------------------------------------------------------
// Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
// A code unit is a 16 bit value representing either a 16 bit code point
// or one part of a surrogate pair that make a single 21 bit code point.
class Utf16CharacterStream {
 public:
  static const uc32 kEndOfInput = -1;

41
  virtual ~Utf16CharacterStream() = default;
42

43
  V8_INLINE void set_parser_error() {
44 45 46
    buffer_cursor_ = buffer_end_;
    has_parser_error_ = true;
  }
47 48
  V8_INLINE void reset_parser_error_flag() { has_parser_error_ = false; }
  V8_INLINE bool has_parser_error() const { return has_parser_error_; }
49

50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
  inline uc32 Peek() {
    if (V8_LIKELY(buffer_cursor_ < buffer_end_)) {
      return static_cast<uc32>(*buffer_cursor_);
    } else if (ReadBlockChecked()) {
      return static_cast<uc32>(*buffer_cursor_);
    } else {
      return kEndOfInput;
    }
  }

  // Returns and advances past the next UTF-16 code unit in the input
  // stream. If there are no more code units it returns kEndOfInput.
  inline uc32 Advance() {
    uc32 result = Peek();
    buffer_cursor_++;
    return result;
  }

  // Returns and advances past the next UTF-16 code unit in the input stream
  // that meets the checks requirement. If there are no more code units it
  // returns kEndOfInput.
  template <typename FunctionType>
  V8_INLINE uc32 AdvanceUntil(FunctionType check) {
    while (true) {
74 75 76 77 78 79 80 81 82
      auto next_cursor_pos =
          std::find_if(buffer_cursor_, buffer_end_, [&check](uint16_t raw_c0_) {
            uc32 c0_ = static_cast<uc32>(raw_c0_);
            return check(c0_);
          });

      if (next_cursor_pos == buffer_end_) {
        buffer_cursor_ = buffer_end_;
        if (!ReadBlockChecked()) {
83
          buffer_cursor_++;
84
          return kEndOfInput;
85
        }
86 87 88
      } else {
        buffer_cursor_ = next_cursor_pos + 1;
        return static_cast<uc32>(*next_cursor_pos);
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
      }
    }
  }

  // Go back one by one character in the input stream.
  // This undoes the most recent Advance().
  inline void Back() {
    // The common case - if the previous character is within
    // buffer_start_ .. buffer_end_ will be handles locally.
    // Otherwise, a new block is requested.
    if (V8_LIKELY(buffer_cursor_ > buffer_start_)) {
      buffer_cursor_--;
    } else {
      ReadBlockAt(pos() - 1);
    }
  }

  inline size_t pos() const {
    return buffer_pos_ + (buffer_cursor_ - buffer_start_);
  }

  inline void Seek(size_t pos) {
    if (V8_LIKELY(pos >= buffer_pos_ &&
                  pos < (buffer_pos_ + (buffer_end_ - buffer_start_)))) {
      buffer_cursor_ = buffer_start_ + (pos - buffer_pos_);
    } else {
      ReadBlockAt(pos);
    }
  }

119 120 121 122 123
  // Returns true if the stream could access the V8 heap after construction.
  bool can_be_cloned_for_parallel_access() const {
    return can_be_cloned() && !can_access_heap();
  }

124 125 126 127 128 129 130 131
  // Returns true if the stream can be cloned with Clone.
  // TODO(rmcilroy): Remove this once ChunkedStreams can be cloned.
  virtual bool can_be_cloned() const = 0;

  // Clones the character stream to enable another independent scanner to access
  // the same underlying stream.
  virtual std::unique_ptr<Utf16CharacterStream> Clone() const = 0;

132
  // Returns true if the stream could access the V8 heap after construction.
133
  virtual bool can_access_heap() const = 0;
134

135 136 137 138 139
  RuntimeCallStats* runtime_call_stats() const { return runtime_call_stats_; }
  void set_runtime_call_stats(RuntimeCallStats* runtime_call_stats) {
    runtime_call_stats_ = runtime_call_stats;
  }

140 141 142 143 144 145 146 147 148 149 150 151 152
 protected:
  Utf16CharacterStream(const uint16_t* buffer_start,
                       const uint16_t* buffer_cursor,
                       const uint16_t* buffer_end, size_t buffer_pos)
      : buffer_start_(buffer_start),
        buffer_cursor_(buffer_cursor),
        buffer_end_(buffer_end),
        buffer_pos_(buffer_pos) {}
  Utf16CharacterStream() : Utf16CharacterStream(nullptr, nullptr, nullptr, 0) {}

  bool ReadBlockChecked() {
    size_t position = pos();
    USE(position);
153
    bool success = !has_parser_error() && ReadBlock();
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199

    // Post-conditions: 1, We should always be at the right position.
    //                  2, Cursor should be inside the buffer.
    //                  3, We should have more characters available iff success.
    DCHECK_EQ(pos(), position);
    DCHECK_LE(buffer_cursor_, buffer_end_);
    DCHECK_LE(buffer_start_, buffer_cursor_);
    DCHECK_EQ(success, buffer_cursor_ < buffer_end_);
    return success;
  }

  void ReadBlockAt(size_t new_pos) {
    // The callers of this method (Back/Back2/Seek) should handle the easy
    // case (seeking within the current buffer), and we should only get here
    // if we actually require new data.
    // (This is really an efficiency check, not a correctness invariant.)
    DCHECK(new_pos < buffer_pos_ ||
           new_pos >= buffer_pos_ + (buffer_end_ - buffer_start_));

    // Change pos() to point to new_pos.
    buffer_pos_ = new_pos;
    buffer_cursor_ = buffer_start_;
    DCHECK_EQ(pos(), new_pos);
    ReadBlockChecked();
  }

  // Read more data, and update buffer_*_ to point to it.
  // Returns true if more data was available.
  //
  // ReadBlock() may modify any of the buffer_*_ members, but must sure that
  // the result of pos() remains unaffected.
  //
  // Examples:
  // - a stream could either fill a separate buffer. Then buffer_start_ and
  //   buffer_cursor_ would point to the beginning of the buffer, and
  //   buffer_pos would be the old pos().
  // - a stream with existing buffer chunks would set buffer_start_ and
  //   buffer_end_ to cover the full chunk, and then buffer_cursor_ would
  //   point into the middle of the buffer, while buffer_pos_ would describe
  //   the start of the buffer.
  virtual bool ReadBlock() = 0;

  const uint16_t* buffer_start_;
  const uint16_t* buffer_cursor_;
  const uint16_t* buffer_end_;
  size_t buffer_pos_;
200
  RuntimeCallStats* runtime_call_stats_;
201
  bool has_parser_error_ = false;
202 203
};

204
// ----------------------------------------------------------------------------
205
// JavaScript Scanner.
206 207

class Scanner {
208
 public:
209 210 211
  // Scoped helper for a re-settable bookmark.
  class BookmarkScope {
   public:
212
    explicit BookmarkScope(Scanner* scanner)
213 214
        : scanner_(scanner),
          bookmark_(kNoBookmark),
215
          had_parser_error_(scanner->has_parser_error()) {
216 217
      DCHECK_NOT_NULL(scanner_);
    }
218
    ~BookmarkScope() = default;
219

220
    void Set(size_t bookmark);
221
    void Apply();
222 223
    bool HasBeenSet() const;
    bool HasBeenApplied() const;
224 225

   private:
226 227 228
    static const size_t kNoBookmark;
    static const size_t kBookmarkWasApplied;

229
    Scanner* scanner_;
230
    size_t bookmark_;
231
    bool had_parser_error_;
232 233 234 235

    DISALLOW_COPY_AND_ASSIGN(BookmarkScope);
  };

236 237
  // Sets the Scanner into an error state to stop further scanning and terminate
  // the parsing by only returning ILLEGAL tokens after that.
238
  V8_INLINE void set_parser_error() {
239
    if (!has_parser_error()) {
240 241
      c0_ = kEndOfInput;
      source_->set_parser_error();
242
      for (TokenDesc& desc : token_storage_) desc.token = Token::ILLEGAL;
243 244
    }
  }
245 246 247
  V8_INLINE void reset_parser_error_flag() {
    source_->reset_parser_error_flag();
  }
248
  V8_INLINE bool has_parser_error() const {
249 250
    return source_->has_parser_error();
  }
251

252
  // Representation of an interval of source positions.
253 254 255 256
  struct Location {
    Location(int b, int e) : beg_pos(b), end_pos(e) { }
    Location() : beg_pos(0), end_pos(0) { }

257
    int length() const { return end_pos - beg_pos; }
258
    bool IsValid() const { return IsInRange(beg_pos, 0, end_pos); }
259

260
    static Location invalid() { return Location(-1, 0); }
261 262 263 264 265

    int beg_pos;
    int end_pos;
  };

266 267
  // -1 is outside of the range of any real source code.
  static const int kNoOctalLocation = -1;
268
  static const uc32 kEndOfInput = Utf16CharacterStream::kEndOfInput;
269

270
  explicit Scanner(Utf16CharacterStream* source, bool is_module);
271

272
  void Initialize();
273 274

  // Returns the next token and advances input.
275
  Token::Value Next();
littledan's avatar
littledan committed
276
  // Returns the token following peek()
277
  Token::Value PeekAhead();
278
  // Returns the current token again.
279
  Token::Value current_token() const { return current().token; }
280

281
  // Returns the location information for the current token
282
  // (the token last returned by Next()).
283
  const Location& location() const { return current().location; }
284

285
  // This error is specifically an invalid hex or unicode escape sequence.
286
  bool has_error() const { return scanner_error_ != MessageTemplate::kNone; }
287
  MessageTemplate error() const { return scanner_error_; }
288
  const Location& error_location() const { return scanner_error_location_; }
289

290
  bool has_invalid_template_escape() const {
291
    return current().invalid_template_escape_message != MessageTemplate::kNone;
292
  }
293
  MessageTemplate invalid_template_escape_message() const {
294
    DCHECK(has_invalid_template_escape());
295
    return current().invalid_template_escape_message;
296
  }
297 298 299 300 301 302

  void clear_invalid_template_escape_message() {
    DCHECK(has_invalid_template_escape());
    current_->invalid_template_escape_message = MessageTemplate::kNone;
  }

303 304
  Location invalid_template_escape_location() const {
    DCHECK(has_invalid_template_escape());
305
    return current().invalid_template_escape_location;
306 307
  }

308 309 310
  // Similar functions for the upcoming token.

  // One token look-ahead (past the token returned by Next()).
311
  Token::Value peek() const { return next().token; }
312

313
  const Location& peek_location() const { return next().location; }
314 315

  bool literal_contains_escapes() const {
316
    return LiteralContainsEscapes(current());
317
  }
318

319
  const AstRawString* CurrentSymbol(AstValueFactory* ast_value_factory) const;
320

321 322 323
  const AstRawString* NextSymbol(AstValueFactory* ast_value_factory) const;
  const AstRawString* CurrentRawSymbol(
      AstValueFactory* ast_value_factory) const;
324 325

  double DoubleValue();
326

327 328
  const char* CurrentLiteralAsCString(Zone* zone) const;

329 330
  inline bool CurrentMatches(Token::Value token) const {
    DCHECK(Token::IsKeyword(token));
331
    return current().token == token;
332
  }
333

334 335 336 337 338 339 340 341 342 343 344 345 346 347
  template <size_t N>
  bool NextLiteralEquals(const char (&s)[N]) {
    DCHECK_EQ(Token::STRING, peek());
    // The length of the token is used to make sure the literal equals without
    // taking escape sequences (e.g., "use \x73trict") or line continuations
    // (e.g., "use \(newline) strict") into account.
    if (!is_next_literal_one_byte()) return false;
    if (peek_location().length() != N + 1) return false;

    Vector<const uint8_t> next = next_literal_one_byte_string();
    const char* chars = reinterpret_cast<const char*>(next.start());
    return next.length() == N - 1 && strncmp(s, chars, N - 1) == 0;
  }

348 349
  // Returns the location of the last seen octal literal.
  Location octal_position() const { return octal_pos_; }
350 351 352 353
  void clear_octal_position() {
    octal_pos_ = Location::invalid();
    octal_message_ = MessageTemplate::kNone;
  }
354
  MessageTemplate octal_message() const { return octal_message_; }
355

verwaest's avatar
verwaest committed
356
  // Returns the value of the last smi that was scanned.
357
  uint32_t smi_value() const { return current().smi_value_; }
verwaest's avatar
verwaest committed
358

359 360 361 362 363 364 365 366
  // Seek forward to the given position.  This operation does not
  // work in general, for instance when there are pushed back
  // characters, but works for seeking forward until simple delimiter
  // tokens, which is what it is used for.
  void SeekForward(int pos);

  // Returns true if there was a line terminator before the peek'ed token,
  // possibly inside a multi-line comment.
367 368
  bool HasLineTerminatorBeforeNext() const {
    return next().after_line_terminator;
369 370
  }

371
  bool HasLineTerminatorAfterNext() {
372 373
    Token::Value ensure_next_next = PeekAhead();
    USE(ensure_next_next);
374
    return next_next().after_line_terminator;
375 376
  }

377 378
  // Scans the input as a regular expression pattern, next token must be /(=).
  // Returns true if a pattern is scanned.
379
  bool ScanRegExpPattern();
380
  // Scans the input as regular expression flags. Returns the flags on success.
381
  Maybe<RegExp::Flags> ScanRegExpFlags();
382

383
  // Scans the input as a template literal
384
  Token::Value ScanTemplateContinuation() {
385
    DCHECK_EQ(next().token, Token::RBRACE);
386
    DCHECK_EQ(source_pos() - 1, next().location.beg_pos);
387
    return ScanTemplateSpan();
388
  }
389

390 391
  Handle<String> SourceUrl(Isolate* isolate) const;
  Handle<String> SourceMappingUrl(Isolate* isolate) const;
392

393 394
  bool FoundHtmlComment() const { return found_html_comment_; }

395 396 397 398 399 400
  bool allow_harmony_private_fields() const {
    return allow_harmony_private_fields_;
  }
  void set_allow_harmony_private_fields(bool allow) {
    allow_harmony_private_fields_ = allow;
  }
401 402 403 404 405 406
  bool allow_harmony_numeric_separator() const {
    return allow_harmony_numeric_separator_;
  }
  void set_allow_harmony_numeric_separator(bool allow) {
    allow_harmony_numeric_separator_ = allow;
  }
407

408 409
  const Utf16CharacterStream* stream() const { return source_; }

410 411 412
  // If the next characters in the stream are "#!", the line is skipped.
  void SkipHashBang();

413
 private:
414 415 416 417 418
  // Scoped helper for saving & restoring scanner error state.
  // This is used for tagged template literals, in which normally forbidden
  // escape sequences are allowed.
  class ErrorState;

419 420 421
  // LiteralBuffer -  Collector of chars of literals.
  class LiteralBuffer {
   public:
422
    LiteralBuffer() : backing_store_(), position_(0), is_one_byte_(true) {}
423 424 425

    ~LiteralBuffer() { backing_store_.Dispose(); }

426
    V8_INLINE void AddChar(char code_unit) {
427
      DCHECK(IsValidAscii(code_unit));
428
      AddOneByteChar(static_cast<byte>(code_unit));
429 430
    }

431
    V8_INLINE void AddChar(uc32 code_unit) {
432
      if (is_one_byte()) {
433 434 435 436 437
        if (code_unit <= static_cast<uc32>(unibrow::Latin1::kMaxChar)) {
          AddOneByteChar(static_cast<byte>(code_unit));
          return;
        }
        ConvertToTwoByte();
438
      }
439
      AddTwoByteChar(code_unit);
440 441
    }

442
    bool is_one_byte() const { return is_one_byte_; }
443

444
    bool Equals(Vector<const char> keyword) const {
445 446 447 448 449
      return is_one_byte() && keyword.length() == position_ &&
             (memcmp(keyword.start(), backing_store_.start(), position_) == 0);
    }

    Vector<const uint16_t> two_byte_literal() const {
450
      DCHECK(!is_one_byte());
451
      DCHECK_EQ(position_ & 0x1, 0);
452 453 454 455 456 457
      return Vector<const uint16_t>(
          reinterpret_cast<const uint16_t*>(backing_store_.start()),
          position_ >> 1);
    }

    Vector<const uint8_t> one_byte_literal() const {
458
      DCHECK(is_one_byte());
459 460 461 462
      return Vector<const uint8_t>(
          reinterpret_cast<const uint8_t*>(backing_store_.start()), position_);
    }

463
    int length() const { return is_one_byte() ? position_ : (position_ >> 1); }
464

465
    void Start() {
466
      position_ = 0;
467
      is_one_byte_ = true;
468 469 470 471 472 473
    }

    Handle<String> Internalize(Isolate* isolate) const;

   private:
    static const int kInitialCapacity = 16;
474
    static const int kGrowthFactor = 4;
475 476 477 478 479 480 481 482 483 484
    static const int kMaxGrowth = 1 * MB;

    inline bool IsValidAscii(char code_unit) {
      // Control characters and printable characters span the range of
      // valid ASCII characters (0-127). Chars are unsigned on some
      // platforms which causes compiler warnings if the validity check
      // tests the lower bound >= 0 as it's always true.
      return iscntrl(code_unit) || isprint(code_unit);
    }

485
    V8_INLINE void AddOneByteChar(byte one_byte_char) {
486
      DCHECK(is_one_byte());
487 488 489
      if (position_ >= backing_store_.length()) ExpandBuffer();
      backing_store_[position_] = one_byte_char;
      position_ += kOneByteSize;
490 491
    }

492
    void AddTwoByteChar(uc32 code_unit);
493 494 495 496
    int NewCapacity(int min_capacity);
    void ExpandBuffer();
    void ConvertToTwoByte();

497
    Vector<byte> backing_store_;
498
    int position_;
499

500
    bool is_one_byte_;
501 502 503 504

    DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
  };

505 506
  // The current and look-ahead token.
  struct TokenDesc {
507 508 509 510
    Location location = {0, 0};
    LiteralBuffer literal_chars;
    LiteralBuffer raw_literal_chars;
    Token::Value token = Token::UNINITIALIZED;
511
    MessageTemplate invalid_template_escape_message = MessageTemplate::kNone;
512
    Location invalid_template_escape_location;
513 514
    uint32_t smi_value_ = 0;
    bool after_line_terminator = false;
515 516 517 518 519 520 521

#ifdef DEBUG
    bool CanAccessLiteral() const {
      return token == Token::PRIVATE_NAME || token == Token::ILLEGAL ||
             token == Token::UNINITIALIZED || token == Token::REGEXP_LITERAL ||
             token == Token::ESCAPED_KEYWORD ||
             IsInRange(token, Token::NUMBER, Token::STRING) ||
522
             (Token::IsAnyIdentifier(token) && !Token::IsKeyword(token)) ||
523 524 525 526 527 528 529
             IsInRange(token, Token::TEMPLATE_SPAN, Token::TEMPLATE_TAIL);
    }
    bool CanAccessRawLiteral() const {
      return token == Token::ILLEGAL || token == Token::UNINITIALIZED ||
             IsInRange(token, Token::TEMPLATE_SPAN, Token::TEMPLATE_TAIL);
    }
#endif  // DEBUG
530 531
  };

532 533 534 535 536 537 538 539 540
  enum NumberKind {
    BINARY,
    OCTAL,
    IMPLICIT_OCTAL,
    HEX,
    DECIMAL,
    DECIMAL_WITH_LEADING_ZERO
  };

541
  static const int kCharacterLookaheadBufferSize = 1;
542
  static const int kMaxAscii = 127;
543 544

  // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
545
  template <bool capture_raw>
546
  uc32 ScanOctalEscape(uc32 c, int length);
547

548 549 550 551 552
  // Call this after setting source_ to the input.
  void Init() {
    // Set c0_ (one character ahead)
    STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);
    Advance();
553

554 555 556 557
    current_ = &token_storage_[0];
    next_ = &token_storage_[1];
    next_next_ = &token_storage_[2];

558 559 560
    found_html_comment_ = false;
    scanner_error_ = MessageTemplate::kNone;
  }
561

562
  void ReportScannerError(const Location& location, MessageTemplate error) {
563 564 565 566 567
    if (has_error()) return;
    scanner_error_ = error;
    scanner_error_location_ = location;
  }

568
  void ReportScannerError(int pos, MessageTemplate error) {
569 570 571 572 573
    if (has_error()) return;
    scanner_error_ = error;
    scanner_error_location_ = Location(pos, pos + 1);
  }

574 575 576
  // Seek to the next_ token at the given position.
  void SeekNext(size_t position);

577 578 579
  V8_INLINE void AddLiteralChar(uc32 c) { next().literal_chars.AddChar(c); }

  V8_INLINE void AddLiteralChar(char c) { next().literal_chars.AddChar(c); }
580

581
  V8_INLINE void AddRawLiteralChar(uc32 c) {
582
    next().raw_literal_chars.AddChar(c);
583 584
  }

585
  V8_INLINE void AddLiteralCharAdvance() {
586
    AddLiteralChar(c0_);
587
    Advance();
588 589 590
  }

  // Low-level scanning support.
591
  template <bool capture_raw = false>
592
  void Advance() {
593 594 595 596
    if (capture_raw) {
      AddRawLiteralChar(c0_);
    }
    c0_ = source_->Advance();
597 598
  }

599
  template <typename FunctionType>
600
  V8_INLINE void AdvanceUntil(FunctionType check) {
601
    c0_ = source_->AdvanceUntil(check);
602 603
  }

604
  bool CombineSurrogatePair() {
605
    DCHECK(!unibrow::Utf16::IsLeadSurrogate(kEndOfInput));
606
    if (unibrow::Utf16::IsLeadSurrogate(c0_)) {
607
      uc32 c1 = source_->Advance();
608
      DCHECK(!unibrow::Utf16::IsTrailSurrogate(kEndOfInput));
609
      if (unibrow::Utf16::IsTrailSurrogate(c1)) {
610
        c0_ = unibrow::Utf16::CombineSurrogatePair(c0_, c1);
611
        return true;
612
      }
613
      source_->Back();
614
    }
615
    return false;
616 617
  }

618
  void PushBack(uc32 ch) {
619 620
    DCHECK_LE(c0_, static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode));
    source_->Back();
621 622 623
    c0_ = ch;
  }

624
  uc32 Peek() const { return source_->Peek(); }
625

626
  inline Token::Value Select(Token::Value tok) {
627
    Advance();
628 629 630 631
    return tok;
  }

  inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
632
    Advance();
633
    if (c0_ == next) {
634
      Advance();
635
      return then;
636 637
    } else {
      return else_;
638 639
    }
  }
640 641
  // Returns the literal string, if any, for the current token (the
  // token last returned by Next()). The string is 0-terminated.
642 643 644
  // Literal strings are collected for identifiers, strings, numbers as well
  // as for template literals. For template literals we also collect the raw
  // form.
645 646
  // These functions only give the correct result if the literal was scanned
  // when a LiteralScope object is alive.
647 648 649 650 651 652
  //
  // Current usage of these functions is unfortunately a little undisciplined,
  // and is_literal_one_byte() + is_literal_one_byte_string() is also
  // requested for tokens that do not have a literal. Hence, we treat any
  // token as a one-byte literal. E.g. Token::FUNCTION pretends to have a
  // literal "function".
653
  Vector<const uint8_t> literal_one_byte_string() const {
654 655
    DCHECK(current().CanAccessLiteral() || Token::IsKeyword(current().token));
    return current().literal_chars.one_byte_literal();
656
  }
657
  Vector<const uint16_t> literal_two_byte_string() const {
658
    DCHECK(current().CanAccessLiteral() || Token::IsKeyword(current().token));
659
    return current().literal_chars.two_byte_literal();
660
  }
661
  bool is_literal_one_byte() const {
662 663
    DCHECK(current().CanAccessLiteral() || Token::IsKeyword(current().token));
    return current().literal_chars.is_one_byte();
664 665 666
  }
  // Returns the literal string for the next token (the token that
  // would be returned if Next() were called).
667
  Vector<const uint8_t> next_literal_one_byte_string() const {
668
    DCHECK(next().CanAccessLiteral());
669
    return next().literal_chars.one_byte_literal();
670
  }
671
  Vector<const uint16_t> next_literal_two_byte_string() const {
672
    DCHECK(next().CanAccessLiteral());
673
    return next().literal_chars.two_byte_literal();
674
  }
675
  bool is_next_literal_one_byte() const {
676
    DCHECK(next().CanAccessLiteral());
677
    return next().literal_chars.is_one_byte();
678
  }
679
  Vector<const uint8_t> raw_literal_one_byte_string() const {
680
    DCHECK(current().CanAccessRawLiteral());
681
    return current().raw_literal_chars.one_byte_literal();
682
  }
683
  Vector<const uint16_t> raw_literal_two_byte_string() const {
684
    DCHECK(current().CanAccessRawLiteral());
685
    return current().raw_literal_chars.two_byte_literal();
686
  }
687
  bool is_raw_literal_one_byte() const {
688
    DCHECK(current().CanAccessRawLiteral());
689
    return current().raw_literal_chars.is_one_byte();
690 691
  }

692
  template <bool capture_raw, bool unicode = false>
693
  uc32 ScanHexNumber(int expected_length);
marja's avatar
marja committed
694 695 696
  // Scan a number of any length but not bigger than max_value. For example, the
  // number can be 000000001, so it's very long in characters but its value is
  // small.
697
  template <bool capture_raw>
698
  uc32 ScanUnlimitedLengthHexNumber(int max_value, int beg_pos);
699

700
  // Scans a single JavaScript token.
701
  V8_INLINE Token::Value ScanSingleToken();
702
  V8_INLINE void Scan();
703 704 705 706 707
  // Performance hack: pass through a pre-calculated "next()" value to avoid
  // having to re-calculate it in Scan. You'd think the compiler would be able
  // to hoist the next() calculation out of the inlined Scan method, but seems
  // that pointer aliasing analysis fails show that this is safe.
  V8_INLINE void Scan(TokenDesc* next_desc);
708

709
  V8_INLINE Token::Value SkipWhiteSpace();
710
  Token::Value SkipSingleHTMLComment();
711
  Token::Value SkipSingleLineComment();
712 713
  Token::Value SkipSourceURLComment();
  void TryToParseSourceURLComment();
714
  Token::Value SkipMultiLineComment();
715 716
  // Scans a possible HTML comment -- begins with '<!'.
  Token::Value ScanHtmlComment();
717

718 719
  bool ScanDigitsWithNumericSeparators(bool (*predicate)(uc32 ch),
                                       bool is_check_first_digit);
720
  bool ScanDecimalDigits();
721
  // Optimized function to scan decimal number as Smi.
722 723 724 725 726 727
  bool ScanDecimalAsSmi(uint64_t* value);
  bool ScanDecimalAsSmiWithNumericSeparators(uint64_t* value);
  bool ScanHexDigits();
  bool ScanBinaryDigits();
  bool ScanSignedInteger();
  bool ScanOctalDigits();
728
  bool ScanImplicitOctalDigits(int start_pos, NumberKind* kind);
729

730
  Token::Value ScanNumber(bool seen_period);
731
  V8_INLINE Token::Value ScanIdentifierOrKeyword();
732
  V8_INLINE Token::Value ScanIdentifierOrKeywordInner();
733 734
  Token::Value ScanIdentifierOrKeywordInnerSlow(bool escaped,
                                                bool can_be_keyword);
735 736

  Token::Value ScanString();
737
  Token::Value ScanPrivateName();
738

739 740 741
  // Scans an escape-sequence which is part of a string and adds the
  // decoded character to the current literal. Returns true if a pattern
  // is scanned.
742
  template <bool capture_raw>
743
  bool ScanEscape();
744

745
  // Decodes a Unicode escape-sequence which is part of an identifier.
746 747
  // If the escape sequence cannot be decoded the result is kBadChar.
  uc32 ScanIdentifierUnicodeEscape();
marja's avatar
marja committed
748
  // Helper for the above functions.
749
  template <bool capture_raw>
marja's avatar
marja committed
750
  uc32 ScanUnicodeEscape();
751

752 753
  Token::Value ScanTemplateSpan();

754
  // Return the current source position.
755 756
  int source_pos() {
    return static_cast<int>(source_->pos()) - kCharacterLookaheadBufferSize;
757 758
  }

759 760 761 762 763 764 765
  static bool LiteralContainsEscapes(const TokenDesc& token) {
    Location location = token.location;
    int source_length = (location.end_pos - location.beg_pos);
    if (token.token == Token::STRING) {
      // Subtract delimiters.
      source_length -= 2;
    }
766
    return token.literal_chars.length() != source_length;
767 768
  }

769 770 771 772
#ifdef DEBUG
  void SanityCheckTokenDesc(const TokenDesc&) const;
#endif

773
  TokenDesc& next() { return *next_; }
774

775 776 777
  const TokenDesc& current() const { return *current_; }
  const TokenDesc& next() const { return *next_; }
  const TokenDesc& next_next() const { return *next_next_; }
778

779 780 781
  TokenDesc* current_;    // desc for current token (as returned by Next())
  TokenDesc* next_;       // desc for next token (one token look-ahead)
  TokenDesc* next_next_;  // desc for the token after next (after PeakAhead())
782

783
  // Input stream. Must be initialized to an Utf16CharacterStream.
784
  Utf16CharacterStream* const source_;
785 786 787 788

  // One Unicode character look-ahead; c0_ < 0 at the end of the input.
  uc32 c0_;

789 790
  TokenDesc token_storage_[3];

791 792
  // Whether this scanner encountered an HTML comment.
  bool found_html_comment_;
793

794 795
  // Harmony flags to allow ESNext features.
  bool allow_harmony_private_fields_;
796
  bool allow_harmony_numeric_separator_;
797

798 799
  const bool is_module_;

800 801 802 803 804 805
  // Values parsed from magic comments.
  LiteralBuffer source_url_;
  LiteralBuffer source_mapping_url_;

  // Last-seen positions of potentially problematic tokens.
  Location octal_pos_;
806
  MessageTemplate octal_message_;
807

808
  MessageTemplate scanner_error_;
809
  Location scanner_error_location_;
810 811
};

812 813
}  // namespace internal
}  // namespace v8
814

815
#endif  // V8_PARSING_SCANNER_H_