scanner.h 25 KB
Newer Older
1
// Copyright 2011 the V8 project authors. All rights reserved.
2 3
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
4

5 6
// Features shared by parsing and pre-parsing scanners.

7 8
#ifndef V8_PARSING_SCANNER_H_
#define V8_PARSING_SCANNER_H_
9

10
#include "src/allocation.h"
11
#include "src/base/logging.h"
12
#include "src/char-predicates.h"
13
#include "src/collector.h"
14 15 16
#include "src/globals.h"
#include "src/hashmap.h"
#include "src/list.h"
17
#include "src/parsing/token.h"
18
#include "src/unicode.h"
19
#include "src/unicode-decoder.h"
20

21 22
namespace v8 {
namespace internal {
23

24

25 26
class AstRawString;
class AstValueFactory;
27
class ParserRecorder;
28
class UnicodeCache;
29 30


31
// ---------------------------------------------------------------------
32 33 34
// Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
// A code unit is a 16 bit value representing either a 16 bit code point
// or one part of a surrogate pair that make a single 21 bit code point.
35

36
class Utf16CharacterStream {
37
 public:
38 39
  Utf16CharacterStream() : pos_(0) { }
  virtual ~Utf16CharacterStream() { }
40

41 42
  // Returns and advances past the next UTF-16 code unit in the input
  // stream. If there are no more code units, it returns a negative
43 44 45 46 47 48 49 50
  // value.
  inline uc32 Advance() {
    if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
      pos_++;
      return static_cast<uc32>(*(buffer_cursor_++));
    }
    // Note: currently the following increment is necessary to avoid a
    // parser problem! The scanner treats the final kEndOfInput as
51
    // a code unit with a position, and does math relative to that
52 53 54 55 56
    // position.
    pos_++;

    return kEndOfInput;
  }
57

58
  // Return the current position in the code unit stream.
59
  // Starts at zero.
60
  inline size_t pos() const { return pos_; }
61

62
  // Skips forward past the next code_unit_count UTF-16 code units
63
  // in the input, or until the end of input if that comes sooner.
64 65
  // Returns the number of code units actually skipped. If less
  // than code_unit_count,
66 67
  inline size_t SeekForward(size_t code_unit_count) {
    size_t buffered_chars = buffer_end_ - buffer_cursor_;
68 69 70 71
    if (code_unit_count <= buffered_chars) {
      buffer_cursor_ += code_unit_count;
      pos_ += code_unit_count;
      return code_unit_count;
72
    }
73
    return SlowSeekForward(code_unit_count);
74 75
  }

76
  // Pushes back the most recently read UTF-16 code unit (or negative
77 78 79
  // value if at end of input), i.e., the value returned by the most recent
  // call to Advance.
  // Must not be used right after calling SeekForward.
80
  virtual void PushBack(int32_t code_unit) = 0;
81

82 83 84
  virtual bool SetBookmark();
  virtual void ResetToBookmark();

85
 protected:
86 87
  static const uc32 kEndOfInput = -1;

88
  // Ensures that the buffer_cursor_ points to the code_unit at
89 90
  // position pos_ of the input, if possible. If the position
  // is at or after the end of the input, return false. If there
91
  // are more code_units available, return true.
92
  virtual bool ReadBlock() = 0;
93
  virtual size_t SlowSeekForward(size_t code_unit_count) = 0;
94

95 96
  const uint16_t* buffer_cursor_;
  const uint16_t* buffer_end_;
97
  size_t pos_;
98 99
};

100

101 102 103 104 105 106 107 108 109 110
// ---------------------------------------------------------------------
// DuplicateFinder discovers duplicate symbols.

class DuplicateFinder {
 public:
  explicit DuplicateFinder(UnicodeCache* constants)
      : unicode_constants_(constants),
        backing_store_(16),
        map_(&Match) { }

111 112
  int AddOneByteSymbol(Vector<const uint8_t> key, int value);
  int AddTwoByteSymbol(Vector<const uint16_t> key, int value);
113 114
  // Add a a number literal by converting it (if necessary)
  // to the string that ToString(ToNumber(literal)) would generate.
115
  // and then adding that string with AddOneByteSymbol.
116 117
  // This string is the actual value used as key in an object literal,
  // and the one that must be different from the other keys.
118
  int AddNumber(Vector<const uint8_t> key, int value);
119 120

 private:
121
  int AddSymbol(Vector<const uint8_t> key, bool is_one_byte, int value);
122 123
  // Backs up the key and its length in the backing store.
  // The backup is stored with a base 127 encoding of the
124
  // length (plus a bit saying whether the string is one byte),
125
  // followed by the bytes of the key.
126
  uint8_t* BackupKey(Vector<const uint8_t> key, bool is_one_byte);
127 128

  // Compare two encoded keys (both pointing into the backing store)
129
  // for having the same base-127 encoded lengths and representation.
130 131 132
  // and then having the same 'length' bytes following.
  static bool Match(void* first, void* second);
  // Creates a hash from a sequence of bytes.
133
  static uint32_t Hash(Vector<const uint8_t> key, bool is_one_byte);
134 135
  // Checks whether a string containing a JS number is its canonical
  // form.
136
  static bool IsNumberCanonical(Vector<const uint8_t> key);
137 138 139 140 141 142 143 144 145 146 147 148 149 150

  // Size of buffer. Sufficient for using it to call DoubleToCString in
  // from conversions.h.
  static const int kBufferSize = 100;

  UnicodeCache* unicode_constants_;
  // Backing store used to store strings used as hashmap keys.
  SequenceCollector<unsigned char> backing_store_;
  HashMap map_;
  // Buffer used for string->number->canonical string conversions.
  char number_buffer_[kBufferSize];
};


151 152 153 154
// ----------------------------------------------------------------------------
// LiteralBuffer -  Collector of chars of literals.

class LiteralBuffer {
155
 public:
156
  LiteralBuffer() : is_one_byte_(true), position_(0), backing_store_() { }
157

158
  ~LiteralBuffer() { backing_store_.Dispose(); }
159

160
  INLINE(void AddChar(uint32_t code_unit)) {
161
    if (position_ >= backing_store_.length()) ExpandBuffer();
162
    if (is_one_byte_) {
163
      if (code_unit <= unibrow::Latin1::kMaxChar) {
164
        backing_store_[position_] = static_cast<byte>(code_unit);
165
        position_ += kOneByteSize;
166 167
        return;
      }
168
      ConvertToTwoByte();
169
    }
170 171 172 173 174 175 176
    if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
      *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
      position_ += kUC16Size;
    } else {
      *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
          unibrow::Utf16::LeadSurrogate(code_unit);
      position_ += kUC16Size;
177
      if (position_ >= backing_store_.length()) ExpandBuffer();
178 179 180 181
      *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
          unibrow::Utf16::TrailSurrogate(code_unit);
      position_ += kUC16Size;
    }
182 183
  }

184
  bool is_one_byte() const { return is_one_byte_; }
185

186
  bool is_contextual_keyword(Vector<const char> keyword) const {
187
    return is_one_byte() && keyword.length() == position_ &&
188 189 190
        (memcmp(keyword.start(), backing_store_.start(), position_) == 0);
  }

191
  Vector<const uint16_t> two_byte_literal() const {
192 193
    DCHECK(!is_one_byte_);
    DCHECK((position_ & 0x1) == 0);
194 195
    return Vector<const uint16_t>(
        reinterpret_cast<const uint16_t*>(backing_store_.start()),
196 197 198
        position_ >> 1);
  }

199
  Vector<const uint8_t> one_byte_literal() const {
200
    DCHECK(is_one_byte_);
201 202
    return Vector<const uint8_t>(
        reinterpret_cast<const uint8_t*>(backing_store_.start()),
203 204 205
        position_);
  }

206
  int length() const {
207
    return is_one_byte_ ? position_ : (position_ >> 1);
208 209
  }

210 211 212 213
  void ReduceLength(int delta) {
    position_ -= delta * (is_one_byte_ ? kOneByteSize : kUC16Size);
  }

214 215
  void Reset() {
    position_ = 0;
216
    is_one_byte_ = true;
217
  }
218

219 220
  Handle<String> Internalize(Isolate* isolate) const;

221 222 223 224 225 226 227 228 229 230 231
  void CopyFrom(const LiteralBuffer* other) {
    if (other == nullptr) {
      Reset();
    } else {
      is_one_byte_ = other->is_one_byte_;
      position_ = other->position_;
      backing_store_.Dispose();
      backing_store_ = other->backing_store_.Clone();
    }
  }

232 233 234 235 236 237 238 239 240 241 242 243 244
 private:
  static const int kInitialCapacity = 16;
  static const int kGrowthFactory = 4;
  static const int kMinConversionSlack = 256;
  static const int kMaxGrowth = 1 * MB;
  inline int NewCapacity(int min_capacity) {
    int capacity = Max(min_capacity, backing_store_.length());
    int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
    return new_capacity;
  }

  void ExpandBuffer() {
    Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
245
    MemCopy(new_store.start(), backing_store_.start(), position_);
246 247 248 249
    backing_store_.Dispose();
    backing_store_ = new_store;
  }

250
  void ConvertToTwoByte() {
251
    DCHECK(is_one_byte_);
252 253 254
    Vector<byte> new_store;
    int new_content_size = position_ * kUC16Size;
    if (new_content_size >= backing_store_.length()) {
255 256
      // Ensure room for all currently read code units as UC16 as well
      // as the code unit about to be stored.
257 258 259 260
      new_store = Vector<byte>::New(NewCapacity(new_content_size));
    } else {
      new_store = backing_store_;
    }
261
    uint8_t* src = backing_store_.start();
262
    uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start());
263 264 265 266 267 268 269 270
    for (int i = position_ - 1; i >= 0; i--) {
      dst[i] = src[i];
    }
    if (new_store.start() != backing_store_.start()) {
      backing_store_.Dispose();
      backing_store_ = new_store;
    }
    position_ = new_content_size;
271
    is_one_byte_ = false;
272 273
  }

274
  bool is_one_byte_;
275 276
  int position_;
  Vector<byte> backing_store_;
277

278
  DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
279 280 281
};


282
// ----------------------------------------------------------------------------
283
// JavaScript Scanner.
284 285

class Scanner {
286
 public:
287 288
  // Scoped helper for literal recording. Automatically drops the literal
  // if aborting the scanning before it's complete.
289 290
  class LiteralScope {
   public:
291
    explicit LiteralScope(Scanner* self) : scanner_(self), complete_(false) {
292 293 294 295 296 297 298 299
      scanner_->StartLiteral();
    }
     ~LiteralScope() {
       if (!complete_) scanner_->DropLiteral();
     }
    void Complete() {
      complete_ = true;
    }
300 301 302 303 304 305

   private:
    Scanner* scanner_;
    bool complete_;
  };

306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
  // Scoped helper for a re-settable bookmark.
  class BookmarkScope {
   public:
    explicit BookmarkScope(Scanner* scanner) : scanner_(scanner) {
      DCHECK_NOT_NULL(scanner_);
    }
    ~BookmarkScope() { scanner_->DropBookmark(); }

    bool Set() { return scanner_->SetBookmark(); }
    void Reset() { scanner_->ResetToBookmark(); }
    bool HasBeenSet() { return scanner_->BookmarkHasBeenSet(); }
    bool HasBeenReset() { return scanner_->BookmarkHasBeenReset(); }

   private:
    Scanner* scanner_;

    DISALLOW_COPY_AND_ASSIGN(BookmarkScope);
  };

325
  // Representation of an interval of source positions.
326 327 328 329 330 331 332 333 334 335 336 337 338 339
  struct Location {
    Location(int b, int e) : beg_pos(b), end_pos(e) { }
    Location() : beg_pos(0), end_pos(0) { }

    bool IsValid() const {
      return beg_pos >= 0 && end_pos >= beg_pos;
    }

    static Location invalid() { return Location(-1, -1); }

    int beg_pos;
    int end_pos;
  };

340 341 342 343 344
  // -1 is outside of the range of any real source code.
  static const int kNoOctalLocation = -1;

  explicit Scanner(UnicodeCache* scanner_contants);

345
  void Initialize(Utf16CharacterStream* source);
346 347 348

  // Returns the next token and advances input.
  Token::Value Next();
littledan's avatar
littledan committed
349 350
  // Returns the token following peek()
  Token::Value PeekAhead();
351 352
  // Returns the current token again.
  Token::Value current_token() { return current_.token; }
353
  // Returns the location information for the current token
354
  // (the token last returned by Next()).
355
  Location location() const { return current_.location; }
356 357 358 359 360 361 362

  // Similar functions for the upcoming token.

  // One token look-ahead (past the token returned by Next()).
  Token::Value peek() const { return next_.token; }

  Location peek_location() const { return next_.location; }
363 364

  bool literal_contains_escapes() const {
365 366 367 368
    return LiteralContainsEscapes(current_);
  }
  bool next_literal_contains_escapes() const {
    return LiteralContainsEscapes(next_);
369
  }
370
  bool is_literal_contextual_keyword(Vector<const char> keyword) {
371
    DCHECK_NOT_NULL(current_.literal_chars);
372
    return current_.literal_chars->is_contextual_keyword(keyword);
373
  }
374
  bool is_next_contextual_keyword(Vector<const char> keyword) {
375
    DCHECK_NOT_NULL(next_.literal_chars);
376 377
    return next_.literal_chars->is_contextual_keyword(keyword);
  }
378

379 380
  const AstRawString* CurrentSymbol(AstValueFactory* ast_value_factory);
  const AstRawString* NextSymbol(AstValueFactory* ast_value_factory);
381
  const AstRawString* CurrentRawSymbol(AstValueFactory* ast_value_factory);
382 383

  double DoubleValue();
384
  bool ContainsDot();
385
  bool LiteralMatches(const char* data, int length, bool allow_escapes = true) {
386 387
    if (is_literal_one_byte() &&
        literal_length() == length &&
388
        (allow_escapes || !literal_contains_escapes())) {
389 390 391
      const char* token =
          reinterpret_cast<const char*>(literal_one_byte_string().start());
      return !strncmp(token, data, length);
392 393 394
    }
    return false;
  }
395 396 397 398
  inline bool UnescapedLiteralMatches(const char* data, int length) {
    return LiteralMatches(data, length, false);
  }

399 400 401 402
  void IsGetOrSet(bool* is_get, bool* is_set) {
    if (is_literal_one_byte() &&
        literal_length() == 3 &&
        !literal_contains_escapes()) {
403 404
      const char* token =
          reinterpret_cast<const char*>(literal_one_byte_string().start());
405 406 407 408 409
      *is_get = strncmp(token, "get", 3) == 0;
      *is_set = !*is_get && strncmp(token, "set", 3) == 0;
    }
  }

410
  int FindSymbol(DuplicateFinder* finder, int value);
411

412
  UnicodeCache* unicode_cache() { return unicode_cache_; }
413 414 415 416 417

  // Returns the location of the last seen octal literal.
  Location octal_position() const { return octal_pos_; }
  void clear_octal_position() { octal_pos_ = Location::invalid(); }

verwaest's avatar
verwaest committed
418
  // Returns the value of the last smi that was scanned.
verwaest's avatar
verwaest committed
419
  int smi_value() const { return current_.smi_value_; }
verwaest's avatar
verwaest committed
420

421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436
  // Seek forward to the given position.  This operation does not
  // work in general, for instance when there are pushed back
  // characters, but works for seeking forward until simple delimiter
  // tokens, which is what it is used for.
  void SeekForward(int pos);

  // Returns true if there was a line terminator before the peek'ed token,
  // possibly inside a multi-line comment.
  bool HasAnyLineTerminatorBeforeNext() const {
    return has_line_terminator_before_next_ ||
           has_multiline_comment_before_next_;
  }

  // Scans the input as a regular expression pattern, previous
  // character(s) must be /(=). Returns true if a pattern is scanned.
  bool ScanRegExpPattern(bool seen_equal);
437 438
  // Scans the input as regular expression flags. Returns the flags on success.
  Maybe<RegExp::Flags> ScanRegExpFlags();
439

440
  // Scans the input as a template literal
441 442
  Token::Value ScanTemplateStart();
  Token::Value ScanTemplateContinuation();
443

444 445 446 447 448
  const LiteralBuffer* source_url() const { return &source_url_; }
  const LiteralBuffer* source_mapping_url() const {
    return &source_mapping_url_;
  }

449 450
  bool IdentifierIsFutureStrictReserved(const AstRawString* string) const;

451 452
  bool FoundHtmlComment() const { return found_html_comment_; }

453
 private:
454 455 456 457 458
  // The current and look-ahead token.
  struct TokenDesc {
    Token::Value token;
    Location location;
    LiteralBuffer* literal_chars;
459
    LiteralBuffer* raw_literal_chars;
verwaest's avatar
verwaest committed
460
    int smi_value_;
461 462
  };

463 464 465
  static const int kCharacterLookaheadBufferSize = 1;

  // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
466
  template <bool capture_raw>
467 468
  uc32 ScanOctalEscape(uc32 c, int length);

469 470 471 472 473 474 475
  // Call this after setting source_ to the input.
  void Init() {
    // Set c0_ (one character ahead)
    STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);
    Advance();
    // Initialize current_ to not refer to a literal.
    current_.literal_chars = NULL;
476
    current_.raw_literal_chars = NULL;
littledan's avatar
littledan committed
477
    next_next_.token = Token::UNINITIALIZED;
478
    found_html_comment_ = false;
479 480
  }

481 482 483 484 485 486 487 488
  // Support BookmarkScope functionality.
  bool SetBookmark();
  void ResetToBookmark();
  bool BookmarkHasBeenSet();
  bool BookmarkHasBeenReset();
  void DropBookmark();
  static void CopyTokenDesc(TokenDesc* to, TokenDesc* from);

489 490
  // Literal buffer support
  inline void StartLiteral() {
littledan's avatar
littledan committed
491 492 493 494 495
    LiteralBuffer* free_buffer =
        (current_.literal_chars == &literal_buffer0_)
            ? &literal_buffer1_
            : (current_.literal_chars == &literal_buffer1_) ? &literal_buffer2_
                                                            : &literal_buffer0_;
496 497 498 499
    free_buffer->Reset();
    next_.literal_chars = free_buffer;
  }

500
  inline void StartRawLiteral() {
501
    LiteralBuffer* free_buffer =
littledan's avatar
littledan committed
502 503 504 505 506
        (current_.raw_literal_chars == &raw_literal_buffer0_)
            ? &raw_literal_buffer1_
            : (current_.raw_literal_chars == &raw_literal_buffer1_)
                  ? &raw_literal_buffer2_
                  : &raw_literal_buffer0_;
507 508
    free_buffer->Reset();
    next_.raw_literal_chars = free_buffer;
509 510
  }

511
  INLINE(void AddLiteralChar(uc32 c)) {
512
    DCHECK_NOT_NULL(next_.literal_chars);
513 514 515
    next_.literal_chars->AddChar(c);
  }

516 517 518
  INLINE(void AddRawLiteralChar(uc32 c)) {
    DCHECK_NOT_NULL(next_.raw_literal_chars);
    next_.raw_literal_chars->AddChar(c);
519 520
  }

521 522 523 524 525
  INLINE(void ReduceRawLiteralLength(int delta)) {
    DCHECK_NOT_NULL(next_.raw_literal_chars);
    next_.raw_literal_chars->ReduceLength(delta);
  }

526 527 528 529
  // Stops scanning of a literal and drop the collected characters,
  // e.g., due to an encountered error.
  inline void DropLiteral() {
    next_.literal_chars = NULL;
530
    next_.raw_literal_chars = NULL;
531 532 533 534 535 536 537 538
  }

  inline void AddLiteralCharAdvance() {
    AddLiteralChar(c0_);
    Advance();
  }

  // Low-level scanning support.
539
  template <bool capture_raw = false, bool check_surrogate = true>
540
  void Advance() {
541
    if (capture_raw) {
542 543
      AddRawLiteralChar(c0_);
    }
544
    c0_ = source_->Advance();
verwaest's avatar
verwaest committed
545
    if (check_surrogate) HandleLeadSurrogate();
546 547
  }

verwaest's avatar
verwaest committed
548
  void HandleLeadSurrogate() {
549 550 551 552 553 554 555 556 557 558
    if (unibrow::Utf16::IsLeadSurrogate(c0_)) {
      uc32 c1 = source_->Advance();
      if (!unibrow::Utf16::IsTrailSurrogate(c1)) {
        source_->PushBack(c1);
      } else {
        c0_ = unibrow::Utf16::CombineSurrogatePair(c0_, c1);
      }
    }
  }

559
  void PushBack(uc32 ch) {
560 561 562 563 564 565
    if (ch > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
      source_->PushBack(unibrow::Utf16::TrailSurrogate(c0_));
      source_->PushBack(unibrow::Utf16::LeadSurrogate(c0_));
    } else {
      source_->PushBack(c0_);
    }
566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583
    c0_ = ch;
  }

  inline Token::Value Select(Token::Value tok) {
    Advance();
    return tok;
  }

  inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
    Advance();
    if (c0_ == next) {
      Advance();
      return then;
    } else {
      return else_;
    }
  }

584 585
  // Returns the literal string, if any, for the current token (the
  // token last returned by Next()). The string is 0-terminated.
586 587 588
  // Literal strings are collected for identifiers, strings, numbers as well
  // as for template literals. For template literals we also collect the raw
  // form.
589 590
  // These functions only give the correct result if the literal was scanned
  // when a LiteralScope object is alive.
591
  Vector<const uint8_t> literal_one_byte_string() {
592
    DCHECK_NOT_NULL(current_.literal_chars);
593 594
    return current_.literal_chars->one_byte_literal();
  }
595
  Vector<const uint16_t> literal_two_byte_string() {
596
    DCHECK_NOT_NULL(current_.literal_chars);
597
    return current_.literal_chars->two_byte_literal();
598 599
  }
  bool is_literal_one_byte() {
600
    DCHECK_NOT_NULL(current_.literal_chars);
601 602 603
    return current_.literal_chars->is_one_byte();
  }
  int literal_length() const {
604
    DCHECK_NOT_NULL(current_.literal_chars);
605 606 607 608
    return current_.literal_chars->length();
  }
  // Returns the literal string for the next token (the token that
  // would be returned if Next() were called).
609
  Vector<const uint8_t> next_literal_one_byte_string() {
610
    DCHECK_NOT_NULL(next_.literal_chars);
611 612
    return next_.literal_chars->one_byte_literal();
  }
613
  Vector<const uint16_t> next_literal_two_byte_string() {
614
    DCHECK_NOT_NULL(next_.literal_chars);
615
    return next_.literal_chars->two_byte_literal();
616 617
  }
  bool is_next_literal_one_byte() {
618
    DCHECK_NOT_NULL(next_.literal_chars);
619 620
    return next_.literal_chars->is_one_byte();
  }
621 622 623 624 625 626 627
  Vector<const uint8_t> raw_literal_one_byte_string() {
    DCHECK_NOT_NULL(current_.raw_literal_chars);
    return current_.raw_literal_chars->one_byte_literal();
  }
  Vector<const uint16_t> raw_literal_two_byte_string() {
    DCHECK_NOT_NULL(current_.raw_literal_chars);
    return current_.raw_literal_chars->two_byte_literal();
628
  }
629 630 631 632 633
  bool is_raw_literal_one_byte() {
    DCHECK_NOT_NULL(current_.raw_literal_chars);
    return current_.raw_literal_chars->is_one_byte();
  }

634
  template <bool capture_raw>
635
  uc32 ScanHexNumber(int expected_length);
marja's avatar
marja committed
636 637 638
  // Scan a number of any length but not bigger than max_value. For example, the
  // number can be 000000001, so it's very long in characters but its value is
  // small.
639
  template <bool capture_raw>
marja's avatar
marja committed
640
  uc32 ScanUnlimitedLengthHexNumber(int max_value);
641

642 643
  // Scans a single JavaScript token.
  void Scan();
644 645 646

  bool SkipWhiteSpace();
  Token::Value SkipSingleLineComment();
647 648
  Token::Value SkipSourceURLComment();
  void TryToParseSourceURLComment();
649
  Token::Value SkipMultiLineComment();
650 651
  // Scans a possible HTML comment -- begins with '<!'.
  Token::Value ScanHtmlComment();
652 653 654 655

  void ScanDecimalDigits();
  Token::Value ScanNumber(bool seen_period);
  Token::Value ScanIdentifierOrKeyword();
656
  Token::Value ScanIdentifierSuffix(LiteralScope* literal, bool escaped);
657 658 659

  Token::Value ScanString();

660 661 662
  // Scans an escape-sequence which is part of a string and adds the
  // decoded character to the current literal. Returns true if a pattern
  // is scanned.
663
  template <bool capture_raw, bool in_template_literal>
664
  bool ScanEscape();
665

666
  // Decodes a Unicode escape-sequence which is part of an identifier.
667 668
  // If the escape sequence cannot be decoded the result is kBadChar.
  uc32 ScanIdentifierUnicodeEscape();
marja's avatar
marja committed
669
  // Helper for the above functions.
670
  template <bool capture_raw>
marja's avatar
marja committed
671
  uc32 ScanUnicodeEscape();
672

673 674
  Token::Value ScanTemplateSpan();

675 676
  // Return the current source position.
  int source_pos() {
677
    return static_cast<int>(source_->pos()) - kCharacterLookaheadBufferSize;
678 679
  }

680 681 682 683 684 685 686 687 688 689
  static bool LiteralContainsEscapes(const TokenDesc& token) {
    Location location = token.location;
    int source_length = (location.end_pos - location.beg_pos);
    if (token.token == Token::STRING) {
      // Subtract delimiters.
      source_length -= 2;
    }
    return token.literal_chars->length() != source_length;
  }

690 691 692
  UnicodeCache* unicode_cache_;

  // Buffers collecting literal strings, numbers, etc.
littledan's avatar
littledan committed
693
  LiteralBuffer literal_buffer0_;
694 695 696
  LiteralBuffer literal_buffer1_;
  LiteralBuffer literal_buffer2_;

697 698 699 700
  // Values parsed from magic comments.
  LiteralBuffer source_url_;
  LiteralBuffer source_mapping_url_;

701
  // Buffer to store raw string values
littledan's avatar
littledan committed
702
  LiteralBuffer raw_literal_buffer0_;
703 704
  LiteralBuffer raw_literal_buffer1_;
  LiteralBuffer raw_literal_buffer2_;
705

littledan's avatar
littledan committed
706 707 708
  TokenDesc current_;    // desc for current token (as returned by Next())
  TokenDesc next_;       // desc for next token (one token look-ahead)
  TokenDesc next_next_;  // desc for the token after next (after PeakAhead())
709

710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740
  // Variables for Scanner::BookmarkScope and the *Bookmark implementation.
  // These variables contain the scanner state when a bookmark is set.
  //
  // We will use bookmark_c0_ as a 'control' variable, where:
  // - bookmark_c0_ >= 0: A bookmark has been set and this contains c0_.
  // - bookmark_c0_ == -1: No bookmark has been set.
  // - bookmark_c0_ == -2: The bookmark has been applied (ResetToBookmark).
  //
  // Which state is being bookmarked? The parser state is distributed over
  // several variables, roughly like this:
  //   ...    1234        +       5678 ..... [character stream]
  //       [current_] [next_] c0_ |      [scanner state]
  // So when the scanner is logically at the beginning of an expression
  // like "1234 + 4567", then:
  // - current_ contains "1234"
  // - next_ contains "+"
  // - c0_ contains ' ' (the space between "+" and "5678",
  // - the source_ character stream points to the beginning of "5678".
  // To be able to restore this state, we will keep copies of current_, next_,
  // and c0_; we'll ask the stream to bookmark itself, and we'll copy the
  // contents of current_'s and next_'s literal buffers to bookmark_*_literal_.
  static const uc32 kNoBookmark = -1;
  static const uc32 kBookmarkWasApplied = -2;
  uc32 bookmark_c0_;
  TokenDesc bookmark_current_;
  TokenDesc bookmark_next_;
  LiteralBuffer bookmark_current_literal_;
  LiteralBuffer bookmark_current_raw_literal_;
  LiteralBuffer bookmark_next_literal_;
  LiteralBuffer bookmark_next_raw_literal_;

741 742
  // Input stream. Must be initialized to an Utf16CharacterStream.
  Utf16CharacterStream* source_;
743 744


745 746 747
  // Start position of the octal literal last scanned.
  Location octal_pos_;

748 749 750
  // One Unicode character look-ahead; c0_ < 0 at the end of the input.
  uc32 c0_;

751 752 753 754 755 756 757
  // Whether there is a line terminator whitespace character after
  // the current token, and  before the next. Does not count newlines
  // inside multiline comments.
  bool has_line_terminator_before_next_;
  // Whether there is a multi-line comment that contains a
  // line-terminator after the current token, and before the next.
  bool has_multiline_comment_before_next_;
758 759 760

  // Whether this scanner encountered an HTML comment.
  bool found_html_comment_;
761 762
};

763 764
}  // namespace internal
}  // namespace v8
765

766
#endif  // V8_PARSING_SCANNER_H_