scanner.h 26.3 KB
Newer Older
1
// Copyright 2011 the V8 project authors. All rights reserved.
2 3
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
4

5 6
// Features shared by parsing and pre-parsing scanners.

7 8 9
#ifndef V8_SCANNER_H_
#define V8_SCANNER_H_

10
#include "src/allocation.h"
11
#include "src/base/logging.h"
12 13 14 15 16 17
#include "src/char-predicates.h"
#include "src/globals.h"
#include "src/hashmap.h"
#include "src/list.h"
#include "src/token.h"
#include "src/unicode-inl.h"
18
#include "src/unicode-decoder.h"
19
#include "src/utils.h"
20

21 22
namespace v8 {
namespace internal {
23

24

25 26
class AstRawString;
class AstValueFactory;
27 28 29
class ParserRecorder;


30 31 32 33 34 35 36 37 38 39 40 41
// Returns the value (0 .. 15) of a hexadecimal character c.
// If c is not a legal hexadecimal character, returns a value < 0.
inline int HexValue(uc32 c) {
  c -= '0';
  if (static_cast<unsigned>(c) <= 9) return c;
  c = (c | 0x20) - ('a' - '0');  // detect 0x11..0x16 and 0x31..0x36.
  if (static_cast<unsigned>(c) <= 5) return c + 10;
  return -1;
}


// ---------------------------------------------------------------------
42 43 44
// Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
// A code unit is a 16 bit value representing either a 16 bit code point
// or one part of a surrogate pair that make a single 21 bit code point.
45

46
class Utf16CharacterStream {
47
 public:
48 49
  Utf16CharacterStream() : pos_(0) { }
  virtual ~Utf16CharacterStream() { }
50

51 52
  // Returns and advances past the next UTF-16 code unit in the input
  // stream. If there are no more code units, it returns a negative
53 54 55 56 57 58 59 60
  // value.
  inline uc32 Advance() {
    if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
      pos_++;
      return static_cast<uc32>(*(buffer_cursor_++));
    }
    // Note: currently the following increment is necessary to avoid a
    // parser problem! The scanner treats the final kEndOfInput as
61
    // a code unit with a position, and does math relative to that
62 63 64 65 66
    // position.
    pos_++;

    return kEndOfInput;
  }
67

68
  // Return the current position in the code unit stream.
69
  // Starts at zero.
70
  inline size_t pos() const { return pos_; }
71

72
  // Skips forward past the next code_unit_count UTF-16 code units
73
  // in the input, or until the end of input if that comes sooner.
74 75
  // Returns the number of code units actually skipped. If less
  // than code_unit_count,
76 77
  inline size_t SeekForward(size_t code_unit_count) {
    size_t buffered_chars = buffer_end_ - buffer_cursor_;
78 79 80 81
    if (code_unit_count <= buffered_chars) {
      buffer_cursor_ += code_unit_count;
      pos_ += code_unit_count;
      return code_unit_count;
82
    }
83
    return SlowSeekForward(code_unit_count);
84 85
  }

86
  // Pushes back the most recently read UTF-16 code unit (or negative
87 88 89
  // value if at end of input), i.e., the value returned by the most recent
  // call to Advance.
  // Must not be used right after calling SeekForward.
90
  virtual void PushBack(int32_t code_unit) = 0;
91

92 93 94
  virtual bool SetBookmark();
  virtual void ResetToBookmark();

95
 protected:
96 97
  static const uc32 kEndOfInput = -1;

98
  // Ensures that the buffer_cursor_ points to the code_unit at
99 100
  // position pos_ of the input, if possible. If the position
  // is at or after the end of the input, return false. If there
101
  // are more code_units available, return true.
102
  virtual bool ReadBlock() = 0;
103
  virtual size_t SlowSeekForward(size_t code_unit_count) = 0;
104

105 106
  const uint16_t* buffer_cursor_;
  const uint16_t* buffer_end_;
107
  size_t pos_;
108 109
};

110

111 112
// ---------------------------------------------------------------------
// Caching predicates used by scanners.
113 114

class UnicodeCache {
115 116
 public:
  UnicodeCache() {}
117
  typedef unibrow::Utf8Decoder<512> Utf8Decoder;
118 119 120 121 122 123 124 125

  StaticResource<Utf8Decoder>* utf8_decoder() {
    return &utf8_decoder_;
  }

  bool IsIdentifierStart(unibrow::uchar c) { return kIsIdentifierStart.get(c); }
  bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); }
  bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); }
126 127 128 129 130 131
  bool IsLineTerminatorSequence(unibrow::uchar c, unibrow::uchar next) {
    if (!IsLineTerminator(c)) return false;
    if (c == 0x000d && next == 0x000a) return false;  // CR with following LF.
    return true;
  }

132
  bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); }
133 134 135
  bool IsWhiteSpaceOrLineTerminator(unibrow::uchar c) {
    return kIsWhiteSpaceOrLineTerminator.get(c);
  }
136

137 138 139 140
 private:
  unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
  unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
  unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
141 142 143
  unibrow::Predicate<WhiteSpace, 128> kIsWhiteSpace;
  unibrow::Predicate<WhiteSpaceOrLineTerminator, 128>
      kIsWhiteSpaceOrLineTerminator;
144 145 146
  StaticResource<Utf8Decoder> utf8_decoder_;

  DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
147 148 149
};


150 151 152 153 154 155 156 157 158 159
// ---------------------------------------------------------------------
// DuplicateFinder discovers duplicate symbols.

class DuplicateFinder {
 public:
  explicit DuplicateFinder(UnicodeCache* constants)
      : unicode_constants_(constants),
        backing_store_(16),
        map_(&Match) { }

160 161
  int AddOneByteSymbol(Vector<const uint8_t> key, int value);
  int AddTwoByteSymbol(Vector<const uint16_t> key, int value);
162 163
  // Add a a number literal by converting it (if necessary)
  // to the string that ToString(ToNumber(literal)) would generate.
164
  // and then adding that string with AddOneByteSymbol.
165 166
  // This string is the actual value used as key in an object literal,
  // and the one that must be different from the other keys.
167
  int AddNumber(Vector<const uint8_t> key, int value);
168 169

 private:
170
  int AddSymbol(Vector<const uint8_t> key, bool is_one_byte, int value);
171 172
  // Backs up the key and its length in the backing store.
  // The backup is stored with a base 127 encoding of the
173
  // length (plus a bit saying whether the string is one byte),
174
  // followed by the bytes of the key.
175
  uint8_t* BackupKey(Vector<const uint8_t> key, bool is_one_byte);
176 177

  // Compare two encoded keys (both pointing into the backing store)
178
  // for having the same base-127 encoded lengths and representation.
179 180 181
  // and then having the same 'length' bytes following.
  static bool Match(void* first, void* second);
  // Creates a hash from a sequence of bytes.
182
  static uint32_t Hash(Vector<const uint8_t> key, bool is_one_byte);
183 184
  // Checks whether a string containing a JS number is its canonical
  // form.
185
  static bool IsNumberCanonical(Vector<const uint8_t> key);
186 187 188 189 190 191 192 193 194 195 196 197 198 199

  // Size of buffer. Sufficient for using it to call DoubleToCString in
  // from conversions.h.
  static const int kBufferSize = 100;

  UnicodeCache* unicode_constants_;
  // Backing store used to store strings used as hashmap keys.
  SequenceCollector<unsigned char> backing_store_;
  HashMap map_;
  // Buffer used for string->number->canonical string conversions.
  char number_buffer_[kBufferSize];
};


200 201 202 203
// ----------------------------------------------------------------------------
// LiteralBuffer -  Collector of chars of literals.

class LiteralBuffer {
204
 public:
205
  LiteralBuffer() : is_one_byte_(true), position_(0), backing_store_() { }
206

207 208 209 210 211 212
  ~LiteralBuffer() {
    if (backing_store_.length() > 0) {
      backing_store_.Dispose();
    }
  }

213
  INLINE(void AddChar(uint32_t code_unit)) {
214
    if (position_ >= backing_store_.length()) ExpandBuffer();
215
    if (is_one_byte_) {
216
      if (code_unit <= unibrow::Latin1::kMaxChar) {
217
        backing_store_[position_] = static_cast<byte>(code_unit);
218
        position_ += kOneByteSize;
219 220
        return;
      }
221
      ConvertToTwoByte();
222
    }
223 224 225 226 227 228 229
    if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
      *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
      position_ += kUC16Size;
    } else {
      *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
          unibrow::Utf16::LeadSurrogate(code_unit);
      position_ += kUC16Size;
230
      if (position_ >= backing_store_.length()) ExpandBuffer();
231 232 233 234
      *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
          unibrow::Utf16::TrailSurrogate(code_unit);
      position_ += kUC16Size;
    }
235 236
  }

237
  bool is_one_byte() const { return is_one_byte_; }
238

239
  bool is_contextual_keyword(Vector<const char> keyword) const {
240
    return is_one_byte() && keyword.length() == position_ &&
241 242 243
        (memcmp(keyword.start(), backing_store_.start(), position_) == 0);
  }

244
  Vector<const uint16_t> two_byte_literal() const {
245 246
    DCHECK(!is_one_byte_);
    DCHECK((position_ & 0x1) == 0);
247 248
    return Vector<const uint16_t>(
        reinterpret_cast<const uint16_t*>(backing_store_.start()),
249 250 251
        position_ >> 1);
  }

252
  Vector<const uint8_t> one_byte_literal() const {
253
    DCHECK(is_one_byte_);
254 255
    return Vector<const uint8_t>(
        reinterpret_cast<const uint8_t*>(backing_store_.start()),
256 257 258
        position_);
  }

259
  int length() const {
260
    return is_one_byte_ ? position_ : (position_ >> 1);
261 262
  }

263 264 265 266
  void ReduceLength(int delta) {
    position_ -= delta * (is_one_byte_ ? kOneByteSize : kUC16Size);
  }

267 268
  void Reset() {
    position_ = 0;
269
    is_one_byte_ = true;
270
  }
271

272 273
  Handle<String> Internalize(Isolate* isolate) const;

274 275 276 277 278 279 280 281 282 283 284
  void CopyFrom(const LiteralBuffer* other) {
    if (other == nullptr) {
      Reset();
    } else {
      is_one_byte_ = other->is_one_byte_;
      position_ = other->position_;
      backing_store_.Dispose();
      backing_store_ = other->backing_store_.Clone();
    }
  }

285 286 287 288 289 290 291 292 293 294 295 296 297
 private:
  static const int kInitialCapacity = 16;
  static const int kGrowthFactory = 4;
  static const int kMinConversionSlack = 256;
  static const int kMaxGrowth = 1 * MB;
  inline int NewCapacity(int min_capacity) {
    int capacity = Max(min_capacity, backing_store_.length());
    int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
    return new_capacity;
  }

  void ExpandBuffer() {
    Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
298
    MemCopy(new_store.start(), backing_store_.start(), position_);
299 300 301 302
    backing_store_.Dispose();
    backing_store_ = new_store;
  }

303
  void ConvertToTwoByte() {
304
    DCHECK(is_one_byte_);
305 306 307
    Vector<byte> new_store;
    int new_content_size = position_ * kUC16Size;
    if (new_content_size >= backing_store_.length()) {
308 309
      // Ensure room for all currently read code units as UC16 as well
      // as the code unit about to be stored.
310 311 312 313
      new_store = Vector<byte>::New(NewCapacity(new_content_size));
    } else {
      new_store = backing_store_;
    }
314
    uint8_t* src = backing_store_.start();
315
    uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start());
316 317 318 319 320 321 322 323
    for (int i = position_ - 1; i >= 0; i--) {
      dst[i] = src[i];
    }
    if (new_store.start() != backing_store_.start()) {
      backing_store_.Dispose();
      backing_store_ = new_store;
    }
    position_ = new_content_size;
324
    is_one_byte_ = false;
325 326
  }

327
  bool is_one_byte_;
328 329
  int position_;
  Vector<byte> backing_store_;
330

331
  DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
332 333 334
};


335
// ----------------------------------------------------------------------------
336
// JavaScript Scanner.
337 338

class Scanner {
339
 public:
340 341
  // Scoped helper for literal recording. Automatically drops the literal
  // if aborting the scanning before it's complete.
342 343
  class LiteralScope {
   public:
344
    explicit LiteralScope(Scanner* self) : scanner_(self), complete_(false) {
345 346 347 348 349 350 351 352
      scanner_->StartLiteral();
    }
     ~LiteralScope() {
       if (!complete_) scanner_->DropLiteral();
     }
    void Complete() {
      complete_ = true;
    }
353 354 355 356 357 358

   private:
    Scanner* scanner_;
    bool complete_;
  };

359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
  // Scoped helper for a re-settable bookmark.
  class BookmarkScope {
   public:
    explicit BookmarkScope(Scanner* scanner) : scanner_(scanner) {
      DCHECK_NOT_NULL(scanner_);
    }
    ~BookmarkScope() { scanner_->DropBookmark(); }

    bool Set() { return scanner_->SetBookmark(); }
    void Reset() { scanner_->ResetToBookmark(); }
    bool HasBeenSet() { return scanner_->BookmarkHasBeenSet(); }
    bool HasBeenReset() { return scanner_->BookmarkHasBeenReset(); }

   private:
    Scanner* scanner_;

    DISALLOW_COPY_AND_ASSIGN(BookmarkScope);
  };

378
  // Representation of an interval of source positions.
379 380 381 382 383 384 385 386 387 388 389 390 391 392
  struct Location {
    Location(int b, int e) : beg_pos(b), end_pos(e) { }
    Location() : beg_pos(0), end_pos(0) { }

    bool IsValid() const {
      return beg_pos >= 0 && end_pos >= beg_pos;
    }

    static Location invalid() { return Location(-1, -1); }

    int beg_pos;
    int end_pos;
  };

393 394 395 396 397
  // -1 is outside of the range of any real source code.
  static const int kNoOctalLocation = -1;

  explicit Scanner(UnicodeCache* scanner_contants);

398
  void Initialize(Utf16CharacterStream* source);
399 400 401 402 403

  // Returns the next token and advances input.
  Token::Value Next();
  // Returns the current token again.
  Token::Value current_token() { return current_.token; }
404
  // Returns the location information for the current token
405
  // (the token last returned by Next()).
406
  Location location() const { return current_.location; }
407 408 409 410 411 412 413

  // Similar functions for the upcoming token.

  // One token look-ahead (past the token returned by Next()).
  Token::Value peek() const { return next_.token; }

  Location peek_location() const { return next_.location; }
414 415 416 417 418 419 420 421 422 423

  bool literal_contains_escapes() const {
    Location location = current_.location;
    int source_length = (location.end_pos - location.beg_pos);
    if (current_.token == Token::STRING) {
      // Subtract delimiters.
      source_length -= 2;
    }
    return current_.literal_chars->length() != source_length;
  }
424
  bool is_literal_contextual_keyword(Vector<const char> keyword) {
425
    DCHECK_NOT_NULL(current_.literal_chars);
426
    return current_.literal_chars->is_contextual_keyword(keyword);
427
  }
428
  bool is_next_contextual_keyword(Vector<const char> keyword) {
429
    DCHECK_NOT_NULL(next_.literal_chars);
430 431
    return next_.literal_chars->is_contextual_keyword(keyword);
  }
432

433 434
  const AstRawString* CurrentSymbol(AstValueFactory* ast_value_factory);
  const AstRawString* NextSymbol(AstValueFactory* ast_value_factory);
435
  const AstRawString* CurrentRawSymbol(AstValueFactory* ast_value_factory);
436 437

  double DoubleValue();
438
  bool LiteralMatches(const char* data, int length, bool allow_escapes = true) {
439 440
    if (is_literal_one_byte() &&
        literal_length() == length &&
441
        (allow_escapes || !literal_contains_escapes())) {
442 443 444
      const char* token =
          reinterpret_cast<const char*>(literal_one_byte_string().start());
      return !strncmp(token, data, length);
445 446 447
    }
    return false;
  }
448 449 450 451
  inline bool UnescapedLiteralMatches(const char* data, int length) {
    return LiteralMatches(data, length, false);
  }

452 453 454 455
  void IsGetOrSet(bool* is_get, bool* is_set) {
    if (is_literal_one_byte() &&
        literal_length() == 3 &&
        !literal_contains_escapes()) {
456 457
      const char* token =
          reinterpret_cast<const char*>(literal_one_byte_string().start());
458 459 460 461 462
      *is_get = strncmp(token, "get", 3) == 0;
      *is_set = !*is_get && strncmp(token, "set", 3) == 0;
    }
  }

463
  int FindSymbol(DuplicateFinder* finder, int value);
464

465
  UnicodeCache* unicode_cache() { return unicode_cache_; }
466 467 468 469 470

  // Returns the location of the last seen octal literal.
  Location octal_position() const { return octal_pos_; }
  void clear_octal_position() { octal_pos_ = Location::invalid(); }

verwaest's avatar
verwaest committed
471
  // Returns the value of the last smi that was scanned.
verwaest's avatar
verwaest committed
472
  int smi_value() const { return current_.smi_value_; }
verwaest's avatar
verwaest committed
473

474 475 476 477 478 479
  // Seek forward to the given position.  This operation does not
  // work in general, for instance when there are pushed back
  // characters, but works for seeking forward until simple delimiter
  // tokens, which is what it is used for.
  void SeekForward(int pos);

480 481 482 483 484
  bool HarmonyModules() const {
    return harmony_modules_;
  }
  void SetHarmonyModules(bool modules) {
    harmony_modules_ = modules;
485
  }
486 487 488 489 490 491
  bool HarmonyClasses() const {
    return harmony_classes_;
  }
  void SetHarmonyClasses(bool classes) {
    harmony_classes_ = classes;
  }
marja's avatar
marja committed
492 493
  bool HarmonyUnicode() const { return harmony_unicode_; }
  void SetHarmonyUnicode(bool unicode) { harmony_unicode_ = unicode; }
494 495 496 497 498 499 500 501 502 503 504 505 506 507 508

  // Returns true if there was a line terminator before the peek'ed token,
  // possibly inside a multi-line comment.
  bool HasAnyLineTerminatorBeforeNext() const {
    return has_line_terminator_before_next_ ||
           has_multiline_comment_before_next_;
  }

  // Scans the input as a regular expression pattern, previous
  // character(s) must be /(=). Returns true if a pattern is scanned.
  bool ScanRegExpPattern(bool seen_equal);
  // Returns true if regexp flags are scanned (always since flags can
  // be empty).
  bool ScanRegExpFlags();

509
  // Scans the input as a template literal
510 511
  Token::Value ScanTemplateStart();
  Token::Value ScanTemplateContinuation();
512

513 514 515 516 517
  const LiteralBuffer* source_url() const { return &source_url_; }
  const LiteralBuffer* source_mapping_url() const {
    return &source_mapping_url_;
  }

518 519
  bool IdentifierIsFutureStrictReserved(const AstRawString* string) const;

520
 private:
521 522 523 524 525
  // The current and look-ahead token.
  struct TokenDesc {
    Token::Value token;
    Location location;
    LiteralBuffer* literal_chars;
526
    LiteralBuffer* raw_literal_chars;
verwaest's avatar
verwaest committed
527
    int smi_value_;
528 529
  };

530 531 532
  static const int kCharacterLookaheadBufferSize = 1;

  // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
533
  template <bool capture_raw>
534 535
  uc32 ScanOctalEscape(uc32 c, int length);

536 537 538 539 540 541 542
  // Call this after setting source_ to the input.
  void Init() {
    // Set c0_ (one character ahead)
    STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);
    Advance();
    // Initialize current_ to not refer to a literal.
    current_.literal_chars = NULL;
543
    current_.raw_literal_chars = NULL;
544 545
  }

546 547 548 549 550 551 552 553
  // Support BookmarkScope functionality.
  bool SetBookmark();
  void ResetToBookmark();
  bool BookmarkHasBeenSet();
  bool BookmarkHasBeenReset();
  void DropBookmark();
  static void CopyTokenDesc(TokenDesc* to, TokenDesc* from);

554 555 556 557 558 559 560 561
  // Literal buffer support
  inline void StartLiteral() {
    LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ?
            &literal_buffer2_ : &literal_buffer1_;
    free_buffer->Reset();
    next_.literal_chars = free_buffer;
  }

562
  inline void StartRawLiteral() {
563 564 565 566 567
    LiteralBuffer* free_buffer =
        (current_.raw_literal_chars == &raw_literal_buffer1_) ?
            &raw_literal_buffer2_ : &raw_literal_buffer1_;
    free_buffer->Reset();
    next_.raw_literal_chars = free_buffer;
568 569
  }

570
  INLINE(void AddLiteralChar(uc32 c)) {
571
    DCHECK_NOT_NULL(next_.literal_chars);
572 573 574
    next_.literal_chars->AddChar(c);
  }

575 576 577
  INLINE(void AddRawLiteralChar(uc32 c)) {
    DCHECK_NOT_NULL(next_.raw_literal_chars);
    next_.raw_literal_chars->AddChar(c);
578 579
  }

580 581 582 583 584
  INLINE(void ReduceRawLiteralLength(int delta)) {
    DCHECK_NOT_NULL(next_.raw_literal_chars);
    next_.raw_literal_chars->ReduceLength(delta);
  }

585 586 587 588
  // Stops scanning of a literal and drop the collected characters,
  // e.g., due to an encountered error.
  inline void DropLiteral() {
    next_.literal_chars = NULL;
589
    next_.raw_literal_chars = NULL;
590 591 592 593 594 595 596 597
  }

  inline void AddLiteralCharAdvance() {
    AddLiteralChar(c0_);
    Advance();
  }

  // Low-level scanning support.
598
  template <bool capture_raw = false, bool check_surrogate = true>
599
  void Advance() {
600
    if (capture_raw) {
601 602
      AddRawLiteralChar(c0_);
    }
603
    c0_ = source_->Advance();
verwaest's avatar
verwaest committed
604
    if (check_surrogate) HandleLeadSurrogate();
605 606
  }

verwaest's avatar
verwaest committed
607
  void HandleLeadSurrogate() {
608 609 610 611 612 613 614 615 616 617
    if (unibrow::Utf16::IsLeadSurrogate(c0_)) {
      uc32 c1 = source_->Advance();
      if (!unibrow::Utf16::IsTrailSurrogate(c1)) {
        source_->PushBack(c1);
      } else {
        c0_ = unibrow::Utf16::CombineSurrogatePair(c0_, c1);
      }
    }
  }

618
  void PushBack(uc32 ch) {
619 620 621 622 623 624
    if (ch > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
      source_->PushBack(unibrow::Utf16::TrailSurrogate(c0_));
      source_->PushBack(unibrow::Utf16::LeadSurrogate(c0_));
    } else {
      source_->PushBack(c0_);
    }
625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
    c0_ = ch;
  }

  inline Token::Value Select(Token::Value tok) {
    Advance();
    return tok;
  }

  inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
    Advance();
    if (c0_ == next) {
      Advance();
      return then;
    } else {
      return else_;
    }
  }

643 644
  // Returns the literal string, if any, for the current token (the
  // token last returned by Next()). The string is 0-terminated.
645 646 647
  // Literal strings are collected for identifiers, strings, numbers as well
  // as for template literals. For template literals we also collect the raw
  // form.
648 649
  // These functions only give the correct result if the literal was scanned
  // when a LiteralScope object is alive.
650
  Vector<const uint8_t> literal_one_byte_string() {
651
    DCHECK_NOT_NULL(current_.literal_chars);
652 653
    return current_.literal_chars->one_byte_literal();
  }
654
  Vector<const uint16_t> literal_two_byte_string() {
655
    DCHECK_NOT_NULL(current_.literal_chars);
656
    return current_.literal_chars->two_byte_literal();
657 658
  }
  bool is_literal_one_byte() {
659
    DCHECK_NOT_NULL(current_.literal_chars);
660 661 662
    return current_.literal_chars->is_one_byte();
  }
  int literal_length() const {
663
    DCHECK_NOT_NULL(current_.literal_chars);
664 665 666 667
    return current_.literal_chars->length();
  }
  // Returns the literal string for the next token (the token that
  // would be returned if Next() were called).
668
  Vector<const uint8_t> next_literal_one_byte_string() {
669
    DCHECK_NOT_NULL(next_.literal_chars);
670 671
    return next_.literal_chars->one_byte_literal();
  }
672
  Vector<const uint16_t> next_literal_two_byte_string() {
673
    DCHECK_NOT_NULL(next_.literal_chars);
674
    return next_.literal_chars->two_byte_literal();
675 676
  }
  bool is_next_literal_one_byte() {
677
    DCHECK_NOT_NULL(next_.literal_chars);
678 679
    return next_.literal_chars->is_one_byte();
  }
680 681 682 683 684 685 686
  Vector<const uint8_t> raw_literal_one_byte_string() {
    DCHECK_NOT_NULL(current_.raw_literal_chars);
    return current_.raw_literal_chars->one_byte_literal();
  }
  Vector<const uint16_t> raw_literal_two_byte_string() {
    DCHECK_NOT_NULL(current_.raw_literal_chars);
    return current_.raw_literal_chars->two_byte_literal();
687
  }
688 689 690 691 692
  bool is_raw_literal_one_byte() {
    DCHECK_NOT_NULL(current_.raw_literal_chars);
    return current_.raw_literal_chars->is_one_byte();
  }

693
  template <bool capture_raw>
694
  uc32 ScanHexNumber(int expected_length);
marja's avatar
marja committed
695 696 697
  // Scan a number of any length but not bigger than max_value. For example, the
  // number can be 000000001, so it's very long in characters but its value is
  // small.
698
  template <bool capture_raw>
marja's avatar
marja committed
699
  uc32 ScanUnlimitedLengthHexNumber(int max_value);
700

701 702
  // Scans a single JavaScript token.
  void Scan();
703 704 705

  bool SkipWhiteSpace();
  Token::Value SkipSingleLineComment();
706 707
  Token::Value SkipSourceURLComment();
  void TryToParseSourceURLComment();
708
  Token::Value SkipMultiLineComment();
709 710
  // Scans a possible HTML comment -- begins with '<!'.
  Token::Value ScanHtmlComment();
711 712 713 714 715 716 717 718

  void ScanDecimalDigits();
  Token::Value ScanNumber(bool seen_period);
  Token::Value ScanIdentifierOrKeyword();
  Token::Value ScanIdentifierSuffix(LiteralScope* literal);

  Token::Value ScanString();

719 720 721
  // Scans an escape-sequence which is part of a string and adds the
  // decoded character to the current literal. Returns true if a pattern
  // is scanned.
722
  template <bool capture_raw, bool in_template_literal>
723
  bool ScanEscape();
724

725
  // Decodes a Unicode escape-sequence which is part of an identifier.
726 727
  // If the escape sequence cannot be decoded the result is kBadChar.
  uc32 ScanIdentifierUnicodeEscape();
marja's avatar
marja committed
728
  // Helper for the above functions.
729
  template <bool capture_raw>
marja's avatar
marja committed
730
  uc32 ScanUnicodeEscape();
731

732 733
  Token::Value ScanTemplateSpan();

734 735
  // Return the current source position.
  int source_pos() {
736
    return static_cast<int>(source_->pos()) - kCharacterLookaheadBufferSize;
737 738 739 740 741 742 743 744
  }

  UnicodeCache* unicode_cache_;

  // Buffers collecting literal strings, numbers, etc.
  LiteralBuffer literal_buffer1_;
  LiteralBuffer literal_buffer2_;

745 746 747 748
  // Values parsed from magic comments.
  LiteralBuffer source_url_;
  LiteralBuffer source_mapping_url_;

749
  // Buffer to store raw string values
750 751
  LiteralBuffer raw_literal_buffer1_;
  LiteralBuffer raw_literal_buffer2_;
752

753 754 755
  TokenDesc current_;  // desc for current token (as returned by Next())
  TokenDesc next_;     // desc for next token (one token look-ahead)

756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
  // Variables for Scanner::BookmarkScope and the *Bookmark implementation.
  // These variables contain the scanner state when a bookmark is set.
  //
  // We will use bookmark_c0_ as a 'control' variable, where:
  // - bookmark_c0_ >= 0: A bookmark has been set and this contains c0_.
  // - bookmark_c0_ == -1: No bookmark has been set.
  // - bookmark_c0_ == -2: The bookmark has been applied (ResetToBookmark).
  //
  // Which state is being bookmarked? The parser state is distributed over
  // several variables, roughly like this:
  //   ...    1234        +       5678 ..... [character stream]
  //       [current_] [next_] c0_ |      [scanner state]
  // So when the scanner is logically at the beginning of an expression
  // like "1234 + 4567", then:
  // - current_ contains "1234"
  // - next_ contains "+"
  // - c0_ contains ' ' (the space between "+" and "5678",
  // - the source_ character stream points to the beginning of "5678".
  // To be able to restore this state, we will keep copies of current_, next_,
  // and c0_; we'll ask the stream to bookmark itself, and we'll copy the
  // contents of current_'s and next_'s literal buffers to bookmark_*_literal_.
  static const uc32 kNoBookmark = -1;
  static const uc32 kBookmarkWasApplied = -2;
  uc32 bookmark_c0_;
  TokenDesc bookmark_current_;
  TokenDesc bookmark_next_;
  LiteralBuffer bookmark_current_literal_;
  LiteralBuffer bookmark_current_raw_literal_;
  LiteralBuffer bookmark_next_literal_;
  LiteralBuffer bookmark_next_raw_literal_;

787 788
  // Input stream. Must be initialized to an Utf16CharacterStream.
  Utf16CharacterStream* source_;
789 790


791 792 793
  // Start position of the octal literal last scanned.
  Location octal_pos_;

794 795 796
  // One Unicode character look-ahead; c0_ < 0 at the end of the input.
  uc32 c0_;

797 798 799 800 801 802 803
  // Whether there is a line terminator whitespace character after
  // the current token, and  before the next. Does not count newlines
  // inside multiline comments.
  bool has_line_terminator_before_next_;
  // Whether there is a multi-line comment that contains a
  // line-terminator after the current token, and before the next.
  bool has_multiline_comment_before_next_;
804 805
  // Whether we scan 'module', 'import', 'export' as keywords.
  bool harmony_modules_;
arv@chromium.org's avatar
arv@chromium.org committed
806
  // Whether we scan 'class', 'extends', 'static' and 'super' as keywords.
807
  bool harmony_classes_;
marja's avatar
marja committed
808 809
  // Whether we allow \u{xxxxx}.
  bool harmony_unicode_;
810 811
};

812 813 814
} }  // namespace v8::internal

#endif  // V8_SCANNER_H_