scanner.cc 60 KB
Newer Older
1
// Copyright 2011 the V8 project authors. All rights reserved.
2 3
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
4

5
// Features shared by parsing and pre-parsing scanners.
6

7
#include "src/parsing/scanner.h"
8

9 10
#include <stdint.h>

11 12
#include <cmath>

13
#include "src/ast/ast-value-factory.h"
14 15
#include "src/char-predicates-inl.h"
#include "src/conversions-inl.h"
16
#include "src/objects/bigint.h"
17
#include "src/parsing/duplicate-finder.h"  // For Scanner::FindSymbol
18
#include "src/unicode-cache-inl.h"
19

20 21
namespace v8 {
namespace internal {
22

23
class Scanner::ErrorState {
24 25 26 27 28 29 30 31
 public:
  ErrorState(MessageTemplate::Template* message_stack,
             Scanner::Location* location_stack)
      : message_stack_(message_stack),
        old_message_(*message_stack),
        location_stack_(location_stack),
        old_location_(*location_stack) {
    *message_stack_ = MessageTemplate::kNone;
32
    *location_stack_ = Location::invalid();
33 34 35 36 37 38 39
  }

  ~ErrorState() {
    *message_stack_ = old_message_;
    *location_stack_ = old_location_;
  }

40
  void MoveErrorTo(TokenDesc* dest) {
41 42 43
    if (*message_stack_ == MessageTemplate::kNone) {
      return;
    }
44 45 46
    if (dest->invalid_template_escape_message == MessageTemplate::kNone) {
      dest->invalid_template_escape_message = *message_stack_;
      dest->invalid_template_escape_location = *location_stack_;
47 48
    }
    *message_stack_ = MessageTemplate::kNone;
49
    *location_stack_ = Location::invalid();
50 51 52 53 54 55 56 57 58
  }

 private:
  MessageTemplate::Template* const message_stack_;
  MessageTemplate::Template const old_message_;
  Scanner::Location* const location_stack_;
  Scanner::Location const old_location_;
};

59 60 61
// ----------------------------------------------------------------------------
// Scanner::LiteralBuffer

62
Handle<String> Scanner::LiteralBuffer::Internalize(Isolate* isolate) const {
63 64 65 66 67 68
  if (is_one_byte()) {
    return isolate->factory()->InternalizeOneByteString(one_byte_literal());
  }
  return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
}

69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
int Scanner::LiteralBuffer::NewCapacity(int min_capacity) {
  int capacity = Max(min_capacity, backing_store_.length());
  int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
  return new_capacity;
}

void Scanner::LiteralBuffer::ExpandBuffer() {
  Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
  MemCopy(new_store.start(), backing_store_.start(), position_);
  backing_store_.Dispose();
  backing_store_ = new_store;
}

void Scanner::LiteralBuffer::ConvertToTwoByte() {
  DCHECK(is_one_byte_);
  Vector<byte> new_store;
  int new_content_size = position_ * kUC16Size;
  if (new_content_size >= backing_store_.length()) {
    // Ensure room for all currently read code units as UC16 as well
    // as the code unit about to be stored.
    new_store = Vector<byte>::New(NewCapacity(new_content_size));
  } else {
    new_store = backing_store_;
  }
  uint8_t* src = backing_store_.start();
  uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start());
  for (int i = position_ - 1; i >= 0; i--) {
    dst[i] = src[i];
  }
  if (new_store.start() != backing_store_.start()) {
    backing_store_.Dispose();
    backing_store_ = new_store;
  }
  position_ = new_content_size;
  is_one_byte_ = false;
}

void Scanner::LiteralBuffer::AddCharSlow(uc32 code_unit) {
  if (position_ >= backing_store_.length()) ExpandBuffer();
  if (is_one_byte_) {
    if (code_unit <= static_cast<uc32>(unibrow::Latin1::kMaxChar)) {
      backing_store_[position_] = static_cast<byte>(code_unit);
      position_ += kOneByteSize;
      return;
    }
    ConvertToTwoByte();
  }
  if (code_unit <=
      static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
    *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
    position_ += kUC16Size;
  } else {
    *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
        unibrow::Utf16::LeadSurrogate(code_unit);
    position_ += kUC16Size;
    if (position_ >= backing_store_.length()) ExpandBuffer();
    *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
        unibrow::Utf16::TrailSurrogate(code_unit);
    position_ += kUC16Size;
  }
}

131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
// ----------------------------------------------------------------------------
// Scanner::BookmarkScope

const size_t Scanner::BookmarkScope::kBookmarkAtFirstPos =
    std::numeric_limits<size_t>::max() - 2;
const size_t Scanner::BookmarkScope::kNoBookmark =
    std::numeric_limits<size_t>::max() - 1;
const size_t Scanner::BookmarkScope::kBookmarkWasApplied =
    std::numeric_limits<size_t>::max();

void Scanner::BookmarkScope::Set() {
  DCHECK_EQ(bookmark_, kNoBookmark);
  DCHECK_EQ(scanner_->next_next_.token, Token::UNINITIALIZED);

  // The first token is a bit special, since current_ will still be
  // uninitialized. In this case, store kBookmarkAtFirstPos and special-case it
  // when
  // applying the bookmark.
  DCHECK_IMPLIES(
      scanner_->current_.token == Token::UNINITIALIZED,
      scanner_->current_.location.beg_pos == scanner_->next_.location.beg_pos);
  bookmark_ = (scanner_->current_.token == Token::UNINITIALIZED)
                  ? kBookmarkAtFirstPos
                  : scanner_->location().beg_pos;
}

void Scanner::BookmarkScope::Apply() {
  DCHECK(HasBeenSet());  // Caller hasn't called SetBookmark.
  if (bookmark_ == kBookmarkAtFirstPos) {
    scanner_->SeekNext(0);
  } else {
    scanner_->SeekNext(bookmark_);
    scanner_->Next();
164
    DCHECK_EQ(scanner_->location().beg_pos, static_cast<int>(bookmark_));
165 166 167 168 169 170 171 172 173 174 175
  }
  bookmark_ = kBookmarkWasApplied;
}

bool Scanner::BookmarkScope::HasBeenSet() {
  return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied;
}

bool Scanner::BookmarkScope::HasBeenApplied() {
  return bookmark_ == kBookmarkWasApplied;
}
176

177
// ----------------------------------------------------------------------------
178
// Scanner
179

180
Scanner::Scanner(UnicodeCache* unicode_cache)
181
    : unicode_cache_(unicode_cache),
182
      octal_pos_(Location::invalid()),
183
      octal_message_(MessageTemplate::kNone),
184
      found_html_comment_(false),
185 186
      allow_harmony_bigint_(false),
      allow_harmony_numeric_separator_(false) {}
187

188
void Scanner::Initialize(Utf16CharacterStream* source, bool is_module) {
189
  DCHECK_NOT_NULL(source);
190
  source_ = source;
191
  is_module_ = is_module;
192 193 194 195 196
  // Need to capture identifiers in order to recognize "get" and "set"
  // in object literals.
  Init();
  has_line_terminator_before_next_ = true;
  Scan();
197 198
}

199
template <bool capture_raw, bool unicode>
200
uc32 Scanner::ScanHexNumber(int expected_length) {
201
  DCHECK_LE(expected_length, 4);  // prevent overflow
202

203
  int begin = source_pos() - 2;
204 205 206 207
  uc32 x = 0;
  for (int i = 0; i < expected_length; i++) {
    int d = HexValue(c0_);
    if (d < 0) {
208 209 210 211
      ReportScannerError(Location(begin, begin + expected_length + 2),
                         unicode
                             ? MessageTemplate::kInvalidUnicodeEscapeSequence
                             : MessageTemplate::kInvalidHexEscapeSequence);
212 213 214
      return -1;
    }
    x = x * 16 + d;
215
    Advance<capture_raw>();
216 217 218
  }

  return x;
219
}
lrn@chromium.org's avatar
lrn@chromium.org committed
220

221
template <bool capture_raw>
222
uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) {
marja's avatar
marja committed
223 224
  uc32 x = 0;
  int d = HexValue(c0_);
225 226
  if (d < 0) return -1;

marja's avatar
marja committed
227 228
  while (d >= 0) {
    x = x * 16 + d;
229 230 231 232 233
    if (x > max_value) {
      ReportScannerError(Location(beg_pos, source_pos() + 1),
                         MessageTemplate::kUndefinedUnicodeCodePoint);
      return -1;
    }
234
    Advance<capture_raw>();
marja's avatar
marja committed
235 236
    d = HexValue(c0_);
  }
237

marja's avatar
marja committed
238 239 240 241
  return x;
}


242 243 244
// Ensure that tokens can be stored in a byte.
STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);

245 246
// Table of one-character tokens, by character (0x00..0x7F only).
// clang-format off
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291
static const byte one_char_tokens[] = {
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::LPAREN,       // 0x28
  Token::RPAREN,       // 0x29
  Token::ILLEGAL,
  Token::ILLEGAL,
292
  Token::COMMA,        // 0x2C
293 294 295 296 297 298 299 300 301 302 303 304 305
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
306 307
  Token::COLON,        // 0x3A
  Token::SEMICOLON,    // 0x3B
308 309 310
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
311
  Token::CONDITIONAL,  // 0x3F
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
339
  Token::LBRACK,     // 0x5B
340
  Token::ILLEGAL,
341
  Token::RBRACK,     // 0x5D
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
371
  Token::LBRACE,       // 0x7B
372
  Token::ILLEGAL,
373 374
  Token::RBRACE,       // 0x7D
  Token::BIT_NOT,      // 0x7E
375 376
  Token::ILLEGAL
};
377
// clang-format on
378

379
Token::Value Scanner::Next() {
380 381 382 383
  if (next_.token == Token::EOS) {
    next_.location.beg_pos = current_.location.beg_pos;
    next_.location.end_pos = current_.location.end_pos;
  }
384
  current_ = next_;
littledan's avatar
littledan committed
385 386 387
  if (V8_UNLIKELY(next_next_.token != Token::UNINITIALIZED)) {
    next_ = next_next_;
    next_next_.token = Token::UNINITIALIZED;
388
    next_next_.contextual_token = Token::UNINITIALIZED;
389
    has_line_terminator_before_next_ = has_line_terminator_after_next_;
littledan's avatar
littledan committed
390 391
    return current_.token;
  }
392 393
  has_line_terminator_before_next_ = false;
  has_multiline_comment_before_next_ = false;
394
  if (static_cast<unsigned>(c0_) <= 0x7F) {
395 396 397 398
    Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
    if (token != Token::ILLEGAL) {
      int pos = source_pos();
      next_.token = token;
399
      next_.contextual_token = Token::UNINITIALIZED;
400 401
      next_.location.beg_pos = pos;
      next_.location.end_pos = pos + 1;
402 403
      next_.literal_chars = nullptr;
      next_.raw_literal_chars = nullptr;
404
      next_.invalid_template_escape_message = MessageTemplate::kNone;
405 406 407
      Advance();
      return current_.token;
    }
408
  }
409 410 411 412 413
  Scan();
  return current_.token;
}


littledan's avatar
littledan committed
414
Token::Value Scanner::PeekAhead() {
415 416 417
  DCHECK(next_.token != Token::DIV);
  DCHECK(next_.token != Token::ASSIGN_DIV);

littledan's avatar
littledan committed
418 419 420 421
  if (next_next_.token != Token::UNINITIALIZED) {
    return next_next_.token;
  }
  TokenDesc prev = current_;
422 423
  bool has_line_terminator_before_next =
      has_line_terminator_before_next_ || has_multiline_comment_before_next_;
littledan's avatar
littledan committed
424
  Next();
425 426 427
  has_line_terminator_after_next_ =
      has_line_terminator_before_next_ || has_multiline_comment_before_next_;
  has_line_terminator_before_next_ = has_line_terminator_before_next;
littledan's avatar
littledan committed
428 429 430 431 432 433 434 435
  Token::Value ret = next_.token;
  next_next_ = next_;
  next_ = current_;
  current_ = prev;
  return ret;
}


436
Token::Value Scanner::SkipWhiteSpace() {
437 438 439
  int start_position = source_pos();

  while (true) {
440
    while (true) {
441
      // Don't skip behind the end of input.
442
      if (c0_ == kEndOfInput) break;
443

444 445
      // Advance as long as character is a WhiteSpace or LineTerminator.
      // Remember if the latter is the case.
446
      if (unibrow::IsLineTerminator(c0_)) {
447
        has_line_terminator_before_next_ = true;
448
      } else if (!unicode_cache_->IsWhiteSpace(c0_)) {
449
        break;
450 451 452 453 454 455 456 457
      }
      Advance();
    }

    // If there is an HTML comment end '-->' at the beginning of a
    // line (with only whitespace in front of it), we treat the rest
    // of the line as a comment. This is in line with the way
    // SpiderMonkey handles it.
458 459 460 461
    if (c0_ != '-' || !has_line_terminator_before_next_) break;

    Advance();
    if (c0_ != '-') {
462
      PushBack('-');  // undo Advance()
463 464 465 466 467 468 469
      break;
    }

    Advance();
    if (c0_ != '>') {
      PushBack2('-', '-');  // undo 2x Advance();
      break;
470
    }
471 472

    // Treat the rest of the line as a comment.
473 474 475 476
    Token::Value token = SkipSingleHTMLComment();
    if (token == Token::ILLEGAL) {
      return token;
    }
477
  }
478

479
  // Return whether or not we skipped any characters.
480 481 482 483 484 485 486 487 488 489 490 491 492
  if (source_pos() == start_position) {
    return Token::ILLEGAL;
  }

  return Token::WHITESPACE;
}

Token::Value Scanner::SkipSingleHTMLComment() {
  if (is_module_) {
    ReportScannerError(source_pos(), MessageTemplate::kHtmlCommentInModule);
    return Token::ILLEGAL;
  }
  return SkipSingleLineComment();
493
}
lrn@chromium.org's avatar
lrn@chromium.org committed
494

495
Token::Value Scanner::SkipSingleLineComment() {
496
  Advance();
497

498 499 500 501 502
  // The line terminator at the end of the line is not considered
  // to be part of the single-line comment; it is recognized
  // separately by the lexical grammar and becomes part of the
  // stream of input elements for the syntactic grammar (see
  // ECMA-262, section 7.4).
503
  while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
504
    Advance();
505
  }
506 507 508 509 510

  return Token::WHITESPACE;
}


511 512
Token::Value Scanner::SkipSourceURLComment() {
  TryToParseSourceURLComment();
513
  while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
514 515 516 517 518 519 520 521
    Advance();
  }

  return Token::WHITESPACE;
}


void Scanner::TryToParseSourceURLComment() {
522
  // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
523
  // function will just return if it cannot parse a magic comment.
524
  if (c0_ == kEndOfInput || !unicode_cache_->IsWhiteSpace(c0_)) return;
525 526
  Advance();
  LiteralBuffer name;
527 528
  while (c0_ != kEndOfInput &&
         !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) && c0_ != '=') {
529 530 531 532 533 534
    name.AddChar(c0_);
    Advance();
  }
  if (!name.is_one_byte()) return;
  Vector<const uint8_t> name_literal = name.one_byte_literal();
  LiteralBuffer* value;
535
  if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) {
536
    value = &source_url_;
537
  } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) {
538 539 540 541 542 543 544 545
    value = &source_mapping_url_;
  } else {
    return;
  }
  if (c0_ != '=')
    return;
  Advance();
  value->Reset();
546
  while (c0_ != kEndOfInput && unicode_cache_->IsWhiteSpace(c0_)) {
547 548
    Advance();
  }
549
  while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
550 551 552 553 554 555 556 557 558 559 560 561
    // Disallowed characters.
    if (c0_ == '"' || c0_ == '\'') {
      value->Reset();
      return;
    }
    if (unicode_cache_->IsWhiteSpace(c0_)) {
      break;
    }
    value->AddChar(c0_);
    Advance();
  }
  // Allow whitespace at the end.
562
  while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
563 564 565 566 567 568 569 570 571
    if (!unicode_cache_->IsWhiteSpace(c0_)) {
      value->Reset();
      break;
    }
    Advance();
  }
}


572
Token::Value Scanner::SkipMultiLineComment() {
573
  DCHECK_EQ(c0_, '*');
574 575
  Advance();

576
  while (c0_ != kEndOfInput) {
577 578
    uc32 ch = c0_;
    Advance();
579
    if (c0_ != kEndOfInput && unibrow::IsLineTerminator(ch)) {
580 581 582 583 584 585 586 587 588 589 590
      // Following ECMA-262, section 7.4, a comment containing
      // a newline will make the comment count as a line-terminator.
      has_multiline_comment_before_next_ = true;
    }
    // If we have reached the end of the multi-line comment, we
    // consume the '/' and insert a whitespace. This way all
    // multi-line comments are treated as whitespace.
    if (ch == '*' && c0_ == '/') {
      c0_ = ' ';
      return Token::WHITESPACE;
    }
591
  }
592 593 594

  // Unterminated multi-line comment.
  return Token::ILLEGAL;
595 596
}

597
Token::Value Scanner::ScanHtmlComment() {
598
  // Check for <!-- comments.
599
  DCHECK_EQ(c0_, '!');
600
  Advance();
601 602 603 604 605 606 607 608 609
  if (c0_ != '-') {
    PushBack('!');  // undo Advance()
    return Token::LT;
  }

  Advance();
  if (c0_ != '-') {
    PushBack2('-', '!');  // undo 2x Advance()
    return Token::LT;
610 611
  }

612
  found_html_comment_ = true;
613
  return SkipSingleHTMLComment();
614
}
615

616
void Scanner::Scan() {
617 618
  next_.literal_chars = nullptr;
  next_.raw_literal_chars = nullptr;
619
  next_.invalid_template_escape_message = MessageTemplate::kNone;
620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637
  Token::Value token;
  do {
    // Remember the position of the next token
    next_.location.beg_pos = source_pos();

    switch (c0_) {
      case ' ':
      case '\t':
        Advance();
        token = Token::WHITESPACE;
        break;

      case '\n':
        Advance();
        has_line_terminator_before_next_ = true;
        token = Token::WHITESPACE;
        break;

638 639
      case '"':
      case '\'':
640 641 642 643 644 645 646 647 648 649
        token = ScanString();
        break;

      case '<':
        // < <= << <<= <!--
        Advance();
        if (c0_ == '=') {
          token = Select(Token::LTE);
        } else if (c0_ == '<') {
          token = Select('=', Token::ASSIGN_SHL, Token::SHL);
650
        } else if (c0_ == '!') {
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677
          token = ScanHtmlComment();
        } else {
          token = Token::LT;
        }
        break;

      case '>':
        // > >= >> >>= >>> >>>=
        Advance();
        if (c0_ == '=') {
          token = Select(Token::GTE);
        } else if (c0_ == '>') {
          // >> >>= >>> >>>=
          Advance();
          if (c0_ == '=') {
            token = Select(Token::ASSIGN_SAR);
          } else if (c0_ == '>') {
            token = Select('=', Token::ASSIGN_SHR, Token::SHR);
          } else {
            token = Token::SAR;
          }
        } else {
          token = Token::GT;
        }
        break;

      case '=':
678
        // = == === =>
679 680 681
        Advance();
        if (c0_ == '=') {
          token = Select('=', Token::EQ_STRICT, Token::EQ);
682 683
        } else if (c0_ == '>') {
          token = Select(Token::ARROW);
684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715
        } else {
          token = Token::ASSIGN;
        }
        break;

      case '!':
        // ! != !==
        Advance();
        if (c0_ == '=') {
          token = Select('=', Token::NE_STRICT, Token::NE);
        } else {
          token = Token::NOT;
        }
        break;

      case '+':
        // + ++ +=
        Advance();
        if (c0_ == '+') {
          token = Select(Token::INC);
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_ADD);
        } else {
          token = Token::ADD;
        }
        break;

      case '-':
        // - -- --> -=
        Advance();
        if (c0_ == '-') {
          Advance();
716
          if (c0_ == '>' && HasAnyLineTerminatorBeforeNext()) {
717 718
            // For compatibility with SpiderMonkey, we skip lines that
            // start with an HTML comment end '-->'.
719
            token = SkipSingleHTMLComment();
720 721 722 723 724 725 726 727 728 729 730 731
          } else {
            token = Token::DEC;
          }
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_SUB);
        } else {
          token = Token::SUB;
        }
        break;

      case '*':
        // * *=
732
        Advance();
733
        if (c0_ == '*') {
734 735 736 737 738 739
          token = Select('=', Token::ASSIGN_EXP, Token::EXP);
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_MUL);
        } else {
          token = Token::MUL;
        }
740 741 742 743 744 745 746 747 748 749 750
        break;

      case '%':
        // % %=
        token = Select('=', Token::ASSIGN_MOD, Token::MOD);
        break;

      case '/':
        // /  // /* /=
        Advance();
        if (c0_ == '/') {
751
          Advance();
752
          if (c0_ == '#' || c0_ == '@') {
753 754 755 756 757 758
            Advance();
            token = SkipSourceURLComment();
          } else {
            PushBack(c0_);
            token = SkipSingleLineComment();
          }
759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803
        } else if (c0_ == '*') {
          token = SkipMultiLineComment();
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_DIV);
        } else {
          token = Token::DIV;
        }
        break;

      case '&':
        // & && &=
        Advance();
        if (c0_ == '&') {
          token = Select(Token::AND);
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_BIT_AND);
        } else {
          token = Token::BIT_AND;
        }
        break;

      case '|':
        // | || |=
        Advance();
        if (c0_ == '|') {
          token = Select(Token::OR);
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_BIT_OR);
        } else {
          token = Token::BIT_OR;
        }
        break;

      case '^':
        // ^ ^=
        token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
        break;

      case '.':
        // . Number
        Advance();
        if (IsDecimalDigit(c0_)) {
          token = ScanNumber(true);
        } else {
          token = Token::PERIOD;
804 805 806 807 808 809 810 811 812
          if (c0_ == '.') {
            Advance();
            if (c0_ == '.') {
              Advance();
              token = Token::ELLIPSIS;
            } else {
              PushBack('.');
            }
          }
813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859
        }
        break;

      case ':':
        token = Select(Token::COLON);
        break;

      case ';':
        token = Select(Token::SEMICOLON);
        break;

      case ',':
        token = Select(Token::COMMA);
        break;

      case '(':
        token = Select(Token::LPAREN);
        break;

      case ')':
        token = Select(Token::RPAREN);
        break;

      case '[':
        token = Select(Token::LBRACK);
        break;

      case ']':
        token = Select(Token::RBRACK);
        break;

      case '{':
        token = Select(Token::LBRACE);
        break;

      case '}':
        token = Select(Token::RBRACE);
        break;

      case '?':
        token = Select(Token::CONDITIONAL);
        break;

      case '~':
        token = Select(Token::BIT_NOT);
        break;

860
      case '`':
861 862
        token = ScanTemplateStart();
        break;
863

864 865 866 867
      case '#':
        token = ScanPrivateName();
        break;

868
      default:
869
        if (c0_ == kEndOfInput) {
870 871
          token = Token::EOS;
        } else if (unicode_cache_->IsIdentifierStart(c0_)) {
872 873 874 875
          token = ScanIdentifierOrKeyword();
        } else if (IsDecimalDigit(c0_)) {
          token = ScanNumber(false);
        } else {
876 877 878 879
          token = SkipWhiteSpace();
          if (token == Token::ILLEGAL) {
            Advance();
          }
880 881 882 883 884 885 886 887 888
        }
        break;
    }

    // Continue scanning for tokens as long as we're just skipping
    // whitespace.
  } while (token == Token::WHITESPACE);

  next_.location.end_pos = source_pos();
889 890 891 892 893 894 895
  if (Token::IsContextualKeyword(token)) {
    next_.token = Token::IDENTIFIER;
    next_.contextual_token = token;
  } else {
    next_.token = token;
    next_.contextual_token = Token::UNINITIALIZED;
  }
896 897 898 899 900 901

#ifdef DEBUG
  SanityCheckTokenDesc(current_);
  SanityCheckTokenDesc(next_);
  SanityCheckTokenDesc(next_next_);
#endif
902 903
}

904 905 906 907 908 909 910 911
#ifdef DEBUG
void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const {
  // Most tokens should not have literal_chars or even raw_literal chars.
  // The rules are:
  // - UNINITIALIZED: we don't care.
  // - TEMPLATE_*: need both literal + raw literal chars.
  // - IDENTIFIERS, STRINGS, etc.: need a literal, but no raw literal.
  // - all others: should have neither.
912 913
  // Furthermore, only TEMPLATE_* tokens can have a
  // invalid_template_escape_message.
914 915 916 917 918 919 920 921 922 923 924 925 926 927 928

  switch (token.token) {
    case Token::UNINITIALIZED:
      // token.literal_chars & other members might be garbage. That's ok.
      break;
    case Token::TEMPLATE_SPAN:
    case Token::TEMPLATE_TAIL:
      DCHECK_NOT_NULL(token.raw_literal_chars);
      DCHECK_NOT_NULL(token.literal_chars);
      break;
    case Token::ESCAPED_KEYWORD:
    case Token::ESCAPED_STRICT_RESERVED_WORD:
    case Token::FUTURE_STRICT_RESERVED_WORD:
    case Token::IDENTIFIER:
    case Token::NUMBER:
929
    case Token::BIGINT:
930 931 932
    case Token::REGEXP_LITERAL:
    case Token::SMI:
    case Token::STRING:
933
    case Token::PRIVATE_NAME:
934 935
      DCHECK_NOT_NULL(token.literal_chars);
      DCHECK_NULL(token.raw_literal_chars);
936
      DCHECK_EQ(token.invalid_template_escape_message, MessageTemplate::kNone);
937 938 939 940
      break;
    default:
      DCHECK_NULL(token.literal_chars);
      DCHECK_NULL(token.raw_literal_chars);
941
      DCHECK_EQ(token.invalid_template_escape_message, MessageTemplate::kNone);
942 943
      break;
  }
944 945 946 947 948 949 950

  DCHECK_IMPLIES(token.token != Token::IDENTIFIER,
                 token.contextual_token == Token::UNINITIALIZED);
  DCHECK_IMPLIES(token.contextual_token != Token::UNINITIALIZED,
                 token.token == Token::IDENTIFIER &&
                     Token::IsContextualKeyword(token.contextual_token));
  DCHECK(!Token::IsContextualKeyword(token.token));
951 952
}
#endif  // DEBUG
953

954
void Scanner::SeekForward(int pos) {
955 956 957 958
  // After this call, we will have the token at the given position as
  // the "next" token. The "current" token will be invalid.
  if (pos == next_.location.beg_pos) return;
  int current_pos = source_pos();
959
  DCHECK_EQ(next_.location.end_pos, current_pos);
960
  // Positions inside the lookahead token aren't supported.
961
  DCHECK(pos >= current_pos);
962
  if (pos != current_pos) {
963
    source_->Seek(pos);
964 965 966 967 968 969 970 971
    Advance();
    // This function is only called to seek to the location
    // of the end of a function (at the "}" token). It doesn't matter
    // whether there was a line terminator in the part we skip.
    has_line_terminator_before_next_ = false;
    has_multiline_comment_before_next_ = false;
  }
  Scan();
972 973 974
}


975
template <bool capture_raw, bool in_template_literal>
976
bool Scanner::ScanEscape() {
977
  uc32 c = c0_;
978
  Advance<capture_raw>();
979 980

  // Skip escaped newlines.
981 982
  if (!in_template_literal && c0_ != kEndOfInput &&
      unibrow::IsLineTerminator(c)) {
983
    // Allow escaped CR+LF newlines in multiline string literals.
984
    if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance<capture_raw>();
985
    return true;
986
  }
987

988 989 990 991 992 993 994 995 996 997
  switch (c) {
    case '\'':  // fall through
    case '"' :  // fall through
    case '\\': break;
    case 'b' : c = '\b'; break;
    case 'f' : c = '\f'; break;
    case 'n' : c = '\n'; break;
    case 'r' : c = '\r'; break;
    case 't' : c = '\t'; break;
    case 'u' : {
998
      c = ScanUnicodeEscape<capture_raw>();
999
      if (c < 0) return false;
1000 1001
      break;
    }
1002 1003 1004 1005
    case 'v':
      c = '\v';
      break;
    case 'x': {
1006
      c = ScanHexNumber<capture_raw>(2);
1007
      if (c < 0) return false;
1008 1009
      break;
    }
1010
    case '0':  // Fall through.
1011 1012 1013 1014 1015 1016
    case '1':  // fall through
    case '2':  // fall through
    case '3':  // fall through
    case '4':  // fall through
    case '5':  // fall through
    case '6':  // fall through
1017
    case '7':
1018
      c = ScanOctalEscape<capture_raw>(c, 2, in_template_literal);
1019
      break;
1020
  }
1021

1022
  // Other escaped characters are interpreted as their non-escaped version.
1023
  AddLiteralChar(c);
1024
  return true;
1025 1026
}

1027
template <bool capture_raw>
1028
uc32 Scanner::ScanOctalEscape(uc32 c, int length, bool in_template_literal) {
1029 1030 1031 1032 1033 1034 1035 1036
  uc32 x = c - '0';
  int i = 0;
  for (; i < length; i++) {
    int d = c0_ - '0';
    if (d < 0 || d > 7) break;
    int nx = x * 8 + d;
    if (nx >= 256) break;
    x = nx;
1037
    Advance<capture_raw>();
1038 1039 1040 1041 1042 1043
  }
  // Anything except '\0' is an octal escape sequence, illegal in strict mode.
  // Remember the position of octal escape sequences so that an error
  // can be reported later (in strict mode).
  // We don't report the error immediately, because the octal escape can
  // occur before the "use strict" directive.
1044
  if (c != '0' || i > 0 || c0_ == '8' || c0_ == '9') {
1045
    octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
1046 1047 1048
    octal_message_ = in_template_literal
                         ? MessageTemplate::kTemplateOctalLiteral
                         : MessageTemplate::kStrictOctalEscape;
1049
  }
1050
  return x;
1051 1052 1053
}


1054
Token::Value Scanner::ScanString() {
1055
  uc32 quote = c0_;
verwaest's avatar
verwaest committed
1056
  Advance<false, false>();  // consume quote
1057

1058
  LiteralScope literal(this);
verwaest's avatar
verwaest committed
1059 1060 1061 1062 1063
  while (true) {
    if (c0_ > kMaxAscii) {
      HandleLeadSurrogate();
      break;
    }
1064
    if (c0_ == kEndOfInput || c0_ == '\n' || c0_ == '\r') return Token::ILLEGAL;
verwaest's avatar
verwaest committed
1065 1066 1067 1068 1069
    if (c0_ == quote) {
      literal.Complete();
      Advance<false, false>();
      return Token::STRING;
    }
1070
    char c = static_cast<char>(c0_);
verwaest's avatar
verwaest committed
1071 1072 1073 1074 1075
    if (c == '\\') break;
    Advance<false, false>();
    AddLiteralChar(c);
  }

1076 1077 1078 1079 1080
  bool (*line_terminator_func)(unsigned int) =
      FLAG_harmony_subsume_json ? unibrow::IsStringLiteralLineTerminator
                                : unibrow::IsLineTerminator;

  while (c0_ != quote && c0_ != kEndOfInput && !line_terminator_func(c0_)) {
1081 1082 1083
    uc32 c = c0_;
    Advance();
    if (c == '\\') {
1084
      if (c0_ == kEndOfInput || !ScanEscape<false, false>()) {
1085 1086
        return Token::ILLEGAL;
      }
1087 1088 1089 1090 1091 1092
    } else {
      AddLiteralChar(c);
    }
  }
  if (c0_ != quote) return Token::ILLEGAL;
  literal.Complete();
1093

1094 1095 1096
  Advance();  // consume quote
  return Token::STRING;
}
1097

1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117
Token::Value Scanner::ScanPrivateName() {
  if (!allow_harmony_private_fields()) {
    ReportScannerError(source_pos(),
                       MessageTemplate::kInvalidOrUnexpectedToken);
    return Token::ILLEGAL;
  }

  LiteralScope literal(this);
  DCHECK_EQ(c0_, '#');
  AddLiteralCharAdvance();
  if (c0_ == kEndOfInput || !unicode_cache_->IsIdentifierStart(c0_)) {
    PushBack(c0_);
    ReportScannerError(source_pos(),
                       MessageTemplate::kInvalidOrUnexpectedToken);
    return Token::ILLEGAL;
  }

  Token::Value token = ScanIdentifierOrKeywordInner(&literal);
  return token == Token::ILLEGAL ? Token::ILLEGAL : Token::PRIVATE_NAME;
}
1118

1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132
Token::Value Scanner::ScanTemplateSpan() {
  // When scanning a TemplateSpan, we are looking for the following construct:
  // TEMPLATE_SPAN ::
  //     ` LiteralChars* ${
  //   | } LiteralChars* ${
  //
  // TEMPLATE_TAIL ::
  //     ` LiteralChars* `
  //   | } LiteralChar* `
  //
  // A TEMPLATE_SPAN should always be followed by an Expression, while a
  // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
  // followed by an Expression.

1133 1134 1135 1136 1137 1138
  // These scoped helpers save and restore the original error state, so that we
  // can specially treat invalid escape sequences in templates (which are
  // handled by the parser).
  ErrorState scanner_error_state(&scanner_error_, &scanner_error_location_);
  ErrorState octal_error_state(&octal_message_, &octal_pos_);

1139
  Token::Value result = Token::TEMPLATE_SPAN;
1140 1141 1142
  LiteralScope literal(this);
  StartRawLiteral();
  const bool capture_raw = true;
1143
  const bool in_template_literal = true;
1144 1145
  while (true) {
    uc32 c = c0_;
1146
    Advance<capture_raw>();
1147 1148
    if (c == '`') {
      result = Token::TEMPLATE_TAIL;
1149
      ReduceRawLiteralLength(1);
1150 1151
      break;
    } else if (c == '$' && c0_ == '{') {
1152
      Advance<capture_raw>();  // Consume '{'
1153
      ReduceRawLiteralLength(2);
1154 1155
      break;
    } else if (c == '\\') {
1156
      if (c0_ != kEndOfInput && unibrow::IsLineTerminator(c0_)) {
1157 1158 1159
        // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
        // code unit sequence.
        uc32 lastChar = c0_;
1160
        Advance<capture_raw>();
1161 1162 1163
        if (lastChar == '\r') {
          ReduceRawLiteralLength(1);  // Remove \r
          if (c0_ == '\n') {
1164
            Advance<capture_raw>();  // Adds \n
1165 1166 1167 1168
          } else {
            AddRawLiteralChar('\n');
          }
        }
1169 1170 1171 1172 1173 1174
      } else {
        bool success = ScanEscape<capture_raw, in_template_literal>();
        USE(success);
        DCHECK_EQ(!success, has_error());
        // For templates, invalid escape sequence checking is handled in the
        // parser.
1175 1176
        scanner_error_state.MoveErrorTo(&next_);
        octal_error_state.MoveErrorTo(&next_);
1177 1178 1179 1180 1181 1182 1183 1184 1185 1186
      }
    } else if (c < 0) {
      // Unterminated template literal
      PushBack(c);
      break;
    } else {
      // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
      // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
      // consisting of the CV 0x000A.
      if (c == '\r') {
1187 1188
        ReduceRawLiteralLength(1);  // Remove \r
        if (c0_ == '\n') {
1189
          Advance<capture_raw>();  // Adds \n
1190 1191 1192
        } else {
          AddRawLiteralChar('\n');
        }
1193 1194 1195 1196 1197 1198 1199 1200
        c = '\n';
      }
      AddLiteralChar(c);
    }
  }
  literal.Complete();
  next_.location.end_pos = source_pos();
  next_.token = result;
1201
  next_.contextual_token = Token::UNINITIALIZED;
1202

1203 1204 1205 1206
  return result;
}


1207
Token::Value Scanner::ScanTemplateStart() {
1208 1209
  DCHECK_EQ(next_next_.token, Token::UNINITIALIZED);
  DCHECK_EQ(c0_, '`');
1210 1211 1212 1213 1214
  next_.location.beg_pos = source_pos();
  Advance();  // Consume `
  return ScanTemplateSpan();
}

1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226
Handle<String> Scanner::SourceUrl(Isolate* isolate) const {
  Handle<String> tmp;
  if (source_url_.length() > 0) tmp = source_url_.Internalize(isolate);
  return tmp;
}

Handle<String> Scanner::SourceMappingUrl(Isolate* isolate) const {
  Handle<String> tmp;
  if (source_mapping_url_.length() > 0)
    tmp = source_mapping_url_.Internalize(isolate);
  return tmp;
}
1227

1228 1229 1230 1231 1232 1233 1234 1235 1236 1237
bool Scanner::ScanDigitsWithNumericSeparators(bool (*predicate)(uc32 ch),
                                              bool is_check_first_digit) {
  // we must have at least one digit after 'x'/'b'/'o'
  if (is_check_first_digit && !predicate(c0_)) return false;

  bool separator_seen = false;
  while (predicate(c0_) || c0_ == '_') {
    if (c0_ == '_') {
      Advance<false, false>();
      if (c0_ == '_') {
1238
        ReportScannerError(Location(source_pos(), source_pos() + 1),
1239 1240 1241 1242 1243 1244 1245
                           MessageTemplate::kContinuousNumericSeparator);
        return false;
      }
      separator_seen = true;
      continue;
    }
    separator_seen = false;
1246
    AddLiteralCharAdvance();
1247 1248 1249
  }

  if (separator_seen) {
1250
    ReportScannerError(Location(source_pos(), source_pos() + 1),
1251 1252 1253 1254 1255 1256 1257
                       MessageTemplate::kTrailingNumericSeparator);
    return false;
  }

  return true;
}

1258
bool Scanner::ScanDecimalDigits() {
1259
  if (allow_harmony_numeric_separator()) {
1260
    return ScanDigitsWithNumericSeparators(&IsDecimalDigit, false);
1261 1262 1263 1264 1265 1266 1267
  }
  while (IsDecimalDigit(c0_)) {
    AddLiteralCharAdvance();
  }
  return true;
}

1268
bool Scanner::ScanDecimalAsSmiWithNumericSeparators(uint64_t* value) {
1269 1270 1271 1272 1273
  bool separator_seen = false;
  while (IsDecimalDigit(c0_) || c0_ == '_') {
    if (c0_ == '_') {
      Advance<false, false>();
      if (c0_ == '_') {
1274
        ReportScannerError(Location(source_pos(), source_pos() + 1),
1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288
                           MessageTemplate::kContinuousNumericSeparator);
        return false;
      }
      separator_seen = true;
      continue;
    }
    separator_seen = false;
    *value = 10 * *value + (c0_ - '0');
    uc32 first_char = c0_;
    Advance<false, false>();
    AddLiteralChar(first_char);
  }

  if (separator_seen) {
1289
    ReportScannerError(Location(source_pos(), source_pos() + 1),
1290 1291 1292 1293 1294
                       MessageTemplate::kTrailingNumericSeparator);
    return false;
  }

  return true;
1295 1296
}

1297
bool Scanner::ScanDecimalAsSmi(uint64_t* value) {
1298
  if (allow_harmony_numeric_separator()) {
1299
    return ScanDecimalAsSmiWithNumericSeparators(value);
1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310
  }

  while (IsDecimalDigit(c0_)) {
    *value = 10 * *value + (c0_ - '0');
    uc32 first_char = c0_;
    Advance<false, false>();
    AddLiteralChar(first_char);
  }
  return true;
}

1311
bool Scanner::ScanBinaryDigits() {
1312
  if (allow_harmony_numeric_separator()) {
1313
    return ScanDigitsWithNumericSeparators(&IsBinaryDigit, true);
1314 1315
  }

1316
  // we must have at least one binary digit after 'b'/'B'
1317 1318 1319 1320
  if (!IsBinaryDigit(c0_)) {
    return false;
  }

1321 1322 1323 1324 1325 1326
  while (IsBinaryDigit(c0_)) {
    AddLiteralCharAdvance();
  }
  return true;
}

1327
bool Scanner::ScanOctalDigits() {
1328
  if (allow_harmony_numeric_separator()) {
1329
    return ScanDigitsWithNumericSeparators(&IsOctalDigit, true);
1330 1331
  }

1332
  // we must have at least one octal digit after 'o'/'O'
1333 1334 1335 1336
  if (!IsOctalDigit(c0_)) {
    return false;
  }

1337 1338 1339
  while (IsOctalDigit(c0_)) {
    AddLiteralCharAdvance();
  }
1340
  return true;
1341 1342
}

1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362
bool Scanner::ScanImplicitOctalDigits(int start_pos,
                                      Scanner::NumberKind* kind) {
  *kind = IMPLICIT_OCTAL;

  while (true) {
    // (possible) octal number
    if (c0_ == '8' || c0_ == '9') {
      *kind = DECIMAL_WITH_LEADING_ZERO;
      return true;
    }
    if (c0_ < '0' || '7' < c0_) {
      // Octal literal finished.
      octal_pos_ = Location(start_pos, source_pos());
      octal_message_ = MessageTemplate::kStrictOctalLiteral;
      return true;
    }
    AddLiteralCharAdvance();
  }
}

1363
bool Scanner::ScanHexDigits() {
1364
  if (allow_harmony_numeric_separator()) {
1365
    return ScanDigitsWithNumericSeparators(&IsHexDigit, true);
1366 1367
  }

1368
  // we must have at least one hex digit after 'x'/'X'
1369 1370 1371 1372
  if (!IsHexDigit(c0_)) {
    return false;
  }

1373 1374 1375 1376 1377 1378
  while (IsHexDigit(c0_)) {
    AddLiteralCharAdvance();
  }
  return true;
}

1379
bool Scanner::ScanSignedInteger() {
1380 1381 1382
  if (c0_ == '+' || c0_ == '-') AddLiteralCharAdvance();
  // we must have at least one decimal digit after 'e'/'E'
  if (!IsDecimalDigit(c0_)) return false;
1383
  return ScanDecimalDigits();
1384
}
1385

1386
Token::Value Scanner::ScanNumber(bool seen_period) {
1387
  DCHECK(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
1388

1389
  NumberKind kind = DECIMAL;
1390 1391

  LiteralScope literal(this);
verwaest's avatar
verwaest committed
1392
  bool at_start = !seen_period;
1393
  int start_pos = source_pos();  // For reporting octal positions.
1394 1395 1396
  if (seen_period) {
    // we have already seen a decimal point of the float
    AddLiteralChar('.');
1397 1398 1399 1400
    if (allow_harmony_numeric_separator() && c0_ == '_') {
      return Token::ILLEGAL;
    }
    // we know we have at least one digit
1401
    if (!ScanDecimalDigits()) return Token::ILLEGAL;
1402 1403 1404 1405 1406
  } else {
    // if the first character is '0' we must check for octals and hex
    if (c0_ == '0') {
      AddLiteralCharAdvance();

1407 1408
      // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
      // an octal number.
1409 1410
      if (c0_ == 'x' || c0_ == 'X') {
        AddLiteralCharAdvance();
1411
        kind = HEX;
1412
        if (!ScanHexDigits()) return Token::ILLEGAL;
1413
      } else if (c0_ == 'o' || c0_ == 'O') {
1414
        AddLiteralCharAdvance();
1415
        kind = OCTAL;
1416
        if (!ScanOctalDigits()) return Token::ILLEGAL;
1417
      } else if (c0_ == 'b' || c0_ == 'B') {
1418
        AddLiteralCharAdvance();
1419
        kind = BINARY;
1420
        if (!ScanBinaryDigits()) return Token::ILLEGAL;
1421
      } else if ('0' <= c0_ && c0_ <= '7') {
1422
        kind = IMPLICIT_OCTAL;
1423 1424 1425 1426
        if (!ScanImplicitOctalDigits(start_pos, &kind)) {
          return Token::ILLEGAL;
        }
        if (kind == DECIMAL_WITH_LEADING_ZERO) {
1427
          at_start = false;
1428
        }
1429 1430
      } else if (c0_ == '8' || c0_ == '9') {
        kind = DECIMAL_WITH_LEADING_ZERO;
1431 1432 1433 1434
      } else if (allow_harmony_numeric_separator() && c0_ == '_') {
        ReportScannerError(Location(source_pos(), source_pos() + 1),
                           MessageTemplate::kZeroDigitNumericSeparator);
        return Token::ILLEGAL;
1435 1436 1437 1438
      }
    }

    // Parse decimal digits and allow trailing fractional part.
1439
    if (kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO) {
1440
      // This is an optimization for parsing Decimal numbers as Smi's.
verwaest's avatar
verwaest committed
1441
      if (at_start) {
1442
        uint64_t value = 0;
1443
        // scan subsequent decimal digits
1444
        if (!ScanDecimalAsSmi(&value)) {
1445
          return Token::ILLEGAL;
verwaest's avatar
verwaest committed
1446 1447
        }

1448
        if (next_.literal_chars->one_byte_literal().length() <= 10 &&
1449 1450
            value <= Smi::kMaxValue && c0_ != '.' &&
            (c0_ == kEndOfInput || !unicode_cache_->IsIdentifierStart(c0_))) {
heimbuef's avatar
heimbuef committed
1451
          next_.smi_value_ = static_cast<uint32_t>(value);
verwaest's avatar
verwaest committed
1452
          literal.Complete();
verwaest's avatar
verwaest committed
1453
          HandleLeadSurrogate();
1454

1455 1456 1457 1458
          if (kind == DECIMAL_WITH_LEADING_ZERO) {
            octal_pos_ = Location(start_pos, source_pos());
            octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
          }
verwaest's avatar
verwaest committed
1459 1460
          return Token::SMI;
        }
verwaest's avatar
verwaest committed
1461
        HandleLeadSurrogate();
verwaest's avatar
verwaest committed
1462 1463
      }

1464
      if (!ScanDecimalDigits()) return Token::ILLEGAL;
1465
      if (c0_ == '.') {
1466
        seen_period = true;
1467
        AddLiteralCharAdvance();
1468 1469 1470
        if (allow_harmony_numeric_separator() && c0_ == '_') {
          return Token::ILLEGAL;
        }
1471
        if (!ScanDecimalDigits()) return Token::ILLEGAL;
1472 1473 1474 1475
      }
    }
  }

1476 1477 1478
  bool is_bigint = false;
  if (allow_harmony_bigint() && c0_ == 'n' && !seen_period &&
      (kind == DECIMAL || kind == HEX || kind == OCTAL || kind == BINARY)) {
1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489
    // Check that the literal is within our limits for BigInt length.
    // For simplicity, use 4 bits per character to calculate the maximum
    // allowed literal length.
    static const int kMaxBigIntCharacters = BigInt::kMaxLengthBits / 4;
    int length = source_pos() - start_pos - (kind != DECIMAL ? 2 : 0);
    if (length > kMaxBigIntCharacters) {
      ReportScannerError(Location(start_pos, source_pos()),
                         MessageTemplate::kBigIntTooBig);
      return Token::ILLEGAL;
    }

1490 1491 1492 1493
    is_bigint = true;
    Advance();
  } else if (c0_ == 'e' || c0_ == 'E') {
    // scan exponent, if any
1494
    DCHECK(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
1495

1496 1497
    if (!(kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO))
      return Token::ILLEGAL;
1498

1499 1500
    // scan exponent
    AddLiteralCharAdvance();
1501

1502
    if (!ScanSignedInteger()) return Token::ILLEGAL;
1503
  }
1504

1505 1506 1507 1508
  // The source character immediately following a numeric literal must
  // not be an identifier start or a decimal digit; see ECMA-262
  // section 7.8.3, page 17 (note that we read only one decimal digit
  // if the value is 0).
1509
  if (IsDecimalDigit(c0_) ||
1510
      (c0_ != kEndOfInput && unicode_cache_->IsIdentifierStart(c0_)))
1511
    return Token::ILLEGAL;
1512

1513
  literal.Complete();
1514

1515 1516 1517 1518
  if (kind == DECIMAL_WITH_LEADING_ZERO) {
    octal_pos_ = Location(start_pos, source_pos());
    octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
  }
1519

1520
  return is_bigint ? Token::BIGINT : Token::NUMBER;
1521 1522 1523
}


1524
uc32 Scanner::ScanIdentifierUnicodeEscape() {
1525 1526 1527
  Advance();
  if (c0_ != 'u') return -1;
  Advance();
1528
  return ScanUnicodeEscape<false>();
marja's avatar
marja committed
1529 1530 1531
}


1532
template <bool capture_raw>
marja's avatar
marja committed
1533
uc32 Scanner::ScanUnicodeEscape() {
adamk's avatar
adamk committed
1534 1535 1536
  // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
  // hex digits between { } is arbitrary. \ and u have already been read.
  if (c0_ == '{') {
1537
    int begin = source_pos() - 2;
1538
    Advance<capture_raw>();
1539
    uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10FFFF, begin);
1540 1541 1542
    if (cp < 0 || c0_ != '}') {
      ReportScannerError(source_pos(),
                         MessageTemplate::kInvalidUnicodeEscapeSequence);
marja's avatar
marja committed
1543 1544
      return -1;
    }
1545
    Advance<capture_raw>();
marja's avatar
marja committed
1546 1547
    return cp;
  }
1548 1549
  const bool unicode = true;
  return ScanHexNumber<capture_raw, unicode>(4);
1550 1551 1552
}


1553 1554 1555
// ----------------------------------------------------------------------------
// Keyword Matcher

1556
#define KEYWORDS(KEYWORD_GROUP, KEYWORD)                    \
1557
  KEYWORD_GROUP('a')                                        \
1558 1559
  KEYWORD("arguments", Token::ARGUMENTS)                    \
  KEYWORD("as", Token::AS)                                  \
1560
  KEYWORD("async", Token::ASYNC)                            \
1561
  KEYWORD("await", Token::AWAIT)                            \
1562
  KEYWORD("anonymous", Token::ANONYMOUS)                    \
1563 1564 1565 1566 1567 1568 1569
  KEYWORD_GROUP('b')                                        \
  KEYWORD("break", Token::BREAK)                            \
  KEYWORD_GROUP('c')                                        \
  KEYWORD("case", Token::CASE)                              \
  KEYWORD("catch", Token::CATCH)                            \
  KEYWORD("class", Token::CLASS)                            \
  KEYWORD("const", Token::CONST)                            \
1570
  KEYWORD("constructor", Token::CONSTRUCTOR)                \
1571 1572 1573 1574 1575 1576 1577 1578
  KEYWORD("continue", Token::CONTINUE)                      \
  KEYWORD_GROUP('d')                                        \
  KEYWORD("debugger", Token::DEBUGGER)                      \
  KEYWORD("default", Token::DEFAULT)                        \
  KEYWORD("delete", Token::DELETE)                          \
  KEYWORD("do", Token::DO)                                  \
  KEYWORD_GROUP('e')                                        \
  KEYWORD("else", Token::ELSE)                              \
1579
  KEYWORD("enum", Token::ENUM)                              \
1580
  KEYWORD("eval", Token::EVAL)                              \
1581 1582 1583 1584 1585 1586
  KEYWORD("export", Token::EXPORT)                          \
  KEYWORD("extends", Token::EXTENDS)                        \
  KEYWORD_GROUP('f')                                        \
  KEYWORD("false", Token::FALSE_LITERAL)                    \
  KEYWORD("finally", Token::FINALLY)                        \
  KEYWORD("for", Token::FOR)                                \
1587
  KEYWORD("from", Token::FROM)                              \
1588
  KEYWORD("function", Token::FUNCTION)                      \
1589 1590
  KEYWORD_GROUP('g')                                        \
  KEYWORD("get", Token::GET)                                \
1591 1592 1593 1594 1595 1596 1597 1598 1599
  KEYWORD_GROUP('i')                                        \
  KEYWORD("if", Token::IF)                                  \
  KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
  KEYWORD("import", Token::IMPORT)                          \
  KEYWORD("in", Token::IN)                                  \
  KEYWORD("instanceof", Token::INSTANCEOF)                  \
  KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)  \
  KEYWORD_GROUP('l')                                        \
  KEYWORD("let", Token::LET)                                \
1600 1601
  KEYWORD_GROUP('m')                                        \
  KEYWORD("meta", Token::META)                              \
1602
  KEYWORD_GROUP('n')                                        \
1603
  KEYWORD("name", Token::NAME)                              \
1604 1605
  KEYWORD("new", Token::NEW)                                \
  KEYWORD("null", Token::NULL_LITERAL)                      \
1606 1607
  KEYWORD_GROUP('o')                                        \
  KEYWORD("of", Token::OF)                                  \
1608 1609 1610 1611
  KEYWORD_GROUP('p')                                        \
  KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)    \
  KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)    \
  KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)  \
1612
  KEYWORD("prototype", Token::PROTOTYPE)                    \
1613 1614 1615 1616
  KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)     \
  KEYWORD_GROUP('r')                                        \
  KEYWORD("return", Token::RETURN)                          \
  KEYWORD_GROUP('s')                                        \
1617
  KEYWORD("set", Token::SET)                                \
1618 1619 1620 1621
  KEYWORD("static", Token::STATIC)                          \
  KEYWORD("super", Token::SUPER)                            \
  KEYWORD("switch", Token::SWITCH)                          \
  KEYWORD_GROUP('t')                                        \
1622
  KEYWORD("target", Token::TARGET)                          \
1623 1624 1625 1626 1627
  KEYWORD("this", Token::THIS)                              \
  KEYWORD("throw", Token::THROW)                            \
  KEYWORD("true", Token::TRUE_LITERAL)                      \
  KEYWORD("try", Token::TRY)                                \
  KEYWORD("typeof", Token::TYPEOF)                          \
1628 1629
  KEYWORD_GROUP('u')                                        \
  KEYWORD("undefined", Token::UNDEFINED)                    \
1630 1631 1632 1633 1634 1635 1636
  KEYWORD_GROUP('v')                                        \
  KEYWORD("var", Token::VAR)                                \
  KEYWORD("void", Token::VOID)                              \
  KEYWORD_GROUP('w')                                        \
  KEYWORD("while", Token::WHILE)                            \
  KEYWORD("with", Token::WITH)                              \
  KEYWORD_GROUP('y')                                        \
1637 1638
  KEYWORD("yield", Token::YIELD)                            \
  KEYWORD_GROUP('_')                                        \
1639 1640 1641
  KEYWORD("__proto__", Token::PROTO_UNDERSCORED)            \
  KEYWORD_GROUP('#')                                        \
  KEYWORD("#constructor", Token::PRIVATE_CONSTRUCTOR)
1642

1643
static Token::Value KeywordOrIdentifierToken(const uint8_t* input,
1644
                                             int input_length) {
1645
  DCHECK_GE(input_length, 1);
1646
  const int kMinLength = 2;
1647
  const int kMaxLength = 12;
1648 1649 1650 1651 1652 1653 1654 1655
  if (input_length < kMinLength || input_length > kMaxLength) {
    return Token::IDENTIFIER;
  }
  switch (input[0]) {
    default:
#define KEYWORD_GROUP_CASE(ch)                                \
      break;                                                  \
    case ch:
1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677
#define KEYWORD(keyword, token)                                           \
  {                                                                       \
    /* 'keyword' is a char array, so sizeof(keyword) is */                \
    /* strlen(keyword) plus 1 for the NUL char. */                        \
    const int keyword_length = sizeof(keyword) - 1;                       \
    STATIC_ASSERT(keyword_length >= kMinLength);                          \
    STATIC_ASSERT(keyword_length <= kMaxLength);                          \
    DCHECK_EQ(input[0], keyword[0]);                                      \
    DCHECK(token == Token::FUTURE_STRICT_RESERVED_WORD ||                 \
           0 == strncmp(keyword, Token::String(token), sizeof(keyword))); \
    if (input_length == keyword_length && input[1] == keyword[1] &&       \
        (keyword_length <= 2 || input[2] == keyword[2]) &&                \
        (keyword_length <= 3 || input[3] == keyword[3]) &&                \
        (keyword_length <= 4 || input[4] == keyword[4]) &&                \
        (keyword_length <= 5 || input[5] == keyword[5]) &&                \
        (keyword_length <= 6 || input[6] == keyword[6]) &&                \
        (keyword_length <= 7 || input[7] == keyword[7]) &&                \
        (keyword_length <= 8 || input[8] == keyword[8]) &&                \
        (keyword_length <= 9 || input[9] == keyword[9]) &&                \
        (keyword_length <= 10 || input[10] == keyword[10])) {             \
      return token;                                                       \
    }                                                                     \
1678
  }
1679
      KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
1680
  }
1681
  return Token::IDENTIFIER;
1682 1683 1684
#undef KEYWORDS
#undef KEYWORD
#undef KEYWORD_GROUP_CASE
1685 1686
}

1687
Token::Value Scanner::ScanIdentifierOrKeyword() {
1688
  LiteralScope literal(this);
1689 1690 1691 1692 1693
  return ScanIdentifierOrKeywordInner(&literal);
}

Token::Value Scanner::ScanIdentifierOrKeywordInner(LiteralScope* literal) {
  DCHECK(unicode_cache_->IsIdentifierStart(c0_));
1694
  if (IsInRange(c0_, 'a', 'z') || c0_ == '_') {
1695
    do {
1696
      char first_char = static_cast<char>(c0_);
1697 1698
      Advance<false, false>();
      AddLiteralChar(first_char);
1699
    } while (IsInRange(c0_, 'a', 'z') || c0_ == '_');
1700 1701 1702 1703

    if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '_' ||
        c0_ == '$') {
      // Identifier starting with lowercase.
1704
      char first_char = static_cast<char>(c0_);
1705 1706 1707
      Advance<false, false>();
      AddLiteralChar(first_char);
      while (IsAsciiIdentifier(c0_)) {
1708
        char first_char = static_cast<char>(c0_);
1709 1710 1711
        Advance<false, false>();
        AddLiteralChar(first_char);
      }
verwaest's avatar
verwaest committed
1712
      if (c0_ <= kMaxAscii && c0_ != '\\') {
1713
        literal->Complete();
1714 1715
        return Token::IDENTIFIER;
      }
verwaest's avatar
verwaest committed
1716
    } else if (c0_ <= kMaxAscii && c0_ != '\\') {
1717
      // Only a-z+ or _: could be a keyword or identifier.
1718
      Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1719 1720 1721
      Token::Value token =
          KeywordOrIdentifierToken(chars.start(), chars.length());
      if (token == Token::IDENTIFIER ||
1722 1723
          token == Token::FUTURE_STRICT_RESERVED_WORD ||
          Token::IsContextualKeyword(token))
1724
        literal->Complete();
1725
      return token;
1726 1727
    }

verwaest's avatar
verwaest committed
1728
    HandleLeadSurrogate();
1729 1730
  } else if (IsInRange(c0_, 'A', 'Z') || c0_ == '_' || c0_ == '$') {
    do {
1731
      char first_char = static_cast<char>(c0_);
1732 1733 1734 1735
      Advance<false, false>();
      AddLiteralChar(first_char);
    } while (IsAsciiIdentifier(c0_));

verwaest's avatar
verwaest committed
1736
    if (c0_ <= kMaxAscii && c0_ != '\\') {
1737
      literal->Complete();
1738 1739 1740
      return Token::IDENTIFIER;
    }

verwaest's avatar
verwaest committed
1741
    HandleLeadSurrogate();
1742 1743
  } else if (c0_ == '\\') {
    // Scan identifier start character.
1744 1745 1746 1747 1748 1749 1750 1751
    uc32 c = ScanIdentifierUnicodeEscape();
    // Only allow legal identifier start characters.
    if (c < 0 ||
        c == '\\' ||  // No recursive escapes.
        !unicode_cache_->IsIdentifierStart(c)) {
      return Token::ILLEGAL;
    }
    AddLiteralChar(c);
1752
    return ScanIdentifierSuffix(literal, true);
1753 1754 1755 1756
  } else {
    uc32 first_char = c0_;
    Advance();
    AddLiteralChar(first_char);
1757 1758
  }

1759
  // Scan the rest of the identifier characters.
1760
  while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1761 1762 1763 1764 1765 1766 1767
    if (c0_ != '\\') {
      uc32 next_char = c0_;
      Advance();
      AddLiteralChar(next_char);
      continue;
    }
    // Fallthrough if no longer able to complete keyword.
1768
    return ScanIdentifierSuffix(literal, false);
1769
  }
1770

1771
  if (next_.literal_chars->is_one_byte()) {
1772
    Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1773 1774
    Token::Value token =
        KeywordOrIdentifierToken(chars.start(), chars.length());
1775
    if (token == Token::IDENTIFIER ||
1776 1777
        token == Token::FUTURE_STRICT_RESERVED_WORD ||
        Token::IsContextualKeyword(token))
1778
      literal->Complete();
1779
    return token;
1780
  }
1781
  literal->Complete();
1782
  return Token::IDENTIFIER;
1783 1784
}

lrn@chromium.org's avatar
lrn@chromium.org committed
1785

1786 1787
Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal,
                                           bool escaped) {
1788
  // Scan the rest of the identifier characters.
1789
  while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1790 1791
    if (c0_ == '\\') {
      uc32 c = ScanIdentifierUnicodeEscape();
1792
      escaped = true;
1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806
      // Only allow legal identifier part characters.
      if (c < 0 ||
          c == '\\' ||
          !unicode_cache_->IsIdentifierPart(c)) {
        return Token::ILLEGAL;
      }
      AddLiteralChar(c);
    } else {
      AddLiteralChar(c0_);
      Advance();
    }
  }
  literal->Complete();

1807 1808
  if (escaped && next_.literal_chars->is_one_byte()) {
    Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1809 1810 1811
    Token::Value token =
        KeywordOrIdentifierToken(chars.start(), chars.length());
    /* TODO(adamk): YIELD should be handled specially. */
1812 1813
    if (token == Token::IDENTIFIER || Token::IsContextualKeyword(token)) {
      return token;
1814 1815 1816 1817 1818 1819
    } else if (token == Token::FUTURE_STRICT_RESERVED_WORD ||
               token == Token::LET || token == Token::STATIC) {
      return Token::ESCAPED_STRICT_RESERVED_WORD;
    } else {
      return Token::ESCAPED_KEYWORD;
    }
1820
  }
1821
  return Token::IDENTIFIER;
1822 1823
}

1824 1825 1826
bool Scanner::ScanRegExpPattern() {
  DCHECK(next_next_.token == Token::UNINITIALIZED);
  DCHECK(next_.token == Token::DIV || next_.token == Token::ASSIGN_DIV);
lrn@chromium.org's avatar
lrn@chromium.org committed
1827

1828 1829
  // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
  bool in_character_class = false;
1830
  bool seen_equal = (next_.token == Token::ASSIGN_DIV);
1831 1832 1833 1834 1835

  // Previous token is either '/' or '/=', in the second case, the
  // pattern starts at =.
  next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
  next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1836

1837 1838 1839 1840 1841 1842 1843 1844 1845
  // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
  // the scanner should pass uninterpreted bodies to the RegExp
  // constructor.
  LiteralScope literal(this);
  if (seen_equal) {
    AddLiteralChar('=');
  }

  while (c0_ != '/' || in_character_class) {
1846
    if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
1847
      return false;
1848
    }
1849 1850
    if (c0_ == '\\') {  // Escape sequence.
      AddLiteralCharAdvance();
1851
      if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
1852
        return false;
1853
      }
1854 1855 1856 1857 1858 1859 1860 1861 1862
      AddLiteralCharAdvance();
      // If the escape allows more characters, i.e., \x??, \u????, or \c?,
      // only "safe" characters are allowed (letters, digits, underscore),
      // otherwise the escape isn't valid and the invalid character has
      // its normal meaning. I.e., we can just continue scanning without
      // worrying whether the following characters are part of the escape
      // or not, since any '/', '\\' or '[' is guaranteed to not be part
      // of the escape sequence.

1863
      // TODO(896): At some point, parse RegExps more thoroughly to capture
1864 1865 1866 1867 1868 1869 1870 1871 1872 1873
      // octal esacpes in strict mode.
    } else {  // Unescaped character.
      if (c0_ == '[') in_character_class = true;
      if (c0_ == ']') in_character_class = false;
      AddLiteralCharAdvance();
    }
  }
  Advance();  // consume '/'

  literal.Complete();
1874
  next_.token = Token::REGEXP_LITERAL;
1875
  next_.contextual_token = Token::UNINITIALIZED;
1876
  return true;
1877 1878 1879
}


1880
Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() {
1881 1882
  DCHECK(next_.token == Token::REGEXP_LITERAL);

1883
  // Scan regular expression flags.
1884
  int flags = 0;
1885
  while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896
    RegExp::Flags flag = RegExp::kNone;
    switch (c0_) {
      case 'g':
        flag = RegExp::kGlobal;
        break;
      case 'i':
        flag = RegExp::kIgnoreCase;
        break;
      case 'm':
        flag = RegExp::kMultiline;
        break;
1897
      case 's':
1898
        flag = RegExp::kDotAll;
1899
        break;
1900 1901 1902 1903 1904 1905 1906 1907
      case 'u':
        flag = RegExp::kUnicode;
        break;
      case 'y':
        flag = RegExp::kSticky;
        break;
      default:
        return Nothing<RegExp::Flags>();
1908
    }
1909 1910 1911 1912
    if (flags & flag) {
      return Nothing<RegExp::Flags>();
    }
    Advance();
1913
    flags |= flag;
1914 1915
  }

1916
  next_.location.end_pos = source_pos();
1917
  return Just(RegExp::Flags(flags));
1918 1919
}

1920 1921
const AstRawString* Scanner::CurrentSymbol(
    AstValueFactory* ast_value_factory) const {
1922 1923
  if (is_literal_one_byte()) {
    return ast_value_factory->GetOneByteString(literal_one_byte_string());
1924
  }
1925
  return ast_value_factory->GetTwoByteString(literal_two_byte_string());
1926 1927
}

1928 1929
const AstRawString* Scanner::NextSymbol(
    AstValueFactory* ast_value_factory) const {
1930 1931
  if (is_next_literal_one_byte()) {
    return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
1932
  }
1933
  return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
1934 1935
}

1936
const AstRawString* Scanner::CurrentRawSymbol(
1937
    AstValueFactory* ast_value_factory) const {
1938 1939 1940 1941 1942 1943 1944
  if (is_raw_literal_one_byte()) {
    return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
  }
  return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
}


1945
double Scanner::DoubleValue() {
1946
  DCHECK(is_literal_one_byte());
1947
  return StringToDouble(
1948 1949
      unicode_cache_,
      literal_one_byte_string(),
1950 1951 1952
      ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
}

1953 1954 1955 1956 1957 1958 1959 1960 1961 1962
const char* Scanner::CurrentLiteralAsCString(Zone* zone) const {
  DCHECK(is_literal_one_byte());
  Vector<const uint8_t> vector = literal_one_byte_string();
  int length = vector.length();
  char* buffer = zone->NewArray<char>(length + 1);
  memcpy(buffer, vector.start(), length);
  buffer[length] = '\0';
  return buffer;
}

1963 1964 1965 1966 1967 1968
bool Scanner::IsDuplicateSymbol(DuplicateFinder* duplicate_finder,
                                AstValueFactory* ast_value_factory) const {
  DCHECK_NOT_NULL(duplicate_finder);
  DCHECK_NOT_NULL(ast_value_factory);
  const AstRawString* string = CurrentSymbol(ast_value_factory);
  return !duplicate_finder->known_symbols_.insert(string).second;
1969 1970
}

1971 1972 1973 1974 1975 1976 1977 1978
void Scanner::SeekNext(size_t position) {
  // Use with care: This cleanly resets most, but not all scanner state.
  // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions.

  // To re-scan from a given character position, we need to:
  // 1, Reset the current_, next_ and next_next_ tokens
  //    (next_ + next_next_ will be overwrittem by Next(),
  //     current_ will remain unchanged, so overwrite it fully.)
1979 1980 1981 1982 1983 1984 1985 1986
  current_ = {{0, 0},
              nullptr,
              nullptr,
              0,
              Token::UNINITIALIZED,
              MessageTemplate::kNone,
              {0, 0},
              Token::UNINITIALIZED};
1987
  next_.token = Token::UNINITIALIZED;
1988
  next_.contextual_token = Token::UNINITIALIZED;
1989
  next_next_.token = Token::UNINITIALIZED;
1990
  next_next_.contextual_token = Token::UNINITIALIZED;
1991 1992 1993 1994 1995
  // 2, reset the source to the desired position,
  source_->Seek(position);
  // 3, re-scan, by scanning the look-ahead char + 1 token (next_).
  c0_ = source_->Advance();
  Next();
1996
  DCHECK_EQ(next_.location.beg_pos, static_cast<int>(position));
1997 1998
}

1999 2000
}  // namespace internal
}  // namespace v8