scanner.cc 55.3 KB
Newer Older
1
// Copyright 2011 the V8 project authors. All rights reserved.
2 3
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
4

5
// Features shared by parsing and pre-parsing scanners.
6

7
#include "src/parsing/scanner.h"
8

9 10
#include <stdint.h>

11 12
#include <cmath>

13
#include "src/ast/ast-value-factory.h"
14 15
#include "src/char-predicates-inl.h"
#include "src/conversions-inl.h"
16
#include "src/parsing/duplicate-finder.h"  // For Scanner::FindSymbol
17
#include "src/unicode-cache-inl.h"
18

19 20
namespace v8 {
namespace internal {
21

22
class Scanner::ErrorState {
23 24 25 26 27 28 29 30
 public:
  ErrorState(MessageTemplate::Template* message_stack,
             Scanner::Location* location_stack)
      : message_stack_(message_stack),
        old_message_(*message_stack),
        location_stack_(location_stack),
        old_location_(*location_stack) {
    *message_stack_ = MessageTemplate::kNone;
31
    *location_stack_ = Location::invalid();
32 33 34 35 36 37 38
  }

  ~ErrorState() {
    *message_stack_ = old_message_;
    *location_stack_ = old_location_;
  }

39
  void MoveErrorTo(TokenDesc* dest) {
40 41 42
    if (*message_stack_ == MessageTemplate::kNone) {
      return;
    }
43 44 45
    if (dest->invalid_template_escape_message == MessageTemplate::kNone) {
      dest->invalid_template_escape_message = *message_stack_;
      dest->invalid_template_escape_location = *location_stack_;
46 47
    }
    *message_stack_ = MessageTemplate::kNone;
48
    *location_stack_ = Location::invalid();
49 50 51 52 53 54 55 56 57
  }

 private:
  MessageTemplate::Template* const message_stack_;
  MessageTemplate::Template const old_message_;
  Scanner::Location* const location_stack_;
  Scanner::Location const old_location_;
};

58 59 60
// ----------------------------------------------------------------------------
// Scanner::LiteralBuffer

61
Handle<String> Scanner::LiteralBuffer::Internalize(Isolate* isolate) const {
62 63 64 65 66 67
  if (is_one_byte()) {
    return isolate->factory()->InternalizeOneByteString(one_byte_literal());
  }
  return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
}

68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
int Scanner::LiteralBuffer::NewCapacity(int min_capacity) {
  int capacity = Max(min_capacity, backing_store_.length());
  int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
  return new_capacity;
}

void Scanner::LiteralBuffer::ExpandBuffer() {
  Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
  MemCopy(new_store.start(), backing_store_.start(), position_);
  backing_store_.Dispose();
  backing_store_ = new_store;
}

void Scanner::LiteralBuffer::ConvertToTwoByte() {
  DCHECK(is_one_byte_);
  Vector<byte> new_store;
  int new_content_size = position_ * kUC16Size;
  if (new_content_size >= backing_store_.length()) {
    // Ensure room for all currently read code units as UC16 as well
    // as the code unit about to be stored.
    new_store = Vector<byte>::New(NewCapacity(new_content_size));
  } else {
    new_store = backing_store_;
  }
  uint8_t* src = backing_store_.start();
  uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start());
  for (int i = position_ - 1; i >= 0; i--) {
    dst[i] = src[i];
  }
  if (new_store.start() != backing_store_.start()) {
    backing_store_.Dispose();
    backing_store_ = new_store;
  }
  position_ = new_content_size;
  is_one_byte_ = false;
}

void Scanner::LiteralBuffer::AddCharSlow(uc32 code_unit) {
  if (position_ >= backing_store_.length()) ExpandBuffer();
  if (is_one_byte_) {
    if (code_unit <= static_cast<uc32>(unibrow::Latin1::kMaxChar)) {
      backing_store_[position_] = static_cast<byte>(code_unit);
      position_ += kOneByteSize;
      return;
    }
    ConvertToTwoByte();
  }
  if (code_unit <=
      static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
    *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
    position_ += kUC16Size;
  } else {
    *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
        unibrow::Utf16::LeadSurrogate(code_unit);
    position_ += kUC16Size;
    if (position_ >= backing_store_.length()) ExpandBuffer();
    *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
        unibrow::Utf16::TrailSurrogate(code_unit);
    position_ += kUC16Size;
  }
}

130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
// ----------------------------------------------------------------------------
// Scanner::BookmarkScope

const size_t Scanner::BookmarkScope::kBookmarkAtFirstPos =
    std::numeric_limits<size_t>::max() - 2;
const size_t Scanner::BookmarkScope::kNoBookmark =
    std::numeric_limits<size_t>::max() - 1;
const size_t Scanner::BookmarkScope::kBookmarkWasApplied =
    std::numeric_limits<size_t>::max();

void Scanner::BookmarkScope::Set() {
  DCHECK_EQ(bookmark_, kNoBookmark);
  DCHECK_EQ(scanner_->next_next_.token, Token::UNINITIALIZED);

  // The first token is a bit special, since current_ will still be
  // uninitialized. In this case, store kBookmarkAtFirstPos and special-case it
  // when
  // applying the bookmark.
  DCHECK_IMPLIES(
      scanner_->current_.token == Token::UNINITIALIZED,
      scanner_->current_.location.beg_pos == scanner_->next_.location.beg_pos);
  bookmark_ = (scanner_->current_.token == Token::UNINITIALIZED)
                  ? kBookmarkAtFirstPos
                  : scanner_->location().beg_pos;
}

void Scanner::BookmarkScope::Apply() {
  DCHECK(HasBeenSet());  // Caller hasn't called SetBookmark.
  if (bookmark_ == kBookmarkAtFirstPos) {
    scanner_->SeekNext(0);
  } else {
    scanner_->SeekNext(bookmark_);
    scanner_->Next();
163
    DCHECK_EQ(scanner_->location().beg_pos, static_cast<int>(bookmark_));
164 165 166 167 168 169 170 171 172 173 174
  }
  bookmark_ = kBookmarkWasApplied;
}

bool Scanner::BookmarkScope::HasBeenSet() {
  return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied;
}

bool Scanner::BookmarkScope::HasBeenApplied() {
  return bookmark_ == kBookmarkWasApplied;
}
175

176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
// LineTerminator:       'JS_Line_Terminator' in point.properties
// ES#sec-line-terminators lists exactly 4 code points:
// LF (U+000A), CR (U+000D), LS(U+2028), PS(U+2029)
bool Scanner::IsLineTerminator(uc32 c) {
  if (c == 0x000A || c == 0x000D) {
    return true;
  }
  if (c == 0x2028 || c == 0x2029) {
    ++use_counts_[v8::Isolate::UseCounterFeature::
                      kLineOrParagraphSeparatorAsLineTerminator];
    return true;
  }
  return false;
}

191
// ----------------------------------------------------------------------------
192
// Scanner
193

194
Scanner::Scanner(UnicodeCache* unicode_cache, int* use_counts)
195
    : unicode_cache_(unicode_cache),
196
      octal_pos_(Location::invalid()),
197
      octal_message_(MessageTemplate::kNone),
198
      found_html_comment_(false),
199
      allow_harmony_bigint_(false),
200
      use_counts_(use_counts) {}
201

202
void Scanner::Initialize(Utf16CharacterStream* source, bool is_module) {
203
  DCHECK_NOT_NULL(source);
204
  source_ = source;
205
  is_module_ = is_module;
206 207 208 209 210
  // Need to capture identifiers in order to recognize "get" and "set"
  // in object literals.
  Init();
  has_line_terminator_before_next_ = true;
  Scan();
211 212
}

213
template <bool capture_raw, bool unicode>
214
uc32 Scanner::ScanHexNumber(int expected_length) {
215
  DCHECK_LE(expected_length, 4);  // prevent overflow
216

217
  int begin = source_pos() - 2;
218 219 220 221
  uc32 x = 0;
  for (int i = 0; i < expected_length; i++) {
    int d = HexValue(c0_);
    if (d < 0) {
222 223 224 225
      ReportScannerError(Location(begin, begin + expected_length + 2),
                         unicode
                             ? MessageTemplate::kInvalidUnicodeEscapeSequence
                             : MessageTemplate::kInvalidHexEscapeSequence);
226 227 228
      return -1;
    }
    x = x * 16 + d;
229
    Advance<capture_raw>();
230 231 232
  }

  return x;
233
}
lrn@chromium.org's avatar
lrn@chromium.org committed
234

235
template <bool capture_raw>
236
uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) {
marja's avatar
marja committed
237 238
  uc32 x = 0;
  int d = HexValue(c0_);
239 240
  if (d < 0) return -1;

marja's avatar
marja committed
241 242
  while (d >= 0) {
    x = x * 16 + d;
243 244 245 246 247
    if (x > max_value) {
      ReportScannerError(Location(beg_pos, source_pos() + 1),
                         MessageTemplate::kUndefinedUnicodeCodePoint);
      return -1;
    }
248
    Advance<capture_raw>();
marja's avatar
marja committed
249 250
    d = HexValue(c0_);
  }
251

marja's avatar
marja committed
252 253 254 255
  return x;
}


256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391
// Ensure that tokens can be stored in a byte.
STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);

// Table of one-character tokens, by character (0x00..0x7f only).
static const byte one_char_tokens[] = {
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::LPAREN,       // 0x28
  Token::RPAREN,       // 0x29
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::COMMA,        // 0x2c
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::COLON,        // 0x3a
  Token::SEMICOLON,    // 0x3b
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::CONDITIONAL,  // 0x3f
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::LBRACK,     // 0x5b
  Token::ILLEGAL,
  Token::RBRACK,     // 0x5d
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::LBRACE,       // 0x7b
  Token::ILLEGAL,
  Token::RBRACE,       // 0x7d
  Token::BIT_NOT,      // 0x7e
  Token::ILLEGAL
};


392
Token::Value Scanner::Next() {
393 394 395 396
  if (next_.token == Token::EOS) {
    next_.location.beg_pos = current_.location.beg_pos;
    next_.location.end_pos = current_.location.end_pos;
  }
397
  current_ = next_;
littledan's avatar
littledan committed
398 399 400
  if (V8_UNLIKELY(next_next_.token != Token::UNINITIALIZED)) {
    next_ = next_next_;
    next_next_.token = Token::UNINITIALIZED;
401
    next_next_.contextual_token = Token::UNINITIALIZED;
402
    has_line_terminator_before_next_ = has_line_terminator_after_next_;
littledan's avatar
littledan committed
403 404
    return current_.token;
  }
405 406 407 408 409 410 411
  has_line_terminator_before_next_ = false;
  has_multiline_comment_before_next_ = false;
  if (static_cast<unsigned>(c0_) <= 0x7f) {
    Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
    if (token != Token::ILLEGAL) {
      int pos = source_pos();
      next_.token = token;
412
      next_.contextual_token = Token::UNINITIALIZED;
413 414
      next_.location.beg_pos = pos;
      next_.location.end_pos = pos + 1;
415 416
      next_.literal_chars = nullptr;
      next_.raw_literal_chars = nullptr;
417
      next_.invalid_template_escape_message = MessageTemplate::kNone;
418 419 420
      Advance();
      return current_.token;
    }
421
  }
422 423 424 425 426
  Scan();
  return current_.token;
}


littledan's avatar
littledan committed
427
Token::Value Scanner::PeekAhead() {
428 429 430
  DCHECK(next_.token != Token::DIV);
  DCHECK(next_.token != Token::ASSIGN_DIV);

littledan's avatar
littledan committed
431 432 433 434
  if (next_next_.token != Token::UNINITIALIZED) {
    return next_next_.token;
  }
  TokenDesc prev = current_;
435 436
  bool has_line_terminator_before_next =
      has_line_terminator_before_next_ || has_multiline_comment_before_next_;
littledan's avatar
littledan committed
437
  Next();
438 439 440
  has_line_terminator_after_next_ =
      has_line_terminator_before_next_ || has_multiline_comment_before_next_;
  has_line_terminator_before_next_ = has_line_terminator_before_next;
littledan's avatar
littledan committed
441 442 443 444 445 446 447 448
  Token::Value ret = next_.token;
  next_next_ = next_;
  next_ = current_;
  current_ = prev;
  return ret;
}


449
Token::Value Scanner::SkipWhiteSpace() {
450 451 452
  int start_position = source_pos();

  while (true) {
453
    while (true) {
454
      // Don't skip behind the end of input.
455
      if (c0_ == kEndOfInput) break;
456

457 458
      // Advance as long as character is a WhiteSpace or LineTerminator.
      // Remember if the latter is the case.
459
      if (IsLineTerminator(c0_)) {
460
        has_line_terminator_before_next_ = true;
461
      } else if (!unicode_cache_->IsWhiteSpace(c0_)) {
462
        break;
463 464 465 466 467 468 469 470
      }
      Advance();
    }

    // If there is an HTML comment end '-->' at the beginning of a
    // line (with only whitespace in front of it), we treat the rest
    // of the line as a comment. This is in line with the way
    // SpiderMonkey handles it.
471 472 473 474
    if (c0_ != '-' || !has_line_terminator_before_next_) break;

    Advance();
    if (c0_ != '-') {
475
      PushBack('-');  // undo Advance()
476 477 478 479 480 481 482
      break;
    }

    Advance();
    if (c0_ != '>') {
      PushBack2('-', '-');  // undo 2x Advance();
      break;
483
    }
484 485

    // Treat the rest of the line as a comment.
486 487 488 489
    Token::Value token = SkipSingleHTMLComment();
    if (token == Token::ILLEGAL) {
      return token;
    }
490
  }
491

492
  // Return whether or not we skipped any characters.
493 494 495 496 497 498 499 500 501 502 503 504 505
  if (source_pos() == start_position) {
    return Token::ILLEGAL;
  }

  return Token::WHITESPACE;
}

Token::Value Scanner::SkipSingleHTMLComment() {
  if (is_module_) {
    ReportScannerError(source_pos(), MessageTemplate::kHtmlCommentInModule);
    return Token::ILLEGAL;
  }
  return SkipSingleLineComment();
506
}
lrn@chromium.org's avatar
lrn@chromium.org committed
507

508
Token::Value Scanner::SkipSingleLineComment() {
509
  Advance();
510

511 512 513 514 515
  // The line terminator at the end of the line is not considered
  // to be part of the single-line comment; it is recognized
  // separately by the lexical grammar and becomes part of the
  // stream of input elements for the syntactic grammar (see
  // ECMA-262, section 7.4).
516
  while (c0_ != kEndOfInput && !IsLineTerminator(c0_)) {
517
    Advance();
518
  }
519 520 521 522 523

  return Token::WHITESPACE;
}


524 525
Token::Value Scanner::SkipSourceURLComment() {
  TryToParseSourceURLComment();
526
  while (c0_ != kEndOfInput && !IsLineTerminator(c0_)) {
527 528 529 530 531 532 533 534
    Advance();
  }

  return Token::WHITESPACE;
}


void Scanner::TryToParseSourceURLComment() {
535
  // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
536
  // function will just return if it cannot parse a magic comment.
537
  if (c0_ == kEndOfInput || !unicode_cache_->IsWhiteSpace(c0_)) return;
538 539
  Advance();
  LiteralBuffer name;
540 541
  while (c0_ != kEndOfInput &&
         !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) && c0_ != '=') {
542 543 544 545 546 547
    name.AddChar(c0_);
    Advance();
  }
  if (!name.is_one_byte()) return;
  Vector<const uint8_t> name_literal = name.one_byte_literal();
  LiteralBuffer* value;
548
  if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) {
549
    value = &source_url_;
550
  } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) {
551 552 553 554 555 556 557 558
    value = &source_mapping_url_;
  } else {
    return;
  }
  if (c0_ != '=')
    return;
  Advance();
  value->Reset();
559
  while (c0_ != kEndOfInput && unicode_cache_->IsWhiteSpace(c0_)) {
560 561
    Advance();
  }
562
  while (c0_ != kEndOfInput && !IsLineTerminator(c0_)) {
563 564 565 566 567 568 569 570 571 572 573 574
    // Disallowed characters.
    if (c0_ == '"' || c0_ == '\'') {
      value->Reset();
      return;
    }
    if (unicode_cache_->IsWhiteSpace(c0_)) {
      break;
    }
    value->AddChar(c0_);
    Advance();
  }
  // Allow whitespace at the end.
575
  while (c0_ != kEndOfInput && !IsLineTerminator(c0_)) {
576 577 578 579 580 581 582 583 584
    if (!unicode_cache_->IsWhiteSpace(c0_)) {
      value->Reset();
      break;
    }
    Advance();
  }
}


585
Token::Value Scanner::SkipMultiLineComment() {
586
  DCHECK_EQ(c0_, '*');
587 588
  Advance();

589
  while (c0_ != kEndOfInput) {
590 591
    uc32 ch = c0_;
    Advance();
592
    if (c0_ != kEndOfInput && IsLineTerminator(ch)) {
593 594 595 596 597 598 599 600 601 602 603
      // Following ECMA-262, section 7.4, a comment containing
      // a newline will make the comment count as a line-terminator.
      has_multiline_comment_before_next_ = true;
    }
    // If we have reached the end of the multi-line comment, we
    // consume the '/' and insert a whitespace. This way all
    // multi-line comments are treated as whitespace.
    if (ch == '*' && c0_ == '/') {
      c0_ = ' ';
      return Token::WHITESPACE;
    }
604
  }
605 606 607

  // Unterminated multi-line comment.
  return Token::ILLEGAL;
608 609
}

610
Token::Value Scanner::ScanHtmlComment() {
611
  // Check for <!-- comments.
612
  DCHECK_EQ(c0_, '!');
613
  Advance();
614 615 616 617 618 619 620 621 622
  if (c0_ != '-') {
    PushBack('!');  // undo Advance()
    return Token::LT;
  }

  Advance();
  if (c0_ != '-') {
    PushBack2('-', '!');  // undo 2x Advance()
    return Token::LT;
623 624
  }

625
  found_html_comment_ = true;
626
  return SkipSingleHTMLComment();
627
}
628

629
void Scanner::Scan() {
630 631
  next_.literal_chars = nullptr;
  next_.raw_literal_chars = nullptr;
632
  next_.invalid_template_escape_message = MessageTemplate::kNone;
633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650
  Token::Value token;
  do {
    // Remember the position of the next token
    next_.location.beg_pos = source_pos();

    switch (c0_) {
      case ' ':
      case '\t':
        Advance();
        token = Token::WHITESPACE;
        break;

      case '\n':
        Advance();
        has_line_terminator_before_next_ = true;
        token = Token::WHITESPACE;
        break;

651 652
      case '"':
      case '\'':
653 654 655 656 657 658 659 660 661 662
        token = ScanString();
        break;

      case '<':
        // < <= << <<= <!--
        Advance();
        if (c0_ == '=') {
          token = Select(Token::LTE);
        } else if (c0_ == '<') {
          token = Select('=', Token::ASSIGN_SHL, Token::SHL);
663
        } else if (c0_ == '!') {
664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690
          token = ScanHtmlComment();
        } else {
          token = Token::LT;
        }
        break;

      case '>':
        // > >= >> >>= >>> >>>=
        Advance();
        if (c0_ == '=') {
          token = Select(Token::GTE);
        } else if (c0_ == '>') {
          // >> >>= >>> >>>=
          Advance();
          if (c0_ == '=') {
            token = Select(Token::ASSIGN_SAR);
          } else if (c0_ == '>') {
            token = Select('=', Token::ASSIGN_SHR, Token::SHR);
          } else {
            token = Token::SAR;
          }
        } else {
          token = Token::GT;
        }
        break;

      case '=':
691
        // = == === =>
692 693 694
        Advance();
        if (c0_ == '=') {
          token = Select('=', Token::EQ_STRICT, Token::EQ);
695 696
        } else if (c0_ == '>') {
          token = Select(Token::ARROW);
697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728
        } else {
          token = Token::ASSIGN;
        }
        break;

      case '!':
        // ! != !==
        Advance();
        if (c0_ == '=') {
          token = Select('=', Token::NE_STRICT, Token::NE);
        } else {
          token = Token::NOT;
        }
        break;

      case '+':
        // + ++ +=
        Advance();
        if (c0_ == '+') {
          token = Select(Token::INC);
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_ADD);
        } else {
          token = Token::ADD;
        }
        break;

      case '-':
        // - -- --> -=
        Advance();
        if (c0_ == '-') {
          Advance();
729
          if (c0_ == '>' && HasAnyLineTerminatorBeforeNext()) {
730 731
            // For compatibility with SpiderMonkey, we skip lines that
            // start with an HTML comment end '-->'.
732
            token = SkipSingleHTMLComment();
733 734 735 736 737 738 739 740 741 742 743 744
          } else {
            token = Token::DEC;
          }
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_SUB);
        } else {
          token = Token::SUB;
        }
        break;

      case '*':
        // * *=
745
        Advance();
746
        if (c0_ == '*') {
747 748 749 750 751 752
          token = Select('=', Token::ASSIGN_EXP, Token::EXP);
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_MUL);
        } else {
          token = Token::MUL;
        }
753 754 755 756 757 758 759 760 761 762 763
        break;

      case '%':
        // % %=
        token = Select('=', Token::ASSIGN_MOD, Token::MOD);
        break;

      case '/':
        // /  // /* /=
        Advance();
        if (c0_ == '/') {
764
          Advance();
765
          if (c0_ == '#' || c0_ == '@') {
766 767 768 769 770 771
            Advance();
            token = SkipSourceURLComment();
          } else {
            PushBack(c0_);
            token = SkipSingleLineComment();
          }
772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816
        } else if (c0_ == '*') {
          token = SkipMultiLineComment();
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_DIV);
        } else {
          token = Token::DIV;
        }
        break;

      case '&':
        // & && &=
        Advance();
        if (c0_ == '&') {
          token = Select(Token::AND);
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_BIT_AND);
        } else {
          token = Token::BIT_AND;
        }
        break;

      case '|':
        // | || |=
        Advance();
        if (c0_ == '|') {
          token = Select(Token::OR);
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_BIT_OR);
        } else {
          token = Token::BIT_OR;
        }
        break;

      case '^':
        // ^ ^=
        token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
        break;

      case '.':
        // . Number
        Advance();
        if (IsDecimalDigit(c0_)) {
          token = ScanNumber(true);
        } else {
          token = Token::PERIOD;
817 818 819 820 821 822 823 824 825
          if (c0_ == '.') {
            Advance();
            if (c0_ == '.') {
              Advance();
              token = Token::ELLIPSIS;
            } else {
              PushBack('.');
            }
          }
826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872
        }
        break;

      case ':':
        token = Select(Token::COLON);
        break;

      case ';':
        token = Select(Token::SEMICOLON);
        break;

      case ',':
        token = Select(Token::COMMA);
        break;

      case '(':
        token = Select(Token::LPAREN);
        break;

      case ')':
        token = Select(Token::RPAREN);
        break;

      case '[':
        token = Select(Token::LBRACK);
        break;

      case ']':
        token = Select(Token::RBRACK);
        break;

      case '{':
        token = Select(Token::LBRACE);
        break;

      case '}':
        token = Select(Token::RBRACE);
        break;

      case '?':
        token = Select(Token::CONDITIONAL);
        break;

      case '~':
        token = Select(Token::BIT_NOT);
        break;

873
      case '`':
874 875
        token = ScanTemplateStart();
        break;
876

877
      default:
878
        if (c0_ == kEndOfInput) {
879 880
          token = Token::EOS;
        } else if (unicode_cache_->IsIdentifierStart(c0_)) {
881 882 883 884
          token = ScanIdentifierOrKeyword();
        } else if (IsDecimalDigit(c0_)) {
          token = ScanNumber(false);
        } else {
885 886 887 888
          token = SkipWhiteSpace();
          if (token == Token::ILLEGAL) {
            Advance();
          }
889 890 891 892 893 894 895 896 897
        }
        break;
    }

    // Continue scanning for tokens as long as we're just skipping
    // whitespace.
  } while (token == Token::WHITESPACE);

  next_.location.end_pos = source_pos();
898 899 900 901 902 903 904
  if (Token::IsContextualKeyword(token)) {
    next_.token = Token::IDENTIFIER;
    next_.contextual_token = token;
  } else {
    next_.token = token;
    next_.contextual_token = Token::UNINITIALIZED;
  }
905 906 907 908 909 910

#ifdef DEBUG
  SanityCheckTokenDesc(current_);
  SanityCheckTokenDesc(next_);
  SanityCheckTokenDesc(next_next_);
#endif
911 912
}

913 914 915 916 917 918 919 920
#ifdef DEBUG
void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const {
  // Most tokens should not have literal_chars or even raw_literal chars.
  // The rules are:
  // - UNINITIALIZED: we don't care.
  // - TEMPLATE_*: need both literal + raw literal chars.
  // - IDENTIFIERS, STRINGS, etc.: need a literal, but no raw literal.
  // - all others: should have neither.
921 922
  // Furthermore, only TEMPLATE_* tokens can have a
  // invalid_template_escape_message.
923 924 925 926 927 928 929 930 931 932 933 934 935 936 937

  switch (token.token) {
    case Token::UNINITIALIZED:
      // token.literal_chars & other members might be garbage. That's ok.
      break;
    case Token::TEMPLATE_SPAN:
    case Token::TEMPLATE_TAIL:
      DCHECK_NOT_NULL(token.raw_literal_chars);
      DCHECK_NOT_NULL(token.literal_chars);
      break;
    case Token::ESCAPED_KEYWORD:
    case Token::ESCAPED_STRICT_RESERVED_WORD:
    case Token::FUTURE_STRICT_RESERVED_WORD:
    case Token::IDENTIFIER:
    case Token::NUMBER:
938
    case Token::BIGINT:
939 940 941 942 943
    case Token::REGEXP_LITERAL:
    case Token::SMI:
    case Token::STRING:
      DCHECK_NOT_NULL(token.literal_chars);
      DCHECK_NULL(token.raw_literal_chars);
944
      DCHECK_EQ(token.invalid_template_escape_message, MessageTemplate::kNone);
945 946 947 948
      break;
    default:
      DCHECK_NULL(token.literal_chars);
      DCHECK_NULL(token.raw_literal_chars);
949
      DCHECK_EQ(token.invalid_template_escape_message, MessageTemplate::kNone);
950 951
      break;
  }
952 953 954 955 956 957 958

  DCHECK_IMPLIES(token.token != Token::IDENTIFIER,
                 token.contextual_token == Token::UNINITIALIZED);
  DCHECK_IMPLIES(token.contextual_token != Token::UNINITIALIZED,
                 token.token == Token::IDENTIFIER &&
                     Token::IsContextualKeyword(token.contextual_token));
  DCHECK(!Token::IsContextualKeyword(token.token));
959 960
}
#endif  // DEBUG
961

962
void Scanner::SeekForward(int pos) {
963 964 965 966
  // After this call, we will have the token at the given position as
  // the "next" token. The "current" token will be invalid.
  if (pos == next_.location.beg_pos) return;
  int current_pos = source_pos();
967
  DCHECK_EQ(next_.location.end_pos, current_pos);
968
  // Positions inside the lookahead token aren't supported.
969
  DCHECK(pos >= current_pos);
970
  if (pos != current_pos) {
971
    source_->Seek(pos);
972 973 974 975 976 977 978 979
    Advance();
    // This function is only called to seek to the location
    // of the end of a function (at the "}" token). It doesn't matter
    // whether there was a line terminator in the part we skip.
    has_line_terminator_before_next_ = false;
    has_multiline_comment_before_next_ = false;
  }
  Scan();
980 981 982
}


983
template <bool capture_raw, bool in_template_literal>
984
bool Scanner::ScanEscape() {
985
  uc32 c = c0_;
986
  Advance<capture_raw>();
987 988

  // Skip escaped newlines.
989
  if (!in_template_literal && c0_ != kEndOfInput && IsLineTerminator(c)) {
990
    // Allow escaped CR+LF newlines in multiline string literals.
991
    if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance<capture_raw>();
992
    return true;
993
  }
994

995 996 997 998 999 1000 1001 1002 1003 1004
  switch (c) {
    case '\'':  // fall through
    case '"' :  // fall through
    case '\\': break;
    case 'b' : c = '\b'; break;
    case 'f' : c = '\f'; break;
    case 'n' : c = '\n'; break;
    case 'r' : c = '\r'; break;
    case 't' : c = '\t'; break;
    case 'u' : {
1005
      c = ScanUnicodeEscape<capture_raw>();
1006
      if (c < 0) return false;
1007 1008
      break;
    }
1009 1010 1011 1012
    case 'v':
      c = '\v';
      break;
    case 'x': {
1013
      c = ScanHexNumber<capture_raw>(2);
1014
      if (c < 0) return false;
1015 1016
      break;
    }
1017
    case '0':  // Fall through.
1018 1019 1020 1021 1022 1023
    case '1':  // fall through
    case '2':  // fall through
    case '3':  // fall through
    case '4':  // fall through
    case '5':  // fall through
    case '6':  // fall through
1024
    case '7':
1025 1026
      c = ScanOctalEscape<capture_raw>(c, 2);
      break;
1027
  }
1028

1029
  // Other escaped characters are interpreted as their non-escaped version.
1030
  AddLiteralChar(c);
1031
  return true;
1032 1033 1034
}


1035
template <bool capture_raw>
1036
uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
1037 1038 1039 1040 1041 1042 1043 1044
  uc32 x = c - '0';
  int i = 0;
  for (; i < length; i++) {
    int d = c0_ - '0';
    if (d < 0 || d > 7) break;
    int nx = x * 8 + d;
    if (nx >= 256) break;
    x = nx;
1045
    Advance<capture_raw>();
1046 1047 1048 1049 1050 1051
  }
  // Anything except '\0' is an octal escape sequence, illegal in strict mode.
  // Remember the position of octal escape sequences so that an error
  // can be reported later (in strict mode).
  // We don't report the error immediately, because the octal escape can
  // occur before the "use strict" directive.
1052
  if (c != '0' || i > 0 || c0_ == '8' || c0_ == '9') {
1053
    octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
1054
    octal_message_ = MessageTemplate::kStrictOctalEscape;
1055
  }
1056
  return x;
1057 1058 1059
}


1060
Token::Value Scanner::ScanString() {
1061
  uc32 quote = c0_;
verwaest's avatar
verwaest committed
1062
  Advance<false, false>();  // consume quote
1063

1064
  LiteralScope literal(this);
verwaest's avatar
verwaest committed
1065 1066 1067 1068 1069
  while (true) {
    if (c0_ > kMaxAscii) {
      HandleLeadSurrogate();
      break;
    }
1070
    if (c0_ == kEndOfInput || c0_ == '\n' || c0_ == '\r') return Token::ILLEGAL;
verwaest's avatar
verwaest committed
1071 1072 1073 1074 1075
    if (c0_ == quote) {
      literal.Complete();
      Advance<false, false>();
      return Token::STRING;
    }
1076
    char c = static_cast<char>(c0_);
verwaest's avatar
verwaest committed
1077 1078 1079 1080 1081
    if (c == '\\') break;
    Advance<false, false>();
    AddLiteralChar(c);
  }

1082
  while (c0_ != quote && c0_ != kEndOfInput && !IsLineTerminator(c0_)) {
1083 1084 1085
    uc32 c = c0_;
    Advance();
    if (c == '\\') {
1086
      if (c0_ == kEndOfInput || !ScanEscape<false, false>()) {
1087 1088
        return Token::ILLEGAL;
      }
1089 1090 1091 1092 1093 1094
    } else {
      AddLiteralChar(c);
    }
  }
  if (c0_ != quote) return Token::ILLEGAL;
  literal.Complete();
1095

1096 1097 1098
  Advance();  // consume quote
  return Token::STRING;
}
1099 1100


1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114
Token::Value Scanner::ScanTemplateSpan() {
  // When scanning a TemplateSpan, we are looking for the following construct:
  // TEMPLATE_SPAN ::
  //     ` LiteralChars* ${
  //   | } LiteralChars* ${
  //
  // TEMPLATE_TAIL ::
  //     ` LiteralChars* `
  //   | } LiteralChar* `
  //
  // A TEMPLATE_SPAN should always be followed by an Expression, while a
  // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
  // followed by an Expression.

1115 1116 1117 1118 1119 1120
  // These scoped helpers save and restore the original error state, so that we
  // can specially treat invalid escape sequences in templates (which are
  // handled by the parser).
  ErrorState scanner_error_state(&scanner_error_, &scanner_error_location_);
  ErrorState octal_error_state(&octal_message_, &octal_pos_);

1121
  Token::Value result = Token::TEMPLATE_SPAN;
1122 1123 1124
  LiteralScope literal(this);
  StartRawLiteral();
  const bool capture_raw = true;
1125
  const bool in_template_literal = true;
1126 1127
  while (true) {
    uc32 c = c0_;
1128
    Advance<capture_raw>();
1129 1130
    if (c == '`') {
      result = Token::TEMPLATE_TAIL;
1131
      ReduceRawLiteralLength(1);
1132 1133
      break;
    } else if (c == '$' && c0_ == '{') {
1134
      Advance<capture_raw>();  // Consume '{'
1135
      ReduceRawLiteralLength(2);
1136 1137
      break;
    } else if (c == '\\') {
1138
      if (c0_ != kEndOfInput && IsLineTerminator(c0_)) {
1139 1140 1141
        // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
        // code unit sequence.
        uc32 lastChar = c0_;
1142
        Advance<capture_raw>();
1143 1144 1145
        if (lastChar == '\r') {
          ReduceRawLiteralLength(1);  // Remove \r
          if (c0_ == '\n') {
1146
            Advance<capture_raw>();  // Adds \n
1147 1148 1149 1150
          } else {
            AddRawLiteralChar('\n');
          }
        }
1151 1152 1153 1154 1155 1156
      } else {
        bool success = ScanEscape<capture_raw, in_template_literal>();
        USE(success);
        DCHECK_EQ(!success, has_error());
        // For templates, invalid escape sequence checking is handled in the
        // parser.
1157 1158
        scanner_error_state.MoveErrorTo(&next_);
        octal_error_state.MoveErrorTo(&next_);
1159 1160 1161 1162 1163 1164 1165 1166 1167 1168
      }
    } else if (c < 0) {
      // Unterminated template literal
      PushBack(c);
      break;
    } else {
      // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
      // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
      // consisting of the CV 0x000A.
      if (c == '\r') {
1169 1170
        ReduceRawLiteralLength(1);  // Remove \r
        if (c0_ == '\n') {
1171
          Advance<capture_raw>();  // Adds \n
1172 1173 1174
        } else {
          AddRawLiteralChar('\n');
        }
1175 1176 1177 1178 1179 1180 1181 1182
        c = '\n';
      }
      AddLiteralChar(c);
    }
  }
  literal.Complete();
  next_.location.end_pos = source_pos();
  next_.token = result;
1183
  next_.contextual_token = Token::UNINITIALIZED;
1184

1185 1186 1187 1188
  return result;
}


1189
Token::Value Scanner::ScanTemplateStart() {
1190 1191
  DCHECK_EQ(next_next_.token, Token::UNINITIALIZED);
  DCHECK_EQ(c0_, '`');
1192 1193 1194 1195 1196
  next_.location.beg_pos = source_pos();
  Advance();  // Consume `
  return ScanTemplateSpan();
}

1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208
Handle<String> Scanner::SourceUrl(Isolate* isolate) const {
  Handle<String> tmp;
  if (source_url_.length() > 0) tmp = source_url_.Internalize(isolate);
  return tmp;
}

Handle<String> Scanner::SourceMappingUrl(Isolate* isolate) const {
  Handle<String> tmp;
  if (source_mapping_url_.length() > 0)
    tmp = source_mapping_url_.Internalize(isolate);
  return tmp;
}
1209

1210
void Scanner::ScanDecimalDigits() {
1211 1212
  while (IsDecimalDigit(c0_))
    AddLiteralCharAdvance();
1213 1214 1215
}


1216
Token::Value Scanner::ScanNumber(bool seen_period) {
1217
  DCHECK(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
1218

1219 1220 1221 1222 1223 1224 1225 1226
  enum {
    DECIMAL,
    DECIMAL_WITH_LEADING_ZERO,
    HEX,
    OCTAL,
    IMPLICIT_OCTAL,
    BINARY
  } kind = DECIMAL;
1227 1228

  LiteralScope literal(this);
verwaest's avatar
verwaest committed
1229
  bool at_start = !seen_period;
1230
  int start_pos = source_pos();  // For reporting octal positions.
1231 1232 1233 1234 1235 1236 1237 1238 1239 1240
  if (seen_period) {
    // we have already seen a decimal point of the float
    AddLiteralChar('.');
    ScanDecimalDigits();  // we know we have at least one digit

  } else {
    // if the first character is '0' we must check for octals and hex
    if (c0_ == '0') {
      AddLiteralCharAdvance();

1241 1242
      // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
      // an octal number.
1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253
      if (c0_ == 'x' || c0_ == 'X') {
        // hex number
        kind = HEX;
        AddLiteralCharAdvance();
        if (!IsHexDigit(c0_)) {
          // we must have at least one hex digit after 'x'/'X'
          return Token::ILLEGAL;
        }
        while (IsHexDigit(c0_)) {
          AddLiteralCharAdvance();
        }
1254
      } else if (c0_ == 'o' || c0_ == 'O') {
1255 1256 1257 1258 1259 1260 1261 1262 1263
        kind = OCTAL;
        AddLiteralCharAdvance();
        if (!IsOctalDigit(c0_)) {
          // we must have at least one octal digit after 'o'/'O'
          return Token::ILLEGAL;
        }
        while (IsOctalDigit(c0_)) {
          AddLiteralCharAdvance();
        }
1264
      } else if (c0_ == 'b' || c0_ == 'B') {
1265 1266 1267 1268 1269 1270 1271 1272 1273
        kind = BINARY;
        AddLiteralCharAdvance();
        if (!IsBinaryDigit(c0_)) {
          // we must have at least one binary digit after 'b'/'B'
          return Token::ILLEGAL;
        }
        while (IsBinaryDigit(c0_)) {
          AddLiteralCharAdvance();
        }
1274 1275
      } else if ('0' <= c0_ && c0_ <= '7') {
        // (possible) octal number
1276
        kind = IMPLICIT_OCTAL;
1277 1278
        while (true) {
          if (c0_ == '8' || c0_ == '9') {
verwaest's avatar
verwaest committed
1279
            at_start = false;
1280
            kind = DECIMAL_WITH_LEADING_ZERO;
1281 1282 1283 1284 1285
            break;
          }
          if (c0_  < '0' || '7'  < c0_) {
            // Octal literal finished.
            octal_pos_ = Location(start_pos, source_pos());
1286
            octal_message_ = MessageTemplate::kStrictOctalLiteral;
1287 1288 1289 1290
            break;
          }
          AddLiteralCharAdvance();
        }
1291 1292
      } else if (c0_ == '8' || c0_ == '9') {
        kind = DECIMAL_WITH_LEADING_ZERO;
1293 1294 1295 1296
      }
    }

    // Parse decimal digits and allow trailing fractional part.
1297
    if (kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO) {
verwaest's avatar
verwaest committed
1298
      if (at_start) {
1299
        uint64_t value = 0;
verwaest's avatar
verwaest committed
1300 1301
        while (IsDecimalDigit(c0_)) {
          value = 10 * value + (c0_ - '0');
1302 1303 1304 1305

          uc32 first_char = c0_;
          Advance<false, false>();
          AddLiteralChar(first_char);
verwaest's avatar
verwaest committed
1306 1307
        }

1308
        if (next_.literal_chars->one_byte_literal().length() <= 10 &&
1309 1310
            value <= Smi::kMaxValue && c0_ != '.' &&
            (c0_ == kEndOfInput || !unicode_cache_->IsIdentifierStart(c0_))) {
heimbuef's avatar
heimbuef committed
1311
          next_.smi_value_ = static_cast<uint32_t>(value);
verwaest's avatar
verwaest committed
1312
          literal.Complete();
verwaest's avatar
verwaest committed
1313
          HandleLeadSurrogate();
1314

1315 1316 1317 1318
          if (kind == DECIMAL_WITH_LEADING_ZERO) {
            octal_pos_ = Location(start_pos, source_pos());
            octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
          }
verwaest's avatar
verwaest committed
1319 1320
          return Token::SMI;
        }
verwaest's avatar
verwaest committed
1321
        HandleLeadSurrogate();
verwaest's avatar
verwaest committed
1322 1323
      }

1324 1325
      ScanDecimalDigits();  // optional
      if (c0_ == '.') {
1326
        seen_period = true;
1327 1328
        AddLiteralCharAdvance();
        ScanDecimalDigits();  // optional
1329 1330 1331 1332
      }
    }
  }

1333 1334 1335 1336 1337 1338 1339
  bool is_bigint = false;
  if (allow_harmony_bigint() && c0_ == 'n' && !seen_period &&
      (kind == DECIMAL || kind == HEX || kind == OCTAL || kind == BINARY)) {
    is_bigint = true;
    Advance();
  } else if (c0_ == 'e' || c0_ == 'E') {
    // scan exponent, if any
1340
    DCHECK(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
1341 1342
    if (!(kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO))
      return Token::ILLEGAL;
1343 1344 1345 1346 1347 1348 1349 1350 1351 1352
    // scan exponent
    AddLiteralCharAdvance();
    if (c0_ == '+' || c0_ == '-')
      AddLiteralCharAdvance();
    if (!IsDecimalDigit(c0_)) {
      // we must have at least one decimal digit after 'e'/'E'
      return Token::ILLEGAL;
    }
    ScanDecimalDigits();
  }
1353

1354 1355 1356 1357
  // The source character immediately following a numeric literal must
  // not be an identifier start or a decimal digit; see ECMA-262
  // section 7.8.3, page 17 (note that we read only one decimal digit
  // if the value is 0).
1358
  if (IsDecimalDigit(c0_) ||
1359
      (c0_ != kEndOfInput && unicode_cache_->IsIdentifierStart(c0_)))
1360
    return Token::ILLEGAL;
1361

1362
  literal.Complete();
1363

1364 1365 1366 1367
  if (kind == DECIMAL_WITH_LEADING_ZERO) {
    octal_pos_ = Location(start_pos, source_pos());
    octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
  }
1368
  return is_bigint ? Token::BIGINT : Token::NUMBER;
1369 1370 1371
}


1372
uc32 Scanner::ScanIdentifierUnicodeEscape() {
1373 1374 1375
  Advance();
  if (c0_ != 'u') return -1;
  Advance();
1376
  return ScanUnicodeEscape<false>();
marja's avatar
marja committed
1377 1378 1379
}


1380
template <bool capture_raw>
marja's avatar
marja committed
1381
uc32 Scanner::ScanUnicodeEscape() {
adamk's avatar
adamk committed
1382 1383 1384
  // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
  // hex digits between { } is arbitrary. \ and u have already been read.
  if (c0_ == '{') {
1385
    int begin = source_pos() - 2;
1386
    Advance<capture_raw>();
1387 1388 1389 1390
    uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10ffff, begin);
    if (cp < 0 || c0_ != '}') {
      ReportScannerError(source_pos(),
                         MessageTemplate::kInvalidUnicodeEscapeSequence);
marja's avatar
marja committed
1391 1392
      return -1;
    }
1393
    Advance<capture_raw>();
marja's avatar
marja committed
1394 1395
    return cp;
  }
1396 1397
  const bool unicode = true;
  return ScanHexNumber<capture_raw, unicode>(4);
1398 1399 1400
}


1401 1402 1403
// ----------------------------------------------------------------------------
// Keyword Matcher

1404
#define KEYWORDS(KEYWORD_GROUP, KEYWORD)                    \
1405
  KEYWORD_GROUP('a')                                        \
1406 1407
  KEYWORD("arguments", Token::ARGUMENTS)                    \
  KEYWORD("as", Token::AS)                                  \
1408
  KEYWORD("async", Token::ASYNC)                            \
1409
  KEYWORD("await", Token::AWAIT)                            \
1410
  KEYWORD("anonymous", Token::ANONYMOUS)                    \
1411 1412 1413 1414 1415 1416 1417
  KEYWORD_GROUP('b')                                        \
  KEYWORD("break", Token::BREAK)                            \
  KEYWORD_GROUP('c')                                        \
  KEYWORD("case", Token::CASE)                              \
  KEYWORD("catch", Token::CATCH)                            \
  KEYWORD("class", Token::CLASS)                            \
  KEYWORD("const", Token::CONST)                            \
1418
  KEYWORD("constructor", Token::CONSTRUCTOR)                \
1419 1420 1421 1422 1423 1424 1425 1426
  KEYWORD("continue", Token::CONTINUE)                      \
  KEYWORD_GROUP('d')                                        \
  KEYWORD("debugger", Token::DEBUGGER)                      \
  KEYWORD("default", Token::DEFAULT)                        \
  KEYWORD("delete", Token::DELETE)                          \
  KEYWORD("do", Token::DO)                                  \
  KEYWORD_GROUP('e')                                        \
  KEYWORD("else", Token::ELSE)                              \
1427
  KEYWORD("enum", Token::ENUM)                              \
1428
  KEYWORD("eval", Token::EVAL)                              \
1429 1430 1431 1432 1433 1434
  KEYWORD("export", Token::EXPORT)                          \
  KEYWORD("extends", Token::EXTENDS)                        \
  KEYWORD_GROUP('f')                                        \
  KEYWORD("false", Token::FALSE_LITERAL)                    \
  KEYWORD("finally", Token::FINALLY)                        \
  KEYWORD("for", Token::FOR)                                \
1435
  KEYWORD("from", Token::FROM)                              \
1436
  KEYWORD("function", Token::FUNCTION)                      \
1437 1438
  KEYWORD_GROUP('g')                                        \
  KEYWORD("get", Token::GET)                                \
1439 1440 1441 1442 1443 1444 1445 1446 1447
  KEYWORD_GROUP('i')                                        \
  KEYWORD("if", Token::IF)                                  \
  KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
  KEYWORD("import", Token::IMPORT)                          \
  KEYWORD("in", Token::IN)                                  \
  KEYWORD("instanceof", Token::INSTANCEOF)                  \
  KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)  \
  KEYWORD_GROUP('l')                                        \
  KEYWORD("let", Token::LET)                                \
1448 1449
  KEYWORD_GROUP('m')                                        \
  KEYWORD("meta", Token::META)                              \
1450
  KEYWORD_GROUP('n')                                        \
1451
  KEYWORD("name", Token::NAME)                              \
1452 1453
  KEYWORD("new", Token::NEW)                                \
  KEYWORD("null", Token::NULL_LITERAL)                      \
1454 1455
  KEYWORD_GROUP('o')                                        \
  KEYWORD("of", Token::OF)                                  \
1456 1457 1458 1459
  KEYWORD_GROUP('p')                                        \
  KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)    \
  KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)    \
  KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)  \
1460
  KEYWORD("prototype", Token::PROTOTYPE)                    \
1461 1462 1463 1464
  KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)     \
  KEYWORD_GROUP('r')                                        \
  KEYWORD("return", Token::RETURN)                          \
  KEYWORD_GROUP('s')                                        \
1465 1466
  KEYWORD("sent", Token::SENT)                              \
  KEYWORD("set", Token::SET)                                \
1467 1468 1469 1470
  KEYWORD("static", Token::STATIC)                          \
  KEYWORD("super", Token::SUPER)                            \
  KEYWORD("switch", Token::SWITCH)                          \
  KEYWORD_GROUP('t')                                        \
1471
  KEYWORD("target", Token::TARGET)                          \
1472 1473 1474 1475 1476
  KEYWORD("this", Token::THIS)                              \
  KEYWORD("throw", Token::THROW)                            \
  KEYWORD("true", Token::TRUE_LITERAL)                      \
  KEYWORD("try", Token::TRY)                                \
  KEYWORD("typeof", Token::TYPEOF)                          \
1477 1478
  KEYWORD_GROUP('u')                                        \
  KEYWORD("undefined", Token::UNDEFINED)                    \
1479 1480 1481 1482 1483 1484 1485
  KEYWORD_GROUP('v')                                        \
  KEYWORD("var", Token::VAR)                                \
  KEYWORD("void", Token::VOID)                              \
  KEYWORD_GROUP('w')                                        \
  KEYWORD("while", Token::WHILE)                            \
  KEYWORD("with", Token::WITH)                              \
  KEYWORD_GROUP('y')                                        \
1486 1487 1488
  KEYWORD("yield", Token::YIELD)                            \
  KEYWORD_GROUP('_')                                        \
  KEYWORD("__proto__", Token::PROTO_UNDERSCORED)
1489

1490
static Token::Value KeywordOrIdentifierToken(const uint8_t* input,
1491
                                             int input_length) {
1492
  DCHECK_GE(input_length, 1);
1493
  const int kMinLength = 2;
1494
  const int kMaxLength = 11;
1495 1496 1497 1498 1499 1500 1501 1502
  if (input_length < kMinLength || input_length > kMaxLength) {
    return Token::IDENTIFIER;
  }
  switch (input[0]) {
    default:
#define KEYWORD_GROUP_CASE(ch)                                \
      break;                                                  \
    case ch:
1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524
#define KEYWORD(keyword, token)                                           \
  {                                                                       \
    /* 'keyword' is a char array, so sizeof(keyword) is */                \
    /* strlen(keyword) plus 1 for the NUL char. */                        \
    const int keyword_length = sizeof(keyword) - 1;                       \
    STATIC_ASSERT(keyword_length >= kMinLength);                          \
    STATIC_ASSERT(keyword_length <= kMaxLength);                          \
    DCHECK_EQ(input[0], keyword[0]);                                      \
    DCHECK(token == Token::FUTURE_STRICT_RESERVED_WORD ||                 \
           0 == strncmp(keyword, Token::String(token), sizeof(keyword))); \
    if (input_length == keyword_length && input[1] == keyword[1] &&       \
        (keyword_length <= 2 || input[2] == keyword[2]) &&                \
        (keyword_length <= 3 || input[3] == keyword[3]) &&                \
        (keyword_length <= 4 || input[4] == keyword[4]) &&                \
        (keyword_length <= 5 || input[5] == keyword[5]) &&                \
        (keyword_length <= 6 || input[6] == keyword[6]) &&                \
        (keyword_length <= 7 || input[7] == keyword[7]) &&                \
        (keyword_length <= 8 || input[8] == keyword[8]) &&                \
        (keyword_length <= 9 || input[9] == keyword[9]) &&                \
        (keyword_length <= 10 || input[10] == keyword[10])) {             \
      return token;                                                       \
    }                                                                     \
1525
  }
1526
      KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
1527
  }
1528
  return Token::IDENTIFIER;
1529 1530 1531
}


1532
Token::Value Scanner::ScanIdentifierOrKeyword() {
1533
  DCHECK(unicode_cache_->IsIdentifierStart(c0_));
1534
  LiteralScope literal(this);
1535
  if (IsInRange(c0_, 'a', 'z') || c0_ == '_') {
1536
    do {
1537
      char first_char = static_cast<char>(c0_);
1538 1539
      Advance<false, false>();
      AddLiteralChar(first_char);
1540
    } while (IsInRange(c0_, 'a', 'z') || c0_ == '_');
1541 1542 1543 1544

    if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '_' ||
        c0_ == '$') {
      // Identifier starting with lowercase.
1545
      char first_char = static_cast<char>(c0_);
1546 1547 1548
      Advance<false, false>();
      AddLiteralChar(first_char);
      while (IsAsciiIdentifier(c0_)) {
1549
        char first_char = static_cast<char>(c0_);
1550 1551 1552
        Advance<false, false>();
        AddLiteralChar(first_char);
      }
verwaest's avatar
verwaest committed
1553
      if (c0_ <= kMaxAscii && c0_ != '\\') {
1554 1555 1556
        literal.Complete();
        return Token::IDENTIFIER;
      }
verwaest's avatar
verwaest committed
1557
    } else if (c0_ <= kMaxAscii && c0_ != '\\') {
1558
      // Only a-z+ or _: could be a keyword or identifier.
1559
      Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1560 1561 1562
      Token::Value token =
          KeywordOrIdentifierToken(chars.start(), chars.length());
      if (token == Token::IDENTIFIER ||
1563 1564
          token == Token::FUTURE_STRICT_RESERVED_WORD ||
          Token::IsContextualKeyword(token))
1565 1566
        literal.Complete();
      return token;
1567 1568
    }

verwaest's avatar
verwaest committed
1569
    HandleLeadSurrogate();
1570 1571
  } else if (IsInRange(c0_, 'A', 'Z') || c0_ == '_' || c0_ == '$') {
    do {
1572
      char first_char = static_cast<char>(c0_);
1573 1574 1575 1576
      Advance<false, false>();
      AddLiteralChar(first_char);
    } while (IsAsciiIdentifier(c0_));

verwaest's avatar
verwaest committed
1577
    if (c0_ <= kMaxAscii && c0_ != '\\') {
1578 1579 1580 1581
      literal.Complete();
      return Token::IDENTIFIER;
    }

verwaest's avatar
verwaest committed
1582
    HandleLeadSurrogate();
1583 1584
  } else if (c0_ == '\\') {
    // Scan identifier start character.
1585 1586 1587 1588 1589 1590 1591 1592
    uc32 c = ScanIdentifierUnicodeEscape();
    // Only allow legal identifier start characters.
    if (c < 0 ||
        c == '\\' ||  // No recursive escapes.
        !unicode_cache_->IsIdentifierStart(c)) {
      return Token::ILLEGAL;
    }
    AddLiteralChar(c);
1593
    return ScanIdentifierSuffix(&literal, true);
1594 1595 1596 1597
  } else {
    uc32 first_char = c0_;
    Advance();
    AddLiteralChar(first_char);
1598 1599
  }

1600
  // Scan the rest of the identifier characters.
1601
  while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1602 1603 1604 1605 1606 1607 1608
    if (c0_ != '\\') {
      uc32 next_char = c0_;
      Advance();
      AddLiteralChar(next_char);
      continue;
    }
    // Fallthrough if no longer able to complete keyword.
1609
    return ScanIdentifierSuffix(&literal, false);
1610
  }
1611

1612
  if (next_.literal_chars->is_one_byte()) {
1613
    Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1614 1615
    Token::Value token =
        KeywordOrIdentifierToken(chars.start(), chars.length());
1616
    if (token == Token::IDENTIFIER ||
1617 1618
        token == Token::FUTURE_STRICT_RESERVED_WORD ||
        Token::IsContextualKeyword(token))
1619
      literal.Complete();
1620
    return token;
1621
  }
1622
  literal.Complete();
1623
  return Token::IDENTIFIER;
1624 1625
}

lrn@chromium.org's avatar
lrn@chromium.org committed
1626

1627 1628
Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal,
                                           bool escaped) {
1629
  // Scan the rest of the identifier characters.
1630
  while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1631 1632
    if (c0_ == '\\') {
      uc32 c = ScanIdentifierUnicodeEscape();
1633
      escaped = true;
1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647
      // Only allow legal identifier part characters.
      if (c < 0 ||
          c == '\\' ||
          !unicode_cache_->IsIdentifierPart(c)) {
        return Token::ILLEGAL;
      }
      AddLiteralChar(c);
    } else {
      AddLiteralChar(c0_);
      Advance();
    }
  }
  literal->Complete();

1648 1649
  if (escaped && next_.literal_chars->is_one_byte()) {
    Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1650 1651 1652
    Token::Value token =
        KeywordOrIdentifierToken(chars.start(), chars.length());
    /* TODO(adamk): YIELD should be handled specially. */
1653 1654
    if (token == Token::IDENTIFIER || Token::IsContextualKeyword(token)) {
      return token;
1655 1656 1657 1658 1659 1660
    } else if (token == Token::FUTURE_STRICT_RESERVED_WORD ||
               token == Token::LET || token == Token::STATIC) {
      return Token::ESCAPED_STRICT_RESERVED_WORD;
    } else {
      return Token::ESCAPED_KEYWORD;
    }
1661
  }
1662
  return Token::IDENTIFIER;
1663 1664
}

1665 1666 1667
bool Scanner::ScanRegExpPattern() {
  DCHECK(next_next_.token == Token::UNINITIALIZED);
  DCHECK(next_.token == Token::DIV || next_.token == Token::ASSIGN_DIV);
lrn@chromium.org's avatar
lrn@chromium.org committed
1668

1669 1670
  // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
  bool in_character_class = false;
1671
  bool seen_equal = (next_.token == Token::ASSIGN_DIV);
1672 1673 1674 1675 1676

  // Previous token is either '/' or '/=', in the second case, the
  // pattern starts at =.
  next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
  next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1677

1678 1679 1680 1681 1682 1683 1684 1685 1686
  // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
  // the scanner should pass uninterpreted bodies to the RegExp
  // constructor.
  LiteralScope literal(this);
  if (seen_equal) {
    AddLiteralChar('=');
  }

  while (c0_ != '/' || in_character_class) {
1687
    if (c0_ == kEndOfInput || IsLineTerminator(c0_)) {
1688
      return false;
1689
    }
1690 1691
    if (c0_ == '\\') {  // Escape sequence.
      AddLiteralCharAdvance();
1692
      if (c0_ == kEndOfInput || IsLineTerminator(c0_)) {
1693
        return false;
1694
      }
1695 1696 1697 1698 1699 1700 1701 1702 1703
      AddLiteralCharAdvance();
      // If the escape allows more characters, i.e., \x??, \u????, or \c?,
      // only "safe" characters are allowed (letters, digits, underscore),
      // otherwise the escape isn't valid and the invalid character has
      // its normal meaning. I.e., we can just continue scanning without
      // worrying whether the following characters are part of the escape
      // or not, since any '/', '\\' or '[' is guaranteed to not be part
      // of the escape sequence.

1704
      // TODO(896): At some point, parse RegExps more thoroughly to capture
1705 1706 1707 1708 1709 1710 1711 1712 1713 1714
      // octal esacpes in strict mode.
    } else {  // Unescaped character.
      if (c0_ == '[') in_character_class = true;
      if (c0_ == ']') in_character_class = false;
      AddLiteralCharAdvance();
    }
  }
  Advance();  // consume '/'

  literal.Complete();
1715
  next_.token = Token::REGEXP_LITERAL;
1716
  next_.contextual_token = Token::UNINITIALIZED;
1717
  return true;
1718 1719 1720
}


1721
Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() {
1722 1723
  DCHECK(next_.token == Token::REGEXP_LITERAL);

1724
  // Scan regular expression flags.
1725
  int flags = 0;
1726
  while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737
    RegExp::Flags flag = RegExp::kNone;
    switch (c0_) {
      case 'g':
        flag = RegExp::kGlobal;
        break;
      case 'i':
        flag = RegExp::kIgnoreCase;
        break;
      case 'm':
        flag = RegExp::kMultiline;
        break;
1738 1739 1740 1741 1742 1743 1744
      case 's':
        if (FLAG_harmony_regexp_dotall) {
          flag = RegExp::kDotAll;
        } else {
          return Nothing<RegExp::Flags>();
        }
        break;
1745 1746 1747 1748 1749 1750 1751 1752
      case 'u':
        flag = RegExp::kUnicode;
        break;
      case 'y':
        flag = RegExp::kSticky;
        break;
      default:
        return Nothing<RegExp::Flags>();
1753
    }
1754 1755 1756 1757
    if (flags & flag) {
      return Nothing<RegExp::Flags>();
    }
    Advance();
1758
    flags |= flag;
1759 1760
  }

1761
  next_.location.end_pos = source_pos();
1762
  return Just(RegExp::Flags(flags));
1763 1764
}

1765 1766
const AstRawString* Scanner::CurrentSymbol(
    AstValueFactory* ast_value_factory) const {
1767 1768
  if (is_literal_one_byte()) {
    return ast_value_factory->GetOneByteString(literal_one_byte_string());
1769
  }
1770
  return ast_value_factory->GetTwoByteString(literal_two_byte_string());
1771 1772
}

1773 1774
const AstRawString* Scanner::NextSymbol(
    AstValueFactory* ast_value_factory) const {
1775 1776
  if (is_next_literal_one_byte()) {
    return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
1777
  }
1778
  return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
1779 1780
}

1781
const AstRawString* Scanner::CurrentRawSymbol(
1782
    AstValueFactory* ast_value_factory) const {
1783 1784 1785 1786 1787 1788 1789
  if (is_raw_literal_one_byte()) {
    return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
  }
  return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
}


1790
double Scanner::DoubleValue() {
1791
  DCHECK(is_literal_one_byte());
1792
  return StringToDouble(
1793 1794
      unicode_cache_,
      literal_one_byte_string(),
1795 1796 1797
      ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
}

1798 1799 1800 1801 1802 1803 1804 1805 1806 1807
const char* Scanner::CurrentLiteralAsCString(Zone* zone) const {
  DCHECK(is_literal_one_byte());
  Vector<const uint8_t> vector = literal_one_byte_string();
  int length = vector.length();
  char* buffer = zone->NewArray<char>(length + 1);
  memcpy(buffer, vector.start(), length);
  buffer[length] = '\0';
  return buffer;
}

1808 1809 1810 1811 1812 1813
bool Scanner::IsDuplicateSymbol(DuplicateFinder* duplicate_finder,
                                AstValueFactory* ast_value_factory) const {
  DCHECK_NOT_NULL(duplicate_finder);
  DCHECK_NOT_NULL(ast_value_factory);
  const AstRawString* string = CurrentSymbol(ast_value_factory);
  return !duplicate_finder->known_symbols_.insert(string).second;
1814 1815
}

1816 1817 1818 1819 1820 1821 1822 1823
void Scanner::SeekNext(size_t position) {
  // Use with care: This cleanly resets most, but not all scanner state.
  // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions.

  // To re-scan from a given character position, we need to:
  // 1, Reset the current_, next_ and next_next_ tokens
  //    (next_ + next_next_ will be overwrittem by Next(),
  //     current_ will remain unchanged, so overwrite it fully.)
1824 1825 1826 1827 1828 1829 1830 1831
  current_ = {{0, 0},
              nullptr,
              nullptr,
              0,
              Token::UNINITIALIZED,
              MessageTemplate::kNone,
              {0, 0},
              Token::UNINITIALIZED};
1832
  next_.token = Token::UNINITIALIZED;
1833
  next_.contextual_token = Token::UNINITIALIZED;
1834
  next_next_.token = Token::UNINITIALIZED;
1835
  next_next_.contextual_token = Token::UNINITIALIZED;
1836 1837 1838 1839 1840
  // 2, reset the source to the desired position,
  source_->Seek(position);
  // 3, re-scan, by scanning the look-ahead char + 1 token (next_).
  c0_ = source_->Advance();
  Next();
1841
  DCHECK_EQ(next_.location.beg_pos, static_cast<int>(position));
1842 1843
}

1844 1845
}  // namespace internal
}  // namespace v8