scanner.cc 47.7 KB
Newer Older
1
// Copyright 2011 the V8 project authors. All rights reserved.
2 3
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
4

5
// Features shared by parsing and pre-parsing scanners.
6

7
#include "src/parsing/scanner.h"
8

9 10
#include <stdint.h>

11 12
#include <cmath>

13
#include "src/ast/ast-value-factory.h"
14 15 16
#include "src/char-predicates-inl.h"
#include "src/conversions-inl.h"
#include "src/list-inl.h"
17
#include "src/parsing/duplicate-finder.h"  // For Scanner::FindSymbol
18

19 20
namespace v8 {
namespace internal {
21

22
Handle<String> Scanner::LiteralBuffer::Internalize(Isolate* isolate) const {
23 24 25 26 27 28
  if (is_one_byte()) {
    return isolate->factory()->InternalizeOneByteString(one_byte_literal());
  }
  return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
}

29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
// ----------------------------------------------------------------------------
// Scanner::BookmarkScope

const size_t Scanner::BookmarkScope::kBookmarkAtFirstPos =
    std::numeric_limits<size_t>::max() - 2;
const size_t Scanner::BookmarkScope::kNoBookmark =
    std::numeric_limits<size_t>::max() - 1;
const size_t Scanner::BookmarkScope::kBookmarkWasApplied =
    std::numeric_limits<size_t>::max();

void Scanner::BookmarkScope::Set() {
  DCHECK_EQ(bookmark_, kNoBookmark);
  DCHECK_EQ(scanner_->next_next_.token, Token::UNINITIALIZED);

  // The first token is a bit special, since current_ will still be
  // uninitialized. In this case, store kBookmarkAtFirstPos and special-case it
  // when
  // applying the bookmark.
  DCHECK_IMPLIES(
      scanner_->current_.token == Token::UNINITIALIZED,
      scanner_->current_.location.beg_pos == scanner_->next_.location.beg_pos);
  bookmark_ = (scanner_->current_.token == Token::UNINITIALIZED)
                  ? kBookmarkAtFirstPos
                  : scanner_->location().beg_pos;
}

void Scanner::BookmarkScope::Apply() {
  DCHECK(HasBeenSet());  // Caller hasn't called SetBookmark.
  if (bookmark_ == kBookmarkAtFirstPos) {
    scanner_->SeekNext(0);
  } else {
    scanner_->SeekNext(bookmark_);
    scanner_->Next();
62
    DCHECK_EQ(scanner_->location().beg_pos, static_cast<int>(bookmark_));
63 64 65 66 67 68 69 70 71 72 73
  }
  bookmark_ = kBookmarkWasApplied;
}

bool Scanner::BookmarkScope::HasBeenSet() {
  return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied;
}

bool Scanner::BookmarkScope::HasBeenApplied() {
  return bookmark_ == kBookmarkWasApplied;
}
74

75
// ----------------------------------------------------------------------------
76
// Scanner
77

78
Scanner::Scanner(UnicodeCache* unicode_cache)
79
    : unicode_cache_(unicode_cache),
80
      octal_pos_(Location::invalid()),
81
      decimal_with_leading_zero_pos_(Location::invalid()),
82
      found_html_comment_(false) {
83
}
84 85


86
void Scanner::Initialize(Utf16CharacterStream* source) {
87 88 89 90 91 92 93 94 95
  source_ = source;
  // Need to capture identifiers in order to recognize "get" and "set"
  // in object literals.
  Init();
  // Skip initial whitespace allowing HTML comment ends just like
  // after a newline and scan first token.
  has_line_terminator_before_next_ = true;
  SkipWhiteSpace();
  Scan();
96 97
}

98
template <bool capture_raw, bool unicode>
99
uc32 Scanner::ScanHexNumber(int expected_length) {
100
  DCHECK(expected_length <= 4);  // prevent overflow
101

102
  int begin = source_pos() - 2;
103 104 105 106
  uc32 x = 0;
  for (int i = 0; i < expected_length; i++) {
    int d = HexValue(c0_);
    if (d < 0) {
107 108 109 110
      ReportScannerError(Location(begin, begin + expected_length + 2),
                         unicode
                             ? MessageTemplate::kInvalidUnicodeEscapeSequence
                             : MessageTemplate::kInvalidHexEscapeSequence);
111 112 113
      return -1;
    }
    x = x * 16 + d;
114
    Advance<capture_raw>();
115 116 117
  }

  return x;
118
}
lrn@chromium.org's avatar
lrn@chromium.org committed
119

120
template <bool capture_raw>
121
uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) {
marja's avatar
marja committed
122 123
  uc32 x = 0;
  int d = HexValue(c0_);
124 125
  if (d < 0) return -1;

marja's avatar
marja committed
126 127
  while (d >= 0) {
    x = x * 16 + d;
128 129 130 131 132
    if (x > max_value) {
      ReportScannerError(Location(beg_pos, source_pos() + 1),
                         MessageTemplate::kUndefinedUnicodeCodePoint);
      return -1;
    }
133
    Advance<capture_raw>();
marja's avatar
marja committed
134 135
    d = HexValue(c0_);
  }
136

marja's avatar
marja committed
137 138 139 140
  return x;
}


141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
// Ensure that tokens can be stored in a byte.
STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);

// Table of one-character tokens, by character (0x00..0x7f only).
static const byte one_char_tokens[] = {
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::LPAREN,       // 0x28
  Token::RPAREN,       // 0x29
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::COMMA,        // 0x2c
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::COLON,        // 0x3a
  Token::SEMICOLON,    // 0x3b
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::CONDITIONAL,  // 0x3f
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::LBRACK,     // 0x5b
  Token::ILLEGAL,
  Token::RBRACK,     // 0x5d
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::ILLEGAL,
  Token::LBRACE,       // 0x7b
  Token::ILLEGAL,
  Token::RBRACE,       // 0x7d
  Token::BIT_NOT,      // 0x7e
  Token::ILLEGAL
};


277
Token::Value Scanner::Next() {
278 279 280 281
  if (next_.token == Token::EOS) {
    next_.location.beg_pos = current_.location.beg_pos;
    next_.location.end_pos = current_.location.end_pos;
  }
282
  current_ = next_;
littledan's avatar
littledan committed
283 284 285
  if (V8_UNLIKELY(next_next_.token != Token::UNINITIALIZED)) {
    next_ = next_next_;
    next_next_.token = Token::UNINITIALIZED;
286
    has_line_terminator_before_next_ = has_line_terminator_after_next_;
littledan's avatar
littledan committed
287 288
    return current_.token;
  }
289 290 291 292 293 294 295 296 297
  has_line_terminator_before_next_ = false;
  has_multiline_comment_before_next_ = false;
  if (static_cast<unsigned>(c0_) <= 0x7f) {
    Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
    if (token != Token::ILLEGAL) {
      int pos = source_pos();
      next_.token = token;
      next_.location.beg_pos = pos;
      next_.location.end_pos = pos + 1;
298 299
      next_.literal_chars = nullptr;
      next_.raw_literal_chars = nullptr;
300 301 302
      Advance();
      return current_.token;
    }
303
  }
304 305 306 307 308
  Scan();
  return current_.token;
}


littledan's avatar
littledan committed
309
Token::Value Scanner::PeekAhead() {
310 311 312
  DCHECK(next_.token != Token::DIV);
  DCHECK(next_.token != Token::ASSIGN_DIV);

littledan's avatar
littledan committed
313 314 315 316
  if (next_next_.token != Token::UNINITIALIZED) {
    return next_next_.token;
  }
  TokenDesc prev = current_;
317 318
  bool has_line_terminator_before_next =
      has_line_terminator_before_next_ || has_multiline_comment_before_next_;
littledan's avatar
littledan committed
319
  Next();
320 321 322
  has_line_terminator_after_next_ =
      has_line_terminator_before_next_ || has_multiline_comment_before_next_;
  has_line_terminator_before_next_ = has_line_terminator_before_next;
littledan's avatar
littledan committed
323 324 325 326 327 328 329 330
  Token::Value ret = next_.token;
  next_next_ = next_;
  next_ = current_;
  current_ = prev;
  return ret;
}


331 332
// TODO(yangguo): check whether this is actually necessary.
static inline bool IsLittleEndianByteOrderMark(uc32 c) {
333 334 335 336 337 338 339
  // The Unicode value U+FFFE is guaranteed never to be assigned as a
  // Unicode character; this implies that in a Unicode context the
  // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
  // character expressed in little-endian byte order (since it could
  // not be a U+FFFE character expressed in big-endian byte
  // order). Nevertheless, we check for it to be compatible with
  // Spidermonkey.
340
  return c == 0xFFFE;
341 342
}

343
bool Scanner::SkipWhiteSpace() {
344 345 346
  int start_position = source_pos();

  while (true) {
347
    while (true) {
348
      // Don't skip behind the end of input.
349
      if (c0_ == kEndOfInput) break;
350

351 352
      // Advance as long as character is a WhiteSpace or LineTerminator.
      // Remember if the latter is the case.
353 354
      if (unicode_cache_->IsLineTerminator(c0_)) {
        has_line_terminator_before_next_ = true;
355 356 357
      } else if (!unicode_cache_->IsWhiteSpace(c0_) &&
                 !IsLittleEndianByteOrderMark(c0_)) {
        break;
358 359 360 361 362 363 364 365
      }
      Advance();
    }

    // If there is an HTML comment end '-->' at the beginning of a
    // line (with only whitespace in front of it), we treat the rest
    // of the line as a comment. This is in line with the way
    // SpiderMonkey handles it.
366 367 368 369
    if (c0_ != '-' || !has_line_terminator_before_next_) break;

    Advance();
    if (c0_ != '-') {
370
      PushBack('-');  // undo Advance()
371 372 373 374 375 376 377
      break;
    }

    Advance();
    if (c0_ != '>') {
      PushBack2('-', '-');  // undo 2x Advance();
      break;
378
    }
379 380 381

    // Treat the rest of the line as a comment.
    SkipSingleLineComment();
382
  }
383

384 385 386
  // Return whether or not we skipped any characters.
  return source_pos() != start_position;
}
lrn@chromium.org's avatar
lrn@chromium.org committed
387

388
Token::Value Scanner::SkipSingleLineComment() {
389
  Advance();
390

391 392 393 394 395
  // The line terminator at the end of the line is not considered
  // to be part of the single-line comment; it is recognized
  // separately by the lexical grammar and becomes part of the
  // stream of input elements for the syntactic grammar (see
  // ECMA-262, section 7.4).
396
  while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
397
    Advance();
398
  }
399 400 401 402 403

  return Token::WHITESPACE;
}


404 405
Token::Value Scanner::SkipSourceURLComment() {
  TryToParseSourceURLComment();
406
  while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
407 408 409 410 411 412 413 414
    Advance();
  }

  return Token::WHITESPACE;
}


void Scanner::TryToParseSourceURLComment() {
415
  // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
416
  // function will just return if it cannot parse a magic comment.
417
  if (c0_ == kEndOfInput || !unicode_cache_->IsWhiteSpace(c0_)) return;
418 419
  Advance();
  LiteralBuffer name;
420 421
  while (c0_ != kEndOfInput &&
         !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) && c0_ != '=') {
422 423 424 425 426 427
    name.AddChar(c0_);
    Advance();
  }
  if (!name.is_one_byte()) return;
  Vector<const uint8_t> name_literal = name.one_byte_literal();
  LiteralBuffer* value;
428
  if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) {
429
    value = &source_url_;
430
  } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) {
431 432 433 434 435 436 437 438
    value = &source_mapping_url_;
  } else {
    return;
  }
  if (c0_ != '=')
    return;
  Advance();
  value->Reset();
439
  while (c0_ != kEndOfInput && unicode_cache_->IsWhiteSpace(c0_)) {
440 441
    Advance();
  }
442
  while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
443 444 445 446 447 448 449 450 451 452 453 454
    // Disallowed characters.
    if (c0_ == '"' || c0_ == '\'') {
      value->Reset();
      return;
    }
    if (unicode_cache_->IsWhiteSpace(c0_)) {
      break;
    }
    value->AddChar(c0_);
    Advance();
  }
  // Allow whitespace at the end.
455
  while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
456 457 458 459 460 461 462 463 464
    if (!unicode_cache_->IsWhiteSpace(c0_)) {
      value->Reset();
      break;
    }
    Advance();
  }
}


465
Token::Value Scanner::SkipMultiLineComment() {
466
  DCHECK(c0_ == '*');
467 468
  Advance();

469
  while (c0_ != kEndOfInput) {
470 471
    uc32 ch = c0_;
    Advance();
472
    if (c0_ != kEndOfInput && unicode_cache_->IsLineTerminator(ch)) {
473 474 475 476 477 478 479 480 481 482 483
      // Following ECMA-262, section 7.4, a comment containing
      // a newline will make the comment count as a line-terminator.
      has_multiline_comment_before_next_ = true;
    }
    // If we have reached the end of the multi-line comment, we
    // consume the '/' and insert a whitespace. This way all
    // multi-line comments are treated as whitespace.
    if (ch == '*' && c0_ == '/') {
      c0_ = ' ';
      return Token::WHITESPACE;
    }
484
  }
485 486 487

  // Unterminated multi-line comment.
  return Token::ILLEGAL;
488 489
}

490
Token::Value Scanner::ScanHtmlComment() {
491
  // Check for <!-- comments.
492
  DCHECK(c0_ == '!');
493
  Advance();
494 495 496 497 498 499 500 501 502
  if (c0_ != '-') {
    PushBack('!');  // undo Advance()
    return Token::LT;
  }

  Advance();
  if (c0_ != '-') {
    PushBack2('-', '!');  // undo 2x Advance()
    return Token::LT;
503 504
  }

505 506 507
  found_html_comment_ = true;
  return SkipSingleLineComment();
}
508

509
void Scanner::Scan() {
510
  next_.literal_chars = NULL;
511
  next_.raw_literal_chars = NULL;
512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
  Token::Value token;
  do {
    // Remember the position of the next token
    next_.location.beg_pos = source_pos();

    switch (c0_) {
      case ' ':
      case '\t':
        Advance();
        token = Token::WHITESPACE;
        break;

      case '\n':
        Advance();
        has_line_terminator_before_next_ = true;
        token = Token::WHITESPACE;
        break;

      case '"': case '\'':
        token = ScanString();
        break;

      case '<':
        // < <= << <<= <!--
        Advance();
        if (c0_ == '=') {
          token = Select(Token::LTE);
        } else if (c0_ == '<') {
          token = Select('=', Token::ASSIGN_SHL, Token::SHL);
541
        } else if (c0_ == '!') {
542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568
          token = ScanHtmlComment();
        } else {
          token = Token::LT;
        }
        break;

      case '>':
        // > >= >> >>= >>> >>>=
        Advance();
        if (c0_ == '=') {
          token = Select(Token::GTE);
        } else if (c0_ == '>') {
          // >> >>= >>> >>>=
          Advance();
          if (c0_ == '=') {
            token = Select(Token::ASSIGN_SAR);
          } else if (c0_ == '>') {
            token = Select('=', Token::ASSIGN_SHR, Token::SHR);
          } else {
            token = Token::SAR;
          }
        } else {
          token = Token::GT;
        }
        break;

      case '=':
569
        // = == === =>
570 571 572
        Advance();
        if (c0_ == '=') {
          token = Select('=', Token::EQ_STRICT, Token::EQ);
573 574
        } else if (c0_ == '>') {
          token = Select(Token::ARROW);
575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606
        } else {
          token = Token::ASSIGN;
        }
        break;

      case '!':
        // ! != !==
        Advance();
        if (c0_ == '=') {
          token = Select('=', Token::NE_STRICT, Token::NE);
        } else {
          token = Token::NOT;
        }
        break;

      case '+':
        // + ++ +=
        Advance();
        if (c0_ == '+') {
          token = Select(Token::INC);
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_ADD);
        } else {
          token = Token::ADD;
        }
        break;

      case '-':
        // - -- --> -=
        Advance();
        if (c0_ == '-') {
          Advance();
607
          if (c0_ == '>' && HasAnyLineTerminatorBeforeNext()) {
608 609 610 611 612 613 614 615 616 617 618 619 620 621 622
            // For compatibility with SpiderMonkey, we skip lines that
            // start with an HTML comment end '-->'.
            token = SkipSingleLineComment();
          } else {
            token = Token::DEC;
          }
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_SUB);
        } else {
          token = Token::SUB;
        }
        break;

      case '*':
        // * *=
623
        Advance();
624
        if (c0_ == '*') {
625 626 627 628 629 630
          token = Select('=', Token::ASSIGN_EXP, Token::EXP);
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_MUL);
        } else {
          token = Token::MUL;
        }
631 632 633 634 635 636 637 638 639 640 641
        break;

      case '%':
        // % %=
        token = Select('=', Token::ASSIGN_MOD, Token::MOD);
        break;

      case '/':
        // /  // /* /=
        Advance();
        if (c0_ == '/') {
642
          Advance();
643
          if (c0_ == '#' || c0_ == '@') {
644 645 646 647 648 649
            Advance();
            token = SkipSourceURLComment();
          } else {
            PushBack(c0_);
            token = SkipSingleLineComment();
          }
650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694
        } else if (c0_ == '*') {
          token = SkipMultiLineComment();
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_DIV);
        } else {
          token = Token::DIV;
        }
        break;

      case '&':
        // & && &=
        Advance();
        if (c0_ == '&') {
          token = Select(Token::AND);
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_BIT_AND);
        } else {
          token = Token::BIT_AND;
        }
        break;

      case '|':
        // | || |=
        Advance();
        if (c0_ == '|') {
          token = Select(Token::OR);
        } else if (c0_ == '=') {
          token = Select(Token::ASSIGN_BIT_OR);
        } else {
          token = Token::BIT_OR;
        }
        break;

      case '^':
        // ^ ^=
        token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
        break;

      case '.':
        // . Number
        Advance();
        if (IsDecimalDigit(c0_)) {
          token = ScanNumber(true);
        } else {
          token = Token::PERIOD;
695 696 697 698 699 700 701 702 703
          if (c0_ == '.') {
            Advance();
            if (c0_ == '.') {
              Advance();
              token = Token::ELLIPSIS;
            } else {
              PushBack('.');
            }
          }
704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750
        }
        break;

      case ':':
        token = Select(Token::COLON);
        break;

      case ';':
        token = Select(Token::SEMICOLON);
        break;

      case ',':
        token = Select(Token::COMMA);
        break;

      case '(':
        token = Select(Token::LPAREN);
        break;

      case ')':
        token = Select(Token::RPAREN);
        break;

      case '[':
        token = Select(Token::LBRACK);
        break;

      case ']':
        token = Select(Token::RBRACK);
        break;

      case '{':
        token = Select(Token::LBRACE);
        break;

      case '}':
        token = Select(Token::RBRACE);
        break;

      case '?':
        token = Select(Token::CONDITIONAL);
        break;

      case '~':
        token = Select(Token::BIT_NOT);
        break;

751
      case '`':
752 753
        token = ScanTemplateStart();
        break;
754

755
      default:
756
        if (c0_ == kEndOfInput) {
757 758
          token = Token::EOS;
        } else if (unicode_cache_->IsIdentifierStart(c0_)) {
759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775
          token = ScanIdentifierOrKeyword();
        } else if (IsDecimalDigit(c0_)) {
          token = ScanNumber(false);
        } else if (SkipWhiteSpace()) {
          token = Token::WHITESPACE;
        } else {
          token = Select(Token::ILLEGAL);
        }
        break;
    }

    // Continue scanning for tokens as long as we're just skipping
    // whitespace.
  } while (token == Token::WHITESPACE);

  next_.location.end_pos = source_pos();
  next_.token = token;
776 777 778 779 780 781

#ifdef DEBUG
  SanityCheckTokenDesc(current_);
  SanityCheckTokenDesc(next_);
  SanityCheckTokenDesc(next_next_);
#endif
782 783
}

784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819
#ifdef DEBUG
void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const {
  // Most tokens should not have literal_chars or even raw_literal chars.
  // The rules are:
  // - UNINITIALIZED: we don't care.
  // - TEMPLATE_*: need both literal + raw literal chars.
  // - IDENTIFIERS, STRINGS, etc.: need a literal, but no raw literal.
  // - all others: should have neither.

  switch (token.token) {
    case Token::UNINITIALIZED:
      // token.literal_chars & other members might be garbage. That's ok.
      break;
    case Token::TEMPLATE_SPAN:
    case Token::TEMPLATE_TAIL:
      DCHECK_NOT_NULL(token.raw_literal_chars);
      DCHECK_NOT_NULL(token.literal_chars);
      break;
    case Token::ESCAPED_KEYWORD:
    case Token::ESCAPED_STRICT_RESERVED_WORD:
    case Token::FUTURE_STRICT_RESERVED_WORD:
    case Token::IDENTIFIER:
    case Token::NUMBER:
    case Token::REGEXP_LITERAL:
    case Token::SMI:
    case Token::STRING:
      DCHECK_NOT_NULL(token.literal_chars);
      DCHECK_NULL(token.raw_literal_chars);
      break;
    default:
      DCHECK_NULL(token.literal_chars);
      DCHECK_NULL(token.raw_literal_chars);
      break;
  }
}
#endif  // DEBUG
820

821
void Scanner::SeekForward(int pos) {
822 823 824 825
  // After this call, we will have the token at the given position as
  // the "next" token. The "current" token will be invalid.
  if (pos == next_.location.beg_pos) return;
  int current_pos = source_pos();
826
  DCHECK_EQ(next_.location.end_pos, current_pos);
827
  // Positions inside the lookahead token aren't supported.
828
  DCHECK(pos >= current_pos);
829
  if (pos != current_pos) {
830
    source_->Seek(pos);
831 832 833 834 835 836 837 838
    Advance();
    // This function is only called to seek to the location
    // of the end of a function (at the "}" token). It doesn't matter
    // whether there was a line terminator in the part we skip.
    has_line_terminator_before_next_ = false;
    has_multiline_comment_before_next_ = false;
  }
  Scan();
839 840 841
}


842
template <bool capture_raw, bool in_template_literal>
843
bool Scanner::ScanEscape() {
844
  uc32 c = c0_;
845
  Advance<capture_raw>();
846 847

  // Skip escaped newlines.
848 849
  if (!in_template_literal && c0_ != kEndOfInput &&
      unicode_cache_->IsLineTerminator(c)) {
850
    // Allow CR+LF newlines in multiline string literals.
851
    if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance<capture_raw>();
852
    // Allow LF+CR newlines in multiline string literals.
853
    if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance<capture_raw>();
854
    return true;
855
  }
856

857 858 859 860 861 862 863 864 865 866
  switch (c) {
    case '\'':  // fall through
    case '"' :  // fall through
    case '\\': break;
    case 'b' : c = '\b'; break;
    case 'f' : c = '\f'; break;
    case 'n' : c = '\n'; break;
    case 'r' : c = '\r'; break;
    case 't' : c = '\t'; break;
    case 'u' : {
867
      c = ScanUnicodeEscape<capture_raw>();
868
      if (c < 0) return false;
869 870
      break;
    }
871 872 873 874
    case 'v':
      c = '\v';
      break;
    case 'x': {
875
      c = ScanHexNumber<capture_raw>(2);
876
      if (c < 0) return false;
877 878
      break;
    }
879
    case '0':  // Fall through.
880 881 882 883 884 885
    case '1':  // fall through
    case '2':  // fall through
    case '3':  // fall through
    case '4':  // fall through
    case '5':  // fall through
    case '6':  // fall through
886
    case '7':
887 888
      c = ScanOctalEscape<capture_raw>(c, 2);
      break;
889
  }
890

891 892 893
  // According to ECMA-262, section 7.8.4, characters not covered by the
  // above cases should be illegal, but they are commonly handled as
  // non-escaped characters by JS VMs.
894
  AddLiteralChar(c);
895
  return true;
896 897 898
}


899 900
// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
// ECMA-262. Other JS VMs support them.
901
template <bool capture_raw>
902
uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
903 904 905 906 907 908 909 910
  uc32 x = c - '0';
  int i = 0;
  for (; i < length; i++) {
    int d = c0_ - '0';
    if (d < 0 || d > 7) break;
    int nx = x * 8 + d;
    if (nx >= 256) break;
    x = nx;
911
    Advance<capture_raw>();
912 913 914 915 916 917 918 919
  }
  // Anything except '\0' is an octal escape sequence, illegal in strict mode.
  // Remember the position of octal escape sequences so that an error
  // can be reported later (in strict mode).
  // We don't report the error immediately, because the octal escape can
  // occur before the "use strict" directive.
  if (c != '0' || i > 0) {
    octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
920
  }
921
  return x;
922 923 924
}


925
Token::Value Scanner::ScanString() {
926
  uc32 quote = c0_;
verwaest's avatar
verwaest committed
927
  Advance<false, false>();  // consume quote
928

929
  LiteralScope literal(this);
verwaest's avatar
verwaest committed
930 931 932 933 934
  while (true) {
    if (c0_ > kMaxAscii) {
      HandleLeadSurrogate();
      break;
    }
935
    if (c0_ == kEndOfInput || c0_ == '\n' || c0_ == '\r') return Token::ILLEGAL;
verwaest's avatar
verwaest committed
936 937 938 939 940
    if (c0_ == quote) {
      literal.Complete();
      Advance<false, false>();
      return Token::STRING;
    }
941
    char c = static_cast<char>(c0_);
verwaest's avatar
verwaest committed
942 943 944 945 946
    if (c == '\\') break;
    Advance<false, false>();
    AddLiteralChar(c);
  }

947 948
  while (c0_ != quote && c0_ != kEndOfInput &&
         !unicode_cache_->IsLineTerminator(c0_)) {
949 950 951
    uc32 c = c0_;
    Advance();
    if (c == '\\') {
952
      if (c0_ == kEndOfInput || !ScanEscape<false, false>()) {
953 954
        return Token::ILLEGAL;
      }
955 956 957 958 959 960
    } else {
      AddLiteralChar(c);
    }
  }
  if (c0_ != quote) return Token::ILLEGAL;
  literal.Complete();
961

962 963 964
  Advance();  // consume quote
  return Token::STRING;
}
965 966


967 968 969 970 971 972 973 974 975 976 977 978 979 980 981
Token::Value Scanner::ScanTemplateSpan() {
  // When scanning a TemplateSpan, we are looking for the following construct:
  // TEMPLATE_SPAN ::
  //     ` LiteralChars* ${
  //   | } LiteralChars* ${
  //
  // TEMPLATE_TAIL ::
  //     ` LiteralChars* `
  //   | } LiteralChar* `
  //
  // A TEMPLATE_SPAN should always be followed by an Expression, while a
  // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
  // followed by an Expression.

  Token::Value result = Token::TEMPLATE_SPAN;
982 983 984
  LiteralScope literal(this);
  StartRawLiteral();
  const bool capture_raw = true;
985
  const bool in_template_literal = true;
986 987
  while (true) {
    uc32 c = c0_;
988
    Advance<capture_raw>();
989 990
    if (c == '`') {
      result = Token::TEMPLATE_TAIL;
991
      ReduceRawLiteralLength(1);
992 993
      break;
    } else if (c == '$' && c0_ == '{') {
994
      Advance<capture_raw>();  // Consume '{'
995
      ReduceRawLiteralLength(2);
996 997
      break;
    } else if (c == '\\') {
998
      if (c0_ != kEndOfInput && unicode_cache_->IsLineTerminator(c0_)) {
999 1000 1001
        // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
        // code unit sequence.
        uc32 lastChar = c0_;
1002
        Advance<capture_raw>();
1003 1004 1005
        if (lastChar == '\r') {
          ReduceRawLiteralLength(1);  // Remove \r
          if (c0_ == '\n') {
1006
            Advance<capture_raw>();  // Adds \n
1007 1008 1009 1010
          } else {
            AddRawLiteralChar('\n');
          }
        }
1011 1012
      } else if (!ScanEscape<capture_raw, in_template_literal>()) {
        return Token::ILLEGAL;
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022
      }
    } else if (c < 0) {
      // Unterminated template literal
      PushBack(c);
      break;
    } else {
      // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
      // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
      // consisting of the CV 0x000A.
      if (c == '\r') {
1023 1024
        ReduceRawLiteralLength(1);  // Remove \r
        if (c0_ == '\n') {
1025
          Advance<capture_raw>();  // Adds \n
1026 1027 1028
        } else {
          AddRawLiteralChar('\n');
        }
1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
        c = '\n';
      }
      AddLiteralChar(c);
    }
  }
  literal.Complete();
  next_.location.end_pos = source_pos();
  next_.token = result;
  return result;
}


1041
Token::Value Scanner::ScanTemplateStart() {
1042
  DCHECK(next_next_.token == Token::UNINITIALIZED);
1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056
  DCHECK(c0_ == '`');
  next_.location.beg_pos = source_pos();
  Advance();  // Consume `
  return ScanTemplateSpan();
}


Token::Value Scanner::ScanTemplateContinuation() {
  DCHECK_EQ(next_.token, Token::RBRACE);
  next_.location.beg_pos = source_pos() - 1;  // We already consumed }
  return ScanTemplateSpan();
}


1057
void Scanner::ScanDecimalDigits() {
1058 1059
  while (IsDecimalDigit(c0_))
    AddLiteralCharAdvance();
1060 1061 1062
}


1063
Token::Value Scanner::ScanNumber(bool seen_period) {
1064
  DCHECK(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
1065

1066 1067 1068 1069 1070 1071 1072 1073
  enum {
    DECIMAL,
    DECIMAL_WITH_LEADING_ZERO,
    HEX,
    OCTAL,
    IMPLICIT_OCTAL,
    BINARY
  } kind = DECIMAL;
1074 1075

  LiteralScope literal(this);
verwaest's avatar
verwaest committed
1076
  bool at_start = !seen_period;
1077
  int start_pos = source_pos();  // For reporting octal positions.
1078 1079 1080 1081 1082 1083 1084 1085 1086 1087
  if (seen_period) {
    // we have already seen a decimal point of the float
    AddLiteralChar('.');
    ScanDecimalDigits();  // we know we have at least one digit

  } else {
    // if the first character is '0' we must check for octals and hex
    if (c0_ == '0') {
      AddLiteralCharAdvance();

1088 1089
      // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
      // an octal number.
1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100
      if (c0_ == 'x' || c0_ == 'X') {
        // hex number
        kind = HEX;
        AddLiteralCharAdvance();
        if (!IsHexDigit(c0_)) {
          // we must have at least one hex digit after 'x'/'X'
          return Token::ILLEGAL;
        }
        while (IsHexDigit(c0_)) {
          AddLiteralCharAdvance();
        }
1101
      } else if (c0_ == 'o' || c0_ == 'O') {
1102 1103 1104 1105 1106 1107 1108 1109 1110
        kind = OCTAL;
        AddLiteralCharAdvance();
        if (!IsOctalDigit(c0_)) {
          // we must have at least one octal digit after 'o'/'O'
          return Token::ILLEGAL;
        }
        while (IsOctalDigit(c0_)) {
          AddLiteralCharAdvance();
        }
1111
      } else if (c0_ == 'b' || c0_ == 'B') {
1112 1113 1114 1115 1116 1117 1118 1119 1120
        kind = BINARY;
        AddLiteralCharAdvance();
        if (!IsBinaryDigit(c0_)) {
          // we must have at least one binary digit after 'b'/'B'
          return Token::ILLEGAL;
        }
        while (IsBinaryDigit(c0_)) {
          AddLiteralCharAdvance();
        }
1121 1122
      } else if ('0' <= c0_ && c0_ <= '7') {
        // (possible) octal number
1123
        kind = IMPLICIT_OCTAL;
1124 1125
        while (true) {
          if (c0_ == '8' || c0_ == '9') {
verwaest's avatar
verwaest committed
1126
            at_start = false;
1127
            kind = DECIMAL_WITH_LEADING_ZERO;
1128 1129 1130 1131 1132 1133 1134 1135 1136
            break;
          }
          if (c0_  < '0' || '7'  < c0_) {
            // Octal literal finished.
            octal_pos_ = Location(start_pos, source_pos());
            break;
          }
          AddLiteralCharAdvance();
        }
1137 1138
      } else if (c0_ == '8' || c0_ == '9') {
        kind = DECIMAL_WITH_LEADING_ZERO;
1139 1140 1141 1142
      }
    }

    // Parse decimal digits and allow trailing fractional part.
1143
    if (kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO) {
verwaest's avatar
verwaest committed
1144
      if (at_start) {
1145
        uint64_t value = 0;
verwaest's avatar
verwaest committed
1146 1147
        while (IsDecimalDigit(c0_)) {
          value = 10 * value + (c0_ - '0');
1148 1149 1150 1151

          uc32 first_char = c0_;
          Advance<false, false>();
          AddLiteralChar(first_char);
verwaest's avatar
verwaest committed
1152 1153
        }

1154 1155
        if (next_.literal_chars->one_byte_literal().length() <= 10 &&
            value <= Smi::kMaxValue && c0_ != '.' && c0_ != 'e' && c0_ != 'E') {
heimbuef's avatar
heimbuef committed
1156
          next_.smi_value_ = static_cast<uint32_t>(value);
verwaest's avatar
verwaest committed
1157
          literal.Complete();
verwaest's avatar
verwaest committed
1158
          HandleLeadSurrogate();
1159

1160 1161
          if (kind == DECIMAL_WITH_LEADING_ZERO)
            decimal_with_leading_zero_pos_ = Location(start_pos, source_pos());
verwaest's avatar
verwaest committed
1162 1163
          return Token::SMI;
        }
verwaest's avatar
verwaest committed
1164
        HandleLeadSurrogate();
verwaest's avatar
verwaest committed
1165 1166
      }

1167 1168 1169 1170
      ScanDecimalDigits();  // optional
      if (c0_ == '.') {
        AddLiteralCharAdvance();
        ScanDecimalDigits();  // optional
1171 1172 1173 1174
      }
    }
  }

1175 1176
  // scan exponent, if any
  if (c0_ == 'e' || c0_ == 'E') {
1177
    DCHECK(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
1178 1179
    if (!(kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO))
      return Token::ILLEGAL;
1180 1181 1182 1183 1184 1185 1186 1187 1188 1189
    // scan exponent
    AddLiteralCharAdvance();
    if (c0_ == '+' || c0_ == '-')
      AddLiteralCharAdvance();
    if (!IsDecimalDigit(c0_)) {
      // we must have at least one decimal digit after 'e'/'E'
      return Token::ILLEGAL;
    }
    ScanDecimalDigits();
  }
1190

1191 1192 1193 1194
  // The source character immediately following a numeric literal must
  // not be an identifier start or a decimal digit; see ECMA-262
  // section 7.8.3, page 17 (note that we read only one decimal digit
  // if the value is 0).
1195
  if (IsDecimalDigit(c0_) ||
1196
      (c0_ != kEndOfInput && unicode_cache_->IsIdentifierStart(c0_)))
1197
    return Token::ILLEGAL;
1198

1199
  literal.Complete();
1200

1201 1202
  if (kind == DECIMAL_WITH_LEADING_ZERO)
    decimal_with_leading_zero_pos_ = Location(start_pos, source_pos());
1203
  return Token::NUMBER;
1204 1205 1206
}


1207
uc32 Scanner::ScanIdentifierUnicodeEscape() {
1208 1209 1210
  Advance();
  if (c0_ != 'u') return -1;
  Advance();
1211
  return ScanUnicodeEscape<false>();
marja's avatar
marja committed
1212 1213 1214
}


1215
template <bool capture_raw>
marja's avatar
marja committed
1216
uc32 Scanner::ScanUnicodeEscape() {
adamk's avatar
adamk committed
1217 1218 1219
  // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
  // hex digits between { } is arbitrary. \ and u have already been read.
  if (c0_ == '{') {
1220
    int begin = source_pos() - 2;
1221
    Advance<capture_raw>();
1222 1223 1224 1225
    uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10ffff, begin);
    if (cp < 0 || c0_ != '}') {
      ReportScannerError(source_pos(),
                         MessageTemplate::kInvalidUnicodeEscapeSequence);
marja's avatar
marja committed
1226 1227
      return -1;
    }
1228
    Advance<capture_raw>();
marja's avatar
marja committed
1229 1230
    return cp;
  }
1231 1232
  const bool unicode = true;
  return ScanHexNumber<capture_raw, unicode>(4);
1233 1234 1235
}


1236 1237 1238
// ----------------------------------------------------------------------------
// Keyword Matcher

1239
#define KEYWORDS(KEYWORD_GROUP, KEYWORD)                    \
1240
  KEYWORD_GROUP('a')                                        \
1241
  KEYWORD("async", Token::ASYNC)                            \
1242
  KEYWORD("await", Token::AWAIT)                            \
1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
  KEYWORD_GROUP('b')                                        \
  KEYWORD("break", Token::BREAK)                            \
  KEYWORD_GROUP('c')                                        \
  KEYWORD("case", Token::CASE)                              \
  KEYWORD("catch", Token::CATCH)                            \
  KEYWORD("class", Token::CLASS)                            \
  KEYWORD("const", Token::CONST)                            \
  KEYWORD("continue", Token::CONTINUE)                      \
  KEYWORD_GROUP('d')                                        \
  KEYWORD("debugger", Token::DEBUGGER)                      \
  KEYWORD("default", Token::DEFAULT)                        \
  KEYWORD("delete", Token::DELETE)                          \
  KEYWORD("do", Token::DO)                                  \
  KEYWORD_GROUP('e')                                        \
  KEYWORD("else", Token::ELSE)                              \
1258
  KEYWORD("enum", Token::ENUM)                              \
1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301
  KEYWORD("export", Token::EXPORT)                          \
  KEYWORD("extends", Token::EXTENDS)                        \
  KEYWORD_GROUP('f')                                        \
  KEYWORD("false", Token::FALSE_LITERAL)                    \
  KEYWORD("finally", Token::FINALLY)                        \
  KEYWORD("for", Token::FOR)                                \
  KEYWORD("function", Token::FUNCTION)                      \
  KEYWORD_GROUP('i')                                        \
  KEYWORD("if", Token::IF)                                  \
  KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
  KEYWORD("import", Token::IMPORT)                          \
  KEYWORD("in", Token::IN)                                  \
  KEYWORD("instanceof", Token::INSTANCEOF)                  \
  KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)  \
  KEYWORD_GROUP('l')                                        \
  KEYWORD("let", Token::LET)                                \
  KEYWORD_GROUP('n')                                        \
  KEYWORD("new", Token::NEW)                                \
  KEYWORD("null", Token::NULL_LITERAL)                      \
  KEYWORD_GROUP('p')                                        \
  KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)    \
  KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)    \
  KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)  \
  KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)     \
  KEYWORD_GROUP('r')                                        \
  KEYWORD("return", Token::RETURN)                          \
  KEYWORD_GROUP('s')                                        \
  KEYWORD("static", Token::STATIC)                          \
  KEYWORD("super", Token::SUPER)                            \
  KEYWORD("switch", Token::SWITCH)                          \
  KEYWORD_GROUP('t')                                        \
  KEYWORD("this", Token::THIS)                              \
  KEYWORD("throw", Token::THROW)                            \
  KEYWORD("true", Token::TRUE_LITERAL)                      \
  KEYWORD("try", Token::TRY)                                \
  KEYWORD("typeof", Token::TYPEOF)                          \
  KEYWORD_GROUP('v')                                        \
  KEYWORD("var", Token::VAR)                                \
  KEYWORD("void", Token::VOID)                              \
  KEYWORD_GROUP('w')                                        \
  KEYWORD("while", Token::WHILE)                            \
  KEYWORD("with", Token::WITH)                              \
  KEYWORD_GROUP('y')                                        \
1302
  KEYWORD("yield", Token::YIELD)
1303

1304
static Token::Value KeywordOrIdentifierToken(const uint8_t* input,
1305
                                             int input_length) {
1306
  DCHECK(input_length >= 1);
1307 1308 1309 1310 1311 1312 1313 1314 1315 1316
  const int kMinLength = 2;
  const int kMaxLength = 10;
  if (input_length < kMinLength || input_length > kMaxLength) {
    return Token::IDENTIFIER;
  }
  switch (input[0]) {
    default:
#define KEYWORD_GROUP_CASE(ch)                                \
      break;                                                  \
    case ch:
1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335
#define KEYWORD(keyword, token)                                     \
  {                                                                 \
    /* 'keyword' is a char array, so sizeof(keyword) is */          \
    /* strlen(keyword) plus 1 for the NUL char. */                  \
    const int keyword_length = sizeof(keyword) - 1;                 \
    STATIC_ASSERT(keyword_length >= kMinLength);                    \
    STATIC_ASSERT(keyword_length <= kMaxLength);                    \
    if (input_length == keyword_length && input[1] == keyword[1] && \
        (keyword_length <= 2 || input[2] == keyword[2]) &&          \
        (keyword_length <= 3 || input[3] == keyword[3]) &&          \
        (keyword_length <= 4 || input[4] == keyword[4]) &&          \
        (keyword_length <= 5 || input[5] == keyword[5]) &&          \
        (keyword_length <= 6 || input[6] == keyword[6]) &&          \
        (keyword_length <= 7 || input[7] == keyword[7]) &&          \
        (keyword_length <= 8 || input[8] == keyword[8]) &&          \
        (keyword_length <= 9 || input[9] == keyword[9])) {          \
      return token;                                                 \
    }                                                               \
  }
1336
    KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
1337
  }
1338
  return Token::IDENTIFIER;
1339 1340 1341
}


1342 1343 1344
bool Scanner::IdentifierIsFutureStrictReserved(
    const AstRawString* string) const {
  // Keywords are always 1-byte strings.
1345 1346 1347 1348 1349 1350
  if (!string->is_one_byte()) return false;
  if (string->IsOneByteEqualTo("let") || string->IsOneByteEqualTo("static") ||
      string->IsOneByteEqualTo("yield")) {
    return true;
  }
  return Token::FUTURE_STRICT_RESERVED_WORD ==
1351
         KeywordOrIdentifierToken(string->raw_data(), string->length());
1352 1353 1354
}


1355
Token::Value Scanner::ScanIdentifierOrKeyword() {
1356
  DCHECK(unicode_cache_->IsIdentifierStart(c0_));
1357
  LiteralScope literal(this);
1358 1359
  if (IsInRange(c0_, 'a', 'z')) {
    do {
1360
      char first_char = static_cast<char>(c0_);
1361 1362 1363 1364 1365 1366 1367
      Advance<false, false>();
      AddLiteralChar(first_char);
    } while (IsInRange(c0_, 'a', 'z'));

    if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '_' ||
        c0_ == '$') {
      // Identifier starting with lowercase.
1368
      char first_char = static_cast<char>(c0_);
1369 1370 1371
      Advance<false, false>();
      AddLiteralChar(first_char);
      while (IsAsciiIdentifier(c0_)) {
1372
        char first_char = static_cast<char>(c0_);
1373 1374 1375
        Advance<false, false>();
        AddLiteralChar(first_char);
      }
verwaest's avatar
verwaest committed
1376
      if (c0_ <= kMaxAscii && c0_ != '\\') {
1377 1378 1379
        literal.Complete();
        return Token::IDENTIFIER;
      }
verwaest's avatar
verwaest committed
1380
    } else if (c0_ <= kMaxAscii && c0_ != '\\') {
1381 1382
      // Only a-z+: could be a keyword or identifier.
      Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1383 1384 1385 1386 1387 1388
      Token::Value token =
          KeywordOrIdentifierToken(chars.start(), chars.length());
      if (token == Token::IDENTIFIER ||
          token == Token::FUTURE_STRICT_RESERVED_WORD)
        literal.Complete();
      return token;
1389 1390
    }

verwaest's avatar
verwaest committed
1391
    HandleLeadSurrogate();
1392 1393
  } else if (IsInRange(c0_, 'A', 'Z') || c0_ == '_' || c0_ == '$') {
    do {
1394
      char first_char = static_cast<char>(c0_);
1395 1396 1397 1398
      Advance<false, false>();
      AddLiteralChar(first_char);
    } while (IsAsciiIdentifier(c0_));

verwaest's avatar
verwaest committed
1399
    if (c0_ <= kMaxAscii && c0_ != '\\') {
1400 1401 1402 1403
      literal.Complete();
      return Token::IDENTIFIER;
    }

verwaest's avatar
verwaest committed
1404
    HandleLeadSurrogate();
1405 1406
  } else if (c0_ == '\\') {
    // Scan identifier start character.
1407 1408 1409 1410 1411 1412 1413 1414
    uc32 c = ScanIdentifierUnicodeEscape();
    // Only allow legal identifier start characters.
    if (c < 0 ||
        c == '\\' ||  // No recursive escapes.
        !unicode_cache_->IsIdentifierStart(c)) {
      return Token::ILLEGAL;
    }
    AddLiteralChar(c);
1415
    return ScanIdentifierSuffix(&literal, true);
1416 1417 1418 1419
  } else {
    uc32 first_char = c0_;
    Advance();
    AddLiteralChar(first_char);
1420 1421
  }

1422
  // Scan the rest of the identifier characters.
1423
  while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1424 1425 1426 1427 1428 1429 1430
    if (c0_ != '\\') {
      uc32 next_char = c0_;
      Advance();
      AddLiteralChar(next_char);
      continue;
    }
    // Fallthrough if no longer able to complete keyword.
1431
    return ScanIdentifierSuffix(&literal, false);
1432
  }
1433

1434
  if (next_.literal_chars->is_one_byte()) {
1435
    Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1436 1437 1438 1439
    Token::Value token =
        KeywordOrIdentifierToken(chars.start(), chars.length());
    if (token == Token::IDENTIFIER) literal.Complete();
    return token;
1440
  }
1441
  literal.Complete();
1442
  return Token::IDENTIFIER;
1443 1444
}

lrn@chromium.org's avatar
lrn@chromium.org committed
1445

1446 1447
Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal,
                                           bool escaped) {
1448
  // Scan the rest of the identifier characters.
1449
  while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1450 1451
    if (c0_ == '\\') {
      uc32 c = ScanIdentifierUnicodeEscape();
1452
      escaped = true;
1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466
      // Only allow legal identifier part characters.
      if (c < 0 ||
          c == '\\' ||
          !unicode_cache_->IsIdentifierPart(c)) {
        return Token::ILLEGAL;
      }
      AddLiteralChar(c);
    } else {
      AddLiteralChar(c0_);
      Advance();
    }
  }
  literal->Complete();

1467 1468
  if (escaped && next_.literal_chars->is_one_byte()) {
    Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479
    Token::Value token =
        KeywordOrIdentifierToken(chars.start(), chars.length());
    /* TODO(adamk): YIELD should be handled specially. */
    if (token == Token::IDENTIFIER) {
      return Token::IDENTIFIER;
    } else if (token == Token::FUTURE_STRICT_RESERVED_WORD ||
               token == Token::LET || token == Token::STATIC) {
      return Token::ESCAPED_STRICT_RESERVED_WORD;
    } else {
      return Token::ESCAPED_KEYWORD;
    }
1480
  }
1481
  return Token::IDENTIFIER;
1482 1483
}

1484 1485 1486
bool Scanner::ScanRegExpPattern() {
  DCHECK(next_next_.token == Token::UNINITIALIZED);
  DCHECK(next_.token == Token::DIV || next_.token == Token::ASSIGN_DIV);
lrn@chromium.org's avatar
lrn@chromium.org committed
1487

1488 1489
  // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
  bool in_character_class = false;
1490
  bool seen_equal = (next_.token == Token::ASSIGN_DIV);
1491 1492 1493 1494 1495

  // Previous token is either '/' or '/=', in the second case, the
  // pattern starts at =.
  next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
  next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1496

1497 1498 1499 1500 1501 1502 1503 1504 1505
  // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
  // the scanner should pass uninterpreted bodies to the RegExp
  // constructor.
  LiteralScope literal(this);
  if (seen_equal) {
    AddLiteralChar('=');
  }

  while (c0_ != '/' || in_character_class) {
1506 1507
    if (c0_ == kEndOfInput || unicode_cache_->IsLineTerminator(c0_))
      return false;
1508 1509
    if (c0_ == '\\') {  // Escape sequence.
      AddLiteralCharAdvance();
1510 1511
      if (c0_ == kEndOfInput || unicode_cache_->IsLineTerminator(c0_))
        return false;
1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531
      AddLiteralCharAdvance();
      // If the escape allows more characters, i.e., \x??, \u????, or \c?,
      // only "safe" characters are allowed (letters, digits, underscore),
      // otherwise the escape isn't valid and the invalid character has
      // its normal meaning. I.e., we can just continue scanning without
      // worrying whether the following characters are part of the escape
      // or not, since any '/', '\\' or '[' is guaranteed to not be part
      // of the escape sequence.

      // TODO(896): At some point, parse RegExps more throughly to capture
      // octal esacpes in strict mode.
    } else {  // Unescaped character.
      if (c0_ == '[') in_character_class = true;
      if (c0_ == ']') in_character_class = false;
      AddLiteralCharAdvance();
    }
  }
  Advance();  // consume '/'

  literal.Complete();
1532
  next_.token = Token::REGEXP_LITERAL;
1533
  return true;
1534 1535 1536
}


1537
Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() {
1538 1539
  DCHECK(next_.token == Token::REGEXP_LITERAL);

1540
  // Scan regular expression flags.
1541
  int flags = 0;
1542
  while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561
    RegExp::Flags flag = RegExp::kNone;
    switch (c0_) {
      case 'g':
        flag = RegExp::kGlobal;
        break;
      case 'i':
        flag = RegExp::kIgnoreCase;
        break;
      case 'm':
        flag = RegExp::kMultiline;
        break;
      case 'u':
        flag = RegExp::kUnicode;
        break;
      case 'y':
        flag = RegExp::kSticky;
        break;
      default:
        return Nothing<RegExp::Flags>();
1562
    }
1563 1564 1565 1566
    if (flags & flag) {
      return Nothing<RegExp::Flags>();
    }
    Advance();
1567
    flags |= flag;
1568 1569
  }

1570
  next_.location.end_pos = source_pos();
1571
  return Just(RegExp::Flags(flags));
1572 1573
}

1574

1575 1576 1577
const AstRawString* Scanner::CurrentSymbol(AstValueFactory* ast_value_factory) {
  if (is_literal_one_byte()) {
    return ast_value_factory->GetOneByteString(literal_one_byte_string());
1578
  }
1579
  return ast_value_factory->GetTwoByteString(literal_two_byte_string());
1580 1581 1582
}


1583 1584 1585
const AstRawString* Scanner::NextSymbol(AstValueFactory* ast_value_factory) {
  if (is_next_literal_one_byte()) {
    return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
1586
  }
1587
  return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
1588 1589 1590
}


1591 1592 1593 1594 1595 1596 1597 1598 1599
const AstRawString* Scanner::CurrentRawSymbol(
    AstValueFactory* ast_value_factory) {
  if (is_raw_literal_one_byte()) {
    return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
  }
  return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
}


1600
double Scanner::DoubleValue() {
1601
  DCHECK(is_literal_one_byte());
1602
  return StringToDouble(
1603 1604
      unicode_cache_,
      literal_one_byte_string(),
1605 1606 1607 1608
      ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
}


1609 1610 1611 1612 1613 1614 1615
bool Scanner::ContainsDot() {
  DCHECK(is_literal_one_byte());
  Vector<const uint8_t> str = literal_one_byte_string();
  return std::find(str.begin(), str.end(), '.') != str.end();
}


1616
int Scanner::FindSymbol(DuplicateFinder* finder, int value) {
1617 1618
  // TODO(vogelheim): Move this logic into the calling class; this can be fully
  //                  implemented using the public interface.
1619
  if (is_literal_one_byte()) {
1620
    return finder->AddOneByteSymbol(literal_one_byte_string(), value);
1621
  }
1622
  return finder->AddTwoByteSymbol(literal_two_byte_string(), value);
1623 1624
}

1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640
void Scanner::SeekNext(size_t position) {
  // Use with care: This cleanly resets most, but not all scanner state.
  // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions.

  // To re-scan from a given character position, we need to:
  // 1, Reset the current_, next_ and next_next_ tokens
  //    (next_ + next_next_ will be overwrittem by Next(),
  //     current_ will remain unchanged, so overwrite it fully.)
  current_ = {{0, 0}, nullptr, nullptr, 0, Token::UNINITIALIZED};
  next_.token = Token::UNINITIALIZED;
  next_next_.token = Token::UNINITIALIZED;
  // 2, reset the source to the desired position,
  source_->Seek(position);
  // 3, re-scan, by scanning the look-ahead char + 1 token (next_).
  c0_ = source_->Advance();
  Next();
1641
  DCHECK_EQ(next_.location.beg_pos, static_cast<int>(position));
1642 1643
}

1644 1645
}  // namespace internal
}  // namespace v8