// Copyright 2018 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef V8_PARSING_SCANNER_INL_H_ #define V8_PARSING_SCANNER_INL_H_ #include "src/parsing/keywords-gen.h" #include "src/parsing/scanner.h" #include "src/strings/char-predicates-inl.h" #include "src/utils/utils.h" namespace v8 { namespace internal { // ---------------------------------------------------------------------------- // Keyword Matcher #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ KEYWORD_GROUP('a') \ KEYWORD("async", Token::ASYNC) \ KEYWORD("await", Token::AWAIT) \ KEYWORD_GROUP('b') \ KEYWORD("break", Token::BREAK) \ KEYWORD_GROUP('c') \ KEYWORD("case", Token::CASE) \ KEYWORD("catch", Token::CATCH) \ KEYWORD("class", Token::CLASS) \ KEYWORD("const", Token::CONST) \ KEYWORD("continue", Token::CONTINUE) \ KEYWORD_GROUP('d') \ KEYWORD("debugger", Token::DEBUGGER) \ KEYWORD("default", Token::DEFAULT) \ KEYWORD("delete", Token::DELETE) \ KEYWORD("do", Token::DO) \ KEYWORD_GROUP('e') \ KEYWORD("else", Token::ELSE) \ KEYWORD("enum", Token::ENUM) \ KEYWORD("export", Token::EXPORT) \ KEYWORD("extends", Token::EXTENDS) \ KEYWORD_GROUP('f') \ KEYWORD("false", Token::FALSE_LITERAL) \ KEYWORD("finally", Token::FINALLY) \ KEYWORD("for", Token::FOR) \ KEYWORD("function", Token::FUNCTION) \ KEYWORD_GROUP('g') \ KEYWORD("get", Token::GET) \ KEYWORD_GROUP('i') \ KEYWORD("if", Token::IF) \ KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \ KEYWORD("import", Token::IMPORT) \ KEYWORD("in", Token::IN) \ KEYWORD("instanceof", Token::INSTANCEOF) \ KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \ KEYWORD_GROUP('l') \ KEYWORD("let", Token::LET) \ KEYWORD_GROUP('n') \ KEYWORD("new", Token::NEW) \ KEYWORD("null", Token::NULL_LITERAL) \ KEYWORD_GROUP('p') \ KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \ KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \ KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \ KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \ KEYWORD_GROUP('r') \ KEYWORD("return", Token::RETURN) \ KEYWORD_GROUP('s') \ KEYWORD("set", Token::SET) \ KEYWORD("static", Token::STATIC) \ KEYWORD("super", Token::SUPER) \ KEYWORD("switch", Token::SWITCH) \ KEYWORD_GROUP('t') \ KEYWORD("this", Token::THIS) \ KEYWORD("throw", Token::THROW) \ KEYWORD("true", Token::TRUE_LITERAL) \ KEYWORD("try", Token::TRY) \ KEYWORD("typeof", Token::TYPEOF) \ KEYWORD_GROUP('v') \ KEYWORD("var", Token::VAR) \ KEYWORD("void", Token::VOID) \ KEYWORD_GROUP('w') \ KEYWORD("while", Token::WHILE) \ KEYWORD("with", Token::WITH) \ KEYWORD_GROUP('y') \ KEYWORD("yield", Token::YIELD) constexpr bool IsKeywordStart(char c) { #define KEYWORD_GROUP_CHECK(ch) c == ch || #define KEYWORD_CHECK(keyword, token) return KEYWORDS(KEYWORD_GROUP_CHECK, KEYWORD_CHECK) /* || */ false; #undef KEYWORD_GROUP_CHECK #undef KEYWORD_CHECK } V8_INLINE Token::Value KeywordOrIdentifierToken(const uint8_t* input, int input_length) { DCHECK_GE(input_length, 1); return PerfectKeywordHash::GetToken(reinterpret_cast<const char*>(input), input_length); } // Recursive constexpr template magic to check if a character is in a given // string. template <int N> constexpr bool IsInString(const char (&s)[N], char c, size_t i = 0) { return i >= N ? false : s[i] == c ? true : IsInString(s, c, i + 1); } inline constexpr bool CanBeKeywordCharacter(char c) { return IsInString( #define KEYWORD_GROUP_CASE(ch) // Nothing #define KEYWORD(keyword, token) keyword // Use C string literal concatenation ("a" "b" becomes "ab") to build one // giant string containing all the keywords. KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD) #undef KEYWORD #undef KEYWORD_GROUP_CASE , c); } // Make sure tokens are stored as a single byte. STATIC_ASSERT(sizeof(Token::Value) == 1); // Get the shortest token that this character starts, the token may change // depending on subsequent characters. constexpr Token::Value GetOneCharToken(char c) { // clang-format off return c == '(' ? Token::LPAREN : c == ')' ? Token::RPAREN : c == '{' ? Token::LBRACE : c == '}' ? Token::RBRACE : c == '[' ? Token::LBRACK : c == ']' ? Token::RBRACK : c == '?' ? Token::CONDITIONAL : c == ':' ? Token::COLON : c == ';' ? Token::SEMICOLON : c == ',' ? Token::COMMA : c == '.' ? Token::PERIOD : c == '|' ? Token::BIT_OR : c == '&' ? Token::BIT_AND : c == '^' ? Token::BIT_XOR : c == '~' ? Token::BIT_NOT : c == '!' ? Token::NOT : c == '<' ? Token::LT : c == '>' ? Token::GT : c == '%' ? Token::MOD : c == '=' ? Token::ASSIGN : c == '+' ? Token::ADD : c == '-' ? Token::SUB : c == '*' ? Token::MUL : c == '/' ? Token::DIV : c == '#' ? Token::PRIVATE_NAME : c == '"' ? Token::STRING : c == '\'' ? Token::STRING : c == '`' ? Token::TEMPLATE_SPAN : c == '\\' ? Token::IDENTIFIER : // Whitespace or line terminator c == ' ' ? Token::WHITESPACE : c == '\t' ? Token::WHITESPACE : c == '\v' ? Token::WHITESPACE : c == '\f' ? Token::WHITESPACE : c == '\r' ? Token::WHITESPACE : c == '\n' ? Token::WHITESPACE : // IsDecimalDigit must be tested before IsAsciiIdentifier IsDecimalDigit(c) ? Token::NUMBER : IsAsciiIdentifier(c) ? Token::IDENTIFIER : Token::ILLEGAL; // clang-format on } // Table of one-character tokens, by character (0x00..0x7F only). static const constexpr Token::Value one_char_tokens[128] = { #define CALL_GET_SCAN_FLAGS(N) GetOneCharToken(N), INT_0_TO_127_LIST(CALL_GET_SCAN_FLAGS) #undef CALL_GET_SCAN_FLAGS }; #undef KEYWORDS V8_INLINE Token::Value Scanner::ScanIdentifierOrKeyword() { next().literal_chars.Start(); return ScanIdentifierOrKeywordInner(); } // Character flags for the fast path of scanning a keyword or identifier token. enum class ScanFlags : uint8_t { kTerminatesLiteral = 1 << 0, // "Cannot" rather than "can" so that this flag can be ORed together across // multiple characters. kCannotBeKeyword = 1 << 1, kCannotBeKeywordStart = 1 << 2, kStringTerminator = 1 << 3, kIdentifierNeedsSlowPath = 1 << 4, kMultilineCommentCharacterNeedsSlowPath = 1 << 5, }; constexpr uint8_t GetScanFlags(char c) { return // Keywords are all lowercase and only contain letters. // Note that non-identifier characters do not set this flag, so // that it plays well with kTerminatesLiteral. (IsAsciiIdentifier(c) && !CanBeKeywordCharacter(c) ? static_cast<uint8_t>(ScanFlags::kCannotBeKeyword) : 0) | (IsKeywordStart(c) ? 0 : static_cast<uint8_t>(ScanFlags::kCannotBeKeywordStart)) | // Anything that isn't an identifier character will terminate the // literal, or at least terminates the literal fast path processing // (like an escape). (!IsAsciiIdentifier(c) ? static_cast<uint8_t>(ScanFlags::kTerminatesLiteral) : 0) | // Possible string termination characters. ((c == '\'' || c == '"' || c == '\n' || c == '\r' || c == '\\') ? static_cast<uint8_t>(ScanFlags::kStringTerminator) : 0) | // Escapes are processed on the slow path. (c == '\\' ? static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath) : 0) | // Newlines and * are interesting characters for multiline comment // scanning. (c == '\n' || c == '\r' || c == '*' ? static_cast<uint8_t>( ScanFlags::kMultilineCommentCharacterNeedsSlowPath) : 0); } inline bool TerminatesLiteral(uint8_t scan_flags) { return (scan_flags & static_cast<uint8_t>(ScanFlags::kTerminatesLiteral)); } inline bool CanBeKeyword(uint8_t scan_flags) { return !(scan_flags & static_cast<uint8_t>(ScanFlags::kCannotBeKeyword)); } inline bool IdentifierNeedsSlowPath(uint8_t scan_flags) { return (scan_flags & static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath)); } inline bool MultilineCommentCharacterNeedsSlowPath(uint8_t scan_flags) { return (scan_flags & static_cast<uint8_t>( ScanFlags::kMultilineCommentCharacterNeedsSlowPath)); } inline bool MayTerminateString(uint8_t scan_flags) { return (scan_flags & static_cast<uint8_t>(ScanFlags::kStringTerminator)); } // Table of precomputed scan flags for the 128 ASCII characters, for branchless // flag calculation during the scan. static constexpr const uint8_t character_scan_flags[128] = { #define CALL_GET_SCAN_FLAGS(N) GetScanFlags(N), INT_0_TO_127_LIST(CALL_GET_SCAN_FLAGS) #undef CALL_GET_SCAN_FLAGS }; inline bool CharCanBeKeyword(uc32 c) { return static_cast<uint32_t>(c) < arraysize(character_scan_flags) && CanBeKeyword(character_scan_flags[c]); } V8_INLINE Token::Value Scanner::ScanIdentifierOrKeywordInner() { DCHECK(IsIdentifierStart(c0_)); bool escaped = false; bool can_be_keyword = true; STATIC_ASSERT(arraysize(character_scan_flags) == kMaxAscii + 1); if (V8_LIKELY(static_cast<uint32_t>(c0_) <= kMaxAscii)) { if (V8_LIKELY(c0_ != '\\')) { uint8_t scan_flags = character_scan_flags[c0_]; DCHECK(!TerminatesLiteral(scan_flags)); STATIC_ASSERT(static_cast<uint8_t>(ScanFlags::kCannotBeKeywordStart) == static_cast<uint8_t>(ScanFlags::kCannotBeKeyword) << 1); scan_flags >>= 1; // Make sure the shifting above doesn't set IdentifierNeedsSlowPath. // Otherwise we'll fall into the slow path after scanning the identifier. DCHECK(!IdentifierNeedsSlowPath(scan_flags)); AddLiteralChar(static_cast<char>(c0_)); AdvanceUntil([this, &scan_flags](uc32 c0) { if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) { // A non-ascii character means we need to drop through to the slow // path. // TODO(leszeks): This would be most efficient as a goto to the slow // path, check codegen and maybe use a bool instead. scan_flags |= static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath); return true; } uint8_t char_flags = character_scan_flags[c0]; scan_flags |= char_flags; if (TerminatesLiteral(char_flags)) { return true; } else { AddLiteralChar(static_cast<char>(c0)); return false; } }); if (V8_LIKELY(!IdentifierNeedsSlowPath(scan_flags))) { if (!CanBeKeyword(scan_flags)) return Token::IDENTIFIER; // Could be a keyword or identifier. Vector<const uint8_t> chars = next().literal_chars.one_byte_literal(); return KeywordOrIdentifierToken(chars.begin(), chars.length()); } can_be_keyword = CanBeKeyword(scan_flags); } else { // Special case for escapes at the start of an identifier. escaped = true; uc32 c = ScanIdentifierUnicodeEscape(); DCHECK(!IsIdentifierStart(Invalid())); if (c == '\\' || !IsIdentifierStart(c)) { return Token::ILLEGAL; } AddLiteralChar(c); can_be_keyword = CharCanBeKeyword(c); } } return ScanIdentifierOrKeywordInnerSlow(escaped, can_be_keyword); } V8_INLINE Token::Value Scanner::SkipWhiteSpace() { int start_position = source_pos(); // We won't skip behind the end of input. DCHECK(!IsWhiteSpaceOrLineTerminator(kEndOfInput)); // Advance as long as character is a WhiteSpace or LineTerminator. while (IsWhiteSpaceOrLineTerminator(c0_)) { if (!next().after_line_terminator && unibrow::IsLineTerminator(c0_)) { next().after_line_terminator = true; } Advance(); } // Return whether or not we skipped any characters. if (source_pos() == start_position) { DCHECK_NE('0', c0_); return Token::ILLEGAL; } return Token::WHITESPACE; } V8_INLINE Token::Value Scanner::ScanSingleToken() { Token::Value token; do { next().location.beg_pos = source_pos(); if (V8_LIKELY(static_cast<unsigned>(c0_) <= kMaxAscii)) { token = one_char_tokens[c0_]; switch (token) { case Token::LPAREN: case Token::RPAREN: case Token::LBRACE: case Token::RBRACE: case Token::LBRACK: case Token::RBRACK: case Token::COLON: case Token::SEMICOLON: case Token::COMMA: case Token::BIT_NOT: case Token::ILLEGAL: // One character tokens. return Select(token); case Token::CONDITIONAL: // ? ?. ?? ??= Advance(); if (c0_ == '.') { Advance(); if (!IsDecimalDigit(c0_)) return Token::QUESTION_PERIOD; PushBack('.'); } else if (c0_ == '?') { return Select('=', Token::ASSIGN_NULLISH, Token::NULLISH); } return Token::CONDITIONAL; case Token::STRING: return ScanString(); case Token::LT: // < <= << <<= <!-- Advance(); if (c0_ == '=') return Select(Token::LTE); if (c0_ == '<') return Select('=', Token::ASSIGN_SHL, Token::SHL); if (c0_ == '!') { token = ScanHtmlComment(); continue; } return Token::LT; case Token::GT: // > >= >> >>= >>> >>>= Advance(); if (c0_ == '=') return Select(Token::GTE); if (c0_ == '>') { // >> >>= >>> >>>= Advance(); if (c0_ == '=') return Select(Token::ASSIGN_SAR); if (c0_ == '>') return Select('=', Token::ASSIGN_SHR, Token::SHR); return Token::SAR; } return Token::GT; case Token::ASSIGN: // = == === => Advance(); if (c0_ == '=') return Select('=', Token::EQ_STRICT, Token::EQ); if (c0_ == '>') return Select(Token::ARROW); return Token::ASSIGN; case Token::NOT: // ! != !== Advance(); if (c0_ == '=') return Select('=', Token::NE_STRICT, Token::NE); return Token::NOT; case Token::ADD: // + ++ += Advance(); if (c0_ == '+') return Select(Token::INC); if (c0_ == '=') return Select(Token::ASSIGN_ADD); return Token::ADD; case Token::SUB: // - -- --> -= Advance(); if (c0_ == '-') { Advance(); if (c0_ == '>' && next().after_line_terminator) { // For compatibility with SpiderMonkey, we skip lines that // start with an HTML comment end '-->'. token = SkipSingleHTMLComment(); continue; } return Token::DEC; } if (c0_ == '=') return Select(Token::ASSIGN_SUB); return Token::SUB; case Token::MUL: // * *= Advance(); if (c0_ == '*') return Select('=', Token::ASSIGN_EXP, Token::EXP); if (c0_ == '=') return Select(Token::ASSIGN_MUL); return Token::MUL; case Token::MOD: // % %= return Select('=', Token::ASSIGN_MOD, Token::MOD); case Token::DIV: // / // /* /= Advance(); if (c0_ == '/') { uc32 c = Peek(); if (c == '#' || c == '@') { Advance(); Advance(); token = SkipSourceURLComment(); continue; } token = SkipSingleLineComment(); continue; } if (c0_ == '*') { token = SkipMultiLineComment(); continue; } if (c0_ == '=') return Select(Token::ASSIGN_DIV); return Token::DIV; case Token::BIT_AND: // & && &= &&= Advance(); if (c0_ == '&') return Select('=', Token::ASSIGN_AND, Token::AND); if (c0_ == '=') return Select(Token::ASSIGN_BIT_AND); return Token::BIT_AND; case Token::BIT_OR: // | || |= ||= Advance(); if (c0_ == '|') return Select('=', Token::ASSIGN_OR, Token::OR); if (c0_ == '=') return Select(Token::ASSIGN_BIT_OR); return Token::BIT_OR; case Token::BIT_XOR: // ^ ^= return Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); case Token::PERIOD: // . Number Advance(); if (IsDecimalDigit(c0_)) return ScanNumber(true); if (c0_ == '.') { if (Peek() == '.') { Advance(); Advance(); return Token::ELLIPSIS; } } return Token::PERIOD; case Token::TEMPLATE_SPAN: Advance(); return ScanTemplateSpan(); case Token::PRIVATE_NAME: if (source_pos() == 0 && Peek() == '!') { token = SkipSingleLineComment(); continue; } return ScanPrivateName(); case Token::WHITESPACE: token = SkipWhiteSpace(); continue; case Token::NUMBER: return ScanNumber(false); case Token::IDENTIFIER: return ScanIdentifierOrKeyword(); default: UNREACHABLE(); } } if (IsIdentifierStart(c0_) || (CombineSurrogatePair() && IsIdentifierStart(c0_))) { return ScanIdentifierOrKeyword(); } if (c0_ == kEndOfInput) { return source_->has_parser_error() ? Token::ILLEGAL : Token::EOS; } token = SkipWhiteSpace(); // Continue scanning for tokens as long as we're just skipping whitespace. } while (token == Token::WHITESPACE); return token; } void Scanner::Scan(TokenDesc* next_desc) { DCHECK_EQ(next_desc, &next()); next_desc->token = ScanSingleToken(); DCHECK_IMPLIES(has_parser_error(), next_desc->token == Token::ILLEGAL); next_desc->location.end_pos = source_pos(); #ifdef DEBUG SanityCheckTokenDesc(current()); SanityCheckTokenDesc(next()); SanityCheckTokenDesc(next_next()); #endif } void Scanner::Scan() { Scan(next_); } } // namespace internal } // namespace v8 #endif // V8_PARSING_SCANNER_INL_H_