Commit 7698d8d6 authored by Leszek Swirski's avatar Leszek Swirski Committed by Commit Bot

[parser] Use a lookup table for identifier scanning

Change the keyword/identifier scan to a single loop that branchlessly
collects information on whether this is a possible keyword, identifier
terminator or slow path (i.e. escapes) by looking up the value in a
flags table (as long as the character is ascii).

Also rewrites that loop as an AdvanceUntil, and sprinkles in some
V8_LIKELY magic which is 'likely' to improve things.

Change-Id: If06b0fff23630e7593b515308e5ffeca2d65daa8
Reviewed-on: https://chromium-review.googlesource.com/c/1328943Reviewed-by: 's avatarLeszek Swirski <leszeks@chromium.org>
Reviewed-by: 's avatarToon Verwaest <verwaest@chromium.org>
Commit-Queue: Leszek Swirski <leszeks@chromium.org>
Cr-Commit-Position: refs/heads/master@{#57391}
parent 631dd28d
......@@ -14,63 +14,53 @@ namespace internal {
// If c is in 'A'-'Z' or 'a'-'z', return its lower-case.
// Else, return something outside of 'A'-'Z' and 'a'-'z'.
// Note: it ignores LOCALE.
inline int AsciiAlphaToLower(uc32 c) {
return c | 0x20;
}
inline constexpr int AsciiAlphaToLower(uc32 c) { return c | 0x20; }
inline bool IsCarriageReturn(uc32 c) {
return c == 0x000D;
}
inline constexpr bool IsCarriageReturn(uc32 c) { return c == 0x000D; }
inline bool IsLineFeed(uc32 c) {
return c == 0x000A;
}
inline constexpr bool IsLineFeed(uc32 c) { return c == 0x000A; }
inline bool IsAsciiIdentifier(uc32 c) {
inline constexpr bool IsAsciiIdentifier(uc32 c) {
return IsAlphaNumeric(c) || c == '$' || c == '_';
}
inline bool IsAlphaNumeric(uc32 c) {
inline constexpr bool IsAlphaNumeric(uc32 c) {
return IsInRange(AsciiAlphaToLower(c), 'a', 'z') || IsDecimalDigit(c);
}
inline bool IsDecimalDigit(uc32 c) {
inline constexpr bool IsDecimalDigit(uc32 c) {
// ECMA-262, 3rd, 7.8.3 (p 16)
return IsInRange(c, '0', '9');
}
inline bool IsHexDigit(uc32 c) {
inline constexpr bool IsHexDigit(uc32 c) {
// ECMA-262, 3rd, 7.6 (p 15)
return IsDecimalDigit(c) || IsInRange(AsciiAlphaToLower(c), 'a', 'f');
}
inline bool IsOctalDigit(uc32 c) {
inline constexpr bool IsOctalDigit(uc32 c) {
// ECMA-262, 6th, 7.8.3
return IsInRange(c, '0', '7');
}
inline bool IsNonOctalDecimalDigit(uc32 c) { return IsInRange(c, '8', '9'); }
inline constexpr bool IsNonOctalDecimalDigit(uc32 c) {
return IsInRange(c, '8', '9');
}
inline bool IsBinaryDigit(uc32 c) {
inline constexpr bool IsBinaryDigit(uc32 c) {
// ECMA-262, 6th, 7.8.3
return c == '0' || c == '1';
}
inline bool IsRegExpWord(uc16 c) {
inline constexpr bool IsRegExpWord(uc16 c) {
return IsInRange(AsciiAlphaToLower(c), 'a', 'z')
|| IsDecimalDigit(c)
|| (c == '_');
}
inline bool IsRegExpNewline(uc16 c) {
switch (c) {
// CR LF LS PS
case 0x000A: case 0x000D: case 0x2028: case 0x2029:
return false;
default:
return true;
}
inline constexpr bool IsRegExpNewline(uc16 c) {
// CR LF LS PS
return c != 0x000A && c != 0x000D && c != 0x2028 && c != 0x2029;
}
......
......@@ -14,17 +14,17 @@ namespace internal {
// Unicode character predicates as defined by ECMA-262, 3rd,
// used for lexical analysis.
inline int AsciiAlphaToLower(uc32 c);
inline bool IsCarriageReturn(uc32 c);
inline bool IsLineFeed(uc32 c);
inline bool IsAsciiIdentifier(uc32 c);
inline bool IsAlphaNumeric(uc32 c);
inline bool IsDecimalDigit(uc32 c);
inline bool IsHexDigit(uc32 c);
inline bool IsOctalDigit(uc32 c);
inline bool IsBinaryDigit(uc32 c);
inline bool IsRegExpWord(uc32 c);
inline bool IsRegExpNewline(uc32 c);
inline constexpr int AsciiAlphaToLower(uc32 c);
inline constexpr bool IsCarriageReturn(uc32 c);
inline constexpr bool IsLineFeed(uc32 c);
inline constexpr bool IsAsciiIdentifier(uc32 c);
inline constexpr bool IsAlphaNumeric(uc32 c);
inline constexpr bool IsDecimalDigit(uc32 c);
inline constexpr bool IsHexDigit(uc32 c);
inline constexpr bool IsOctalDigit(uc32 c);
inline constexpr bool IsBinaryDigit(uc32 c);
inline constexpr bool IsRegExpWord(uc32 c);
inline constexpr bool IsRegExpNewline(uc32 c);
// ES#sec-names-and-keywords
// This includes '_', '$' and '\', and ID_Start according to
......
......@@ -288,56 +288,106 @@ V8_INLINE Token::Value Scanner::ScanIdentifierOrKeyword() {
return ScanIdentifierOrKeywordInner(&literal);
}
// Character flags for the fast path of scanning a keyword or identifier token.
enum class ScanFlags : uint8_t {
kTerminatesLiteral = 1 << 0,
// "Cannot" rather than "can" so that this flag can be ORed together across
// multiple characters.
kCannotBeKeyword = 1 << 1,
kNeedsSlowPath = 1 << 2,
};
constexpr uint8_t GetScanFlags(char c) {
return
// Keywords are all lowercase and only contain letters and '_'.
// Note that non-identifier characters do not set this flag, so
// that it plays well with kTerminatesLiteral
// TODO(leszeks): We could probably get an even tighter measure
// here if not all letters are present in keywords.
(IsAsciiIdentifier(c) && !IsInRange(c, 'a', 'z') && c != '_'
? static_cast<uint8_t>(ScanFlags::kCannotBeKeyword)
: 0) |
// Anything that isn't an identifier character will terminate the
// literal, or at least terminates the literal fast path processing
// (like an escape).
(!IsAsciiIdentifier(c)
? static_cast<uint8_t>(ScanFlags::kTerminatesLiteral)
: 0) |
// Escapes are processed on the slow path.
(c == '\\' ? static_cast<uint8_t>(ScanFlags::kNeedsSlowPath) : 0);
}
inline bool TerminatesLiteral(uint8_t scan_flags) {
return (scan_flags & static_cast<uint8_t>(ScanFlags::kTerminatesLiteral));
}
inline bool CanBeKeyword(uint8_t scan_flags) {
return !(scan_flags & static_cast<uint8_t>(ScanFlags::kCannotBeKeyword));
}
inline bool NeedsSlowPath(uint8_t scan_flags) {
return (scan_flags & static_cast<uint8_t>(ScanFlags::kNeedsSlowPath));
}
// Table of precomputed scan flags for the 128 ASCII characters, for branchless
// flag calculation during the scan.
static constexpr const uint8_t character_scan_flags[128] = {
#define CALL_GET_SCAN_FLAGS(N) GetScanFlags(N),
INT_0_TO_127_LIST(CALL_GET_SCAN_FLAGS)
#undef CALL_GET_SCAN_FLAGS
};
V8_INLINE Token::Value Scanner::ScanIdentifierOrKeywordInner(
LiteralScope* literal) {
DCHECK(unicode_cache_->IsIdentifierStart(c0_));
bool escaped = false;
if (IsInRange(c0_, 'a', 'z') || c0_ == '_') {
do {
AddLiteralChar(static_cast<char>(c0_));
Advance();
} while (IsInRange(c0_, 'a', 'z') || c0_ == '_');
if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '$') {
// Identifier starting with lowercase or _.
do {
AddLiteralChar(static_cast<char>(c0_));
Advance();
} while (IsAsciiIdentifier(c0_));
if (c0_ <= kMaxAscii && c0_ != '\\') {
literal->Complete();
return Token::IDENTIFIER;
}
} else if (c0_ <= kMaxAscii && c0_ != '\\') {
// Only a-z+ or _: could be a keyword or identifier.
Vector<const uint8_t> chars = next().literal_chars.one_byte_literal();
Token::Value token =
KeywordOrIdentifierToken(chars.start(), chars.length());
if (token == Token::IDENTIFIER ||
token == Token::FUTURE_STRICT_RESERVED_WORD ||
Token::IsContextualKeyword(token))
literal->Complete();
return token;
}
} else if (IsInRange(c0_, 'A', 'Z') || c0_ == '$') {
do {
STATIC_ASSERT(arraysize(character_scan_flags) == kMaxAscii + 1);
if (V8_LIKELY(static_cast<uint32_t>(c0_) <= kMaxAscii)) {
if (V8_LIKELY(c0_ != '\\')) {
uint8_t scan_flags = character_scan_flags[c0_];
DCHECK(!TerminatesLiteral(scan_flags));
AddLiteralChar(static_cast<char>(c0_));
Advance();
} while (IsAsciiIdentifier(c0_));
if (c0_ <= kMaxAscii && c0_ != '\\') {
literal->Complete();
return Token::IDENTIFIER;
}
} else if (c0_ == '\\') {
escaped = true;
uc32 c = ScanIdentifierUnicodeEscape();
DCHECK(!unicode_cache_->IsIdentifierStart(-1));
if (c == '\\' || !unicode_cache_->IsIdentifierStart(c)) {
return Token::ILLEGAL;
AdvanceUntil([this, &scan_flags](uc32 c0) {
if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
// A non-ascii character means we need to drop through to the slow
// path.
// TODO(leszeks): This would be most efficient as a goto to the slow
// path, check codegen and maybe use a bool instead.
scan_flags |= static_cast<uint8_t>(ScanFlags::kNeedsSlowPath);
return true;
}
uint8_t char_flags = character_scan_flags[c0];
scan_flags |= char_flags;
if (TerminatesLiteral(char_flags)) {
return true;
} else {
AddLiteralChar(static_cast<char>(c0));
return false;
}
});
if (V8_LIKELY(!NeedsSlowPath(scan_flags))) {
if (CanBeKeyword(scan_flags)) {
// Could be a keyword or identifier.
Vector<const uint8_t> chars = next().literal_chars.one_byte_literal();
Token::Value token =
KeywordOrIdentifierToken(chars.start(), chars.length());
if (token == Token::IDENTIFIER ||
token == Token::FUTURE_STRICT_RESERVED_WORD ||
Token::IsContextualKeyword(token))
literal->Complete();
return token;
} else {
literal->Complete();
return Token::IDENTIFIER;
}
}
} else {
// Special case for escapes at the start of an identifier.
escaped = true;
uc32 c = ScanIdentifierUnicodeEscape();
DCHECK(!unicode_cache_->IsIdentifierStart(-1));
if (c == '\\' || !unicode_cache_->IsIdentifierStart(c)) {
return Token::ILLEGAL;
}
AddLiteralChar(c);
}
AddLiteralChar(c);
}
return ScanIdentifierOrKeywordInnerSlow(literal, escaped);
......
......@@ -61,8 +61,10 @@ inline bool CStringEquals(const char* s1, const char* s2) {
// Checks if value is in range [lower_limit, higher_limit] using a single
// branch.
template <typename T, typename U>
inline bool IsInRange(T value, U lower_limit, U higher_limit) {
DCHECK_LE(lower_limit, higher_limit);
inline constexpr bool IsInRange(T value, U lower_limit, U higher_limit) {
#if V8_CAN_HAVE_DCHECK_IN_CONSTEXPR
DCHECK(lower_limit <= higher_limit);
#endif
STATIC_ASSERT(sizeof(U) <= sizeof(T));
typedef typename std::make_unsigned<T>::type unsigned_T;
// Use static_cast to support enum classes.
......@@ -961,6 +963,23 @@ INT_1_TO_63_LIST(DECLARE_TRUNCATE_TO_INT_N)
#undef DECLARE_IS_UINT_N
#undef DECLARE_TRUNCATE_TO_INT_N
// clang-format off
#define INT_0_TO_127_LIST(V) \
V(0) V(1) V(2) V(3) V(4) V(5) V(6) V(7) V(8) V(9) \
V(10) V(11) V(12) V(13) V(14) V(15) V(16) V(17) V(18) V(19) \
V(20) V(21) V(22) V(23) V(24) V(25) V(26) V(27) V(28) V(29) \
V(30) V(31) V(32) V(33) V(34) V(35) V(36) V(37) V(38) V(39) \
V(40) V(41) V(42) V(43) V(44) V(45) V(46) V(47) V(48) V(49) \
V(50) V(51) V(52) V(53) V(54) V(55) V(56) V(57) V(58) V(59) \
V(60) V(61) V(62) V(63) V(64) V(65) V(66) V(67) V(68) V(69) \
V(70) V(71) V(72) V(73) V(74) V(75) V(76) V(77) V(78) V(79) \
V(80) V(81) V(82) V(83) V(84) V(85) V(86) V(87) V(88) V(89) \
V(90) V(91) V(92) V(93) V(94) V(95) V(96) V(97) V(98) V(99) \
V(100) V(101) V(102) V(103) V(104) V(105) V(106) V(107) V(108) V(109) \
V(110) V(111) V(112) V(113) V(114) V(115) V(116) V(117) V(118) V(119) \
V(120) V(121) V(122) V(123) V(124) V(125) V(126) V(127)
// clang-format on
class FeedbackSlot {
public:
FeedbackSlot() : id_(kInvalidSlot) {}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment