Commit df91f6b3 authored by vitalyr@chromium.org's avatar vitalyr@chromium.org

Simpler (and a bit faster) keyword matcher.

Replaced the keyword matching state machine with a switch on the first char followed up by inlined char comparisons.

R=lrn@chromium.org
TEST=cctest/test-parsing/ScanKeywords

Review URL: http://codereview.chromium.org/7558017

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@8866 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 7adb10a4
This diff is collapsed.
......@@ -542,155 +542,6 @@ class JavaScriptScanner : public Scanner {
bool has_multiline_comment_before_next_;
};
// ----------------------------------------------------------------------------
// Keyword matching state machine.
class KeywordMatcher {
// Incrementally recognize keywords.
//
// We distinguish between normal future reserved words and words that are
// considered to be future reserved words only in strict mode as required by
// ECMA-262 7.6.1.2.
//
// Recognized as keywords:
// break, case, catch, const*, continue, debugger, default, delete, do,
// else, finally, false, for, function, if, in, instanceof, new, null,
// return, switch, this, throw, true, try, typeof, var, void, while, with.
//
// Recognized as Future Reserved Keywords:
// class, enum, export, extends, import, super.
//
// Recognized as Future Reserved Keywords (strict mode only):
// implements, interface, let, package, private, protected, public,
// static, yield.
//
// *: Actually a "future reserved keyword". It's the only one we are
// recognizing outside of ES5 strict mode, the remaining are allowed
// as identifiers.
//
public:
KeywordMatcher()
: state_(INITIAL),
token_(Token::IDENTIFIER),
keyword_(NULL),
counter_(0),
keyword_token_(Token::ILLEGAL) {}
Token::Value token() { return token_; }
inline bool AddChar(unibrow::uchar input) {
if (state_ != UNMATCHABLE) {
Step(input);
}
return state_ != UNMATCHABLE;
}
void Fail() {
token_ = Token::IDENTIFIER;
state_ = UNMATCHABLE;
}
private:
enum State {
UNMATCHABLE,
INITIAL,
KEYWORD_PREFIX,
KEYWORD_MATCHED,
C,
CA,
CO,
CON,
D,
DE,
E,
EX,
F,
I,
IM,
IMP,
IN,
N,
P,
PR,
S,
T,
TH,
TR,
V,
W,
LAST_STATE = W
};
STATIC_ASSERT(LAST_STATE <= 0xFF);
STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
struct FirstState {
const char* keyword;
State state : 8;
Token::Value token : 8;
};
// Range of possible first characters of a keyword.
static const unsigned int kFirstCharRangeMin = 'b';
static const unsigned int kFirstCharRangeMax = 'y';
static const unsigned int kFirstCharRangeLength =
kFirstCharRangeMax - kFirstCharRangeMin + 1;
// State map for first keyword character range.
static const FirstState first_states_[kFirstCharRangeLength];
// If input equals keyword's character at position, continue matching keyword
// from that position.
inline bool MatchKeywordStart(unibrow::uchar input,
const char* keyword,
int position,
Token::Value token_if_match) {
if (input != static_cast<unibrow::uchar>(keyword[position])) {
return false;
}
state_ = KEYWORD_PREFIX;
this->keyword_ = keyword;
this->counter_ = position + 1;
this->keyword_token_ = token_if_match;
return true;
}
// If input equals match character, transition to new state and return true.
inline bool MatchState(unibrow::uchar input, char match, State new_state) {
if (input != static_cast<unibrow::uchar>(match)) {
return false;
}
state_ = new_state;
return true;
}
inline bool MatchKeyword(unibrow::uchar input,
char match,
State new_state,
Token::Value keyword_token) {
if (input != static_cast<unibrow::uchar>(match)) {
return false;
}
state_ = new_state;
token_ = keyword_token;
return true;
}
void Step(unibrow::uchar input);
// Current state.
State state_;
// Token for currently added characters.
Token::Value token_;
// Matching a specific keyword string (there is only one possible valid
// keyword with the current prefix).
const char* keyword_;
int counter_;
Token::Value keyword_token_;
};
} } // namespace v8::internal
#endif // V8_SCANNER_BASE_H_
......@@ -33,21 +33,21 @@ namespace internal {
#define T(name, string, precedence) #name,
const char* const Token::name_[NUM_TOKENS] = {
TOKEN_LIST(T, T, IGNORE_TOKEN)
TOKEN_LIST(T, T)
};
#undef T
#define T(name, string, precedence) string,
const char* const Token::string_[NUM_TOKENS] = {
TOKEN_LIST(T, T, IGNORE_TOKEN)
TOKEN_LIST(T, T)
};
#undef T
#define T(name, string, precedence) precedence,
const int8_t Token::precedence_[NUM_TOKENS] = {
TOKEN_LIST(T, T, IGNORE_TOKEN)
TOKEN_LIST(T, T)
};
#undef T
......@@ -55,7 +55,7 @@ const int8_t Token::precedence_[NUM_TOKENS] = {
#define KT(a, b, c) 'T',
#define KK(a, b, c) 'K',
const char Token::token_type[] = {
TOKEN_LIST(KT, KK, IGNORE_TOKEN)
TOKEN_LIST(KT, KK)
};
#undef KT
#undef KK
......
......@@ -41,7 +41,6 @@ namespace internal {
//
// T: Non-keyword tokens
// K: Keyword tokens
// F: Future (reserved) keyword tokens
// IGNORE_TOKEN is a convenience macro that can be supplied as
// an argument (at any position) for a TOKEN_LIST call. It does
......@@ -49,7 +48,7 @@ namespace internal {
#define IGNORE_TOKEN(name, string, precedence)
#define TOKEN_LIST(T, K, F) \
#define TOKEN_LIST(T, K) \
/* End of source indicator. */ \
T(EOS, "EOS", 0) \
\
......@@ -182,7 +181,7 @@ class Token {
// All token values.
#define T(name, string, precedence) name,
enum Value {
TOKEN_LIST(T, T, IGNORE_TOKEN)
TOKEN_LIST(T, T)
NUM_TOKENS
};
#undef T
......
......@@ -42,7 +42,7 @@
namespace i = ::v8::internal;
TEST(KeywordMatcher) {
TEST(ScanKeywords) {
struct KeywordToken {
const char* keyword;
i::Token::Value token;
......@@ -50,86 +50,56 @@ TEST(KeywordMatcher) {
static const KeywordToken keywords[] = {
#define KEYWORD(t, s, d) { s, i::Token::t },
#define IGNORE(t, s, d) /* */
TOKEN_LIST(IGNORE, KEYWORD, IGNORE)
TOKEN_LIST(IGNORE_TOKEN, KEYWORD)
#undef KEYWORD
{ NULL, i::Token::IDENTIFIER }
};
static const char* future_keywords[] = {
#define FUTURE(t, s, d) s,
TOKEN_LIST(IGNORE, IGNORE, FUTURE)
#undef FUTURE
#undef IGNORE
NULL
};
KeywordToken key_token;
i::UnicodeCache unicode_cache;
i::byte buffer[32];
for (int i = 0; (key_token = keywords[i]).keyword != NULL; i++) {
i::KeywordMatcher matcher;
const char* keyword = key_token.keyword;
int length = i::StrLength(keyword);
for (int j = 0; j < length; j++) {
if (key_token.token == i::Token::INSTANCEOF && j == 2) {
// "in" is a prefix of "instanceof". It's the only keyword
// that is a prefix of another.
CHECK_EQ(i::Token::IN, matcher.token());
} else {
CHECK_EQ(i::Token::IDENTIFIER, matcher.token());
}
matcher.AddChar(keyword[j]);
const i::byte* keyword =
reinterpret_cast<const i::byte*>(key_token.keyword);
int length = i::StrLength(key_token.keyword);
CHECK(static_cast<int>(sizeof(buffer)) >= length);
{
i::Utf8ToUC16CharacterStream stream(keyword, length);
i::JavaScriptScanner scanner(&unicode_cache);
scanner.Initialize(&stream);
CHECK_EQ(key_token.token, scanner.Next());
CHECK_EQ(i::Token::EOS, scanner.Next());
}
CHECK_EQ(key_token.token, matcher.token());
// Adding more characters will make keyword matching fail.
matcher.AddChar('z');
CHECK_EQ(i::Token::IDENTIFIER, matcher.token());
// Adding a keyword later will not make it match again.
matcher.AddChar('i');
matcher.AddChar('f');
CHECK_EQ(i::Token::IDENTIFIER, matcher.token());
}
// Future keywords are not recognized.
const char* future_keyword;
for (int i = 0; (future_keyword = future_keywords[i]) != NULL; i++) {
i::KeywordMatcher matcher;
int length = i::StrLength(future_keyword);
for (int j = 0; j < length; j++) {
matcher.AddChar(future_keyword[j]);
// Removing characters will make keyword matching fail.
{
i::Utf8ToUC16CharacterStream stream(keyword, length - 1);
i::JavaScriptScanner scanner(&unicode_cache);
scanner.Initialize(&stream);
CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
CHECK_EQ(i::Token::EOS, scanner.Next());
}
// Adding characters will make keyword matching fail.
static const char chars_to_append[] = { 'z', '0', '_' };
for (int j = 0; j < static_cast<int>(ARRAY_SIZE(chars_to_append)); ++j) {
memmove(buffer, keyword, length);
buffer[length] = chars_to_append[j];
i::Utf8ToUC16CharacterStream stream(buffer, length + 1);
i::JavaScriptScanner scanner(&unicode_cache);
scanner.Initialize(&stream);
CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
CHECK_EQ(i::Token::EOS, scanner.Next());
}
// Replacing characters will make keyword matching fail.
{
memmove(buffer, keyword, length);
buffer[length - 1] = '_';
i::Utf8ToUC16CharacterStream stream(buffer, length);
i::JavaScriptScanner scanner(&unicode_cache);
scanner.Initialize(&stream);
CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
CHECK_EQ(i::Token::EOS, scanner.Next());
}
CHECK_EQ(i::Token::IDENTIFIER, matcher.token());
}
// Zero isn't ignored at first.
i::KeywordMatcher bad_start;
bad_start.AddChar(0);
CHECK_EQ(i::Token::IDENTIFIER, bad_start.token());
bad_start.AddChar('i');
bad_start.AddChar('f');
CHECK_EQ(i::Token::IDENTIFIER, bad_start.token());
// Zero isn't ignored at end.
i::KeywordMatcher bad_end;
bad_end.AddChar('i');
bad_end.AddChar('f');
CHECK_EQ(i::Token::IF, bad_end.token());
bad_end.AddChar(0);
CHECK_EQ(i::Token::IDENTIFIER, bad_end.token());
// Case isn't ignored.
i::KeywordMatcher bad_case;
bad_case.AddChar('i');
bad_case.AddChar('F');
CHECK_EQ(i::Token::IDENTIFIER, bad_case.token());
// If we mark it as failure, continuing won't help.
i::KeywordMatcher full_stop;
full_stop.AddChar('i');
CHECK_EQ(i::Token::IDENTIFIER, full_stop.token());
full_stop.Fail();
CHECK_EQ(i::Token::IDENTIFIER, full_stop.token());
full_stop.AddChar('f');
CHECK_EQ(i::Token::IDENTIFIER, full_stop.token());
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment