Commit 2d4aa629 authored by lrn@chromium.org's avatar lrn@chromium.org

Extract scanner base/JS/JSON and move base and JS to scanner-base.

Remove templates from prescanner.

Review URL: http://codereview.chromium.org/5136002

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@5854 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 7c5cca29
......@@ -39,16 +39,6 @@
namespace v8 {
namespace internal {
int HexValue(uc32 c) {
if ('0' <= c && c <= '9')
return c - '0';
if ('a' <= c && c <= 'f')
return c - 'a' + 10;
if ('A' <= c && c <= 'F')
return c - 'A' + 10;
return -1;
}
namespace {
// C++-style iterator adaptor for StringInputBuffer
......
......@@ -75,11 +75,6 @@ static inline uint32_t DoubleToUint32(double x) {
}
// Returns the value (0 .. 15) of a hexadecimal character c.
// If c is not a legal hexadecimal character, returns a value < 0.
int HexValue(uc32 c);
// Enumeration for allowing octals and ignoring junk when converting
// strings to numbers.
enum ConversionFlags {
......
......@@ -728,7 +728,7 @@ FunctionLiteral* Parser::ParseProgram(Handle<String> source,
// Initialize parser state.
source->TryFlatten();
scanner_.Initialize(source, JAVASCRIPT);
scanner_.Initialize(source);
ASSERT(target_stack_ == NULL);
if (pre_data_ != NULL) pre_data_->Initialize();
......@@ -791,8 +791,7 @@ FunctionLiteral* Parser::ParseLazy(Handle<SharedFunctionInfo> info) {
// Initialize parser state.
source->TryFlatten();
scanner_.Initialize(source, info->start_position(), info->end_position(),
JAVASCRIPT);
scanner_.Initialize(source, info->start_position(), info->end_position());
ASSERT(target_stack_ == NULL);
mode_ = PARSE_EAGERLY;
......@@ -3613,7 +3612,7 @@ Expression* Parser::NewThrowError(Handle<String> constructor,
Handle<Object> JsonParser::ParseJson(Handle<String> source) {
source->TryFlatten();
scanner_.Initialize(source, JSON);
scanner_.Initialize(source);
Handle<Object> result = ParseJsonValue();
if (result.is_null() || scanner_.Next() != Token::EOS) {
if (scanner_.stack_overflow()) {
......@@ -4641,10 +4640,9 @@ int ScriptDataImpl::ReadNumber(byte** source) {
static ScriptDataImpl* DoPreParse(UTF16Buffer* stream,
bool allow_lazy,
PartialParserRecorder* recorder) {
typedef preparser::Scanner<UTF16Buffer, UTF8Buffer> PreScanner;
PreScanner scanner;
preparser::Scanner scanner;
scanner.Initialize(stream);
preparser::PreParser<PreScanner, PartialParserRecorder> preparser;
preparser::PreParser<preparser::Scanner, PartialParserRecorder> preparser;
if (!preparser.PreParseProgram(&scanner, recorder, allow_lazy)) {
Top::StackOverflow();
return NULL;
......
......@@ -682,7 +682,7 @@ class Parser {
Expression* ParseV8Intrinsic(bool* ok);
INLINE(Token::Value peek()) { return scanner_.peek(); }
INLINE(Token::Value Next()) { return scanner_.Next(); }
INLINE(Token::Value Next()) { return scanner_.NextCheckStack(); }
INLINE(void Consume(Token::Value token));
void Expect(Token::Value token, bool* ok);
bool Check(Token::Value token);
......@@ -760,7 +760,7 @@ class Parser {
ZoneList<Handle<String> > symbol_cache_;
Handle<Script> script_;
Scanner scanner_;
V8JavaScriptScanner scanner_;
Scope* top_scope_;
int with_nesting_level_;
......@@ -852,7 +852,7 @@ class JsonParser BASE_EMBEDDED {
// Converts the currently parsed literal to a JavaScript String.
Handle<String> GetString();
Scanner scanner_;
JsonScanner scanner_;
};
} } // namespace v8::internal
......
......@@ -40,19 +40,6 @@ namespace i = v8::internal;
typedef int uc32;
int HexValue(uc32 c) {
int res = c | 0x20; // Uppercase letters.
int is_digit = (c & 0x10) >> 4; // 0 if non-digit, 1 if digit.
// What to add to digits to make them consecutive with 'a'-'f' letters.
int kDelta = 'a' - '9' - 1;
// What to subtract to digits and letters to get them back to the range 0..15.
int kStart = '0' + kDelta;
res -= kStart;
res += kDelta * is_digit;
return res;
}
class PreScannerStackGuard {
public:
explicit PreScannerStackGuard(int max_size)
......@@ -72,7 +59,6 @@ class PreScannerStackGuard {
// Scanner for preparsing.
// InputStream is a source of UC16 characters with limited push-back.
// LiteralsBuffer is a collector of (UTF-8) characters used to capture literals.
template <typename InputStream, typename LiteralsBuffer>
class Scanner {
public:
enum LiteralType {
......@@ -96,7 +82,7 @@ class Scanner {
Scanner();
void Initialize(InputStream* stream);
void Initialize(i::UTF16Buffer* stream);
// Returns the next token.
i::Token::Value Next();
......@@ -151,7 +137,6 @@ class Scanner {
return next_.literal_chars;
}
// Returns the length of the next token (that would be returned if
// Next() were called).
int next_literal_length() const {
......@@ -250,15 +235,15 @@ class Scanner {
bool has_line_terminator_before_next_;
// Source.
InputStream* source_;
i::UTF16Buffer* source_;
// Buffer to hold literal values (identifiers, strings, numerals, regexps and
// regexp flags) using '\x00'-terminated UTF-8 encoding.
// Handles allocation internally.
// Notice that the '\x00' termination is meaningless for strings and regexps
// which may contain the zero-character, but can be used as terminator for
// identifiers, numerals and regexp flags.
LiteralsBuffer literal_buffer_;
// identifiers, numerals and regexp flags.Collector
i::LiteralCollector literal_buffer_;
bool stack_overflow_;
......@@ -270,21 +255,18 @@ class Scanner {
// ----------------------------------------------------------------------------
// Scanner::LiteralScope
template <typename InputStream, typename LiteralsBuffer>
Scanner<InputStream, LiteralsBuffer>::LiteralScope::LiteralScope(
Scanner::LiteralScope::LiteralScope(
Scanner* self, LiteralType type)
: scanner_(self), complete_(false) {
self->StartLiteral(type);
}
template <typename InputStream, typename LiteralsBuffer>
Scanner<InputStream, LiteralsBuffer>::LiteralScope::~LiteralScope() {
Scanner::LiteralScope::~LiteralScope() {
if (!complete_) scanner_->DropLiteral();
}
template <typename InputStream, typename LiteralsBuffer>
void Scanner<InputStream, LiteralsBuffer>::LiteralScope::Complete() {
void Scanner::LiteralScope::Complete() {
scanner_->TerminateLiteral();
complete_ = true;
}
......@@ -292,16 +274,14 @@ void Scanner<InputStream, LiteralsBuffer>::LiteralScope::Complete() {
// ----------------------------------------------------------------------------
// Scanner.
template <typename InputStream, typename LiteralsBuffer>
Scanner<InputStream, LiteralsBuffer>::Scanner()
Scanner::Scanner()
: stack_guard_(kMaxStackSize),
has_line_terminator_before_next_(false),
source_(NULL),
stack_overflow_(false) {}
template <typename InputStream, typename LiteralsBuffer>
void Scanner<InputStream, LiteralsBuffer>::Initialize(InputStream* stream) {
void Scanner::Initialize(i::UTF16Buffer* stream) {
source_ = stream;
// Initialize current_ to not refer to a literal.
......@@ -321,8 +301,7 @@ void Scanner<InputStream, LiteralsBuffer>::Initialize(InputStream* stream) {
}
template <typename InputStream, typename LiteralsBuffer>
i::Token::Value Scanner<InputStream, LiteralsBuffer>::Next() {
i::Token::Value Scanner::Next() {
// BUG 1215673: Find a thread safe way to set a stack limit in
// pre-parse mode. Otherwise, we cannot safely pre-parse from other
// threads.
......@@ -339,8 +318,7 @@ i::Token::Value Scanner<InputStream, LiteralsBuffer>::Next() {
}
template <typename InputStream, typename LiteralsBuffer>
void Scanner<InputStream, LiteralsBuffer>::StartLiteral(LiteralType type) {
void Scanner::StartLiteral(LiteralType type) {
// Only record string and literal identifiers when preparsing.
// Those are the ones that are recorded as symbols. Numbers and
// regexps are not recorded.
......@@ -350,28 +328,24 @@ void Scanner<InputStream, LiteralsBuffer>::StartLiteral(LiteralType type) {
}
template <typename InputStream, typename LiteralsBuffer>
void Scanner<InputStream, LiteralsBuffer>::AddLiteralChar(uc32 c) {
void Scanner::AddLiteralChar(uc32 c) {
literal_buffer_.AddChar(c);
}
template <typename InputStream, typename LiteralsBuffer>
void Scanner<InputStream, LiteralsBuffer>::TerminateLiteral() {
void Scanner::TerminateLiteral() {
i::Vector<const char> chars = literal_buffer_.EndLiteral();
next_.literal_chars = chars.start();
next_.literal_length = chars.length();
}
template <typename InputStream, typename LiteralsBuffer>
void Scanner<InputStream, LiteralsBuffer>::DropLiteral() {
void Scanner::DropLiteral() {
literal_buffer_.DropLiteral();
}
template <typename InputStream, typename LiteralsBuffer>
void Scanner<InputStream, LiteralsBuffer>::AddLiteralCharAdvance() {
void Scanner::AddLiteralCharAdvance() {
AddLiteralChar(c0_);
Advance();
}
......@@ -389,8 +363,7 @@ static inline bool IsByteOrderMark(uc32 c) {
}
template <typename InputStream, typename LiteralsBuffer>
bool Scanner<InputStream, LiteralsBuffer>::SkipWhiteSpace() {
bool Scanner::SkipWhiteSpace() {
int start_position = source_pos();
while (true) {
......@@ -431,8 +404,7 @@ bool Scanner<InputStream, LiteralsBuffer>::SkipWhiteSpace() {
}
template <typename InputStream, typename LiteralsBuffer>
i::Token::Value Scanner<InputStream, LiteralsBuffer>::SkipSingleLineComment() {
i::Token::Value Scanner::SkipSingleLineComment() {
Advance();
// The line terminator at the end of the line is not considered
......@@ -448,8 +420,7 @@ i::Token::Value Scanner<InputStream, LiteralsBuffer>::SkipSingleLineComment() {
}
template <typename InputStream, typename LiteralsBuffer>
i::Token::Value Scanner<InputStream, LiteralsBuffer>::SkipMultiLineComment() {
i::Token::Value Scanner::SkipMultiLineComment() {
ASSERT(c0_ == '*');
Advance();
......@@ -474,8 +445,7 @@ i::Token::Value Scanner<InputStream, LiteralsBuffer>::SkipMultiLineComment() {
}
template <typename InputStream, typename LiteralsBuffer>
i::Token::Value Scanner<InputStream, LiteralsBuffer>::ScanHtmlComment() {
i::Token::Value Scanner::ScanHtmlComment() {
// Check for <!-- comments.
ASSERT(c0_ == '!');
Advance();
......@@ -490,8 +460,7 @@ i::Token::Value Scanner<InputStream, LiteralsBuffer>::ScanHtmlComment() {
}
template <typename InputStream, typename LiteralsBuffer>
void Scanner<InputStream, LiteralsBuffer>::Scan() {
void Scanner::Scan() {
next_.literal_length = 0;
i::Token::Value token;
do {
......@@ -731,8 +700,7 @@ void Scanner<InputStream, LiteralsBuffer>::Scan() {
}
template <typename InputStream, typename LiteralsBuffer>
void Scanner<InputStream, LiteralsBuffer>::SeekForward(int pos) {
void Scanner::SeekForward(int pos) {
source_->SeekForward(pos - 1);
Advance();
// This function is only called to seek to the location
......@@ -743,15 +711,14 @@ void Scanner<InputStream, LiteralsBuffer>::SeekForward(int pos) {
}
template <typename InputStream, typename LiteralsBuffer>
uc32 Scanner<InputStream, LiteralsBuffer>::ScanHexEscape(uc32 c, int length) {
uc32 Scanner::ScanHexEscape(uc32 c, int length) {
ASSERT(length <= 4); // prevent overflow
uc32 digits[4];
uc32 x = 0;
for (int i = 0; i < length; i++) {
digits[i] = c0_;
int d = HexValue(c0_);
int d = i::HexValue(c0_);
if (d < 0) {
// According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
// should be illegal, but other JS VMs just return the
......@@ -774,8 +741,7 @@ uc32 Scanner<InputStream, LiteralsBuffer>::ScanHexEscape(uc32 c, int length) {
// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
// ECMA-262. Other JS VMs support them.
template <typename InputStream, typename LiteralsBuffer>
uc32 Scanner<InputStream, LiteralsBuffer>::ScanOctalEscape(
uc32 Scanner::ScanOctalEscape(
uc32 c, int length) {
uc32 x = c - '0';
for (int i = 0; i < length; i++) {
......@@ -790,8 +756,7 @@ uc32 Scanner<InputStream, LiteralsBuffer>::ScanOctalEscape(
}
template <typename InputStream, typename LiteralsBuffer>
void Scanner<InputStream, LiteralsBuffer>::ScanEscape() {
void Scanner::ScanEscape() {
uc32 c = c0_;
Advance();
......@@ -833,8 +798,7 @@ void Scanner<InputStream, LiteralsBuffer>::ScanEscape() {
}
template <typename InputStream, typename LiteralsBuffer>
i::Token::Value Scanner<InputStream, LiteralsBuffer>::ScanString() {
i::Token::Value Scanner::ScanString() {
uc32 quote = c0_;
Advance(); // consume quote
......@@ -858,16 +822,14 @@ i::Token::Value Scanner<InputStream, LiteralsBuffer>::ScanString() {
}
template <typename InputStream, typename LiteralsBuffer>
i::Token::Value Scanner<InputStream, LiteralsBuffer>::Select(
i::Token::Value Scanner::Select(
i::Token::Value tok) {
Advance();
return tok;
}
template <typename InputStream, typename LiteralsBuffer>
i::Token::Value Scanner<InputStream, LiteralsBuffer>::Select(
i::Token::Value Scanner::Select(
uc32 next,
i::Token::Value then,
i::Token::Value else_) {
......@@ -882,15 +844,13 @@ i::Token::Value Scanner<InputStream, LiteralsBuffer>::Select(
// Returns true if any decimal digits were scanned, returns false otherwise.
template <typename InputStream, typename LiteralsBuffer>
void Scanner<InputStream, LiteralsBuffer>::ScanDecimalDigits() {
void Scanner::ScanDecimalDigits() {
while (i::IsDecimalDigit(c0_))
AddLiteralCharAdvance();
}
template <typename InputStream, typename LiteralsBuffer>
i::Token::Value Scanner<InputStream, LiteralsBuffer>::ScanNumber(
i::Token::Value Scanner::ScanNumber(
bool seen_period) {
// c0_ is the first digit of the number or the fraction.
ASSERT(i::IsDecimalDigit(c0_));
......@@ -973,8 +933,7 @@ i::Token::Value Scanner<InputStream, LiteralsBuffer>::ScanNumber(
}
template <typename InputStream, typename LiteralsBuffer>
uc32 Scanner<InputStream, LiteralsBuffer>::ScanIdentifierUnicodeEscape() {
uc32 Scanner::ScanIdentifierUnicodeEscape() {
Advance();
if (c0_ != 'u') return unibrow::Utf8::kBadChar;
Advance();
......@@ -986,8 +945,7 @@ uc32 Scanner<InputStream, LiteralsBuffer>::ScanIdentifierUnicodeEscape() {
}
template <typename InputStream, typename LiteralsBuffer>
i::Token::Value Scanner<InputStream, LiteralsBuffer>::ScanIdentifier() {
i::Token::Value Scanner::ScanIdentifier() {
ASSERT(i::ScannerConstants::kIsIdentifierStart.get(c0_));
LiteralScope literal(this, kLiteralIdentifier);
......@@ -1030,8 +988,7 @@ i::Token::Value Scanner<InputStream, LiteralsBuffer>::ScanIdentifier() {
}
template <typename InputStream, typename LiteralsBuffer>
bool Scanner<InputStream, LiteralsBuffer>::ScanRegExpPattern(bool seen_equal) {
bool Scanner::ScanRegExpPattern(bool seen_equal) {
// Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
bool in_character_class = false;
......@@ -1070,8 +1027,7 @@ bool Scanner<InputStream, LiteralsBuffer>::ScanRegExpPattern(bool seen_equal) {
return true;
}
template <typename InputStream, typename LiteralsBuffer>
bool Scanner<InputStream, LiteralsBuffer>::ScanRegExpFlags() {
bool Scanner::ScanRegExpFlags() {
// Scan regular expression flags.
LiteralScope literal(this, kLiteralRegExpFlags);
while (i::ScannerConstants::kIsIdentifierPart.get(c0_)) {
......
......@@ -29,10 +29,39 @@
#include "../include/v8stdint.h"
#include "scanner-base.h"
#include "char-predicates-inl.h"
namespace v8 {
namespace internal {
// ----------------------------------------------------------------------------
// UTF16Buffer
UTF16Buffer::UTF16Buffer()
: pos_(0), end_(kNoEndPosition) { }
// ----------------------------------------------------------------------------
// LiteralCollector
LiteralCollector::LiteralCollector()
: buffer_(kInitialCapacity), recording_(false) { }
LiteralCollector::~LiteralCollector() {}
void LiteralCollector::AddCharSlow(uc32 c) {
ASSERT(static_cast<unsigned>(c) > unibrow::Utf8::kMaxOneByteChar);
int length = unibrow::Utf8::Length(c);
Vector<char> block = buffer_.AddBlock(length, '\0');
#ifdef DEBUG
int written_length = unibrow::Utf8::Encode(block.start(), c);
CHECK_EQ(length, written_length);
#else
unibrow::Utf8::Encode(block.start(), c);
#endif
}
// ----------------------------------------------------------------------------
// Character predicates
......@@ -60,6 +89,690 @@ bool ScannerConstants::IsIdentifier(unibrow::CharacterStream* buffer) {
return true;
}
// ----------------------------------------------------------------------------
// Scanner
Scanner::Scanner() : source_(NULL), stack_overflow_(false) {}
uc32 Scanner::ScanHexEscape(uc32 c, int length) {
ASSERT(length <= 4); // prevent overflow
uc32 digits[4];
uc32 x = 0;
for (int i = 0; i < length; i++) {
digits[i] = c0_;
int d = HexValue(c0_);
if (d < 0) {
// According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
// should be illegal, but other JS VMs just return the
// non-escaped version of the original character.
// Push back digits read, except the last one (in c0_).
for (int j = i-1; j >= 0; j--) {
PushBack(digits[j]);
}
// Notice: No handling of error - treat it as "\u"->"u".
return c;
}
x = x * 16 + d;
Advance();
}
return x;
}
// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
// ECMA-262. Other JS VMs support them.
uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
uc32 x = c - '0';
for (int i = 0; i < length; i++) {
int d = c0_ - '0';
if (d < 0 || d > 7) break;
int nx = x * 8 + d;
if (nx >= 256) break;
x = nx;
Advance();
}
return x;
}
// ----------------------------------------------------------------------------
// JavaScriptScanner
JavaScriptScanner::JavaScriptScanner()
: has_line_terminator_before_next_(false) {}
Token::Value JavaScriptScanner::Next() {
current_ = next_;
has_line_terminator_before_next_ = false;
Scan();
return current_.token;
}
static inline bool IsByteOrderMark(uc32 c) {
// The Unicode value U+FFFE is guaranteed never to be assigned as a
// Unicode character; this implies that in a Unicode context the
// 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
// character expressed in little-endian byte order (since it could
// not be a U+FFFE character expressed in big-endian byte
// order). Nevertheless, we check for it to be compatible with
// Spidermonkey.
return c == 0xFEFF || c == 0xFFFE;
}
bool JavaScriptScanner::SkipWhiteSpace() {
int start_position = source_pos();
while (true) {
// We treat byte-order marks (BOMs) as whitespace for better
// compatibility with Spidermonkey and other JavaScript engines.
while (ScannerConstants::kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) {
// IsWhiteSpace() includes line terminators!
if (ScannerConstants::kIsLineTerminator.get(c0_)) {
// Ignore line terminators, but remember them. This is necessary
// for automatic semicolon insertion.
has_line_terminator_before_next_ = true;
}
Advance();
}
// If there is an HTML comment end '-->' at the beginning of a
// line (with only whitespace in front of it), we treat the rest
// of the line as a comment. This is in line with the way
// SpiderMonkey handles it.
if (c0_ == '-' && has_line_terminator_before_next_) {
Advance();
if (c0_ == '-') {
Advance();
if (c0_ == '>') {
// Treat the rest of the line as a comment.
SkipSingleLineComment();
// Continue skipping white space after the comment.
continue;
}
PushBack('-'); // undo Advance()
}
PushBack('-'); // undo Advance()
}
// Return whether or not we skipped any characters.
return source_pos() != start_position;
}
}
Token::Value JavaScriptScanner::SkipSingleLineComment() {
Advance();
// The line terminator at the end of the line is not considered
// to be part of the single-line comment; it is recognized
// separately by the lexical grammar and becomes part of the
// stream of input elements for the syntactic grammar (see
// ECMA-262, section 7.4, page 12).
while (c0_ >= 0 && !ScannerConstants::kIsLineTerminator.get(c0_)) {
Advance();
}
return Token::WHITESPACE;
}
Token::Value JavaScriptScanner::SkipMultiLineComment() {
ASSERT(c0_ == '*');
Advance();
while (c0_ >= 0) {
char ch = c0_;
Advance();
// If we have reached the end of the multi-line comment, we
// consume the '/' and insert a whitespace. This way all
// multi-line comments are treated as whitespace - even the ones
// containing line terminators. This contradicts ECMA-262, section
// 7.4, page 12, that says that multi-line comments containing
// line terminators should be treated as a line terminator, but it
// matches the behaviour of SpiderMonkey and KJS.
if (ch == '*' && c0_ == '/') {
c0_ = ' ';
return Token::WHITESPACE;
}
}
// Unterminated multi-line comment.
return Token::ILLEGAL;
}
Token::Value JavaScriptScanner::ScanHtmlComment() {
// Check for <!-- comments.
ASSERT(c0_ == '!');
Advance();
if (c0_ == '-') {
Advance();
if (c0_ == '-') return SkipSingleLineComment();
PushBack('-'); // undo Advance()
}
PushBack('!'); // undo Advance()
ASSERT(c0_ == '!');
return Token::LT;
}
void JavaScriptScanner::Scan() {
next_.literal_chars = Vector<const char>();
Token::Value token;
do {
// Remember the position of the next token
next_.location.beg_pos = source_pos();
switch (c0_) {
case ' ':
case '\t':
Advance();
token = Token::WHITESPACE;
break;
case '\n':
Advance();
has_line_terminator_before_next_ = true;
token = Token::WHITESPACE;
break;
case '"': case '\'':
token = ScanString();
break;
case '<':
// < <= << <<= <!--
Advance();
if (c0_ == '=') {
token = Select(Token::LTE);
} else if (c0_ == '<') {
token = Select('=', Token::ASSIGN_SHL, Token::SHL);
} else if (c0_ == '!') {
token = ScanHtmlComment();
} else {
token = Token::LT;
}
break;
case '>':
// > >= >> >>= >>> >>>=
Advance();
if (c0_ == '=') {
token = Select(Token::GTE);
} else if (c0_ == '>') {
// >> >>= >>> >>>=
Advance();
if (c0_ == '=') {
token = Select(Token::ASSIGN_SAR);
} else if (c0_ == '>') {
token = Select('=', Token::ASSIGN_SHR, Token::SHR);
} else {
token = Token::SAR;
}
} else {
token = Token::GT;
}
break;
case '=':
// = == ===
Advance();
if (c0_ == '=') {
token = Select('=', Token::EQ_STRICT, Token::EQ);
} else {
token = Token::ASSIGN;
}
break;
case '!':
// ! != !==
Advance();
if (c0_ == '=') {
token = Select('=', Token::NE_STRICT, Token::NE);
} else {
token = Token::NOT;
}
break;
case '+':
// + ++ +=
Advance();
if (c0_ == '+') {
token = Select(Token::INC);
} else if (c0_ == '=') {
token = Select(Token::ASSIGN_ADD);
} else {
token = Token::ADD;
}
break;
case '-':
// - -- --> -=
Advance();
if (c0_ == '-') {
Advance();
if (c0_ == '>' && has_line_terminator_before_next_) {
// For compatibility with SpiderMonkey, we skip lines that
// start with an HTML comment end '-->'.
token = SkipSingleLineComment();
} else {
token = Token::DEC;
}
} else if (c0_ == '=') {
token = Select(Token::ASSIGN_SUB);
} else {
token = Token::SUB;
}
break;
case '*':
// * *=
token = Select('=', Token::ASSIGN_MUL, Token::MUL);
break;
case '%':
// % %=
token = Select('=', Token::ASSIGN_MOD, Token::MOD);
break;
case '/':
// / // /* /=
Advance();
if (c0_ == '/') {
token = SkipSingleLineComment();
} else if (c0_ == '*') {
token = SkipMultiLineComment();
} else if (c0_ == '=') {
token = Select(Token::ASSIGN_DIV);
} else {
token = Token::DIV;
}
break;
case '&':
// & && &=
Advance();
if (c0_ == '&') {
token = Select(Token::AND);
} else if (c0_ == '=') {
token = Select(Token::ASSIGN_BIT_AND);
} else {
token = Token::BIT_AND;
}
break;
case '|':
// | || |=
Advance();
if (c0_ == '|') {
token = Select(Token::OR);
} else if (c0_ == '=') {
token = Select(Token::ASSIGN_BIT_OR);
} else {
token = Token::BIT_OR;
}
break;
case '^':
// ^ ^=
token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
break;
case '.':
// . Number
Advance();
if (IsDecimalDigit(c0_)) {
token = ScanNumber(true);
} else {
token = Token::PERIOD;
}
break;
case ':':
token = Select(Token::COLON);
break;
case ';':
token = Select(Token::SEMICOLON);
break;
case ',':
token = Select(Token::COMMA);
break;
case '(':
token = Select(Token::LPAREN);
break;
case ')':
token = Select(Token::RPAREN);
break;
case '[':
token = Select(Token::LBRACK);
break;
case ']':
token = Select(Token::RBRACK);
break;
case '{':
token = Select(Token::LBRACE);
break;
case '}':
token = Select(Token::RBRACE);
break;
case '?':
token = Select(Token::CONDITIONAL);
break;
case '~':
token = Select(Token::BIT_NOT);
break;
default:
if (ScannerConstants::kIsIdentifierStart.get(c0_)) {
token = ScanIdentifier();
} else if (IsDecimalDigit(c0_)) {
token = ScanNumber(false);
} else if (SkipWhiteSpace()) {
token = Token::WHITESPACE;
} else if (c0_ < 0) {
token = Token::EOS;
} else {
token = Select(Token::ILLEGAL);
}
break;
}
// Continue scanning for tokens as long as we're just skipping
// whitespace.
} while (token == Token::WHITESPACE);
next_.location.end_pos = source_pos();
next_.token = token;
}
void JavaScriptScanner::SeekForward(int pos) {
source_->SeekForward(pos - 1);
Advance();
// This function is only called to seek to the location
// of the end of a function (at the "}" token). It doesn't matter
// whether there was a line terminator in the part we skip.
has_line_terminator_before_next_ = false;
Scan();
}
void JavaScriptScanner::ScanEscape() {
uc32 c = c0_;
Advance();
// Skip escaped newlines.
if (ScannerConstants::kIsLineTerminator.get(c)) {
// Allow CR+LF newlines in multiline string literals.
if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
// Allow LF+CR newlines in multiline string literals.
if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
return;
}
switch (c) {
case '\'': // fall through
case '"' : // fall through
case '\\': break;
case 'b' : c = '\b'; break;
case 'f' : c = '\f'; break;
case 'n' : c = '\n'; break;
case 'r' : c = '\r'; break;
case 't' : c = '\t'; break;
case 'u' : c = ScanHexEscape(c, 4); break;
case 'v' : c = '\v'; break;
case 'x' : c = ScanHexEscape(c, 2); break;
case '0' : // fall through
case '1' : // fall through
case '2' : // fall through
case '3' : // fall through
case '4' : // fall through
case '5' : // fall through
case '6' : // fall through
case '7' : c = ScanOctalEscape(c, 2); break;
}
// According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
// should be illegal, but they are commonly handled
// as non-escaped characters by JS VMs.
AddLiteralChar(c);
}
Token::Value JavaScriptScanner::ScanString() {
uc32 quote = c0_;
Advance(); // consume quote
LiteralScope literal(this);
while (c0_ != quote && c0_ >= 0
&& !ScannerConstants::kIsLineTerminator.get(c0_)) {
uc32 c = c0_;
Advance();
if (c == '\\') {
if (c0_ < 0) return Token::ILLEGAL;
ScanEscape();
} else {
AddLiteralChar(c);
}
}
if (c0_ != quote) return Token::ILLEGAL;
literal.Complete();
Advance(); // consume quote
return Token::STRING;
}
void JavaScriptScanner::ScanDecimalDigits() {
while (IsDecimalDigit(c0_))
AddLiteralCharAdvance();
}
Token::Value JavaScriptScanner::ScanNumber(bool seen_period) {
ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
LiteralScope literal(this);
if (seen_period) {
// we have already seen a decimal point of the float
AddLiteralChar('.');
ScanDecimalDigits(); // we know we have at least one digit
} else {
// if the first character is '0' we must check for octals and hex
if (c0_ == '0') {
AddLiteralCharAdvance();
// either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
if (c0_ == 'x' || c0_ == 'X') {
// hex number
kind = HEX;
AddLiteralCharAdvance();
if (!IsHexDigit(c0_)) {
// we must have at least one hex digit after 'x'/'X'
return Token::ILLEGAL;
}
while (IsHexDigit(c0_)) {
AddLiteralCharAdvance();
}
} else if ('0' <= c0_ && c0_ <= '7') {
// (possible) octal number
kind = OCTAL;
while (true) {
if (c0_ == '8' || c0_ == '9') {
kind = DECIMAL;
break;
}
if (c0_ < '0' || '7' < c0_) break;
AddLiteralCharAdvance();
}
}
}
// Parse decimal digits and allow trailing fractional part.
if (kind == DECIMAL) {
ScanDecimalDigits(); // optional
if (c0_ == '.') {
AddLiteralCharAdvance();
ScanDecimalDigits(); // optional
}
}
}
// scan exponent, if any
if (c0_ == 'e' || c0_ == 'E') {
ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed
// scan exponent
AddLiteralCharAdvance();
if (c0_ == '+' || c0_ == '-')
AddLiteralCharAdvance();
if (!IsDecimalDigit(c0_)) {
// we must have at least one decimal digit after 'e'/'E'
return Token::ILLEGAL;
}
ScanDecimalDigits();
}
// The source character immediately following a numeric literal must
// not be an identifier start or a decimal digit; see ECMA-262
// section 7.8.3, page 17 (note that we read only one decimal digit
// if the value is 0).
if (IsDecimalDigit(c0_) || ScannerConstants::kIsIdentifierStart.get(c0_))
return Token::ILLEGAL;
literal.Complete();
return Token::NUMBER;
}
uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {
Advance();
if (c0_ != 'u') return unibrow::Utf8::kBadChar;
Advance();
uc32 c = ScanHexEscape('u', 4);
// We do not allow a unicode escape sequence to start another
// unicode escape sequence.
if (c == '\\') return unibrow::Utf8::kBadChar;
return c;
}
Token::Value JavaScriptScanner::ScanIdentifier() {
ASSERT(ScannerConstants::kIsIdentifierStart.get(c0_));
LiteralScope literal(this);
KeywordMatcher keyword_match;
// Scan identifier start character.
if (c0_ == '\\') {
uc32 c = ScanIdentifierUnicodeEscape();
// Only allow legal identifier start characters.
if (!ScannerConstants::kIsIdentifierStart.get(c)) return Token::ILLEGAL;
AddLiteralChar(c);
keyword_match.Fail();
} else {
AddLiteralChar(c0_);
keyword_match.AddChar(c0_);
Advance();
}
// Scan the rest of the identifier characters.
while (ScannerConstants::kIsIdentifierPart.get(c0_)) {
if (c0_ == '\\') {
uc32 c = ScanIdentifierUnicodeEscape();
// Only allow legal identifier part characters.
if (!ScannerConstants::kIsIdentifierPart.get(c)) return Token::ILLEGAL;
AddLiteralChar(c);
keyword_match.Fail();
} else {
AddLiteralChar(c0_);
keyword_match.AddChar(c0_);
Advance();
}
}
literal.Complete();
return keyword_match.token();
}
bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {
// Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
bool in_character_class = false;
// Previous token is either '/' or '/=', in the second case, the
// pattern starts at =.
next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
// Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
// the scanner should pass uninterpreted bodies to the RegExp
// constructor.
LiteralScope literal(this);
if (seen_equal)
AddLiteralChar('=');
while (c0_ != '/' || in_character_class) {
if (ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) return false;
if (c0_ == '\\') { // escaped character
AddLiteralCharAdvance();
if (ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) return false;
AddLiteralCharAdvance();
} else { // unescaped character
if (c0_ == '[') in_character_class = true;
if (c0_ == ']') in_character_class = false;
AddLiteralCharAdvance();
}
}
Advance(); // consume '/'
literal.Complete();
return true;
}
bool JavaScriptScanner::ScanRegExpFlags() {
// Scan regular expression flags.
LiteralScope literal(this);
while (ScannerConstants::kIsIdentifierPart.get(c0_)) {
if (c0_ == '\\') {
uc32 c = ScanIdentifierUnicodeEscape();
if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
// We allow any escaped character, unlike the restriction on
// IdentifierPart when it is used to build an IdentifierName.
AddLiteralChar(c);
continue;
}
}
AddLiteralCharAdvance();
}
literal.Complete();
next_.location.end_pos = source_pos() - 1;
return true;
}
// ----------------------------------------------------------------------------
// Keyword Matcher
......
......@@ -37,11 +37,24 @@
#include "unicode-inl.h"
#include "char-predicates.h"
#include "utils.h"
#include "list-inl.h"
namespace v8 {
namespace internal {
// Interface through which the scanner reads characters from the input source.
// Returns the value (0 .. 15) of a hexadecimal character c.
// If c is not a legal hexadecimal character, returns a value < 0.
inline int HexValue(uc32 c) {
c -= '0';
if (static_cast<unsigned>(c) <= 9) return c;
c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
if (static_cast<unsigned>(c) <= 6) return c + 10;
return -1;
}
// ----------------------------------------------------------------------------
// UTF16Buffer - scanner input source with pushback.
class UTF16Buffer {
public:
UTF16Buffer();
......@@ -54,7 +67,11 @@ class UTF16Buffer {
int pos() const { return pos_; }
static const int kNoEndPosition = 1;
protected:
// Initial value of end_ before the input stream is initialized.
int pos_; // Current position in the buffer.
int end_; // Position where scanning should stop (EOF).
};
......@@ -79,6 +96,292 @@ class ScannerConstants : AllStatic {
static StaticResource<Utf8Decoder> utf8_decoder_;
};
// ----------------------------------------------------------------------------
// LiteralCollector - Collector of chars of literals.
class LiteralCollector {
public:
LiteralCollector();
~LiteralCollector();
inline void AddChar(uc32 c) {
if (recording_) {
if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
buffer_.Add(static_cast<char>(c));
} else {
AddCharSlow(c);
}
}
}
void StartLiteral() {
buffer_.StartSequence();
recording_ = true;
}
Vector<const char> EndLiteral() {
if (recording_) {
recording_ = false;
buffer_.Add(kEndMarker);
Vector<char> sequence = buffer_.EndSequence();
return Vector<const char>(sequence.start(), sequence.length());
}
return Vector<const char>();
}
void DropLiteral() {
if (recording_) {
recording_ = false;
buffer_.DropSequence();
}
}
void Reset() {
buffer_.Reset();
}
// The end marker added after a parsed literal.
// Using zero allows the usage of strlen and similar functions on
// identifiers and numbers (but not strings, since they may contain zero
// bytes).
static const char kEndMarker = '\x00';
private:
static const int kInitialCapacity = 256;
SequenceCollector<char, 4> buffer_;
bool recording_;
void AddCharSlow(uc32 c);
};
// ----------------------------------------------------------------------------
// Scanner base-class.
// Generic functionality used by both JSON and JavaScript scanners.
class Scanner {
public:
typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
class LiteralScope {
public:
explicit LiteralScope(Scanner* self);
~LiteralScope();
void Complete();
private:
Scanner* scanner_;
bool complete_;
};
Scanner();
// Returns the current token again.
Token::Value current_token() { return current_.token; }
// One token look-ahead (past the token returned by Next()).
Token::Value peek() const { return next_.token; }
struct Location {
Location(int b, int e) : beg_pos(b), end_pos(e) { }
Location() : beg_pos(0), end_pos(0) { }
int beg_pos;
int end_pos;
};
// Returns the location information for the current token
// (the token returned by Next()).
Location location() const { return current_.location; }
Location peek_location() const { return next_.location; }
// Returns the literal string, if any, for the current token (the
// token returned by Next()). The string is 0-terminated and in
// UTF-8 format; they may contain 0-characters. Literal strings are
// collected for identifiers, strings, and numbers.
// These functions only give the correct result if the literal
// was scanned between calls to StartLiteral() and TerminateLiteral().
const char* literal_string() const {
return current_.literal_chars.start();
}
int literal_length() const {
// Excluding terminal '\x00' added by TerminateLiteral().
return current_.literal_chars.length() - 1;
}
Vector<const char> literal() const {
return Vector<const char>(literal_string(), literal_length());
}
// Returns the literal string for the next token (the token that
// would be returned if Next() were called).
const char* next_literal_string() const {
return next_.literal_chars.start();
}
// Returns the length of the next token (that would be returned if
// Next() were called).
int next_literal_length() const {
// Excluding terminal '\x00' added by TerminateLiteral().
return next_.literal_chars.length() - 1;
}
Vector<const char> next_literal() const {
return Vector<const char>(next_literal_string(), next_literal_length());
}
bool stack_overflow() { return stack_overflow_; }
static const int kCharacterLookaheadBufferSize = 1;
protected:
// The current and look-ahead token.
struct TokenDesc {
Token::Value token;
Location location;
Vector<const char> literal_chars;
};
// Call this after setting source_ to the input.
void Init() {
// Set c0_ (one character ahead)
ASSERT(kCharacterLookaheadBufferSize == 1);
Advance();
// Initialize current_ to not refer to a literal.
current_.literal_chars = Vector<const char>();
// Reset literal buffer.
literal_buffer_.Reset();
}
// Literal buffer support
inline void StartLiteral() {
literal_buffer_.StartLiteral();
}
inline void AddLiteralChar(uc32 c) {
literal_buffer_.AddChar(c);
}
// Complete scanning of a literal.
inline void TerminateLiteral() {
next_.literal_chars = literal_buffer_.EndLiteral();
}
// Stops scanning of a literal and drop the collected characters,
// e.g., due to an encountered error.
inline void DropLiteral() {
literal_buffer_.DropLiteral();
}
inline void AddLiteralCharAdvance() {
AddLiteralChar(c0_);
Advance();
}
// Low-level scanning support.
void Advance() { c0_ = source_->Advance(); }
void PushBack(uc32 ch) {
source_->PushBack(ch);
c0_ = ch;
}
inline Token::Value Select(Token::Value tok) {
Advance();
return tok;
}
inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
Advance();
if (c0_ == next) {
Advance();
return then;
} else {
return else_;
}
}
uc32 ScanHexEscape(uc32 c, int length);
uc32 ScanOctalEscape(uc32 c, int length);
// Return the current source position.
int source_pos() {
return source_->pos() - kCharacterLookaheadBufferSize;
}
TokenDesc current_; // desc for current token (as returned by Next())
TokenDesc next_; // desc for next token (one token look-ahead)
// Input stream. Must be initialized to an UTF16Buffer.
UTF16Buffer* source_;
// Buffer to hold literal values (identifiers, strings, numbers)
// using '\x00'-terminated UTF-8 encoding. Handles allocation internally.
LiteralCollector literal_buffer_;
bool stack_overflow_;
// One Unicode character look-ahead; c0_ < 0 at the end of the input.
uc32 c0_;
};
// ----------------------------------------------------------------------------
// JavaScriptScanner - base logic for JavaScript scanning.
class JavaScriptScanner : public Scanner {
public:
JavaScriptScanner();
// Returns the next token.
Token::Value Next();
// Returns true if there was a line terminator before the peek'ed token.
bool has_line_terminator_before_next() const {
return has_line_terminator_before_next_;
}
// Scans the input as a regular expression pattern, previous
// character(s) must be /(=). Returns true if a pattern is scanned.
bool ScanRegExpPattern(bool seen_equal);
// Returns true if regexp flags are scanned (always since flags can
// be empty).
bool ScanRegExpFlags();
// Tells whether the buffer contains an identifier (no escapes).
// Used for checking if a property name is an identifier.
static bool IsIdentifier(unibrow::CharacterStream* buffer);
// Seek forward to the given position. This operation does not
// work in general, for instance when there are pushed back
// characters, but works for seeking forward until simple delimiter
// tokens, which is what it is used for.
void SeekForward(int pos);
protected:
bool SkipWhiteSpace();
Token::Value SkipSingleLineComment();
Token::Value SkipMultiLineComment();
// Scans a single JavaScript token.
void Scan();
void ScanDecimalDigits();
Token::Value ScanNumber(bool seen_period);
Token::Value ScanIdentifier();
void ScanEscape();
Token::Value ScanString();
// Scans a possible HTML comment -- begins with '<!'.
Token::Value ScanHtmlComment();
// Decodes a unicode escape-sequence which is part of an identifier.
// If the escape sequence cannot be decoded the result is kBadChar.
uc32 ScanIdentifierUnicodeEscape();
bool has_line_terminator_before_next_;
};
// ----------------------------------------------------------------------------
// Keyword matching state machine.
class KeywordMatcher {
// Incrementally recognize keywords.
......
......@@ -35,36 +35,9 @@
namespace v8 {
namespace internal {
// ----------------------------------------------------------------------------
// UTF8Buffer
UTF8Buffer::UTF8Buffer() : buffer_(kInitialCapacity), recording_(false) { }
UTF8Buffer::~UTF8Buffer() {}
void UTF8Buffer::AddCharSlow(uc32 c) {
ASSERT(static_cast<unsigned>(c) > unibrow::Utf8::kMaxOneByteChar);
int length = unibrow::Utf8::Length(c);
Vector<char> block = buffer_.AddBlock(length, '\0');
#ifdef DEBUG
int written_length = unibrow::Utf8::Encode(block.start(), c);
CHECK_EQ(length, written_length);
#else
unibrow::Utf8::Encode(block.start(), c);
#endif
}
// ----------------------------------------------------------------------------
// UTF16Buffer
UTF16Buffer::UTF16Buffer()
: pos_(0), end_(Scanner::kNoEndPosition) { }
// CharacterStreamUTF16Buffer
CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer()
: pushback_buffer_(0), last_(0), stream_(NULL) { }
......@@ -78,7 +51,7 @@ void CharacterStreamUTF16Buffer::Initialize(Handle<String> data,
if (start_position > 0) {
SeekForward(start_position);
}
end_ = end_position != Scanner::kNoEndPosition ? end_position : kMaxInt;
end_ = end_position != kNoEndPosition ? end_position : kMaxInt;
}
......@@ -90,7 +63,7 @@ void CharacterStreamUTF16Buffer::PushBack(uc32 ch) {
uc32 CharacterStreamUTF16Buffer::Advance() {
ASSERT(end_ != Scanner::kNoEndPosition);
ASSERT(end_ != kNoEndPosition);
ASSERT(end_ >= 0);
// NOTE: It is of importance to Persian / Farsi resources that we do
// *not* strip format control characters in the scanner; see
......@@ -143,41 +116,66 @@ void Scanner::LiteralScope::Complete() {
}
// ----------------------------------------------------------------------------
// Scanner
// V8JavaScriptScanner
Scanner::Scanner()
: has_line_terminator_before_next_(false),
is_parsing_json_(false),
source_(NULL),
stack_overflow_(false) {}
void V8JavaScriptScanner::Initialize(Handle<String> source) {
source_ = stream_initializer_.Init(source, NULL, 0, source->length());
Init();
// Skip initial whitespace allowing HTML comment ends just like
// after a newline and scan first token.
has_line_terminator_before_next_ = true;
SkipWhiteSpace();
Scan();
}
void Scanner::Initialize(Handle<String> source,
ParserLanguage language) {
Init(source, NULL, 0, source->length(), language);
void V8JavaScriptScanner::Initialize(Handle<String> source,
unibrow::CharacterStream* stream) {
source_ = stream_initializer_.Init(source, stream,
0, UTF16Buffer::kNoEndPosition);
Init();
// Skip initial whitespace allowing HTML comment ends just like
// after a newline and scan first token.
has_line_terminator_before_next_ = true;
SkipWhiteSpace();
Scan();
}
void Scanner::Initialize(Handle<String> source,
unibrow::CharacterStream* stream,
ParserLanguage language) {
Init(source, stream, 0, kNoEndPosition, language);
void V8JavaScriptScanner::Initialize(Handle<String> source,
int start_position,
int end_position) {
source_ = stream_initializer_.Init(source, NULL,
start_position, end_position);
Init();
// Skip initial whitespace allowing HTML comment ends just like
// after a newline and scan first token.
has_line_terminator_before_next_ = true;
SkipWhiteSpace();
Scan();
}
void Scanner::Initialize(Handle<String> source,
int start_position,
int end_position,
ParserLanguage language) {
Init(source, NULL, start_position, end_position, language);
Token::Value V8JavaScriptScanner::NextCheckStack() {
// BUG 1215673: Find a thread safe way to set a stack limit in
// pre-parse mode. Otherwise, we cannot safely pre-parse from other
// threads.
StackLimitCheck check;
if (check.HasOverflowed()) {
stack_overflow_ = true;
current_ = next_;
next_.token = Token::ILLEGAL;
return current_.token;
} else {
return Next();
}
}
void Scanner::Init(Handle<String> source,
unibrow::CharacterStream* stream,
int start_position,
int end_position,
ParserLanguage language) {
UTF16Buffer* StreamInitializer::Init(Handle<String> source,
unibrow::CharacterStream* stream,
int start_position,
int end_position) {
// Either initialize the scanner from a character stream or from a
// string.
ASSERT(source.is_null() || stream == NULL);
......@@ -188,13 +186,13 @@ void Scanner::Init(Handle<String> source,
Handle<ExternalTwoByteString>::cast(source),
start_position,
end_position);
source_ = &two_byte_string_buffer_;
return &two_byte_string_buffer_;
} else if (!source.is_null() && StringShape(*source).IsExternalAscii()) {
ascii_string_buffer_.Initialize(
Handle<ExternalAsciiString>::cast(source),
start_position,
end_position);
source_ = &ascii_string_buffer_;
return &ascii_string_buffer_;
} else {
if (!source.is_null()) {
safe_string_input_buffer_.Reset(source.location());
......@@ -204,28 +202,27 @@ void Scanner::Init(Handle<String> source,
stream,
start_position,
end_position);
source_ = &char_stream_buffer_;
return &char_stream_buffer_;
}
}
is_parsing_json_ = (language == JSON);
// ----------------------------------------------------------------------------
// JsonScanner
// Set c0_ (one character ahead)
ASSERT(kCharacterLookaheadBufferSize == 1);
Advance();
// Initialize current_ to not refer to a literal.
current_.literal_chars = Vector<const char>();
// Reset literal buffer.
literal_buffer_.Reset();
JsonScanner::JsonScanner() {}
// Skip initial whitespace allowing HTML comment ends just like
// after a newline and scan first token.
has_line_terminator_before_next_ = true;
SkipWhiteSpace();
Scan();
void JsonScanner::Initialize(Handle<String> source) {
source_ = stream_initializer_.Init(source, NULL, 0, source->length());
Init();
// Skip initial whitespace.
SkipJsonWhiteSpace();
// Preload first token as look-ahead.
ScanJson();
}
Token::Value Scanner::Next() {
Token::Value JsonScanner::Next() {
// BUG 1215673: Find a thread safe way to set a stack limit in
// pre-parse mode. Otherwise, we cannot safely pre-parse from other
// threads.
......@@ -236,52 +233,13 @@ Token::Value Scanner::Next() {
stack_overflow_ = true;
next_.token = Token::ILLEGAL;
} else {
has_line_terminator_before_next_ = false;
Scan();
ScanJson();
}
return current_.token;
}
void Scanner::StartLiteral() {
literal_buffer_.StartLiteral();
}
void Scanner::AddLiteralChar(uc32 c) {
literal_buffer_.AddChar(c);
}
void Scanner::TerminateLiteral() {
next_.literal_chars = literal_buffer_.EndLiteral();
}
void Scanner::DropLiteral() {
literal_buffer_.DropLiteral();
}
void Scanner::AddLiteralCharAdvance() {
AddLiteralChar(c0_);
Advance();
}
static inline bool IsByteOrderMark(uc32 c) {
// The Unicode value U+FFFE is guaranteed never to be assigned as a
// Unicode character; this implies that in a Unicode context the
// 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
// character expressed in little-endian byte order (since it could
// not be a U+FFFE character expressed in big-endian byte
// order). Nevertheless, we check for it to be compatible with
// Spidermonkey.
return c == 0xFEFF || c == 0xFFFE;
}
bool Scanner::SkipJsonWhiteSpace() {
bool JsonScanner::SkipJsonWhiteSpace() {
int start_position = source_pos();
// JSON WhiteSpace is tab, carrige-return, newline and space.
while (c0_ == ' ' || c0_ == '\n' || c0_ == '\r' || c0_ == '\t') {
......@@ -291,107 +249,9 @@ bool Scanner::SkipJsonWhiteSpace() {
}
bool Scanner::SkipJavaScriptWhiteSpace() {
int start_position = source_pos();
while (true) {
// We treat byte-order marks (BOMs) as whitespace for better
// compatibility with Spidermonkey and other JavaScript engines.
while (ScannerConstants::kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) {
// IsWhiteSpace() includes line terminators!
if (ScannerConstants::kIsLineTerminator.get(c0_)) {
// Ignore line terminators, but remember them. This is necessary
// for automatic semicolon insertion.
has_line_terminator_before_next_ = true;
}
Advance();
}
// If there is an HTML comment end '-->' at the beginning of a
// line (with only whitespace in front of it), we treat the rest
// of the line as a comment. This is in line with the way
// SpiderMonkey handles it.
if (c0_ == '-' && has_line_terminator_before_next_) {
Advance();
if (c0_ == '-') {
Advance();
if (c0_ == '>') {
// Treat the rest of the line as a comment.
SkipSingleLineComment();
// Continue skipping white space after the comment.
continue;
}
PushBack('-'); // undo Advance()
}
PushBack('-'); // undo Advance()
}
// Return whether or not we skipped any characters.
return source_pos() != start_position;
}
}
Token::Value Scanner::SkipSingleLineComment() {
Advance();
// The line terminator at the end of the line is not considered
// to be part of the single-line comment; it is recognized
// separately by the lexical grammar and becomes part of the
// stream of input elements for the syntactic grammar (see
// ECMA-262, section 7.4, page 12).
while (c0_ >= 0 && !ScannerConstants::kIsLineTerminator.get(c0_)) {
Advance();
}
return Token::WHITESPACE;
}
Token::Value Scanner::SkipMultiLineComment() {
ASSERT(c0_ == '*');
Advance();
while (c0_ >= 0) {
char ch = c0_;
Advance();
// If we have reached the end of the multi-line comment, we
// consume the '/' and insert a whitespace. This way all
// multi-line comments are treated as whitespace - even the ones
// containing line terminators. This contradicts ECMA-262, section
// 7.4, page 12, that says that multi-line comments containing
// line terminators should be treated as a line terminator, but it
// matches the behaviour of SpiderMonkey and KJS.
if (ch == '*' && c0_ == '/') {
c0_ = ' ';
return Token::WHITESPACE;
}
}
// Unterminated multi-line comment.
return Token::ILLEGAL;
}
Token::Value Scanner::ScanHtmlComment() {
// Check for <!-- comments.
ASSERT(c0_ == '!');
Advance();
if (c0_ == '-') {
Advance();
if (c0_ == '-') return SkipSingleLineComment();
PushBack('-'); // undo Advance()
}
PushBack('!'); // undo Advance()
ASSERT(c0_ == '!');
return Token::LT;
}
void Scanner::ScanJson() {
void JsonScanner::ScanJson() {
next_.literal_chars = Vector<const char>();
Token::Value token;
has_line_terminator_before_next_ = false;
do {
// Remember the position of the next token
next_.location.beg_pos = source_pos();
......@@ -468,7 +328,7 @@ void Scanner::ScanJson() {
}
Token::Value Scanner::ScanJsonString() {
Token::Value JsonScanner::ScanJsonString() {
ASSERT_EQ('"', c0_);
Advance();
LiteralScope literal(this);
......@@ -528,7 +388,7 @@ Token::Value Scanner::ScanJsonString() {
}
Token::Value Scanner::ScanJsonNumber() {
Token::Value JsonScanner::ScanJsonNumber() {
LiteralScope literal(this);
if (c0_ == '-') AddLiteralCharAdvance();
if (c0_ == '0') {
......@@ -562,8 +422,8 @@ Token::Value Scanner::ScanJsonNumber() {
}
Token::Value Scanner::ScanJsonIdentifier(const char* text,
Token::Value token) {
Token::Value JsonScanner::ScanJsonIdentifier(const char* text,
Token::Value token) {
LiteralScope literal(this);
while (*text != '\0') {
if (c0_ != *text) return Token::ILLEGAL;
......@@ -576,577 +436,5 @@ Token::Value Scanner::ScanJsonIdentifier(const char* text,
}
void Scanner::ScanJavaScript() {
next_.literal_chars = Vector<const char>();
Token::Value token;
do {
// Remember the position of the next token
next_.location.beg_pos = source_pos();
switch (c0_) {
case ' ':
case '\t':
Advance();
token = Token::WHITESPACE;
break;
case '\n':
Advance();
has_line_terminator_before_next_ = true;
token = Token::WHITESPACE;
break;
case '"': case '\'':
token = ScanString();
break;
case '<':
// < <= << <<= <!--
Advance();
if (c0_ == '=') {
token = Select(Token::LTE);
} else if (c0_ == '<') {
token = Select('=', Token::ASSIGN_SHL, Token::SHL);
} else if (c0_ == '!') {
token = ScanHtmlComment();
} else {
token = Token::LT;
}
break;
case '>':
// > >= >> >>= >>> >>>=
Advance();
if (c0_ == '=') {
token = Select(Token::GTE);
} else if (c0_ == '>') {
// >> >>= >>> >>>=
Advance();
if (c0_ == '=') {
token = Select(Token::ASSIGN_SAR);
} else if (c0_ == '>') {
token = Select('=', Token::ASSIGN_SHR, Token::SHR);
} else {
token = Token::SAR;
}
} else {
token = Token::GT;
}
break;
case '=':
// = == ===
Advance();
if (c0_ == '=') {
token = Select('=', Token::EQ_STRICT, Token::EQ);
} else {
token = Token::ASSIGN;
}
break;
case '!':
// ! != !==
Advance();
if (c0_ == '=') {
token = Select('=', Token::NE_STRICT, Token::NE);
} else {
token = Token::NOT;
}
break;
case '+':
// + ++ +=
Advance();
if (c0_ == '+') {
token = Select(Token::INC);
} else if (c0_ == '=') {
token = Select(Token::ASSIGN_ADD);
} else {
token = Token::ADD;
}
break;
case '-':
// - -- --> -=
Advance();
if (c0_ == '-') {
Advance();
if (c0_ == '>' && has_line_terminator_before_next_) {
// For compatibility with SpiderMonkey, we skip lines that
// start with an HTML comment end '-->'.
token = SkipSingleLineComment();
} else {
token = Token::DEC;
}
} else if (c0_ == '=') {
token = Select(Token::ASSIGN_SUB);
} else {
token = Token::SUB;
}
break;
case '*':
// * *=
token = Select('=', Token::ASSIGN_MUL, Token::MUL);
break;
case '%':
// % %=
token = Select('=', Token::ASSIGN_MOD, Token::MOD);
break;
case '/':
// / // /* /=
Advance();
if (c0_ == '/') {
token = SkipSingleLineComment();
} else if (c0_ == '*') {
token = SkipMultiLineComment();
} else if (c0_ == '=') {
token = Select(Token::ASSIGN_DIV);
} else {
token = Token::DIV;
}
break;
case '&':
// & && &=
Advance();
if (c0_ == '&') {
token = Select(Token::AND);
} else if (c0_ == '=') {
token = Select(Token::ASSIGN_BIT_AND);
} else {
token = Token::BIT_AND;
}
break;
case '|':
// | || |=
Advance();
if (c0_ == '|') {
token = Select(Token::OR);
} else if (c0_ == '=') {
token = Select(Token::ASSIGN_BIT_OR);
} else {
token = Token::BIT_OR;
}
break;
case '^':
// ^ ^=
token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
break;
case '.':
// . Number
Advance();
if (IsDecimalDigit(c0_)) {
token = ScanNumber(true);
} else {
token = Token::PERIOD;
}
break;
case ':':
token = Select(Token::COLON);
break;
case ';':
token = Select(Token::SEMICOLON);
break;
case ',':
token = Select(Token::COMMA);
break;
case '(':
token = Select(Token::LPAREN);
break;
case ')':
token = Select(Token::RPAREN);
break;
case '[':
token = Select(Token::LBRACK);
break;
case ']':
token = Select(Token::RBRACK);
break;
case '{':
token = Select(Token::LBRACE);
break;
case '}':
token = Select(Token::RBRACE);
break;
case '?':
token = Select(Token::CONDITIONAL);
break;
case '~':
token = Select(Token::BIT_NOT);
break;
default:
if (ScannerConstants::kIsIdentifierStart.get(c0_)) {
token = ScanIdentifier();
} else if (IsDecimalDigit(c0_)) {
token = ScanNumber(false);
} else if (SkipWhiteSpace()) {
token = Token::WHITESPACE;
} else if (c0_ < 0) {
token = Token::EOS;
} else {
token = Select(Token::ILLEGAL);
}
break;
}
// Continue scanning for tokens as long as we're just skipping
// whitespace.
} while (token == Token::WHITESPACE);
next_.location.end_pos = source_pos();
next_.token = token;
}
void Scanner::SeekForward(int pos) {
source_->SeekForward(pos - 1);
Advance();
// This function is only called to seek to the location
// of the end of a function (at the "}" token). It doesn't matter
// whether there was a line terminator in the part we skip.
has_line_terminator_before_next_ = false;
Scan();
}
uc32 Scanner::ScanHexEscape(uc32 c, int length) {
ASSERT(length <= 4); // prevent overflow
uc32 digits[4];
uc32 x = 0;
for (int i = 0; i < length; i++) {
digits[i] = c0_;
int d = HexValue(c0_);
if (d < 0) {
// According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
// should be illegal, but other JS VMs just return the
// non-escaped version of the original character.
// Push back digits read, except the last one (in c0_).
for (int j = i-1; j >= 0; j--) {
PushBack(digits[j]);
}
// Notice: No handling of error - treat it as "\u"->"u".
return c;
}
x = x * 16 + d;
Advance();
}
return x;
}
// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
// ECMA-262. Other JS VMs support them.
uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
uc32 x = c - '0';
for (int i = 0; i < length; i++) {
int d = c0_ - '0';
if (d < 0 || d > 7) break;
int nx = x * 8 + d;
if (nx >= 256) break;
x = nx;
Advance();
}
return x;
}
void Scanner::ScanEscape() {
uc32 c = c0_;
Advance();
// Skip escaped newlines.
if (ScannerConstants::kIsLineTerminator.get(c)) {
// Allow CR+LF newlines in multiline string literals.
if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
// Allow LF+CR newlines in multiline string literals.
if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
return;
}
switch (c) {
case '\'': // fall through
case '"' : // fall through
case '\\': break;
case 'b' : c = '\b'; break;
case 'f' : c = '\f'; break;
case 'n' : c = '\n'; break;
case 'r' : c = '\r'; break;
case 't' : c = '\t'; break;
case 'u' : c = ScanHexEscape(c, 4); break;
case 'v' : c = '\v'; break;
case 'x' : c = ScanHexEscape(c, 2); break;
case '0' : // fall through
case '1' : // fall through
case '2' : // fall through
case '3' : // fall through
case '4' : // fall through
case '5' : // fall through
case '6' : // fall through
case '7' : c = ScanOctalEscape(c, 2); break;
}
// According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
// should be illegal, but they are commonly handled
// as non-escaped characters by JS VMs.
AddLiteralChar(c);
}
Token::Value Scanner::ScanString() {
uc32 quote = c0_;
Advance(); // consume quote
LiteralScope literal(this);
while (c0_ != quote && c0_ >= 0
&& !ScannerConstants::kIsLineTerminator.get(c0_)) {
uc32 c = c0_;
Advance();
if (c == '\\') {
if (c0_ < 0) return Token::ILLEGAL;
ScanEscape();
} else {
AddLiteralChar(c);
}
}
if (c0_ != quote) return Token::ILLEGAL;
literal.Complete();
Advance(); // consume quote
return Token::STRING;
}
Token::Value Scanner::Select(Token::Value tok) {
Advance();
return tok;
}
Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) {
Advance();
if (c0_ == next) {
Advance();
return then;
} else {
return else_;
}
}
// Returns true if any decimal digits were scanned, returns false otherwise.
void Scanner::ScanDecimalDigits() {
while (IsDecimalDigit(c0_))
AddLiteralCharAdvance();
}
Token::Value Scanner::ScanNumber(bool seen_period) {
ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
LiteralScope literal(this);
if (seen_period) {
// we have already seen a decimal point of the float
AddLiteralChar('.');
ScanDecimalDigits(); // we know we have at least one digit
} else {
// if the first character is '0' we must check for octals and hex
if (c0_ == '0') {
AddLiteralCharAdvance();
// either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
if (c0_ == 'x' || c0_ == 'X') {
// hex number
kind = HEX;
AddLiteralCharAdvance();
if (!IsHexDigit(c0_)) {
// we must have at least one hex digit after 'x'/'X'
return Token::ILLEGAL;
}
while (IsHexDigit(c0_)) {
AddLiteralCharAdvance();
}
} else if ('0' <= c0_ && c0_ <= '7') {
// (possible) octal number
kind = OCTAL;
while (true) {
if (c0_ == '8' || c0_ == '9') {
kind = DECIMAL;
break;
}
if (c0_ < '0' || '7' < c0_) break;
AddLiteralCharAdvance();
}
}
}
// Parse decimal digits and allow trailing fractional part.
if (kind == DECIMAL) {
ScanDecimalDigits(); // optional
if (c0_ == '.') {
AddLiteralCharAdvance();
ScanDecimalDigits(); // optional
}
}
}
// scan exponent, if any
if (c0_ == 'e' || c0_ == 'E') {
ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed
// scan exponent
AddLiteralCharAdvance();
if (c0_ == '+' || c0_ == '-')
AddLiteralCharAdvance();
if (!IsDecimalDigit(c0_)) {
// we must have at least one decimal digit after 'e'/'E'
return Token::ILLEGAL;
}
ScanDecimalDigits();
}
// The source character immediately following a numeric literal must
// not be an identifier start or a decimal digit; see ECMA-262
// section 7.8.3, page 17 (note that we read only one decimal digit
// if the value is 0).
if (IsDecimalDigit(c0_) || ScannerConstants::kIsIdentifierStart.get(c0_))
return Token::ILLEGAL;
literal.Complete();
return Token::NUMBER;
}
uc32 Scanner::ScanIdentifierUnicodeEscape() {
Advance();
if (c0_ != 'u') return unibrow::Utf8::kBadChar;
Advance();
uc32 c = ScanHexEscape('u', 4);
// We do not allow a unicode escape sequence to start another
// unicode escape sequence.
if (c == '\\') return unibrow::Utf8::kBadChar;
return c;
}
Token::Value Scanner::ScanIdentifier() {
ASSERT(ScannerConstants::kIsIdentifierStart.get(c0_));
LiteralScope literal(this);
KeywordMatcher keyword_match;
// Scan identifier start character.
if (c0_ == '\\') {
uc32 c = ScanIdentifierUnicodeEscape();
// Only allow legal identifier start characters.
if (!ScannerConstants::kIsIdentifierStart.get(c)) return Token::ILLEGAL;
AddLiteralChar(c);
keyword_match.Fail();
} else {
AddLiteralChar(c0_);
keyword_match.AddChar(c0_);
Advance();
}
// Scan the rest of the identifier characters.
while (ScannerConstants::kIsIdentifierPart.get(c0_)) {
if (c0_ == '\\') {
uc32 c = ScanIdentifierUnicodeEscape();
// Only allow legal identifier part characters.
if (!ScannerConstants::kIsIdentifierPart.get(c)) return Token::ILLEGAL;
AddLiteralChar(c);
keyword_match.Fail();
} else {
AddLiteralChar(c0_);
keyword_match.AddChar(c0_);
Advance();
}
}
literal.Complete();
return keyword_match.token();
}
bool Scanner::ScanRegExpPattern(bool seen_equal) {
// Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
bool in_character_class = false;
// Previous token is either '/' or '/=', in the second case, the
// pattern starts at =.
next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
// Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
// the scanner should pass uninterpreted bodies to the RegExp
// constructor.
LiteralScope literal(this);
if (seen_equal)
AddLiteralChar('=');
while (c0_ != '/' || in_character_class) {
if (ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) return false;
if (c0_ == '\\') { // escaped character
AddLiteralCharAdvance();
if (ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) return false;
AddLiteralCharAdvance();
} else { // unescaped character
if (c0_ == '[') in_character_class = true;
if (c0_ == ']') in_character_class = false;
AddLiteralCharAdvance();
}
}
Advance(); // consume '/'
literal.Complete();
return true;
}
bool Scanner::ScanRegExpFlags() {
// Scan regular expression flags.
LiteralScope literal(this);
while (ScannerConstants::kIsIdentifierPart.get(c0_)) {
if (c0_ == '\\') {
uc32 c = ScanIdentifierUnicodeEscape();
if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
// We allow any escaped character, unlike the restriction on
// IdentifierPart when it is used to build an IdentifierName.
AddLiteralChar(c);
continue;
}
}
AddLiteralCharAdvance();
}
literal.Complete();
next_.location.end_pos = source_pos() - 1;
return true;
}
} } // namespace v8::internal
......@@ -35,65 +35,6 @@
namespace v8 {
namespace internal {
class UTF8Buffer {
public:
UTF8Buffer();
~UTF8Buffer();
inline void AddChar(uc32 c) {
if (recording_) {
if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
buffer_.Add(static_cast<char>(c));
} else {
AddCharSlow(c);
}
}
}
void StartLiteral() {
buffer_.StartSequence();
recording_ = true;
}
Vector<const char> EndLiteral() {
if (recording_) {
recording_ = false;
buffer_.Add(kEndMarker);
Vector<char> sequence = buffer_.EndSequence();
return Vector<const char>(sequence.start(), sequence.length());
}
return Vector<const char>();
}
void DropLiteral() {
if (recording_) {
recording_ = false;
buffer_.DropSequence();
}
}
void Reset() {
buffer_.Reset();
}
// The end marker added after a parsed literal.
// Using zero allows the usage of strlen and similar functions on
// identifiers and numbers (but not strings, since they may contain zero
// bytes).
// TODO(lrn): Use '\xff' as end marker, since it cannot occur inside
// an utf-8 string. This requires changes in all places that uses
// str-functions on the literals, but allows a single pointer to represent
// the literal, even if it contains embedded zeros.
static const char kEndMarker = '\x00';
private:
static const int kInitialCapacity = 256;
SequenceCollector<char, 4> buffer_;
bool recording_;
void AddCharSlow(uc32 c);
};
// UTF16 buffer to read characters from a character stream.
class CharacterStreamUTF16Buffer: public UTF16Buffer {
public:
......@@ -134,175 +75,63 @@ class ExternalStringUTF16Buffer: public UTF16Buffer {
};
enum ParserLanguage { JAVASCRIPT, JSON };
class Scanner {
// Initializes a UTF16Buffer as input stream, using one of a number
// of strategies depending on the available character sources.
class StreamInitializer {
public:
typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
UTF16Buffer* Init(Handle<String> source,
unibrow::CharacterStream* stream,
int start_position,
int end_position);
private:
// Different UTF16 buffers used to pull characters from. Based on input one of
// these will be initialized as the actual data source.
CharacterStreamUTF16Buffer char_stream_buffer_;
ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t>
two_byte_string_buffer_;
ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_;
class LiteralScope {
public:
explicit LiteralScope(Scanner* self);
~LiteralScope();
void Complete();
// Used to convert the source string into a character stream when a stream
// is not passed to the scanner.
SafeStringInputBuffer safe_string_input_buffer_;
};
// ----------------------------------------------------------------------------
// V8JavaScriptScanner
// JavaScript scanner getting its input from either a V8 String or a unicode
// CharacterStream.
private:
Scanner* scanner_;
bool complete_;
};
class V8JavaScriptScanner : public JavaScriptScanner {
public:
V8JavaScriptScanner() {}
Scanner();
Token::Value NextCheckStack();
// Initialize the Scanner to scan source.
void Initialize(Handle<String> source);
void Initialize(Handle<String> source,
ParserLanguage language);
void Initialize(Handle<String> source,
unibrow::CharacterStream* stream,
ParserLanguage language);
unibrow::CharacterStream* stream);
void Initialize(Handle<String> source,
int start_position, int end_position,
ParserLanguage language);
int start_position, int end_position);
// Returns the next token.
Token::Value Next();
// Returns the current token again.
Token::Value current_token() { return current_.token; }
// One token look-ahead (past the token returned by Next()).
Token::Value peek() const { return next_.token; }
// Returns true if there was a line terminator before the peek'ed token.
bool has_line_terminator_before_next() const {
return has_line_terminator_before_next_;
}
struct Location {
Location(int b, int e) : beg_pos(b), end_pos(e) { }
Location() : beg_pos(0), end_pos(0) { }
int beg_pos;
int end_pos;
};
// Returns the location information for the current token
// (the token returned by Next()).
Location location() const { return current_.location; }
Location peek_location() const { return next_.location; }
// Returns the literal string, if any, for the current token (the
// token returned by Next()). The string is 0-terminated and in
// UTF-8 format; they may contain 0-characters. Literal strings are
// collected for identifiers, strings, and numbers.
// These functions only give the correct result if the literal
// was scanned between calls to StartLiteral() and TerminateLiteral().
const char* literal_string() const {
return current_.literal_chars.start();
}
int literal_length() const {
// Excluding terminal '\x00' added by TerminateLiteral().
return current_.literal_chars.length() - 1;
}
Vector<const char> literal() const {
return Vector<const char>(literal_string(), literal_length());
}
// Returns the literal string for the next token (the token that
// would be returned if Next() were called).
const char* next_literal_string() const {
return next_.literal_chars.start();
}
// Returns the length of the next token (that would be returned if
// Next() were called).
int next_literal_length() const {
// Excluding terminal '\x00' added by TerminateLiteral().
return next_.literal_chars.length() - 1;
}
Vector<const char> next_literal() const {
return Vector<const char>(next_literal_string(), next_literal_length());
}
// Scans the input as a regular expression pattern, previous
// character(s) must be /(=). Returns true if a pattern is scanned.
bool ScanRegExpPattern(bool seen_equal);
// Returns true if regexp flags are scanned (always since flags can
// be empty).
bool ScanRegExpFlags();
// Seek forward to the given position. This operation does not
// work in general, for instance when there are pushed back
// characters, but works for seeking forward until simple delimiter
// tokens, which is what it is used for.
void SeekForward(int pos);
bool stack_overflow() { return stack_overflow_; }
protected:
StreamInitializer stream_initializer_;
};
// Tells whether the buffer contains an identifier (no escapes).
// Used for checking if a property name is an identifier.
static bool IsIdentifier(unibrow::CharacterStream* buffer);
static const int kCharacterLookaheadBufferSize = 1;
static const int kNoEndPosition = 1;
class JsonScanner : public Scanner {
public:
JsonScanner();
private:
// The current and look-ahead token.
struct TokenDesc {
Token::Value token;
Location location;
Vector<const char> literal_chars;
};
void Init(Handle<String> source,
unibrow::CharacterStream* stream,
int start_position, int end_position,
ParserLanguage language);
// Literal buffer support
inline void StartLiteral();
inline void AddLiteralChar(uc32 ch);
inline void AddLiteralCharAdvance();
inline void TerminateLiteral();
// Stops scanning of a literal, e.g., due to an encountered error.
inline void DropLiteral();
// Low-level scanning support.
void Advance() { c0_ = source_->Advance(); }
void PushBack(uc32 ch) {
source_->PushBack(ch);
c0_ = ch;
}
// Initialize the Scanner to scan source.
void Initialize(Handle<String> source);
bool SkipWhiteSpace() {
if (is_parsing_json_) {
return SkipJsonWhiteSpace();
} else {
return SkipJavaScriptWhiteSpace();
}
}
// Returns the next token.
Token::Value Next();
bool SkipJavaScriptWhiteSpace();
protected:
// Skip past JSON whitespace (only space, tab, newline and carrige-return).
bool SkipJsonWhiteSpace();
Token::Value SkipSingleLineComment();
Token::Value SkipMultiLineComment();
inline Token::Value Select(Token::Value tok);
inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_);
inline void Scan() {
if (is_parsing_json_) {
ScanJson();
} else {
ScanJavaScript();
}
}
// Scans a single JavaScript token.
void ScanJavaScript();
// Scan a single JSON token. The JSON lexical grammar is specified in the
// ECMAScript 5 standard, section 15.12.1.1.
......@@ -331,53 +160,7 @@ class Scanner {
// JSONNullLiteral).
Token::Value ScanJsonIdentifier(const char* text, Token::Value token);
void ScanDecimalDigits();
Token::Value ScanNumber(bool seen_period);
Token::Value ScanIdentifier();
uc32 ScanHexEscape(uc32 c, int length);
uc32 ScanOctalEscape(uc32 c, int length);
void ScanEscape();
Token::Value ScanString();
// Scans a possible HTML comment -- begins with '<!'.
Token::Value ScanHtmlComment();
// Return the current source position.
int source_pos() {
return source_->pos() - kCharacterLookaheadBufferSize;
}
// Decodes a unicode escape-sequence which is part of an identifier.
// If the escape sequence cannot be decoded the result is kBadRune.
uc32 ScanIdentifierUnicodeEscape();
TokenDesc current_; // desc for current token (as returned by Next())
TokenDesc next_; // desc for next token (one token look-ahead)
bool has_line_terminator_before_next_;
bool is_parsing_json_;
// Different UTF16 buffers used to pull characters from. Based on input one of
// these will be initialized as the actual data source.
CharacterStreamUTF16Buffer char_stream_buffer_;
ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t>
two_byte_string_buffer_;
ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_;
// Source. Will point to one of the buffers declared above.
UTF16Buffer* source_;
// Used to convert the source string into a character stream when a stream
// is not passed to the scanner.
SafeStringInputBuffer safe_string_input_buffer_;
// Buffer to hold literal values (identifiers, strings, numbers)
// using '\x00'-terminated UTF-8 encoding. Handles allocation internally.
UTF8Buffer literal_buffer_;
bool stack_overflow_;
// One Unicode character look-ahead; c0_ < 0 at the end of the input.
uc32 c0_;
StreamInitializer stream_initializer_;
};
......@@ -400,7 +183,7 @@ void ExternalStringUTF16Buffer<StringType, CharType>::Initialize(
SeekForward(start_position);
}
end_ =
end_position != Scanner::kNoEndPosition ? end_position : data->length();
end_position != kNoEndPosition ? end_position : data->length();
}
......
......@@ -36,7 +36,6 @@
#include "parser.h"
#include "utils.h"
#include "execution.h"
#include "scanner.h"
#include "preparser.h"
#include "cctest.h"
......@@ -262,9 +261,10 @@ TEST(StandAlonePreParser) {
const char* program = programs[i];
unibrow::Utf8InputBuffer<256> stream(program, strlen(program));
i::CompleteParserRecorder log;
i::Scanner scanner;
scanner.Initialize(i::Handle<i::String>::null(), &stream, i::JAVASCRIPT);
v8::preparser::PreParser<i::Scanner, i::CompleteParserRecorder> preparser;
i::V8JavaScriptScanner scanner;
scanner.Initialize(i::Handle<i::String>::null(), &stream);
v8::preparser::PreParser<i::V8JavaScriptScanner,
i::CompleteParserRecorder> preparser;
bool result = preparser.PreParseProgram(&scanner, &log, true);
CHECK(result);
i::ScriptDataImpl data(log.ExtractData());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment