Refactor the scanner interface

No need to create sub strings for lazy compiles. The scanner will start from the start position provided. Moved the creating of character streams into the scanner where possible. This uses a input buffer in the scanner class instead of a stack allocated one. Added a UTF16 buffer for reading external ascii strings (by templating the external two byte string one) as all the source for the builtins are exposed as external ascii strings. Review URL: http://codereview.chromium.org/661367 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@4007 ce2b1a6d-e550-0410-aec6-3dcde31c8c00

Refactor the scanner interface
No need to create sub strings for lazy compiles. The scanner will start from the start position provided. Moved the creating of character streams into the scanner where possible. This uses a input buffer in the scanner class instead of a stack allocated one. Added a UTF16 buffer for reading external ascii strings (by templating the external two byte string one) as all the source for the builtins are exposed as external ascii strings. Review URL: http://codereview.chromium.org/661367 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@4007 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
fca7b2ce · sgjesse@chromium.org · 30a8fc3e · fca7b2ce · fca7b2ce · fca7b2ce
Commit fca7b2ce authored Mar 03, 2010 by sgjesse@chromium.org
Hide whitespace changes
Inline Side-by-side

Showing with 135 additions and 75 deletions

objects.cc src/objects.cc +1 -1

parser.cc src/parser.cc +16 -20

scanner.cc src/scanner.cc +73 -32

scanner.h src/scanner.h +45 -22

No files found.
--- a/src/objects.cc
+++ b/src/objects.cc
@@ -5248,7 +5248,7 @@ Object* JSObject::SetSlowElements(Object* len) {
    case DICTIONARY_ELEMENTS: {
      if (IsJSArray()) {
        uint32_t old_length =
-        static_cast<uint32_t>(JSArray::cast(this)->length()->Number());
+            static_cast<uint32_t>(JSArray::cast(this)->length()->Number());
        element_dictionary()->RemoveNumberEntries(new_length, old_length),
        JSArray::cast(this)->set_length(len);
      }

--- a/src/parser.cc
+++ b/src/parser.cc
@@ -107,13 +107,13 @@ class Parser {
  // Returns NULL if parsing failed.
  FunctionLiteral* ParseProgram(Handle<String> source,
-                                unibrow::CharacterStream* stream,
                                bool in_global_context);
  FunctionLiteral* ParseLazy(Handle<String> source,
                             Handle<String> name,
-                             int start_position, bool is_expression);
+                             int start_position,
-  FunctionLiteral* ParseJson(Handle<String> source,
+                             int end_position,
-                             unibrow::CharacterStream* stream);
+                             bool is_expression);
+  FunctionLiteral* ParseJson(Handle<String> source);
  // The minimum number of contiguous assignment that will
  // be treated as an initialization block. Benchmarks show that
@@ -1212,7 +1212,7 @@ bool Parser::PreParseProgram(Handle<String> source,
  AssertNoZoneAllocation assert_no_zone_allocation;
  AssertNoAllocation assert_no_allocation;
  NoHandleAllocation no_handle_allocation;
-  scanner_.Init(source, stream, 0, JAVASCRIPT);
+  scanner_.Initialize(source, stream, JAVASCRIPT);
  ASSERT(target_stack_ == NULL);
  mode_ = PARSE_EAGERLY;
  DummyScope top_scope;
@@ -1226,7 +1226,6 @@ bool Parser::PreParseProgram(Handle<String> source,
 FunctionLiteral* Parser::ParseProgram(Handle<String> source,
-                                      unibrow::CharacterStream* stream,
                                      bool in_global_context) {
  CompilationZoneScope zone_scope(DONT_DELETE_ON_EXIT);
@@ -1235,7 +1234,7 @@ FunctionLiteral* Parser::ParseProgram(Handle<String> source,
  // Initialize parser state.
  source->TryFlatten();
-  scanner_.Init(source, stream, 0, JAVASCRIPT);
+  scanner_.Initialize(source, JAVASCRIPT);
  ASSERT(target_stack_ == NULL);
  // Compute the parsing mode.
@@ -1286,15 +1285,15 @@ FunctionLiteral* Parser::ParseProgram(Handle<String> source,
 FunctionLiteral* Parser::ParseLazy(Handle<String> source,
                                   Handle<String> name,
                                   int start_position,
+                                   int end_position,
                                   bool is_expression) {
  CompilationZoneScope zone_scope(DONT_DELETE_ON_EXIT);
  HistogramTimerScope timer(&Counters::parse_lazy);
-  source->TryFlatten();
  Counters::total_parse_size.Increment(source->length());
-  SafeStringInputBuffer buffer(source.location());
  // Initialize parser state.
-  scanner_.Init(source, &buffer, start_position, JAVASCRIPT);
+  source->TryFlatten();
+  scanner_.Initialize(source, start_position, end_position, JAVASCRIPT);
  ASSERT(target_stack_ == NULL);
  mode_ = PARSE_EAGERLY;
@@ -1330,8 +1329,7 @@ FunctionLiteral* Parser::ParseLazy(Handle<String> source,
  return result;
 }
-FunctionLiteral* Parser::ParseJson(Handle<String> source,
+FunctionLiteral* Parser::ParseJson(Handle<String> source) {
-                                   unibrow::CharacterStream* stream) {
  CompilationZoneScope zone_scope(DONT_DELETE_ON_EXIT);
  HistogramTimerScope timer(&Counters::parse);
@@ -1339,7 +1337,7 @@ FunctionLiteral* Parser::ParseJson(Handle<String> source,
  // Initialize parser state.
  source->TryFlatten(TENURED);
-  scanner_.Init(source, stream, 0, JSON);
+  scanner_.Initialize(source, JSON);
  ASSERT(target_stack_ == NULL);
  FunctionLiteral* result = NULL;
@@ -5065,13 +5063,12 @@ FunctionLiteral* MakeAST(bool compile_in_global_context,
    return NULL;
  }
  Handle<String> source = Handle<String>(String::cast(script->source()));
-  SafeStringInputBuffer input(source.location());
  FunctionLiteral* result;
  if (is_json) {
    ASSERT(compile_in_global_context);
-    result = parser.ParseJson(source, &input);
+    result = parser.ParseJson(source);
  } else {
-    result = parser.ParseProgram(source, &input, compile_in_global_context);
+    result = parser.ParseProgram(source, compile_in_global_context);
  }
  return result;
 }
@@ -5086,12 +5083,11 @@ FunctionLiteral* MakeLazyAST(Handle<Script> script,
  always_allow_natives_syntax = true;
  AstBuildingParser parser(script, true, NULL, NULL);  // always allow
  always_allow_natives_syntax = allow_natives_syntax_before;
-  // Parse the function by pulling the function source from the script source.
+  // Parse the function by pointing to the function source in the script source.
  Handle<String> script_source(String::cast(script->source()));
-  Handle<String> function_source =
-      SubString(script_source, start_position, end_position, TENURED);
  FunctionLiteral* result =
-      parser.ParseLazy(function_source, name, start_position, is_expression);
+      parser.ParseLazy(script_source, name,
+                       start_position, end_position, is_expression);
  return result;
 }

--- a/src/scanner.cc
+++ b/src/scanner.cc
@@ -28,6 +28,7 @@
 #include "v8.h"
 #include "ast.h"
+#include "handles.h"
 #include "scanner.h"
 namespace v8 {
@@ -86,12 +87,7 @@ void UTF8Buffer::AddCharSlow(uc32 c) {
 UTF16Buffer::UTF16Buffer()
-    : pos_(0), size_(0) { }
+    : pos_(0), end_(Scanner::kNoEndPosition) { }
-Handle<String> UTF16Buffer::SubString(int start, int end) {
-  return internal::SubString(data_, start, end);
-}
 // CharacterStreamUTF16Buffer
@@ -100,10 +96,14 @@ CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer()
 void CharacterStreamUTF16Buffer::Initialize(Handle<String> data,
-                                            unibrow::CharacterStream* input) {
+                                            unibrow::CharacterStream* input,
-  data_ = data;
+                                            int start_position,
-  pos_ = 0;
+                                            int end_position) {
  stream_ = input;
+  if (start_position > 0) {
+    SeekForward(start_position);
+  }
+  end_ = end_position != Scanner::kNoEndPosition ? end_position : kMaxInt;
 }
@@ -115,6 +115,8 @@ void CharacterStreamUTF16Buffer::PushBack(uc32 ch) {
 uc32 CharacterStreamUTF16Buffer::Advance() {
+  ASSERT(end_ != Scanner::kNoEndPosition);
+  ASSERT(end_ >= 0);
  // NOTE: It is of importance to Persian / Farsi resources that we do
  // *not* strip format control characters in the scanner; see
  //
@@ -126,7 +128,7 @@ uc32 CharacterStreamUTF16Buffer::Advance() {
  if (!pushback_buffer()->is_empty()) {
    pos_++;
    return last_ = pushback_buffer()->RemoveLast();
-  } else if (stream_->has_more()) {
+  } else if (stream_->has_more() && pos_ < end_) {
    pos_++;
    uc32 next = stream_->GetNext();
    return last_ = next;
@@ -146,25 +148,32 @@ void CharacterStreamUTF16Buffer::SeekForward(int pos) {
 }
-// TwoByteStringUTF16Buffer
+// ExternalStringUTF16Buffer
-TwoByteStringUTF16Buffer::TwoByteStringUTF16Buffer()
+template <typename StringType, typename CharType>
+ExternalStringUTF16Buffer<StringType, CharType>::ExternalStringUTF16Buffer()
    : raw_data_(NULL) { }
-void TwoByteStringUTF16Buffer::Initialize(
+template <typename StringType, typename CharType>
-     Handle<ExternalTwoByteString> data) {
+void ExternalStringUTF16Buffer<StringType, CharType>::Initialize(
+     Handle<StringType> data,
+     int start_position,
+     int end_position) {
  ASSERT(!data.is_null());
-  data_ = data;
-  pos_ = 0;
  raw_data_ = data->resource()->data();
-  size_ = data->length();
+  ASSERT(end_position <= data->length());
+  if (start_position > 0) {
+    SeekForward(start_position);
+  }
+  end_ =
+      end_position != Scanner::kNoEndPosition ? end_position : data->length();
 }
-uc32 TwoByteStringUTF16Buffer::Advance() {
+template <typename StringType, typename CharType>
-  if (pos_ < size_) {
+uc32 ExternalStringUTF16Buffer<StringType, CharType>::Advance() {
+  if (pos_ < end_) {
    return raw_data_[pos_++];
  } else {
    // note: currently the following increment is necessary to avoid a
@@ -175,14 +184,16 @@ uc32 TwoByteStringUTF16Buffer::Advance() {
 }
-void TwoByteStringUTF16Buffer::PushBack(uc32 ch) {
+template <typename StringType, typename CharType>
+void ExternalStringUTF16Buffer<StringType, CharType>::PushBack(uc32 ch) {
  pos_--;
  ASSERT(pos_ >= Scanner::kCharacterLookaheadBufferSize);
  ASSERT(raw_data_[pos_ - Scanner::kCharacterLookaheadBufferSize] == ch);
 }
-void TwoByteStringUTF16Buffer::SeekForward(int pos) {
+template <typename StringType, typename CharType>
+void ExternalStringUTF16Buffer<StringType, CharType>::SeekForward(int pos) {
  pos_ = pos;
 }
@@ -327,21 +338,56 @@ Scanner::Scanner(ParserMode pre)
    : stack_overflow_(false), is_pre_parsing_(pre == PREPARSE) { }
+void Scanner::Initialize(Handle<String> source,
+                         ParserLanguage language) {
+  safe_string_input_buffer_.Reset(source.location());
+  Init(source, &safe_string_input_buffer_, 0, source->length(), language);
+}
+void Scanner::Initialize(Handle<String> source,
+                         unibrow::CharacterStream* stream,
+                         ParserLanguage language) {
+  Init(source, stream, 0, kNoEndPosition, language);
+}
+void Scanner::Initialize(Handle<String> source,
+                         int start_position,
+                         int end_position,
+                         ParserLanguage language) {
+  safe_string_input_buffer_.Reset(source.location());
+  Init(source, &safe_string_input_buffer_,
+       start_position, end_position, language);
+}
 void Scanner::Init(Handle<String> source,
                   unibrow::CharacterStream* stream,
-                   int position,
+                   int start_position,
+                   int end_position,
                   ParserLanguage language) {
  // Initialize the source buffer.
  if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) {
    two_byte_string_buffer_.Initialize(
-        Handle<ExternalTwoByteString>::cast(source));
+        Handle<ExternalTwoByteString>::cast(source),
+        start_position,
+        end_position);
    source_ = &two_byte_string_buffer_;
+  } else if (!source.is_null() && StringShape(*source).IsExternalAscii()) {
+    ascii_string_buffer_.Initialize(
+        Handle<ExternalAsciiString>::cast(source),
+        start_position,
+        end_position);
+    source_ = &ascii_string_buffer_;
  } else {
-    char_stream_buffer_.Initialize(source, stream);
+    char_stream_buffer_.Initialize(source,
+                                   stream,
+                                   start_position,
+                                   end_position);
    source_ = &char_stream_buffer_;
  }
-  position_ = position;
  is_parsing_json_ = (language == JSON);
  // Set c0_ (one character ahead)
@@ -358,11 +404,6 @@ void Scanner::Init(Handle<String> source,
 }
-Handle<String> Scanner::SubString(int start, int end) {
-  return source_->SubString(start - position_, end - position_);
-}
 Token::Value Scanner::Next() {
  // BUG 1215673: Find a thread safe way to set a stack limit in
  // pre-parse mode. Otherwise, we cannot safely pre-parse from other

--- a/src/scanner.h
+++ b/src/scanner.h
@@ -84,32 +84,34 @@ class UTF8Buffer {
 };
+// Interface through which the scanner reads characters from the input source.
 class UTF16Buffer {
 public:
  UTF16Buffer();
  virtual ~UTF16Buffer() {}
  virtual void PushBack(uc32 ch) = 0;
-  // returns a value < 0 when the buffer end is reached
+  // Returns a value < 0 when the buffer end is reached.
  virtual uc32 Advance() = 0;
  virtual void SeekForward(int pos) = 0;
  int pos() const { return pos_; }
-  int size() const { return size_; }
-  Handle<String> SubString(int start, int end);
 protected:
-  Handle<String> data_;
+  int pos_;  // Current position in the buffer.
-  int pos_;
+  int end_;  // Position where scanning should stop (EOF).
-  int size_;
 };
+// UTF16 buffer to read characters from a character stream.
 class CharacterStreamUTF16Buffer: public UTF16Buffer {
 public:
  CharacterStreamUTF16Buffer();
  virtual ~CharacterStreamUTF16Buffer() {}
-  void Initialize(Handle<String> data, unibrow::CharacterStream* stream);
+  void Initialize(Handle<String> data,
+                  unibrow::CharacterStream* stream,
+                  int start_position,
+                  int end_position);
  virtual void PushBack(uc32 ch);
  virtual uc32 Advance();
  virtual void SeekForward(int pos);
@@ -123,17 +125,21 @@ class CharacterStreamUTF16Buffer: public UTF16Buffer {
 };
-class TwoByteStringUTF16Buffer: public UTF16Buffer {
+// UTF16 buffer to read characters from an external string.
+template <typename StringType, typename CharType>
+class ExternalStringUTF16Buffer: public UTF16Buffer {
 public:
-  TwoByteStringUTF16Buffer();
+  ExternalStringUTF16Buffer();
-  virtual ~TwoByteStringUTF16Buffer() {}
+  virtual ~ExternalStringUTF16Buffer() {}
-  void Initialize(Handle<ExternalTwoByteString> data);
+  void Initialize(Handle<StringType> data,
+                  int start_position,
+                  int end_position);
  virtual void PushBack(uc32 ch);
  virtual uc32 Advance();
  virtual void SeekForward(int pos);
 private:
-  const uint16_t* raw_data_;
+  const CharType* raw_data_;  // Pointer to the actual array of characters.
 };
@@ -263,11 +269,15 @@ class Scanner {
  // Construction
  explicit Scanner(ParserMode parse_mode);
-  // Initialize the Scanner to scan source:
+  // Initialize the Scanner to scan source.
-  void Init(Handle<String> source,
+  void Initialize(Handle<String> source,
-            unibrow::CharacterStream* stream,
+                  ParserLanguage language);
-            int position,
+  void Initialize(Handle<String> source,
-            ParserLanguage language);
+                  unibrow::CharacterStream* stream,
+                  ParserLanguage language);
+  void Initialize(Handle<String> source,
+                  int start_position, int end_position,
+                  ParserLanguage language);
  // Returns the next token.
  Token::Value Next();
@@ -335,7 +345,6 @@ class Scanner {
  // tokens, which is what it is used for.
  void SeekForward(int pos);
-  Handle<String> SubString(int start_pos, int end_pos);
  bool stack_overflow() { return stack_overflow_; }
  static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; }
@@ -350,14 +359,28 @@ class Scanner {
  static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
  static const int kCharacterLookaheadBufferSize = 1;
+  static const int kNoEndPosition = 1;
 private:
+  void Init(Handle<String> source,
+            unibrow::CharacterStream* stream,
+            int start_position, int end_position,
+            ParserLanguage language);
+  // Different UTF16 buffers used to pull characters from. Based on input one of
+  // these will be initialized as the actual data source.
  CharacterStreamUTF16Buffer char_stream_buffer_;
-  TwoByteStringUTF16Buffer two_byte_string_buffer_;
+  ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t>
+      two_byte_string_buffer_;
+  ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_;
-  // Source.
+  // Source. Will point to one of the buffers declared above.
  UTF16Buffer* source_;
-  int position_;
+  // Used to convert the source string into a character stream when a stream
+  // is not passed to the scanner.
+  SafeStringInputBuffer safe_string_input_buffer_;
  // Buffer to hold literal values (identifiers, strings, numbers)
  // using 0-terminated UTF-8 encoding.
@@ -460,7 +483,7 @@ class Scanner {
  // Return the current source position.
  int source_pos() {
-    return source_->pos() - kCharacterLookaheadBufferSize + position_;
+    return source_->pos() - kCharacterLookaheadBufferSize;
  }
  // Decodes a unicode escape-sequence which is part of an identifier.