Commit b1e27e2c authored by lrn@chromium.org's avatar lrn@chromium.org

Created collector class and used it to collect identifiers during scanning.

The collector class automatically expands to hold the values added to it,
like a List, but doesn't ensure that the backing store is contiguous, which
allows it to avoid copying back and forth as the buffer grows.

This is in preparation for identifyng identical symbols during preparsing.

Review URL: http://codereview.chromium.org/3181036

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@5325 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 3527e29e
......@@ -733,11 +733,18 @@ double StringToInt(String* str, int radix) {
double StringToDouble(const char* str, int flags, double empty_string_val) {
const char* end = str + StrLength(str);
return InternalStringToDouble(str, end, flags, empty_string_val);
}
double StringToDouble(Vector<const char> str,
int flags,
double empty_string_val) {
const char* end = str.start() + str.length();
return InternalStringToDouble(str.start(), end, flags, empty_string_val);
}
extern "C" char* dtoa(double d, int mode, int ndigits,
int* decpt, int* sign, char** rve);
......
......@@ -96,8 +96,12 @@ static inline uint32_t NumberToUint32(Object* number);
// Converts a string into a double value according to ECMA-262 9.3.1
double StringToDouble(const char* str, int flags, double empty_string_val = 0);
double StringToDouble(String* str, int flags, double empty_string_val = 0);
double StringToDouble(Vector<const char> str,
int flags,
double empty_string_val = 0);
// This version expects a zero-terminated character array.
double StringToDouble(const char* str, int flags, double empty_string_val = 0);
// Converts a string into an integer.
double StringToInt(String* str, int radix);
......
......@@ -3378,7 +3378,7 @@ Expression* Parser::ParsePrimaryExpression(bool* ok) {
case Token::NUMBER: {
Consume(Token::NUMBER);
double value =
StringToDouble(scanner_.literal_string(), ALLOW_HEX | ALLOW_OCTALS);
StringToDouble(scanner_.literal(), ALLOW_HEX | ALLOW_OCTALS);
result = NewNumberLiteral(value);
break;
}
......@@ -3736,7 +3736,7 @@ Expression* Parser::ParseObjectLiteral(bool* ok) {
case Token::NUMBER: {
Consume(Token::NUMBER);
double value =
StringToDouble(scanner_.literal_string(), ALLOW_HEX | ALLOW_OCTALS);
StringToDouble(scanner_.literal(), ALLOW_HEX | ALLOW_OCTALS);
key = NewNumberLiteral(value);
break;
}
......@@ -3989,7 +3989,7 @@ Expression* Parser::ParseV8Intrinsic(bool* ok) {
Expect(Token::MOD, CHECK_OK);
Handle<String> name = ParseIdentifier(CHECK_OK);
Runtime::Function* function =
Runtime::FunctionForName(scanner_.literal_string());
Runtime::FunctionForName(scanner_.literal());
ZoneList<Expression*>* args = ParseArguments(CHECK_OK);
if (function == NULL && extension_ != NULL) {
// The extension structures are only accessible while parsing the
......@@ -4276,7 +4276,7 @@ Expression* Parser::ParseJsonValue(bool* ok) {
case Token::NUMBER: {
Consume(Token::NUMBER);
ASSERT(scanner_.literal_length() > 0);
double value = StringToDouble(scanner_.literal_string(),
double value = StringToDouble(scanner_.literal(),
NO_FLAGS, // Hex, octal or trailing junk.
OS::nan_value());
return NewNumberLiteral(value);
......
......@@ -10688,9 +10688,10 @@ Runtime::Function* Runtime::FunctionForId(FunctionId fid) {
}
Runtime::Function* Runtime::FunctionForName(const char* name) {
Runtime::Function* Runtime::FunctionForName(Vector<const char> name) {
for (Function* f = Runtime_functions; f->name != NULL; f++) {
if (strcmp(f->name, name) == 0) {
if (strncmp(f->name, name.start(), name.length()) == 0
&& f->name[name.length()] == 0) {
return f;
}
}
......
......@@ -419,7 +419,7 @@ class Runtime : public AllStatic {
static Function* FunctionForId(FunctionId fid);
// Get the runtime function with the given name.
static Function* FunctionForName(const char* name);
static Function* FunctionForName(Vector<const char> name);
static int StringMatch(Handle<String> sub, Handle<String> pat, int index);
......
......@@ -50,35 +50,22 @@ StaticResource<Scanner::Utf8Decoder> Scanner::utf8_decoder_;
// ----------------------------------------------------------------------------
// UTF8Buffer
UTF8Buffer::UTF8Buffer() : data_(NULL), limit_(NULL) { }
UTF8Buffer::UTF8Buffer() : buffer_(kInitialCapacity) { }
UTF8Buffer::~UTF8Buffer() {
if (data_ != NULL) DeleteArray(data_);
}
UTF8Buffer::~UTF8Buffer() {}
void UTF8Buffer::AddCharSlow(uc32 c) {
static const int kCapacityGrowthLimit = 1 * MB;
if (cursor_ > limit_) {
int old_capacity = Capacity();
int old_position = pos();
int new_capacity =
Min(old_capacity * 3, old_capacity + kCapacityGrowthLimit);
char* new_data = NewArray<char>(new_capacity);
memcpy(new_data, data_, old_position);
DeleteArray(data_);
data_ = new_data;
cursor_ = new_data + old_position;
limit_ = ComputeLimit(new_data, new_capacity);
ASSERT(Capacity() == new_capacity && pos() == old_position);
}
if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
*cursor_++ = c; // Common case: 7-bit ASCII.
} else {
cursor_ += unibrow::Utf8::Encode(cursor_, c);
}
ASSERT(pos() <= Capacity());
ASSERT(static_cast<unsigned>(c) > unibrow::Utf8::kMaxOneByteChar);
int length = unibrow::Utf8::Length(c);
Vector<char> block = buffer_.AddBlock(length, '\0');
#ifdef DEBUG
int written_length = unibrow::Utf8::Encode(block.start(), c);
CHECK_EQ(length, written_length);
#else
unibrow::Utf8::Encode(block.start(), c);
#endif
}
......@@ -399,8 +386,8 @@ void Scanner::Init(Handle<String> source,
// Set c0_ (one character ahead)
ASSERT(kCharacterLookaheadBufferSize == 1);
Advance();
// Initializer current_ to not refer to a literal buffer.
current_.literal_buffer = NULL;
// Initialise current_ to not refer to a literal.
current_.literal_chars = Vector<const char>();
// Skip initial whitespace allowing HTML comment ends just like
// after a newline and scan first token.
......@@ -428,24 +415,16 @@ Token::Value Scanner::Next() {
void Scanner::StartLiteral() {
// Use the first buffer unless it's currently in use by the current_ token.
// In most cases we won't have two literals/identifiers in a row, so
// the second buffer won't be used very often and is unlikely to grow much.
UTF8Buffer* free_buffer =
(current_.literal_buffer != &literal_buffer_1_) ? &literal_buffer_1_
: &literal_buffer_2_;
next_.literal_buffer = free_buffer;
free_buffer->Reset();
literal_buffer_.StartLiteral();
}
void Scanner::AddChar(uc32 c) {
next_.literal_buffer->AddChar(c);
literal_buffer_.AddChar(c);
}
void Scanner::TerminateLiteral() {
AddChar(0);
next_.literal_chars = literal_buffer_.EndLiteral();
}
......@@ -575,7 +554,7 @@ Token::Value Scanner::ScanHtmlComment() {
void Scanner::ScanJson() {
next_.literal_buffer = NULL;
next_.literal_chars = Vector<const char>();
Token::Value token;
has_line_terminator_before_next_ = false;
do {
......@@ -761,7 +740,7 @@ Token::Value Scanner::ScanJsonIdentifier(const char* text,
void Scanner::ScanJavaScript() {
next_.literal_buffer = NULL;
next_.literal_chars = Vector<const char>();
Token::Value token;
has_line_terminator_before_next_ = false;
do {
......
......@@ -40,45 +40,36 @@ class UTF8Buffer {
UTF8Buffer();
~UTF8Buffer();
void AddChar(uc32 c) {
ASSERT_NOT_NULL(data_);
if (cursor_ <= limit_ &&
static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
*cursor_++ = static_cast<char>(c);
inline void AddChar(uc32 c) {
if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
buffer_.Add(static_cast<char>(c));
} else {
AddCharSlow(c);
}
}
void Reset() {
if (data_ == NULL) {
data_ = NewArray<char>(kInitialCapacity);
limit_ = ComputeLimit(data_, kInitialCapacity);
}
cursor_ = data_;
void StartLiteral() {
buffer_.StartSequence();
}
int pos() const {
ASSERT_NOT_NULL(data_);
return static_cast<int>(cursor_ - data_);
Vector<const char> EndLiteral() {
buffer_.Add(kEndMarker);
Vector<char> sequence = buffer_.EndSequence();
return Vector<const char>(sequence.start(), sequence.length());
}
char* data() const { return data_; }
// The end marker added after a parsed literal.
// Using zero allows the usage of strlen and similar functions on
// identifiers and numbers (but not strings, since they may contain zero
// bytes).
// TODO(lrn): Use '\xff' as end marker, since it cannot occur inside
// an utf-8 string. This requires changes in all places that uses
// str-functions on the literals, but allows a single pointer to represent
// the literal, even if it contains embedded zeros.
static const char kEndMarker = '\x00';
private:
static const int kInitialCapacity = 256;
char* data_;
char* cursor_;
char* limit_;
int Capacity() const {
ASSERT_NOT_NULL(data_);
return static_cast<int>(limit_ - data_) + unibrow::Utf8::kMaxEncodedSize;
}
static char* ComputeLimit(char* data, int capacity) {
return (data + capacity) - unibrow::Utf8::kMaxEncodedSize;
}
SequenceCollector<char> buffer_;
void AddCharSlow(uc32 c);
};
......@@ -314,27 +305,34 @@ class Scanner {
// These functions only give the correct result if the literal
// was scanned between calls to StartLiteral() and TerminateLiteral().
const char* literal_string() const {
return current_.literal_buffer->data();
return current_.literal_chars.start();
}
int literal_length() const {
// Excluding terminal '\0' added by TerminateLiteral().
return current_.literal_buffer->pos() - 1;
// Excluding terminal '\x00' added by TerminateLiteral().
return current_.literal_chars.length() - 1;
}
Vector<const char> literal() const {
return Vector<const char>(literal_string(), literal_length());
}
// Returns the literal string for the next token (the token that
// would be returned if Next() were called).
const char* next_literal_string() const {
return next_.literal_buffer->data();
return next_.literal_chars.start();
}
// Returns the length of the next token (that would be returned if
// Next() were called).
int next_literal_length() const {
return next_.literal_buffer->pos() - 1;
// Excluding terminal '\x00' added by TerminateLiteral().
return next_.literal_chars.length() - 1;
}
Vector<const char> next_literal() const {
return Vector<const char>(next_literal_string(),
next_literal_length());
return Vector<const char>(next_literal_string(), next_literal_length());
}
// Scans the input as a regular expression pattern, previous
......@@ -371,7 +369,7 @@ class Scanner {
struct TokenDesc {
Token::Value token;
Location location;
UTF8Buffer* literal_buffer;
Vector<const char> literal_chars;
};
void Init(Handle<String> source,
......@@ -380,10 +378,10 @@ class Scanner {
ParserLanguage language);
// Literal buffer support
void StartLiteral();
void AddChar(uc32 ch);
void AddCharAdvance();
void TerminateLiteral();
inline void StartLiteral();
inline void AddChar(uc32 ch);
inline void AddCharAdvance();
inline void TerminateLiteral();
// Low-level scanning support.
void Advance() { c0_ = source_->Advance(); }
......@@ -487,9 +485,8 @@ class Scanner {
SafeStringInputBuffer safe_string_input_buffer_;
// Buffer to hold literal values (identifiers, strings, numbers)
// using 0-terminated UTF-8 encoding.
UTF8Buffer literal_buffer_1_;
UTF8Buffer literal_buffer_2_;
// using '\x00'-terminated UTF-8 encoding. Handles allocation internally.
UTF8Buffer literal_buffer_;
bool stack_overflow_;
static StaticResource<Utf8Decoder> utf8_decoder_;
......
......@@ -476,6 +476,185 @@ inline Vector< Handle<Object> > HandleVector(v8::internal::Handle<T>* elms,
}
/*
* A class that collects values into a backing store.
* Specialized versions of the class can allow access to the backing store
* in different ways.
* There is no guarantee that the backing store is contiguous (and, as a
* consequence, no guarantees that consecutively added elements are adjacent
* in memory). The collector may move elements unless it has guaranteed not
* to.
*/
template <typename T>
class Collector {
public:
Collector(int initial_capacity = kMinCapacity,
int growth_factor = 2,
int max_growth = 1 * MB)
: growth_factor_(growth_factor), max_growth_(max_growth) {
if (initial_capacity < kMinCapacity) {
initial_capacity = kMinCapacity;
}
current_chunk_ = NewArray<T>(initial_capacity);
current_capacity_ = initial_capacity;
index_ = 0;
}
virtual ~Collector() {
// Free backing store (in reverse allocation order).
DeleteArray(current_chunk_);
for (int i = chunks_.length() - 1; i >= 0; i--) {
chunks_.at(i).Dispose();
}
}
// Add a single element.
inline void Add(T value) {
if (index_ >= current_capacity_) {
Grow(1);
}
current_chunk_[index_] = value;
index_++;
}
// Add a block of contiguous elements and return a Vector backed by the
// memory area.
// A basic Collector will keep this vector valid as long as the Collector
// is alive.
inline Vector<T> AddBlock(int size, T initial_value) {
if (index_ + size > current_capacity_) {
Grow(size);
}
T* position = current_chunk_ + index_;
index_ += size;
for (int i = 0; i < size; i++) {
position[i] = initial_value;
}
return Vector<T>(position, size);
}
// Allocate a single contiguous vector, copy all the collected
// elements to the vector, and return it.
// The caller is responsible for freeing the memory of the returned
// vector (e.g., using Vector::Dispose).
Vector<T> ToVector() {
// Find the total length.
int total_length = index_;
for (int i = 0; i < chunks_.length(); i++) {
total_length += chunks_.at(i).length();
}
T* new_store = NewArray<T>(total_length);
int position = 0;
for (int i = 0; i < chunks_.length(); i++) {
Vector<T> chunk = chunks_.at(i);
for (int j = 0; j < chunk.length(); j++) {
new_store[position] = chunk[j];
position++;
}
}
for (int i = 0; i < index_; i++) {
new_store[position] = current_chunk_[i];
position++;
}
return Vector<T>(new_store, total_length);
}
protected:
static const int kMinCapacity = 16;
List<Vector<T> > chunks_;
T* current_chunk_;
int growth_factor_;
int max_growth_;
int current_capacity_;
int index_;
// Creates a new current chunk, and stores the old chunk in the chunks_ list.
void Grow(int min_capacity) {
ASSERT(growth_factor_ > 1);
int growth = current_capacity_ * (growth_factor_ - 1);
if (growth > max_growth_) {
growth = max_growth_;
}
int new_capacity = current_capacity_ + growth;
if (new_capacity < min_capacity) {
new_capacity = min_capacity;
}
T* new_chunk = NewArray<T>(new_capacity);
int new_index = PrepareGrow(Vector<T>(new_chunk, new_capacity));
chunks_.Add(Vector<T>(current_chunk_, index_));
current_chunk_ = new_chunk;
current_capacity_ = new_capacity;
index_ = new_index;
ASSERT(index_ + min_capacity <= current_capacity_);
}
// Before replacing the current chunk, give a subclass the option to move
// some of the current data into the new chunk. The function may update
// the current index_ value to represent data no longer in the current chunk.
// Returns the initial index of the new chunk (after copied data).
virtual int PrepareGrow(Vector<T> new_chunk) {
return 0;
}
};
/*
* A collector that allows sequences of values to be guaranteed to
* stay consecutive.
* If the backing store grows while a sequence is active, the current
* sequence might be moved, but after the sequence is ended, it will
* not move again.
* NOTICE: Blocks allocated using Collector::AddBlock(int) can move
* as well, if inside an active sequence where another element is added.
*/
template <typename T>
class SequenceCollector : public Collector<T> {
public:
SequenceCollector(int initial_capacity,
int growth_factor = 2,
int max_growth = 1 * MB)
: Collector<T>(initial_capacity, growth_factor, max_growth),
sequence_start_(kNoSequence) { }
virtual ~SequenceCollector() {}
void StartSequence() {
ASSERT(sequence_start_ == kNoSequence);
sequence_start_ = this->index_;
}
Vector<T> EndSequence() {
ASSERT(sequence_start_ != kNoSequence);
int sequence_start = sequence_start_;
sequence_start_ = kNoSequence;
return Vector<T>(this->current_chunk_ + sequence_start,
this->index_ - sequence_start);
}
private:
static const int kNoSequence = -1;
int sequence_start_;
// Move the currently active sequence to the new chunk.
virtual int PrepareGrow(Vector<T> new_chunk) {
if (sequence_start_ != kNoSequence) {
int sequence_length = this->index_ - sequence_start_;
// The new chunk is always larger than the current chunk, so there
// is room for the copy.
ASSERT(sequence_length < new_chunk.length());
for (int i = 0; i < sequence_length; i++) {
new_chunk[i] = this->current_chunk_[sequence_start_ + i];
}
this->index_ = sequence_start_;
sequence_start_ = 0;
return sequence_length;
}
return 0;
}
};
// Simple support to read a file into a 0-terminated C-string.
// The returned buffer must be freed by the caller.
// On return, *exits tells whether the file existed.
......
......@@ -131,3 +131,64 @@ TEST(MemCopy) {
buffer2.Dispose();
buffer1.Dispose();
}
TEST(Collector) {
Collector<int> collector(8);
const int kLoops = 5;
const int kSequentialSize = 1000;
const int kBlockSize = 7;
for (int loop = 0; loop < kLoops; loop++) {
Vector<int> block = collector.AddBlock(7, 0xbadcafe);
for (int i = 0; i < kSequentialSize; i++) {
collector.Add(i);
}
for (int i = 0; i < kBlockSize - 1; i++) {
block[i] = i * 7;
}
}
Vector<int> result = collector.ToVector();
CHECK_EQ(kLoops * (kBlockSize + kSequentialSize), result.length());
for (int i = 0; i < kLoops; i++) {
int offset = i * (kSequentialSize + kBlockSize);
for (int j = 0; j < kBlockSize - 1; j++) {
CHECK_EQ(j * 7, result[offset + j]);
}
CHECK_EQ(0xbadcafe, result[offset + kBlockSize - 1]);
for (int j = 0; j < kSequentialSize; j++) {
CHECK_EQ(j, result[offset + kBlockSize + j]);
}
}
result.Dispose();
}
TEST(SequenceCollector) {
SequenceCollector<int> collector(8);
const int kLoops = 5000;
const int kMaxSequenceSize = 13;
int total_length = 0;
for (int loop = 0; loop < kLoops; loop++) {
int seq_length = loop % kMaxSequenceSize;
collector.StartSequence();
for (int j = 0; j < seq_length; j++) {
collector.Add(j);
}
Vector<int> sequence = collector.EndSequence();
for (int j = 0; j < seq_length; j++) {
CHECK_EQ(j, sequence[j]);
}
total_length += seq_length;
}
Vector<int> result = collector.ToVector();
CHECK_EQ(total_length, result.length());
int offset = 0;
for (int loop = 0; loop < kLoops; loop++) {
int seq_length = loop % kMaxSequenceSize;
for (int j = 0; j < seq_length; j++) {
CHECK_EQ(j, result[offset]);
offset++;
}
}
result.Dispose();
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment