Commit dac0b853 authored by erikcorry's avatar erikcorry

Improve speed of Utf8Write by always flattening the string first and

detecting the ASCII case.  Also rewrite Utf8Length to work on an
unflattened string.  Bug: http://code.google.com/p/v8/issues/detail?id=1665
Review URL: http://codereview.chromium.org/8304021

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@9661 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 2bbf3bbe
......@@ -3634,13 +3634,25 @@ int String::WriteUtf8(char* buffer,
if (IsDeadCheck(isolate, "v8::String::WriteUtf8()")) return 0;
LOG_API(isolate, "String::WriteUtf8");
ENTER_V8(isolate);
i::StringInputBuffer& write_input_buffer = *isolate->write_input_buffer();
i::Handle<i::String> str = Utils::OpenHandle(this);
if (str->IsAsciiRepresentation()) {
if (capacity == -1) capacity = str->length() + 1;
int len = i::Min(capacity, str->length());
i::String::WriteToFlat(*str, buffer, 0, len);
if (nchars_ref != NULL) *nchars_ref = len;
if (!(options & NO_NULL_TERMINATION) && capacity > len) {
buffer[len] = '\0';
return len + 1;
}
return len;
}
i::StringInputBuffer& write_input_buffer = *isolate->write_input_buffer();
isolate->string_tracker()->RecordWrite(str);
if (options & HINT_MANY_WRITES_EXPECTED) {
// Flatten the string for efficiency. This applies whether we are
// using StringInputBuffer or Get(i) to access the characters.
str->TryFlatten();
FlattenString(str);
}
write_input_buffer.Reset(0, *str);
int len = str->length();
......
......@@ -5519,24 +5519,6 @@ bool String::LooksValid() {
}
int String::Utf8Length() {
if (IsAsciiRepresentation()) return length();
// Attempt to flatten before accessing the string. It probably
// doesn't make Utf8Length faster, but it is very likely that
// the string will be accessed later (for example by WriteUtf8)
// so it's still a good idea.
Heap* heap = GetHeap();
TryFlatten();
Access<StringInputBuffer> buffer(
heap->isolate()->objects_string_input_buffer());
buffer->Reset(0, this);
int result = 0;
while (buffer->has_more())
result += unibrow::Utf8::Length(buffer->GetNext());
return result;
}
String::FlatContent String::GetFlatContent() {
int length = this->length();
StringShape shape(this);
......@@ -5960,6 +5942,73 @@ const unibrow::byte* String::ReadBlock(String* input,
}
// This method determines the type of string involved and then gets the UTF8
// length of the string. It doesn't flatten the string and has log(n) recursion
// for a string of length n.
int String::Utf8Length(String* input, int from, int to) {
if (from == to) return 0;
int total = 0;
while (true) {
if (input->IsAsciiRepresentation()) return total + to - from;
switch (StringShape(input).representation_tag()) {
case kConsStringTag: {
ConsString* str = ConsString::cast(input);
String* first = str->first();
String* second = str->second();
int first_length = first->length();
if (first_length - from < to - first_length) {
if (first_length > from) {
// Left hand side is shorter.
total += Utf8Length(first, from, first_length);
input = second;
from = 0;
to -= first_length;
} else {
// We only need the right hand side.
input = second;
from -= first_length;
to -= first_length;
}
} else {
if (first_length <= to) {
// Right hand side is shorter.
total += Utf8Length(second, 0, to - first_length);
input = first;
to = first_length;
} else {
// We only need the left hand side.
input = first;
}
}
continue;
}
case kExternalStringTag:
case kSeqStringTag: {
Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();
const uc16* p = vector.start();
for (int i = from; i < to; i++) {
total += unibrow::Utf8::Length(p[i]);
}
return total;
}
case kSlicedStringTag: {
SlicedString* str = SlicedString::cast(input);
int offset = str->offset();
input = str->parent();
from += offset;
to += offset;
continue;
}
default:
break;
}
UNREACHABLE();
return 0;
}
return 0;
}
void Relocatable::PostGarbageCollectionProcessing() {
Isolate* isolate = Isolate::Current();
Relocatable* current = isolate->relocatable_top();
......
......@@ -6073,7 +6073,8 @@ class String: public HeapObject {
RobustnessFlag robustness_flag = FAST_STRING_TRAVERSAL,
int* length_output = 0);
int Utf8Length();
inline int Utf8Length() { return Utf8Length(this, 0, length()); }
static int Utf8Length(String* input, int from, int to);
// Return a 16 bit Unicode representation of the string.
// The string should be nearly flat, otherwise the performance of
......
......@@ -5438,67 +5438,109 @@ static int StrNCmp16(uint16_t* a, uint16_t* b, int n) {
THREADED_TEST(StringWrite) {
LocalContext context;
v8::HandleScope scope;
v8::Handle<String> str = v8_str("abcde");
// abc<Icelandic eth><Unicode snowman>.
v8::Handle<String> str2 = v8_str("abc\303\260\342\230\203");
const int kStride = 4; // Must match stride in for loops in JS below.
CompileRun(
"var left = '';"
"for (var i = 0; i < 0xd800; i += 4) {"
" left = left + String.fromCharCode(i);"
"}");
CompileRun(
"var right = '';"
"for (var i = 0; i < 0xd800; i += 4) {"
" right = String.fromCharCode(i) + right;"
"}");
v8::Handle<v8::Object> global = Context::GetCurrent()->Global();
Handle<String> left_tree = global->Get(v8_str("left")).As<String>();
Handle<String> right_tree = global->Get(v8_str("right")).As<String>();
CHECK_EQ(5, str2->Length());
CHECK_EQ(0xd800 / kStride, left_tree->Length());
CHECK_EQ(0xd800 / kStride, right_tree->Length());
char buf[100];
char utf8buf[100];
char utf8buf[0xd800 * 3];
uint16_t wbuf[100];
int len;
int charlen;
memset(utf8buf, 0x1, sizeof(utf8buf));
memset(utf8buf, 0x1, 1000);
len = str2->WriteUtf8(utf8buf, sizeof(utf8buf), &charlen);
CHECK_EQ(9, len);
CHECK_EQ(5, charlen);
CHECK_EQ(0, strcmp(utf8buf, "abc\303\260\342\230\203"));
memset(utf8buf, 0x1, sizeof(utf8buf));
memset(utf8buf, 0x1, 1000);
len = str2->WriteUtf8(utf8buf, 8, &charlen);
CHECK_EQ(8, len);
CHECK_EQ(5, charlen);
CHECK_EQ(0, strncmp(utf8buf, "abc\303\260\342\230\203\1", 9));
memset(utf8buf, 0x1, sizeof(utf8buf));
memset(utf8buf, 0x1, 1000);
len = str2->WriteUtf8(utf8buf, 7, &charlen);
CHECK_EQ(5, len);
CHECK_EQ(4, charlen);
CHECK_EQ(0, strncmp(utf8buf, "abc\303\260\1", 5));
memset(utf8buf, 0x1, sizeof(utf8buf));
memset(utf8buf, 0x1, 1000);
len = str2->WriteUtf8(utf8buf, 6, &charlen);
CHECK_EQ(5, len);
CHECK_EQ(4, charlen);
CHECK_EQ(0, strncmp(utf8buf, "abc\303\260\1", 5));
memset(utf8buf, 0x1, sizeof(utf8buf));
memset(utf8buf, 0x1, 1000);
len = str2->WriteUtf8(utf8buf, 5, &charlen);
CHECK_EQ(5, len);
CHECK_EQ(4, charlen);
CHECK_EQ(0, strncmp(utf8buf, "abc\303\260\1", 5));
memset(utf8buf, 0x1, sizeof(utf8buf));
memset(utf8buf, 0x1, 1000);
len = str2->WriteUtf8(utf8buf, 4, &charlen);
CHECK_EQ(3, len);
CHECK_EQ(3, charlen);
CHECK_EQ(0, strncmp(utf8buf, "abc\1", 4));
memset(utf8buf, 0x1, sizeof(utf8buf));
memset(utf8buf, 0x1, 1000);
len = str2->WriteUtf8(utf8buf, 3, &charlen);
CHECK_EQ(3, len);
CHECK_EQ(3, charlen);
CHECK_EQ(0, strncmp(utf8buf, "abc\1", 4));
memset(utf8buf, 0x1, sizeof(utf8buf));
memset(utf8buf, 0x1, 1000);
len = str2->WriteUtf8(utf8buf, 2, &charlen);
CHECK_EQ(2, len);
CHECK_EQ(2, charlen);
CHECK_EQ(0, strncmp(utf8buf, "ab\1", 3));
memset(utf8buf, 0x1, sizeof(utf8buf));
len = left_tree->Utf8Length();
int utf8_expected =
(0x80 + (0x800 - 0x80) * 2 + (0xd800 - 0x800) * 3) / kStride;
CHECK_EQ(utf8_expected, len);
len = left_tree->WriteUtf8(utf8buf, utf8_expected, &charlen);
CHECK_EQ(utf8_expected, len);
CHECK_EQ(0xd800 / kStride, charlen);
CHECK_EQ(0xed, static_cast<unsigned char>(utf8buf[utf8_expected - 3]));
CHECK_EQ(0x9f, static_cast<unsigned char>(utf8buf[utf8_expected - 2]));
CHECK_EQ(0xc0 - kStride,
static_cast<unsigned char>(utf8buf[utf8_expected - 1]));
CHECK_EQ(1, utf8buf[utf8_expected]);
memset(utf8buf, 0x1, sizeof(utf8buf));
len = right_tree->Utf8Length();
CHECK_EQ(utf8_expected, len);
len = right_tree->WriteUtf8(utf8buf, utf8_expected, &charlen);
CHECK_EQ(utf8_expected, len);
CHECK_EQ(0xd800 / kStride, charlen);
CHECK_EQ(0xed, static_cast<unsigned char>(utf8buf[0]));
CHECK_EQ(0x9f, static_cast<unsigned char>(utf8buf[1]));
CHECK_EQ(0xc0 - kStride, static_cast<unsigned char>(utf8buf[2]));
CHECK_EQ(1, utf8buf[utf8_expected]);
memset(buf, 0x1, sizeof(buf));
memset(wbuf, 0x1, sizeof(wbuf));
len = str->WriteAscii(buf);
......@@ -11440,6 +11482,7 @@ static void MorphAString(i::String* string,
// Test that we can still flatten a string if the components it is built up
// from have been turned into 16 bit strings in the mean time.
THREADED_TEST(MorphCompositeStringTest) {
char utf_buffer[129];
const char* c_string = "Now is the time for all good men"
" to come to the aid of the party";
uint16_t* two_byte_string = AsciiToTwoByteString(c_string);
......@@ -11468,6 +11511,17 @@ THREADED_TEST(MorphCompositeStringTest) {
MorphAString(*v8::Utils::OpenHandle(*lhs), &ascii_resource, &uc16_resource);
MorphAString(*v8::Utils::OpenHandle(*rhs), &ascii_resource, &uc16_resource);
// This should UTF-8 without flattening, since everything is ASCII.
Handle<String> cons = v8_compile("cons")->Run().As<String>();
CHECK_EQ(128, cons->Utf8Length());
int nchars = -1;
CHECK_EQ(129, cons->WriteUtf8(utf_buffer, -1, &nchars));
CHECK_EQ(128, nchars);
CHECK_EQ(0, strcmp(
utf_buffer,
"Now is the time for all good men to come to the aid of the party"
"Now is the time for all good men to come to the aid of the party"));
// Now do some stuff to make sure the strings are flattened, etc.
CompileRun(
"/[^a-z]/.test(cons);"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment