A new instance of the utf-8 conversion changelist, this time against

bleeding_edge.



git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@170 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 388c1094
......@@ -485,7 +485,7 @@ class EXPORT Data {
/**
* Pre-compilation data that can be associated with a script. This
* data can be calculated for a script in advance of actually
* compiling it, and can bestored between compilations. When script
* compiling it, and can be stored between compilations. When script
* data is given to the compile method compilation will be faster.
*/
class EXPORT ScriptData { // NOLINT
......@@ -696,8 +696,18 @@ class EXPORT Boolean : public Primitive {
*/
class EXPORT String : public Primitive {
public:
/**
* Returns the number of characters in this string.
*/
int Length();
/**
* Returns the number of bytes in the UTF-8 encoded
* representation of this string.
*/
int Utf8Length();
/**
* Write the contents of the string to an external buffer.
* If no arguments are given, expects the buffer to be large
......@@ -716,9 +726,8 @@ class EXPORT String : public Primitive {
* excluding the NULL terminator.
*/
int Write(uint16_t* buffer, int start = 0, int length = -1); // UTF-16
int WriteAscii(char* buffer,
int start = 0,
int length = -1); // literally ascii
int WriteAscii(char* buffer, int start = 0, int length = -1); // ASCII
int WriteUtf8(char* buffer, int length = -1); // UTF-8
/**
* Returns true if the string is external
......
......@@ -1925,6 +1925,53 @@ int String::Length() {
}
int String::Utf8Length() {
if (IsDeadCheck("v8::String::Utf8Length()")) return 0;
return Utils::OpenHandle(this)->Utf8Length();
}
int String::WriteUtf8(char* buffer, int capacity) {
if (IsDeadCheck("v8::String::WriteUtf8()")) return 0;
LOG_API("String::WriteUtf8");
i::Handle<i::String> str = Utils::OpenHandle(this);
write_input_buffer.Reset(0, *str);
int len = str->length();
// Encode the first K - 3 bytes directly into the buffer since we
// know there's room for them. If no capacity is given we copy all
// of them here.
int fast_end = capacity - (unibrow::Utf8::kMaxEncodedSize - 1);
int i;
int pos = 0;
for (i = 0; i < len && (capacity == -1 || pos < fast_end); i++) {
i::uc32 c = write_input_buffer.GetNext();
int written = unibrow::Utf8::Encode(buffer + pos, c);
pos += written;
}
if (i < len) {
// For the last characters we need to check the length for each one
// because they may be longer than the remaining space in the
// buffer.
char intermediate[unibrow::Utf8::kMaxEncodedSize];
for (; i < len && pos < capacity; i++) {
i::uc32 c = write_input_buffer.GetNext();
int written = unibrow::Utf8::Encode(intermediate, c);
if (pos + written <= capacity) {
for (int j = 0; j < written; j++)
buffer[pos + j] = intermediate[j];
pos += written;
} else {
// We've reached the end of the buffer
break;
}
}
}
if (i == len && (capacity == -1 || pos < capacity))
buffer[pos++] = '\0';
return pos;
}
int String::WriteAscii(char* buffer, int start, int length) {
if (IsDeadCheck("v8::String::WriteAscii()")) return 0;
LOG_API("String::WriteAscii");
......
......@@ -2915,6 +2915,22 @@ bool String::LooksValid() {
}
int String::Utf8Length() {
if (is_ascii()) return length();
// Attempt to flatten before accessing the string. It probably
// doesn't make Utf8Length faster, but it is very likely that
// the string will be accessed later (for example by WriteUtf8)
// so it's still a good idea.
TryFlatten();
Access<StringInputBuffer> buffer(&string_input_buffer);
buffer->Reset(0, this);
int result = 0;
while (buffer->has_more())
result += unibrow::Utf8::Length(buffer->GetNext());
return result;
}
SmartPointer<char> String::ToCString(AllowNullsFlag allow_nulls,
RobustnessFlag robust_flag,
int offset,
......
......@@ -2842,6 +2842,8 @@ class String: public HeapObject {
RobustnessFlag robustness_flag = FAST_STRING_TRAVERSAL,
int* length_output = 0);
int Utf8Length();
// Return a 16 bit Unicode representation of the string.
// The string should be nearly flat, otherwise the performance of
// of this method may be very bad. Setting robustness_flag to
......
......@@ -333,3 +333,43 @@ TEST(DeepAscii) {
TraverseFirst(flat_string, string, DEEP_ASCII_DEPTH);
}
}
TEST(Utf8Conversion) {
// Smoke test for converting strings to utf-8.
InitializeVM();
v8::HandleScope handle_scope;
// A simple ascii string
const char* ascii_string = "abcdef12345";
int len = v8::String::New(ascii_string, strlen(ascii_string))->Utf8Length();
CHECK_EQ(strlen(ascii_string), len);
// A mixed ascii and non-ascii string
// U+02E4 -> CB A4
// U+0064 -> 64
// U+12E4 -> E1 8B A4
// U+0030 -> 30
// U+3045 -> E3 81 85
const uint16_t mixed_string[] = {0x02E4, 0x0064, 0x12E4, 0x0030, 0x3045};
// The characters we expect to be output
const char as_utf8[11] = {0xCB, 0xA4, 0x64, 0xE1, 0x8B, 0xA4, 0x30,
0xE3, 0x81, 0x85, 0x00};
// The number of bytes expected to be written for each length
const int lengths[12] = {0, 0, 2, 3, 3, 3, 6, 7, 7, 7, 10, 11};
v8::Handle<v8::String> mixed = v8::String::New(mixed_string, 5);
CHECK_EQ(10, mixed->Utf8Length());
// Try encoding the string with all capacities
char buffer[11];
for (int i = 0; i <= 11; i++) {
// Clear the buffer before reusing it
for (int j = 0; j < 11; j++)
buffer[j] = -1;
int written = mixed->WriteUtf8(buffer, i);
CHECK_EQ(lengths[i], written);
// Check that the contents are correct
for (int j = 0; j < lengths[i]; j++)
CHECK_EQ(as_utf8[j], buffer[j]);
// Check that the rest of the buffer hasn't been touched
for (int j = lengths[i]; j < 11; j++)
CHECK_EQ(-1, buffer[j]);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment