Commit c5701e16 authored by jbroman's avatar jbroman Committed by Commit bot

Blink-compatible serialization of strings.

This includes UTF-8 strings and two-byte strings, both length-delimited
(in bytes, not characters). Two-byte strings are written/read in host byte
order.

BUG=chromium:148757

Review-Url: https://codereview.chromium.org/2245753002
Cr-Commit-Position: refs/heads/master@{#38636}
parent cba03ba8
...@@ -18,6 +18,18 @@ namespace internal { ...@@ -18,6 +18,18 @@ namespace internal {
static const uint32_t kLatestVersion = 9; static const uint32_t kLatestVersion = 9;
template <typename T>
static size_t BytesNeededForVarint(T value) {
static_assert(std::is_integral<T>::value && std::is_unsigned<T>::value,
"Only unsigned integer types can be written as varints.");
size_t result = 0;
do {
result++;
value >>= 7;
} while (value);
return result;
}
enum class SerializationTag : uint8_t { enum class SerializationTag : uint8_t {
// version:uint32_t (if at beginning of data, sets version > 0) // version:uint32_t (if at beginning of data, sets version > 0)
kVersion = 0xFF, kVersion = 0xFF,
...@@ -39,6 +51,9 @@ enum class SerializationTag : uint8_t { ...@@ -39,6 +51,9 @@ enum class SerializationTag : uint8_t {
// Number represented as a 64-bit double. // Number represented as a 64-bit double.
// Host byte order is used (N.B. this makes the format non-portable). // Host byte order is used (N.B. this makes the format non-portable).
kDouble = 'N', kDouble = 'N',
// byteLength:uint32_t, then raw data
kUtf8String = 'S',
kTwoByteString = 'c',
}; };
ValueSerializer::ValueSerializer() {} ValueSerializer::ValueSerializer() {}
...@@ -92,6 +107,24 @@ void ValueSerializer::WriteDouble(double value) { ...@@ -92,6 +107,24 @@ void ValueSerializer::WriteDouble(double value) {
reinterpret_cast<const uint8_t*>(&value + 1)); reinterpret_cast<const uint8_t*>(&value + 1));
} }
void ValueSerializer::WriteOneByteString(Vector<const uint8_t> chars) {
WriteVarint<uint32_t>(chars.length());
buffer_.insert(buffer_.end(), chars.begin(), chars.end());
}
void ValueSerializer::WriteTwoByteString(Vector<const uc16> chars) {
// Warning: this uses host endianness.
WriteVarint<uint32_t>(chars.length() * sizeof(uc16));
buffer_.insert(buffer_.end(), reinterpret_cast<const uint8_t*>(chars.begin()),
reinterpret_cast<const uint8_t*>(chars.end()));
}
uint8_t* ValueSerializer::ReserveRawBytes(size_t bytes) {
auto old_size = buffer_.size();
buffer_.resize(buffer_.size() + bytes);
return &buffer_[old_size];
}
Maybe<bool> ValueSerializer::WriteObject(Handle<Object> object) { Maybe<bool> ValueSerializer::WriteObject(Handle<Object> object) {
if (object->IsSmi()) { if (object->IsSmi()) {
WriteSmi(Smi::cast(*object)); WriteSmi(Smi::cast(*object));
...@@ -108,6 +141,10 @@ Maybe<bool> ValueSerializer::WriteObject(Handle<Object> object) { ...@@ -108,6 +141,10 @@ Maybe<bool> ValueSerializer::WriteObject(Handle<Object> object) {
WriteHeapNumber(HeapNumber::cast(*object)); WriteHeapNumber(HeapNumber::cast(*object));
return Just(true); return Just(true);
default: default:
if (object->IsString()) {
WriteString(Handle<String>::cast(object));
return Just(true);
}
UNIMPLEMENTED(); UNIMPLEMENTED();
return Nothing<bool>(); return Nothing<bool>();
} }
...@@ -146,6 +183,41 @@ void ValueSerializer::WriteHeapNumber(HeapNumber* number) { ...@@ -146,6 +183,41 @@ void ValueSerializer::WriteHeapNumber(HeapNumber* number) {
WriteDouble(number->value()); WriteDouble(number->value());
} }
void ValueSerializer::WriteString(Handle<String> string) {
string = String::Flatten(string);
DisallowHeapAllocation no_gc;
String::FlatContent flat = string->GetFlatContent();
DCHECK(flat.IsFlat());
if (flat.IsOneByte()) {
// The existing format uses UTF-8, rather than Latin-1. As a result we must
// to do work to encode strings that have characters outside ASCII.
// TODO(jbroman): In a future format version, consider adding a tag for
// Latin-1 strings, so that this can be skipped.
WriteTag(SerializationTag::kUtf8String);
Vector<const uint8_t> chars = flat.ToOneByteVector();
if (String::IsAscii(chars.begin(), chars.length())) {
WriteOneByteString(chars);
} else {
v8::Local<v8::String> api_string = Utils::ToLocal(string);
uint32_t utf8_length = api_string->Utf8Length();
WriteVarint(utf8_length);
api_string->WriteUtf8(
reinterpret_cast<char*>(ReserveRawBytes(utf8_length)), utf8_length,
nullptr, v8::String::NO_NULL_TERMINATION);
}
} else if (flat.IsTwoByte()) {
Vector<const uc16> chars = flat.ToUC16Vector();
uint32_t byte_length = chars.length() * sizeof(uc16);
// The existing reading code expects 16-byte strings to be aligned.
if ((buffer_.size() + 1 + BytesNeededForVarint(byte_length)) & 1)
WriteTag(SerializationTag::kPadding);
WriteTag(SerializationTag::kTwoByteString);
WriteTwoByteString(chars);
} else {
UNREACHABLE();
}
}
ValueDeserializer::ValueDeserializer(Isolate* isolate, ValueDeserializer::ValueDeserializer(Isolate* isolate,
Vector<const uint8_t> data) Vector<const uint8_t> data)
: isolate_(isolate), : isolate_(isolate),
...@@ -223,6 +295,13 @@ Maybe<double> ValueDeserializer::ReadDouble() { ...@@ -223,6 +295,13 @@ Maybe<double> ValueDeserializer::ReadDouble() {
return Just(value); return Just(value);
} }
Maybe<Vector<const uint8_t>> ValueDeserializer::ReadRawBytes(int size) {
if (size > end_ - position_) return Nothing<Vector<const uint8_t>>();
const uint8_t* start = position_;
position_ += size;
return Just(Vector<const uint8_t>(start, size));
}
MaybeHandle<Object> ValueDeserializer::ReadObject() { MaybeHandle<Object> ValueDeserializer::ReadObject() {
SerializationTag tag; SerializationTag tag;
if (!ReadTag().To(&tag)) return MaybeHandle<Object>(); if (!ReadTag().To(&tag)) return MaybeHandle<Object>();
...@@ -254,10 +333,49 @@ MaybeHandle<Object> ValueDeserializer::ReadObject() { ...@@ -254,10 +333,49 @@ MaybeHandle<Object> ValueDeserializer::ReadObject() {
if (number.IsNothing()) return MaybeHandle<Object>(); if (number.IsNothing()) return MaybeHandle<Object>();
return isolate_->factory()->NewNumber(number.FromJust()); return isolate_->factory()->NewNumber(number.FromJust());
} }
case SerializationTag::kUtf8String:
return ReadUtf8String();
case SerializationTag::kTwoByteString:
return ReadTwoByteString();
default: default:
return MaybeHandle<Object>(); return MaybeHandle<Object>();
} }
} }
MaybeHandle<String> ValueDeserializer::ReadUtf8String() {
uint32_t utf8_length;
Vector<const uint8_t> utf8_bytes;
if (!ReadVarint<uint32_t>().To(&utf8_length) ||
utf8_length >
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()) ||
!ReadRawBytes(utf8_length).To(&utf8_bytes))
return MaybeHandle<String>();
return isolate_->factory()->NewStringFromUtf8(
Vector<const char>::cast(utf8_bytes));
}
MaybeHandle<String> ValueDeserializer::ReadTwoByteString() {
uint32_t byte_length;
Vector<const uint8_t> bytes;
if (!ReadVarint<uint32_t>().To(&byte_length) ||
byte_length >
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()) ||
byte_length % sizeof(uc16) != 0 || !ReadRawBytes(byte_length).To(&bytes))
return MaybeHandle<String>();
// Allocate an uninitialized string so that we can do a raw memcpy into the
// string on the heap (regardless of alignment).
Handle<SeqTwoByteString> string;
if (!isolate_->factory()
->NewRawTwoByteString(byte_length / sizeof(uc16))
.ToHandle(&string))
return MaybeHandle<String>();
// Copy the bytes directly into the new string.
// Warning: this uses host endianness.
memcpy(string->GetChars(), bytes.begin(), bytes.length());
return string;
}
} // namespace internal } // namespace internal
} // namespace v8 } // namespace v8
...@@ -59,11 +59,15 @@ class ValueSerializer { ...@@ -59,11 +59,15 @@ class ValueSerializer {
template <typename T> template <typename T>
void WriteZigZag(T value); void WriteZigZag(T value);
void WriteDouble(double value); void WriteDouble(double value);
void WriteOneByteString(Vector<const uint8_t> chars);
void WriteTwoByteString(Vector<const uc16> chars);
uint8_t* ReserveRawBytes(size_t bytes);
// Writing V8 objects of various kinds. // Writing V8 objects of various kinds.
void WriteOddball(Oddball* oddball); void WriteOddball(Oddball* oddball);
void WriteSmi(Smi* smi); void WriteSmi(Smi* smi);
void WriteHeapNumber(HeapNumber* number); void WriteHeapNumber(HeapNumber* number);
void WriteString(Handle<String> string);
std::vector<uint8_t> buffer_; std::vector<uint8_t> buffer_;
...@@ -90,12 +94,19 @@ class ValueDeserializer { ...@@ -90,12 +94,19 @@ class ValueDeserializer {
MaybeHandle<Object> ReadObject() WARN_UNUSED_RESULT; MaybeHandle<Object> ReadObject() WARN_UNUSED_RESULT;
private: private:
// Reading the wire format.
Maybe<SerializationTag> ReadTag() WARN_UNUSED_RESULT; Maybe<SerializationTag> ReadTag() WARN_UNUSED_RESULT;
template <typename T> template <typename T>
Maybe<T> ReadVarint() WARN_UNUSED_RESULT; Maybe<T> ReadVarint() WARN_UNUSED_RESULT;
template <typename T> template <typename T>
Maybe<T> ReadZigZag() WARN_UNUSED_RESULT; Maybe<T> ReadZigZag() WARN_UNUSED_RESULT;
Maybe<double> ReadDouble() WARN_UNUSED_RESULT; Maybe<double> ReadDouble() WARN_UNUSED_RESULT;
Maybe<Vector<const uint8_t>> ReadRawBytes(int size) WARN_UNUSED_RESULT;
// Reading V8 objects of specific kinds.
// The tag is assumed to have already been read.
MaybeHandle<String> ReadUtf8String() WARN_UNUSED_RESULT;
MaybeHandle<String> ReadTwoByteString() WARN_UNUSED_RESULT;
Isolate* const isolate_; Isolate* const isolate_;
const uint8_t* position_; const uint8_t* position_;
......
...@@ -4,6 +4,9 @@ ...@@ -4,6 +4,9 @@
#include "src/value-serializer.h" #include "src/value-serializer.h"
#include <algorithm>
#include <string>
#include "include/v8.h" #include "include/v8.h"
#include "src/api.h" #include "src/api.h"
#include "src/base/build_config.h" #include "src/base/build_config.h"
...@@ -29,22 +32,27 @@ class ValueSerializerTest : public TestWithIsolate { ...@@ -29,22 +32,27 @@ class ValueSerializerTest : public TestWithIsolate {
template <typename InputFunctor, typename OutputFunctor> template <typename InputFunctor, typename OutputFunctor>
void RoundTripTest(const InputFunctor& input_functor, void RoundTripTest(const InputFunctor& input_functor,
const OutputFunctor& output_functor) { const OutputFunctor& output_functor) {
std::vector<uint8_t> data; EncodeTest(input_functor,
{ [this, &output_functor](const std::vector<uint8_t>& data) {
Context::Scope scope(serialization_context()); DecodeTest(data, output_functor);
TryCatch try_catch(isolate()); });
// TODO(jbroman): Use the public API once it exists. }
Local<Value> input_value = input_functor();
i::Isolate* internal_isolate = reinterpret_cast<i::Isolate*>(isolate()); template <typename InputFunctor, typename EncodedDataFunctor>
i::HandleScope handle_scope(internal_isolate); void EncodeTest(const InputFunctor& input_functor,
i::ValueSerializer serializer; const EncodedDataFunctor& encoded_data_functor) {
serializer.WriteHeader(); Context::Scope scope(serialization_context());
ASSERT_TRUE(serializer.WriteObject(Utils::OpenHandle(*input_value)) TryCatch try_catch(isolate());
.FromMaybe(false)); // TODO(jbroman): Use the public API once it exists.
ASSERT_FALSE(try_catch.HasCaught()); Local<Value> input_value = input_functor();
data = serializer.ReleaseBuffer(); i::Isolate* internal_isolate = reinterpret_cast<i::Isolate*>(isolate());
} i::HandleScope handle_scope(internal_isolate);
DecodeTest(data, output_functor); i::ValueSerializer serializer;
serializer.WriteHeader();
ASSERT_TRUE(serializer.WriteObject(Utils::OpenHandle(*input_value))
.FromMaybe(false));
ASSERT_FALSE(try_catch.HasCaught());
encoded_data_functor(serializer.ReleaseBuffer());
} }
template <typename OutputFunctor> template <typename OutputFunctor>
...@@ -106,6 +114,11 @@ class ValueSerializerTest : public TestWithIsolate { ...@@ -106,6 +114,11 @@ class ValueSerializerTest : public TestWithIsolate {
.ToLocalChecked(); .ToLocalChecked();
} }
static std::string Utf8Value(Local<Value> value) {
String::Utf8Value utf8(value);
return std::string(*utf8, utf8.length());
}
private: private:
Local<Context> serialization_context_; Local<Context> serialization_context_;
Local<Context> deserialization_context_; Local<Context> deserialization_context_;
...@@ -248,5 +261,132 @@ TEST_F(ValueSerializerTest, DecodeNumber) { ...@@ -248,5 +261,132 @@ TEST_F(ValueSerializerTest, DecodeNumber) {
// TODO(jbroman): Equivalent test for big-endian machines. // TODO(jbroman): Equivalent test for big-endian machines.
} }
// String constants (in UTF-8) used for string encoding tests.
static const char kHelloString[] = "Hello";
static const char kQuebecString[] = "\x51\x75\xC3\xA9\x62\x65\x63";
static const char kEmojiString[] = "\xF0\x9F\x91\x8A";
TEST_F(ValueSerializerTest, RoundTripString) {
RoundTripTest([this]() { return String::Empty(isolate()); },
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(0, String::Cast(*value)->Length());
});
// Inside ASCII.
RoundTripTest([this]() { return StringFromUtf8(kHelloString); },
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(5, String::Cast(*value)->Length());
EXPECT_EQ(kHelloString, Utf8Value(value));
});
// Inside Latin-1 (i.e. one-byte string), but not ASCII.
RoundTripTest([this]() { return StringFromUtf8(kQuebecString); },
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(6, String::Cast(*value)->Length());
EXPECT_EQ(kQuebecString, Utf8Value(value));
});
// An emoji (decodes to two 16-bit chars).
RoundTripTest([this]() { return StringFromUtf8(kEmojiString); },
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(2, String::Cast(*value)->Length());
EXPECT_EQ(kEmojiString, Utf8Value(value));
});
}
TEST_F(ValueSerializerTest, DecodeString) {
// Decoding the strings above from UTF-8.
DecodeTest({0xff, 0x09, 0x53, 0x00},
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(0, String::Cast(*value)->Length());
});
DecodeTest({0xff, 0x09, 0x53, 0x05, 'H', 'e', 'l', 'l', 'o'},
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(5, String::Cast(*value)->Length());
EXPECT_EQ(kHelloString, Utf8Value(value));
});
DecodeTest({0xff, 0x09, 0x53, 0x07, 'Q', 'u', 0xc3, 0xa9, 'b', 'e', 'c'},
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(6, String::Cast(*value)->Length());
EXPECT_EQ(kQuebecString, Utf8Value(value));
});
DecodeTest({0xff, 0x09, 0x53, 0x04, 0xf0, 0x9f, 0x91, 0x8a},
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(2, String::Cast(*value)->Length());
EXPECT_EQ(kEmojiString, Utf8Value(value));
});
// And from two-byte strings (endianness dependent).
#if defined(V8_TARGET_LITTLE_ENDIAN)
DecodeTest({0xff, 0x09, 0x63, 0x00},
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(0, String::Cast(*value)->Length());
});
DecodeTest({0xff, 0x09, 0x63, 0x0a, 'H', '\0', 'e', '\0', 'l', '\0', 'l',
'\0', 'o', '\0'},
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(5, String::Cast(*value)->Length());
EXPECT_EQ(kHelloString, Utf8Value(value));
});
DecodeTest({0xff, 0x09, 0x63, 0x0c, 'Q', '\0', 'u', '\0', 0xe9, '\0', 'b',
'\0', 'e', '\0', 'c', '\0'},
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(6, String::Cast(*value)->Length());
EXPECT_EQ(kQuebecString, Utf8Value(value));
});
DecodeTest({0xff, 0x09, 0x63, 0x04, 0x3d, 0xd8, 0x4a, 0xdc},
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(2, String::Cast(*value)->Length());
EXPECT_EQ(kEmojiString, Utf8Value(value));
});
#endif
// TODO(jbroman): The same for big-endian systems.
}
TEST_F(ValueSerializerTest, DecodeInvalidString) {
// UTF-8 string with too few bytes available.
InvalidDecodeTest({0xff, 0x09, 0x53, 0x10, 'v', '8'});
#if defined(V8_TARGET_LITTLE_ENDIAN)
// Two-byte string with too few bytes available.
InvalidDecodeTest({0xff, 0x09, 0x63, 0x10, 'v', '\0', '8', '\0'});
// Two-byte string with an odd byte length.
InvalidDecodeTest({0xff, 0x09, 0x63, 0x03, 'v', '\0', '8'});
#endif
// TODO(jbroman): The same for big-endian systems.
}
TEST_F(ValueSerializerTest, EncodeTwoByteStringUsesPadding) {
// As long as the output has a version that Blink expects to be able to read,
// we must respect its alignment requirements. It requires that two-byte
// characters be aligned.
EncodeTest(
[this]() {
// We need a string whose length will take two bytes to encode, so that
// a padding byte is needed to keep the characters aligned. The string
// must also have a two-byte character, so that it gets the two-byte
// encoding.
std::string string(200, ' ');
string += kEmojiString;
return StringFromUtf8(string.c_str());
},
[](const std::vector<uint8_t>& data) {
// This is a sufficient but not necessary condition to be aligned.
// Note that the third byte (0x00) is padding.
const uint8_t expected_prefix[] = {0xff, 0x09, 0x00, 0x63, 0x94, 0x03};
ASSERT_GT(data.size(), sizeof(expected_prefix) / sizeof(uint8_t));
EXPECT_TRUE(std::equal(std::begin(expected_prefix),
std::end(expected_prefix), data.begin()));
});
}
} // namespace } // namespace
} // namespace v8 } // namespace v8
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment