Commit c5701e16 authored by jbroman's avatar jbroman Committed by Commit bot

Blink-compatible serialization of strings.

This includes UTF-8 strings and two-byte strings, both length-delimited
(in bytes, not characters). Two-byte strings are written/read in host byte
order.

BUG=chromium:148757

Review-Url: https://codereview.chromium.org/2245753002
Cr-Commit-Position: refs/heads/master@{#38636}
parent cba03ba8
......@@ -18,6 +18,18 @@ namespace internal {
static const uint32_t kLatestVersion = 9;
template <typename T>
static size_t BytesNeededForVarint(T value) {
static_assert(std::is_integral<T>::value && std::is_unsigned<T>::value,
"Only unsigned integer types can be written as varints.");
size_t result = 0;
do {
result++;
value >>= 7;
} while (value);
return result;
}
enum class SerializationTag : uint8_t {
// version:uint32_t (if at beginning of data, sets version > 0)
kVersion = 0xFF,
......@@ -39,6 +51,9 @@ enum class SerializationTag : uint8_t {
// Number represented as a 64-bit double.
// Host byte order is used (N.B. this makes the format non-portable).
kDouble = 'N',
// byteLength:uint32_t, then raw data
kUtf8String = 'S',
kTwoByteString = 'c',
};
ValueSerializer::ValueSerializer() {}
......@@ -92,6 +107,24 @@ void ValueSerializer::WriteDouble(double value) {
reinterpret_cast<const uint8_t*>(&value + 1));
}
void ValueSerializer::WriteOneByteString(Vector<const uint8_t> chars) {
WriteVarint<uint32_t>(chars.length());
buffer_.insert(buffer_.end(), chars.begin(), chars.end());
}
void ValueSerializer::WriteTwoByteString(Vector<const uc16> chars) {
// Warning: this uses host endianness.
WriteVarint<uint32_t>(chars.length() * sizeof(uc16));
buffer_.insert(buffer_.end(), reinterpret_cast<const uint8_t*>(chars.begin()),
reinterpret_cast<const uint8_t*>(chars.end()));
}
uint8_t* ValueSerializer::ReserveRawBytes(size_t bytes) {
auto old_size = buffer_.size();
buffer_.resize(buffer_.size() + bytes);
return &buffer_[old_size];
}
Maybe<bool> ValueSerializer::WriteObject(Handle<Object> object) {
if (object->IsSmi()) {
WriteSmi(Smi::cast(*object));
......@@ -108,6 +141,10 @@ Maybe<bool> ValueSerializer::WriteObject(Handle<Object> object) {
WriteHeapNumber(HeapNumber::cast(*object));
return Just(true);
default:
if (object->IsString()) {
WriteString(Handle<String>::cast(object));
return Just(true);
}
UNIMPLEMENTED();
return Nothing<bool>();
}
......@@ -146,6 +183,41 @@ void ValueSerializer::WriteHeapNumber(HeapNumber* number) {
WriteDouble(number->value());
}
void ValueSerializer::WriteString(Handle<String> string) {
string = String::Flatten(string);
DisallowHeapAllocation no_gc;
String::FlatContent flat = string->GetFlatContent();
DCHECK(flat.IsFlat());
if (flat.IsOneByte()) {
// The existing format uses UTF-8, rather than Latin-1. As a result we must
// to do work to encode strings that have characters outside ASCII.
// TODO(jbroman): In a future format version, consider adding a tag for
// Latin-1 strings, so that this can be skipped.
WriteTag(SerializationTag::kUtf8String);
Vector<const uint8_t> chars = flat.ToOneByteVector();
if (String::IsAscii(chars.begin(), chars.length())) {
WriteOneByteString(chars);
} else {
v8::Local<v8::String> api_string = Utils::ToLocal(string);
uint32_t utf8_length = api_string->Utf8Length();
WriteVarint(utf8_length);
api_string->WriteUtf8(
reinterpret_cast<char*>(ReserveRawBytes(utf8_length)), utf8_length,
nullptr, v8::String::NO_NULL_TERMINATION);
}
} else if (flat.IsTwoByte()) {
Vector<const uc16> chars = flat.ToUC16Vector();
uint32_t byte_length = chars.length() * sizeof(uc16);
// The existing reading code expects 16-byte strings to be aligned.
if ((buffer_.size() + 1 + BytesNeededForVarint(byte_length)) & 1)
WriteTag(SerializationTag::kPadding);
WriteTag(SerializationTag::kTwoByteString);
WriteTwoByteString(chars);
} else {
UNREACHABLE();
}
}
ValueDeserializer::ValueDeserializer(Isolate* isolate,
Vector<const uint8_t> data)
: isolate_(isolate),
......@@ -223,6 +295,13 @@ Maybe<double> ValueDeserializer::ReadDouble() {
return Just(value);
}
Maybe<Vector<const uint8_t>> ValueDeserializer::ReadRawBytes(int size) {
if (size > end_ - position_) return Nothing<Vector<const uint8_t>>();
const uint8_t* start = position_;
position_ += size;
return Just(Vector<const uint8_t>(start, size));
}
MaybeHandle<Object> ValueDeserializer::ReadObject() {
SerializationTag tag;
if (!ReadTag().To(&tag)) return MaybeHandle<Object>();
......@@ -254,10 +333,49 @@ MaybeHandle<Object> ValueDeserializer::ReadObject() {
if (number.IsNothing()) return MaybeHandle<Object>();
return isolate_->factory()->NewNumber(number.FromJust());
}
case SerializationTag::kUtf8String:
return ReadUtf8String();
case SerializationTag::kTwoByteString:
return ReadTwoByteString();
default:
return MaybeHandle<Object>();
}
}
MaybeHandle<String> ValueDeserializer::ReadUtf8String() {
uint32_t utf8_length;
Vector<const uint8_t> utf8_bytes;
if (!ReadVarint<uint32_t>().To(&utf8_length) ||
utf8_length >
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()) ||
!ReadRawBytes(utf8_length).To(&utf8_bytes))
return MaybeHandle<String>();
return isolate_->factory()->NewStringFromUtf8(
Vector<const char>::cast(utf8_bytes));
}
MaybeHandle<String> ValueDeserializer::ReadTwoByteString() {
uint32_t byte_length;
Vector<const uint8_t> bytes;
if (!ReadVarint<uint32_t>().To(&byte_length) ||
byte_length >
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()) ||
byte_length % sizeof(uc16) != 0 || !ReadRawBytes(byte_length).To(&bytes))
return MaybeHandle<String>();
// Allocate an uninitialized string so that we can do a raw memcpy into the
// string on the heap (regardless of alignment).
Handle<SeqTwoByteString> string;
if (!isolate_->factory()
->NewRawTwoByteString(byte_length / sizeof(uc16))
.ToHandle(&string))
return MaybeHandle<String>();
// Copy the bytes directly into the new string.
// Warning: this uses host endianness.
memcpy(string->GetChars(), bytes.begin(), bytes.length());
return string;
}
} // namespace internal
} // namespace v8
......@@ -59,11 +59,15 @@ class ValueSerializer {
template <typename T>
void WriteZigZag(T value);
void WriteDouble(double value);
void WriteOneByteString(Vector<const uint8_t> chars);
void WriteTwoByteString(Vector<const uc16> chars);
uint8_t* ReserveRawBytes(size_t bytes);
// Writing V8 objects of various kinds.
void WriteOddball(Oddball* oddball);
void WriteSmi(Smi* smi);
void WriteHeapNumber(HeapNumber* number);
void WriteString(Handle<String> string);
std::vector<uint8_t> buffer_;
......@@ -90,12 +94,19 @@ class ValueDeserializer {
MaybeHandle<Object> ReadObject() WARN_UNUSED_RESULT;
private:
// Reading the wire format.
Maybe<SerializationTag> ReadTag() WARN_UNUSED_RESULT;
template <typename T>
Maybe<T> ReadVarint() WARN_UNUSED_RESULT;
template <typename T>
Maybe<T> ReadZigZag() WARN_UNUSED_RESULT;
Maybe<double> ReadDouble() WARN_UNUSED_RESULT;
Maybe<Vector<const uint8_t>> ReadRawBytes(int size) WARN_UNUSED_RESULT;
// Reading V8 objects of specific kinds.
// The tag is assumed to have already been read.
MaybeHandle<String> ReadUtf8String() WARN_UNUSED_RESULT;
MaybeHandle<String> ReadTwoByteString() WARN_UNUSED_RESULT;
Isolate* const isolate_;
const uint8_t* position_;
......
......@@ -4,6 +4,9 @@
#include "src/value-serializer.h"
#include <algorithm>
#include <string>
#include "include/v8.h"
#include "src/api.h"
#include "src/base/build_config.h"
......@@ -29,8 +32,15 @@ class ValueSerializerTest : public TestWithIsolate {
template <typename InputFunctor, typename OutputFunctor>
void RoundTripTest(const InputFunctor& input_functor,
const OutputFunctor& output_functor) {
std::vector<uint8_t> data;
{
EncodeTest(input_functor,
[this, &output_functor](const std::vector<uint8_t>& data) {
DecodeTest(data, output_functor);
});
}
template <typename InputFunctor, typename EncodedDataFunctor>
void EncodeTest(const InputFunctor& input_functor,
const EncodedDataFunctor& encoded_data_functor) {
Context::Scope scope(serialization_context());
TryCatch try_catch(isolate());
// TODO(jbroman): Use the public API once it exists.
......@@ -42,9 +52,7 @@ class ValueSerializerTest : public TestWithIsolate {
ASSERT_TRUE(serializer.WriteObject(Utils::OpenHandle(*input_value))
.FromMaybe(false));
ASSERT_FALSE(try_catch.HasCaught());
data = serializer.ReleaseBuffer();
}
DecodeTest(data, output_functor);
encoded_data_functor(serializer.ReleaseBuffer());
}
template <typename OutputFunctor>
......@@ -106,6 +114,11 @@ class ValueSerializerTest : public TestWithIsolate {
.ToLocalChecked();
}
static std::string Utf8Value(Local<Value> value) {
String::Utf8Value utf8(value);
return std::string(*utf8, utf8.length());
}
private:
Local<Context> serialization_context_;
Local<Context> deserialization_context_;
......@@ -248,5 +261,132 @@ TEST_F(ValueSerializerTest, DecodeNumber) {
// TODO(jbroman): Equivalent test for big-endian machines.
}
// String constants (in UTF-8) used for string encoding tests.
static const char kHelloString[] = "Hello";
static const char kQuebecString[] = "\x51\x75\xC3\xA9\x62\x65\x63";
static const char kEmojiString[] = "\xF0\x9F\x91\x8A";
TEST_F(ValueSerializerTest, RoundTripString) {
RoundTripTest([this]() { return String::Empty(isolate()); },
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(0, String::Cast(*value)->Length());
});
// Inside ASCII.
RoundTripTest([this]() { return StringFromUtf8(kHelloString); },
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(5, String::Cast(*value)->Length());
EXPECT_EQ(kHelloString, Utf8Value(value));
});
// Inside Latin-1 (i.e. one-byte string), but not ASCII.
RoundTripTest([this]() { return StringFromUtf8(kQuebecString); },
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(6, String::Cast(*value)->Length());
EXPECT_EQ(kQuebecString, Utf8Value(value));
});
// An emoji (decodes to two 16-bit chars).
RoundTripTest([this]() { return StringFromUtf8(kEmojiString); },
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(2, String::Cast(*value)->Length());
EXPECT_EQ(kEmojiString, Utf8Value(value));
});
}
TEST_F(ValueSerializerTest, DecodeString) {
// Decoding the strings above from UTF-8.
DecodeTest({0xff, 0x09, 0x53, 0x00},
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(0, String::Cast(*value)->Length());
});
DecodeTest({0xff, 0x09, 0x53, 0x05, 'H', 'e', 'l', 'l', 'o'},
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(5, String::Cast(*value)->Length());
EXPECT_EQ(kHelloString, Utf8Value(value));
});
DecodeTest({0xff, 0x09, 0x53, 0x07, 'Q', 'u', 0xc3, 0xa9, 'b', 'e', 'c'},
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(6, String::Cast(*value)->Length());
EXPECT_EQ(kQuebecString, Utf8Value(value));
});
DecodeTest({0xff, 0x09, 0x53, 0x04, 0xf0, 0x9f, 0x91, 0x8a},
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(2, String::Cast(*value)->Length());
EXPECT_EQ(kEmojiString, Utf8Value(value));
});
// And from two-byte strings (endianness dependent).
#if defined(V8_TARGET_LITTLE_ENDIAN)
DecodeTest({0xff, 0x09, 0x63, 0x00},
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(0, String::Cast(*value)->Length());
});
DecodeTest({0xff, 0x09, 0x63, 0x0a, 'H', '\0', 'e', '\0', 'l', '\0', 'l',
'\0', 'o', '\0'},
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(5, String::Cast(*value)->Length());
EXPECT_EQ(kHelloString, Utf8Value(value));
});
DecodeTest({0xff, 0x09, 0x63, 0x0c, 'Q', '\0', 'u', '\0', 0xe9, '\0', 'b',
'\0', 'e', '\0', 'c', '\0'},
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(6, String::Cast(*value)->Length());
EXPECT_EQ(kQuebecString, Utf8Value(value));
});
DecodeTest({0xff, 0x09, 0x63, 0x04, 0x3d, 0xd8, 0x4a, 0xdc},
[](Local<Value> value) {
ASSERT_TRUE(value->IsString());
EXPECT_EQ(2, String::Cast(*value)->Length());
EXPECT_EQ(kEmojiString, Utf8Value(value));
});
#endif
// TODO(jbroman): The same for big-endian systems.
}
TEST_F(ValueSerializerTest, DecodeInvalidString) {
// UTF-8 string with too few bytes available.
InvalidDecodeTest({0xff, 0x09, 0x53, 0x10, 'v', '8'});
#if defined(V8_TARGET_LITTLE_ENDIAN)
// Two-byte string with too few bytes available.
InvalidDecodeTest({0xff, 0x09, 0x63, 0x10, 'v', '\0', '8', '\0'});
// Two-byte string with an odd byte length.
InvalidDecodeTest({0xff, 0x09, 0x63, 0x03, 'v', '\0', '8'});
#endif
// TODO(jbroman): The same for big-endian systems.
}
TEST_F(ValueSerializerTest, EncodeTwoByteStringUsesPadding) {
// As long as the output has a version that Blink expects to be able to read,
// we must respect its alignment requirements. It requires that two-byte
// characters be aligned.
EncodeTest(
[this]() {
// We need a string whose length will take two bytes to encode, so that
// a padding byte is needed to keep the characters aligned. The string
// must also have a two-byte character, so that it gets the two-byte
// encoding.
std::string string(200, ' ');
string += kEmojiString;
return StringFromUtf8(string.c_str());
},
[](const std::vector<uint8_t>& data) {
// This is a sufficient but not necessary condition to be aligned.
// Note that the third byte (0x00) is padding.
const uint8_t expected_prefix[] = {0xff, 0x09, 0x00, 0x63, 0x94, 0x03};
ASSERT_GT(data.size(), sizeof(expected_prefix) / sizeof(uint8_t));
EXPECT_TRUE(std::equal(std::begin(expected_prefix),
std::end(expected_prefix), data.begin()));
});
}
} // namespace
} // namespace v8
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment