Blink-compatible serialization of strings.

This includes UTF-8 strings and two-byte strings, both length-delimited (in bytes, not characters). Two-byte strings are written/read in host byte order. BUG=chromium:148757 Review-Url: https://codereview.chromium.org/2245753002 Cr-Commit-Position: refs/heads/master@{#38636}

Blink-compatible serialization of strings.
This includes UTF-8 strings and two-byte strings, both length-delimited (in bytes, not characters). Two-byte strings are written/read in host byte order. BUG=chromium:148757 Review-Url: https://codereview.chromium.org/2245753002 Cr-Commit-Position: refs/heads/master@{#38636}
c5701e16 · jbroman · Commit bot · cba03ba8 · c5701e16 · c5701e16
Commit c5701e16 authored Aug 15, 2016 by jbroman Committed by Commit bot Aug 15, 2016
Showing with 285 additions and 16 deletions

value-serializer.cc src/value-serializer.cc +118 -0

value-serializer.h src/value-serializer.h +11 -0

value-serializer-unittest.cc test/unittests/value-serializer-unittest.cc +156 -16

No files found.
--- a/src/value-serializer.cc
+++ b/src/value-serializer.cc
@@ -18,6 +18,18 @@ namespace internal {
 static const uint32_t kLatestVersion = 9;
+template <typename T>
+static size_t BytesNeededForVarint(T value) {
+  static_assert(std::is_integral<T>::value && std::is_unsigned<T>::value,
+                "Only unsigned integer types can be written as varints.");
+  size_t result = 0;
+  do {
+    result++;
+    value >>= 7;
+  } while (value);
+  return result;
+}
 enum class SerializationTag : uint8_t {
  // version:uint32_t (if at beginning of data, sets version > 0)
  kVersion = 0xFF,
@@ -39,6 +51,9 @@ enum class SerializationTag : uint8_t {
  // Number represented as a 64-bit double.
  // Host byte order is used (N.B. this makes the format non-portable).
  kDouble = 'N',
+  // byteLength:uint32_t, then raw data
+  kUtf8String = 'S',
+  kTwoByteString = 'c',
 };
 ValueSerializer::ValueSerializer() {}
@@ -92,6 +107,24 @@ void ValueSerializer::WriteDouble(double value) {
                 reinterpret_cast<const uint8_t*>(&value + 1));
 }
+void ValueSerializer::WriteOneByteString(Vector<const uint8_t> chars) {
+  WriteVarint<uint32_t>(chars.length());
+  buffer_.insert(buffer_.end(), chars.begin(), chars.end());
+}
+void ValueSerializer::WriteTwoByteString(Vector<const uc16> chars) {
+  // Warning: this uses host endianness.
+  WriteVarint<uint32_t>(chars.length() * sizeof(uc16));
+  buffer_.insert(buffer_.end(), reinterpret_cast<const uint8_t*>(chars.begin()),
+                 reinterpret_cast<const uint8_t*>(chars.end()));
+}
+uint8_t* ValueSerializer::ReserveRawBytes(size_t bytes) {
+  auto old_size = buffer_.size();
+  buffer_.resize(buffer_.size() + bytes);
+  return &buffer_[old_size];
+}
 Maybe<bool> ValueSerializer::WriteObject(Handle<Object> object) {
  if (object->IsSmi()) {
    WriteSmi(Smi::cast(*object));
@@ -108,6 +141,10 @@ Maybe<bool> ValueSerializer::WriteObject(Handle<Object> object) {
      WriteHeapNumber(HeapNumber::cast(*object));
      return Just(true);
    default:
+      if (object->IsString()) {
+        WriteString(Handle<String>::cast(object));
+        return Just(true);
+      }
      UNIMPLEMENTED();
      return Nothing<bool>();
  }
@@ -146,6 +183,41 @@ void ValueSerializer::WriteHeapNumber(HeapNumber* number) {
  WriteDouble(number->value());
 }
+void ValueSerializer::WriteString(Handle<String> string) {
+  string = String::Flatten(string);
+  DisallowHeapAllocation no_gc;
+  String::FlatContent flat = string->GetFlatContent();
+  DCHECK(flat.IsFlat());
+  if (flat.IsOneByte()) {
+    // The existing format uses UTF-8, rather than Latin-1. As a result we must
+    // to do work to encode strings that have characters outside ASCII.
+    // TODO(jbroman): In a future format version, consider adding a tag for
+    // Latin-1 strings, so that this can be skipped.
+    WriteTag(SerializationTag::kUtf8String);
+    Vector<const uint8_t> chars = flat.ToOneByteVector();
+    if (String::IsAscii(chars.begin(), chars.length())) {
+      WriteOneByteString(chars);
+    } else {
+      v8::Local<v8::String> api_string = Utils::ToLocal(string);
+      uint32_t utf8_length = api_string->Utf8Length();
+      WriteVarint(utf8_length);
+      api_string->WriteUtf8(
+          reinterpret_cast<char*>(ReserveRawBytes(utf8_length)), utf8_length,
+          nullptr, v8::String::NO_NULL_TERMINATION);
+    }
+  } else if (flat.IsTwoByte()) {
+    Vector<const uc16> chars = flat.ToUC16Vector();
+    uint32_t byte_length = chars.length() * sizeof(uc16);
+    // The existing reading code expects 16-byte strings to be aligned.
+    if ((buffer_.size() + 1 + BytesNeededForVarint(byte_length)) & 1)
+      WriteTag(SerializationTag::kPadding);
+    WriteTag(SerializationTag::kTwoByteString);
+    WriteTwoByteString(chars);
+  } else {
+    UNREACHABLE();
+  }
+}
 ValueDeserializer::ValueDeserializer(Isolate* isolate,
                                     Vector<const uint8_t> data)
    : isolate_(isolate),
@@ -223,6 +295,13 @@ Maybe<double> ValueDeserializer::ReadDouble() {
  return Just(value);
 }
+Maybe<Vector<const uint8_t>> ValueDeserializer::ReadRawBytes(int size) {
+  if (size > end_ - position_) return Nothing<Vector<const uint8_t>>();
+  const uint8_t* start = position_;
+  position_ += size;
+  return Just(Vector<const uint8_t>(start, size));
+}
 MaybeHandle<Object> ValueDeserializer::ReadObject() {
  SerializationTag tag;
  if (!ReadTag().To(&tag)) return MaybeHandle<Object>();
@@ -254,10 +333,49 @@ MaybeHandle<Object> ValueDeserializer::ReadObject() {
      if (number.IsNothing()) return MaybeHandle<Object>();
      return isolate_->factory()->NewNumber(number.FromJust());
    }
+    case SerializationTag::kUtf8String:
+      return ReadUtf8String();
+    case SerializationTag::kTwoByteString:
+      return ReadTwoByteString();
    default:
      return MaybeHandle<Object>();
  }
 }
+MaybeHandle<String> ValueDeserializer::ReadUtf8String() {
+  uint32_t utf8_length;
+  Vector<const uint8_t> utf8_bytes;
+  if (!ReadVarint<uint32_t>().To(&utf8_length) ||
+      utf8_length >
+          static_cast<uint32_t>(std::numeric_limits<int32_t>::max()) ||
+      !ReadRawBytes(utf8_length).To(&utf8_bytes))
+    return MaybeHandle<String>();
+  return isolate_->factory()->NewStringFromUtf8(
+      Vector<const char>::cast(utf8_bytes));
+}
+MaybeHandle<String> ValueDeserializer::ReadTwoByteString() {
+  uint32_t byte_length;
+  Vector<const uint8_t> bytes;
+  if (!ReadVarint<uint32_t>().To(&byte_length) ||
+      byte_length >
+          static_cast<uint32_t>(std::numeric_limits<int32_t>::max()) ||
+      byte_length % sizeof(uc16) != 0 || !ReadRawBytes(byte_length).To(&bytes))
+    return MaybeHandle<String>();
+  // Allocate an uninitialized string so that we can do a raw memcpy into the
+  // string on the heap (regardless of alignment).
+  Handle<SeqTwoByteString> string;
+  if (!isolate_->factory()
+           ->NewRawTwoByteString(byte_length / sizeof(uc16))
+           .ToHandle(&string))
+    return MaybeHandle<String>();
+  // Copy the bytes directly into the new string.
+  // Warning: this uses host endianness.
+  memcpy(string->GetChars(), bytes.begin(), bytes.length());
+  return string;
+}
 }  // namespace internal
 }  // namespace v8
--- a/src/value-serializer.h
+++ b/src/value-serializer.h
@@ -59,11 +59,15 @@ class ValueSerializer {
  template <typename T>
  void WriteZigZag(T value);
  void WriteDouble(double value);
+  void WriteOneByteString(Vector<const uint8_t> chars);
+  void WriteTwoByteString(Vector<const uc16> chars);
+  uint8_t* ReserveRawBytes(size_t bytes);
  // Writing V8 objects of various kinds.
  void WriteOddball(Oddball* oddball);
  void WriteSmi(Smi* smi);
  void WriteHeapNumber(HeapNumber* number);
+  void WriteString(Handle<String> string);
  std::vector<uint8_t> buffer_;
@@ -90,12 +94,19 @@ class ValueDeserializer {
  MaybeHandle<Object> ReadObject() WARN_UNUSED_RESULT;
 private:
+  // Reading the wire format.
  Maybe<SerializationTag> ReadTag() WARN_UNUSED_RESULT;
  template <typename T>
  Maybe<T> ReadVarint() WARN_UNUSED_RESULT;
  template <typename T>
  Maybe<T> ReadZigZag() WARN_UNUSED_RESULT;
  Maybe<double> ReadDouble() WARN_UNUSED_RESULT;
+  Maybe<Vector<const uint8_t>> ReadRawBytes(int size) WARN_UNUSED_RESULT;
+  // Reading V8 objects of specific kinds.
+  // The tag is assumed to have already been read.
+  MaybeHandle<String> ReadUtf8String() WARN_UNUSED_RESULT;
+  MaybeHandle<String> ReadTwoByteString() WARN_UNUSED_RESULT;
  Isolate* const isolate_;
  const uint8_t* position_;

--- a/test/unittests/value-serializer-unittest.cc
+++ b/test/unittests/value-serializer-unittest.cc
@@ -4,6 +4,9 @@
 #include "src/value-serializer.h"
+#include <algorithm>
+#include <string>
 #include "include/v8.h"
 #include "src/api.h"
 #include "src/base/build_config.h"
@@ -29,22 +32,27 @@ class ValueSerializerTest : public TestWithIsolate {
  template <typename InputFunctor, typename OutputFunctor>
  void RoundTripTest(const InputFunctor& input_functor,
                     const OutputFunctor& output_functor) {
-    std::vector<uint8_t> data;
+    EncodeTest(input_functor,
-    {
+               [this, &output_functor](const std::vector<uint8_t>& data) {
-      Context::Scope scope(serialization_context());
+                 DecodeTest(data, output_functor);
-      TryCatch try_catch(isolate());
+               });
-      // TODO(jbroman): Use the public API once it exists.
+  }
-      Local<Value> input_value = input_functor();
-      i::Isolate* internal_isolate = reinterpret_cast<i::Isolate*>(isolate());
+  template <typename InputFunctor, typename EncodedDataFunctor>
-      i::HandleScope handle_scope(internal_isolate);
+  void EncodeTest(const InputFunctor& input_functor,
-      i::ValueSerializer serializer;
+                  const EncodedDataFunctor& encoded_data_functor) {
-      serializer.WriteHeader();
+    Context::Scope scope(serialization_context());
-      ASSERT_TRUE(serializer.WriteObject(Utils::OpenHandle(*input_value))
+    TryCatch try_catch(isolate());
-                      .FromMaybe(false));
+    // TODO(jbroman): Use the public API once it exists.
-      ASSERT_FALSE(try_catch.HasCaught());
+    Local<Value> input_value = input_functor();
-      data = serializer.ReleaseBuffer();
+    i::Isolate* internal_isolate = reinterpret_cast<i::Isolate*>(isolate());
-    }
+    i::HandleScope handle_scope(internal_isolate);
-    DecodeTest(data, output_functor);
+    i::ValueSerializer serializer;
+    serializer.WriteHeader();
+    ASSERT_TRUE(serializer.WriteObject(Utils::OpenHandle(*input_value))
+                    .FromMaybe(false));
+    ASSERT_FALSE(try_catch.HasCaught());
+    encoded_data_functor(serializer.ReleaseBuffer());
  }
  template <typename OutputFunctor>
@@ -106,6 +114,11 @@ class ValueSerializerTest : public TestWithIsolate {
        .ToLocalChecked();
  }
+  static std::string Utf8Value(Local<Value> value) {
+    String::Utf8Value utf8(value);
+    return std::string(*utf8, utf8.length());
+  }
 private:
  Local<Context> serialization_context_;
  Local<Context> deserialization_context_;
@@ -248,5 +261,132 @@ TEST_F(ValueSerializerTest, DecodeNumber) {
  // TODO(jbroman): Equivalent test for big-endian machines.
 }
+// String constants (in UTF-8) used for string encoding tests.
+static const char kHelloString[] = "Hello";
+static const char kQuebecString[] = "\x51\x75\xC3\xA9\x62\x65\x63";
+static const char kEmojiString[] = "\xF0\x9F\x91\x8A";
+TEST_F(ValueSerializerTest, RoundTripString) {
+  RoundTripTest([this]() { return String::Empty(isolate()); },
+                [](Local<Value> value) {
+                  ASSERT_TRUE(value->IsString());
+                  EXPECT_EQ(0, String::Cast(*value)->Length());
+                });
+  // Inside ASCII.
+  RoundTripTest([this]() { return StringFromUtf8(kHelloString); },
+                [](Local<Value> value) {
+                  ASSERT_TRUE(value->IsString());
+                  EXPECT_EQ(5, String::Cast(*value)->Length());
+                  EXPECT_EQ(kHelloString, Utf8Value(value));
+                });
+  // Inside Latin-1 (i.e. one-byte string), but not ASCII.
+  RoundTripTest([this]() { return StringFromUtf8(kQuebecString); },
+                [](Local<Value> value) {
+                  ASSERT_TRUE(value->IsString());
+                  EXPECT_EQ(6, String::Cast(*value)->Length());
+                  EXPECT_EQ(kQuebecString, Utf8Value(value));
+                });
+  // An emoji (decodes to two 16-bit chars).
+  RoundTripTest([this]() { return StringFromUtf8(kEmojiString); },
+                [](Local<Value> value) {
+                  ASSERT_TRUE(value->IsString());
+                  EXPECT_EQ(2, String::Cast(*value)->Length());
+                  EXPECT_EQ(kEmojiString, Utf8Value(value));
+                });
+}
+TEST_F(ValueSerializerTest, DecodeString) {
+  // Decoding the strings above from UTF-8.
+  DecodeTest({0xff, 0x09, 0x53, 0x00},
+             [](Local<Value> value) {
+               ASSERT_TRUE(value->IsString());
+               EXPECT_EQ(0, String::Cast(*value)->Length());
+             });
+  DecodeTest({0xff, 0x09, 0x53, 0x05, 'H', 'e', 'l', 'l', 'o'},
+             [](Local<Value> value) {
+               ASSERT_TRUE(value->IsString());
+               EXPECT_EQ(5, String::Cast(*value)->Length());
+               EXPECT_EQ(kHelloString, Utf8Value(value));
+             });
+  DecodeTest({0xff, 0x09, 0x53, 0x07, 'Q', 'u', 0xc3, 0xa9, 'b', 'e', 'c'},
+             [](Local<Value> value) {
+               ASSERT_TRUE(value->IsString());
+               EXPECT_EQ(6, String::Cast(*value)->Length());
+               EXPECT_EQ(kQuebecString, Utf8Value(value));
+             });
+  DecodeTest({0xff, 0x09, 0x53, 0x04, 0xf0, 0x9f, 0x91, 0x8a},
+             [](Local<Value> value) {
+               ASSERT_TRUE(value->IsString());
+               EXPECT_EQ(2, String::Cast(*value)->Length());
+               EXPECT_EQ(kEmojiString, Utf8Value(value));
+             });
+// And from two-byte strings (endianness dependent).
+#if defined(V8_TARGET_LITTLE_ENDIAN)
+  DecodeTest({0xff, 0x09, 0x63, 0x00},
+             [](Local<Value> value) {
+               ASSERT_TRUE(value->IsString());
+               EXPECT_EQ(0, String::Cast(*value)->Length());
+             });
+  DecodeTest({0xff, 0x09, 0x63, 0x0a, 'H', '\0', 'e', '\0', 'l', '\0', 'l',
+              '\0', 'o', '\0'},
+             [](Local<Value> value) {
+               ASSERT_TRUE(value->IsString());
+               EXPECT_EQ(5, String::Cast(*value)->Length());
+               EXPECT_EQ(kHelloString, Utf8Value(value));
+             });
+  DecodeTest({0xff, 0x09, 0x63, 0x0c, 'Q', '\0', 'u', '\0', 0xe9, '\0', 'b',
+              '\0', 'e', '\0', 'c', '\0'},
+             [](Local<Value> value) {
+               ASSERT_TRUE(value->IsString());
+               EXPECT_EQ(6, String::Cast(*value)->Length());
+               EXPECT_EQ(kQuebecString, Utf8Value(value));
+             });
+  DecodeTest({0xff, 0x09, 0x63, 0x04, 0x3d, 0xd8, 0x4a, 0xdc},
+             [](Local<Value> value) {
+               ASSERT_TRUE(value->IsString());
+               EXPECT_EQ(2, String::Cast(*value)->Length());
+               EXPECT_EQ(kEmojiString, Utf8Value(value));
+             });
+#endif
+  // TODO(jbroman): The same for big-endian systems.
+}
+TEST_F(ValueSerializerTest, DecodeInvalidString) {
+  // UTF-8 string with too few bytes available.
+  InvalidDecodeTest({0xff, 0x09, 0x53, 0x10, 'v', '8'});
+#if defined(V8_TARGET_LITTLE_ENDIAN)
+  // Two-byte string with too few bytes available.
+  InvalidDecodeTest({0xff, 0x09, 0x63, 0x10, 'v', '\0', '8', '\0'});
+  // Two-byte string with an odd byte length.
+  InvalidDecodeTest({0xff, 0x09, 0x63, 0x03, 'v', '\0', '8'});
+#endif
+  // TODO(jbroman): The same for big-endian systems.
+}
+TEST_F(ValueSerializerTest, EncodeTwoByteStringUsesPadding) {
+  // As long as the output has a version that Blink expects to be able to read,
+  // we must respect its alignment requirements. It requires that two-byte
+  // characters be aligned.
+  EncodeTest(
+      [this]() {
+        // We need a string whose length will take two bytes to encode, so that
+        // a padding byte is needed to keep the characters aligned. The string
+        // must also have a two-byte character, so that it gets the two-byte
+        // encoding.
+        std::string string(200, ' ');
+        string += kEmojiString;
+        return StringFromUtf8(string.c_str());
+      },
+      [](const std::vector<uint8_t>& data) {
+        // This is a sufficient but not necessary condition to be aligned.
+        // Note that the third byte (0x00) is padding.
+        const uint8_t expected_prefix[] = {0xff, 0x09, 0x00, 0x63, 0x94, 0x03};
+        ASSERT_GT(data.size(), sizeof(expected_prefix) / sizeof(uint8_t));
+        EXPECT_TRUE(std::equal(std::begin(expected_prefix),
+                               std::end(expected_prefix), data.begin()));
+      });
+}
 }  // namespace
 }  // namespace v8