Commit ab1b0ed8 authored by Johannes Henkel's avatar Johannes Henkel Committed by Commit Bot

[DevTools] Roll inspector_protocol.

Fixes edge cases for parsing / serializing codepoint 0xffff in JSON
with UTF16.

New Rev: 0213a8545f6362cd1cd5091cedf29747736552e8

Change-Id: I48b174cf1bd9263ace002996094f7143a1248766
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2020502
Auto-Submit: Johannes Henkel <johannes@chromium.org>
Reviewed-by: 's avatarDmitry Gozman <dgozman@chromium.org>
Commit-Queue: Dmitry Gozman <dgozman@chromium.org>
Cr-Commit-Position: refs/heads/master@{#65985}
parent cfb2d89a
......@@ -2,7 +2,7 @@ Name: inspector protocol
Short Name: inspector_protocol
URL: https://chromium.googlesource.com/deps/inspector_protocol/
Version: 0
Revision: a84e91f6696a0b76e1a73286c9c2765154de9889
Revision: 0213a8545f6362cd1cd5091cedf29747736552e8
License: BSD
License File: LICENSE
Security Critical: no
......
......@@ -249,7 +249,7 @@ class JSONEncoder : public ParserHandler {
// So, now we transcode to UTF16,
// using the math described at https://en.wikipedia.org/wiki/UTF-16,
// for either one or two 16 bit characters.
if (codepoint < 0xffff) {
if (codepoint <= 0xffff) {
Emit("\\u");
PrintHex(static_cast<uint16_t>(codepoint), out_);
continue;
......@@ -755,7 +755,7 @@ class JsonParser {
// So, now we transcode to UTF16,
// using the math described at https://en.wikipedia.org/wiki/UTF-16,
// for either one or two 16 bit characters.
if (codepoint < 0xffff) {
if (codepoint <= 0xffff) {
output->push_back(codepoint);
continue;
}
......
......@@ -81,6 +81,43 @@ TEST(JsonEncoder, NotAContinuationByte) {
EXPECT_EQ("\"Hello\"", out); // "Hello" shows we restarted at 'H'.
}
TEST(JsonEncoder, EscapesLoneHighSurrogates) {
// This tests that the JSON encoder escapes lone high surrogates, i.e.
// invalid code points in the range from 0xD800 to 0xDBFF. In
// unescaped form, these cannot be represented in well-formed UTF-8 or
// UTF-16.
std::vector<uint16_t> chars = {'a', 0xd800, 'b', 0xdada, 'c', 0xdbff, 'd'};
std::string out;
Status status;
std::unique_ptr<ParserHandler> writer = NewJSONEncoder(&out, &status);
writer->HandleString16(span<uint16_t>(chars.data(), chars.size()));
EXPECT_EQ("\"a\\ud800b\\udadac\\udbffd\"", out);
}
TEST(JsonEncoder, EscapesLoneLowSurrogates) {
// This tests that the JSON encoder escapes lone low surrogates, i.e.
// invalid code points in the range from 0xDC00 to 0xDFFF. In
// unescaped form, these cannot be represented in well-formed UTF-8 or
// UTF-16.
std::vector<uint16_t> chars = {'a', 0xdc00, 'b', 0xdede, 'c', 0xdfff, 'd'};
std::string out;
Status status;
std::unique_ptr<ParserHandler> writer = NewJSONEncoder(&out, &status);
writer->HandleString16(span<uint16_t>(chars.data(), chars.size()));
EXPECT_EQ("\"a\\udc00b\\udedec\\udfffd\"", out);
}
TEST(JsonEncoder, EscapesFFFF) {
// This tests that the JSON encoder will escape the UTF16 input 0xffff as
// \uffff; useful to check this since it's an edge case.
std::vector<uint16_t> chars = {'a', 'b', 'c', 0xffff, 'd'};
std::string out;
Status status;
std::unique_ptr<ParserHandler> writer = NewJSONEncoder(&out, &status);
writer->HandleString16(span<uint16_t>(chars.data(), chars.size()));
EXPECT_EQ("\"abc\\uffffd\"", out);
}
TEST(JsonEncoder, IncompleteUtf8Sequence) {
std::string out;
Status status;
......@@ -248,6 +285,7 @@ class Log : public ParserHandler {
}
void HandleString16(span<uint16_t> chars) override {
raw_log_string16_.emplace_back(chars.begin(), chars.end());
log_ << "string16: " << UTF16ToUTF8(chars) << "\n";
}
......@@ -271,10 +309,15 @@ class Log : public ParserHandler {
std::string str() const { return status_.ok() ? log_.str() : ""; }
std::vector<std::vector<uint16_t>> raw_log_string16() const {
return raw_log_string16_;
}
Status status() const { return status_; }
private:
std::ostringstream log_;
std::vector<std::vector<uint16_t>> raw_log_string16_;
Status status_;
};
......@@ -395,6 +438,31 @@ TEST_F(JsonParserTest, Unicode_ParseUtf16) {
log_.str());
}
TEST_F(JsonParserTest, Unicode_ParseUtf16_SingleEscapeUpToFFFF) {
// 0xFFFF is the max codepoint that can be represented as a single \u escape.
// One way to write this is \uffff, another way is to encode it as a 3 byte
// UTF-8 sequence (0xef 0xbf 0xbf). Both are equivalent.
// Example with both ways of encoding code point 0xFFFF in a JSON string.
std::string json = "{\"escape\": \"\xef\xbf\xbf or \\uffff\"}";
ParseJSON(SpanFrom(json), &log_);
EXPECT_TRUE(log_.status().ok());
// Shows both inputs result in equivalent output once converted to UTF-8.
EXPECT_EQ(
"map begin\n"
"string16: escape\n"
"string16: \xEF\xBF\xBF or \xEF\xBF\xBF\n"
"map end\n",
log_.str());
// Make an even stronger assertion: The parser represents \xffff as a single
// UTF-16 char.
ASSERT_EQ(2u, log_.raw_log_string16().size());
std::vector<uint16_t> expected = {0xffff, ' ', 'o', 'r', ' ', 0xffff};
EXPECT_EQ(expected, log_.raw_log_string16()[1]);
}
TEST_F(JsonParserTest, Unicode_ParseUtf8) {
// Used below:
// гласность - example for 2 byte utf8, Russian word "glasnost"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment