Commit 3d5b2f80 authored by jochen's avatar jochen Committed by Commit bot

Update UTF-8 decoder to detect more special cases.

The blink version is stricter and for parsing it's important that both
decoders behave the same.

BUG=chromium:489944
R=vogelheim@chromium.org
LOG=n

Review URL: https://codereview.chromium.org/1148653007

Cr-Commit-Position: refs/heads/master@{#28601}
parent c52bb1f0
...@@ -190,71 +190,118 @@ static int LookupMapping(const int32_t* table, ...@@ -190,71 +190,118 @@ static int LookupMapping(const int32_t* table,
} }
uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) { static inline size_t NonASCIISequenceLength(byte first) {
// We only get called for non-ASCII characters. // clang-format off
if (length == 1) { static const uint8_t lengths[256] = {
*cursor += 1; // The first 128 entries correspond to ASCII characters.
return kBadChar; 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
} 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
byte first = str[0]; 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
byte second = str[1] ^ 0x80; 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
if (second & 0xC0) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// The following 64 entries correspond to continuation bytes.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// The next are two invalid overlong encodings and 30 two-byte sequences.
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
// 16 three-byte sequences.
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
// 5 four-byte sequences, followed by sequences that could only encode
// code points outside of the unicode range.
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
// clang-format on
return lengths[first];
}
static inline bool IsContinuationCharacter(byte chr) {
return chr >= 0x80 && chr <= 0xBF;
}
// This method decodes an UTF-8 value according to RFC 3629.
uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
size_t length = NonASCIISequenceLength(str[0]);
if (length == 0 || max_length < length) {
*cursor += 1; *cursor += 1;
return kBadChar; return kBadChar;
} }
if (first < 0xE0) { if (length == 2) {
if (first < 0xC0) { if (!IsContinuationCharacter(str[1])) {
*cursor += 1;
return kBadChar;
}
uchar code_point = ((first << 6) | second) & kMaxTwoByteChar;
if (code_point <= kMaxOneByteChar) {
*cursor += 1; *cursor += 1;
return kBadChar; return kBadChar;
} }
*cursor += 2; *cursor += 2;
return code_point; return ((str[0] << 6) + str[1]) - 0x00003080;
} }
if (length == 2) { if (length == 3) {
*cursor += 1; switch (str[0]) {
return kBadChar; case 0xE0:
} // Overlong three-byte sequence.
byte third = str[2] ^ 0x80; if (str[1] < 0xA0 || str[1] > 0xBF) {
if (third & 0xC0) { *cursor += 1;
*cursor += 1; return kBadChar;
return kBadChar; }
} break;
if (first < 0xF0) { case 0xED:
uchar code_point = ((((first << 6) | second) << 6) | third) // High and low surrogate halves.
& kMaxThreeByteChar; if (str[1] < 0x80 || str[1] > 0x9F) {
if (code_point <= kMaxTwoByteChar) { *cursor += 1;
return kBadChar;
}
break;
default:
if (!IsContinuationCharacter(str[1])) {
*cursor += 1;
return kBadChar;
}
}
if (!IsContinuationCharacter(str[2])) {
*cursor += 1; *cursor += 1;
return kBadChar; return kBadChar;
} }
*cursor += 3; *cursor += 3;
return code_point; return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
} }
if (length == 3) { DCHECK(length == 4);
switch (str[0]) {
case 0xF0:
// Overlong four-byte sequence.
if (str[1] < 0x90 || str[1] > 0xBF) {
*cursor += 1;
return kBadChar;
}
break;
case 0xF4:
// Code points outside of the unicode range.
if (str[1] < 0x80 || str[1] > 0x8F) {
*cursor += 1;
return kBadChar;
}
break;
default:
if (!IsContinuationCharacter(str[1])) {
*cursor += 1;
return kBadChar;
}
}
if (!IsContinuationCharacter(str[2])) {
*cursor += 1; *cursor += 1;
return kBadChar; return kBadChar;
} }
byte fourth = str[3] ^ 0x80; if (!IsContinuationCharacter(str[3])) {
if (fourth & 0xC0) {
*cursor += 1; *cursor += 1;
return kBadChar; return kBadChar;
} }
if (first < 0xF8) { *cursor += 4;
uchar code_point = (((((first << 6 | second) << 6) | third) << 6) | fourth) return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
& kMaxFourByteChar; 0x03C82080;
if (code_point <= kMaxThreeByteChar) {
*cursor += 1;
return kBadChar;
}
*cursor += 4;
return code_point;
}
*cursor += 1;
return kBadChar;
} }
......
...@@ -7267,84 +7267,6 @@ static void Utf16Helper( ...@@ -7267,84 +7267,6 @@ static void Utf16Helper(
} }
static uint16_t StringGet(Handle<String> str, int index) {
i::Handle<i::String> istring =
v8::Utils::OpenHandle(String::Cast(*str));
return istring->Get(index);
}
static void WriteUtf8Helper(
LocalContext& context, // NOLINT
const char* name,
const char* lengths_name,
int len) {
Local<v8::Array> b =
Local<v8::Array>::Cast(context->Global()->Get(v8_str(name)));
Local<v8::Array> alens =
Local<v8::Array>::Cast(context->Global()->Get(v8_str(lengths_name)));
char buffer[1000];
char buffer2[1000];
for (int i = 0; i < len; i++) {
Local<v8::String> string =
Local<v8::String>::Cast(b->Get(i));
Local<v8::Number> expected_len =
Local<v8::Number>::Cast(alens->Get(i));
int utf8_length = static_cast<int>(expected_len->Value());
for (int j = utf8_length + 1; j >= 0; j--) {
memset(reinterpret_cast<void*>(&buffer), 42, sizeof(buffer));
memset(reinterpret_cast<void*>(&buffer2), 42, sizeof(buffer2));
int nchars;
int utf8_written =
string->WriteUtf8(buffer, j, &nchars, String::NO_OPTIONS);
int utf8_written2 =
string->WriteUtf8(buffer2, j, &nchars, String::NO_NULL_TERMINATION);
CHECK_GE(utf8_length + 1, utf8_written);
CHECK_GE(utf8_length, utf8_written2);
for (int k = 0; k < utf8_written2; k++) {
CHECK_EQ(buffer[k], buffer2[k]);
}
CHECK(nchars * 3 >= utf8_written - 1);
CHECK(nchars <= utf8_written);
if (j == utf8_length + 1) {
CHECK_EQ(utf8_written2, utf8_length);
CHECK_EQ(utf8_written2 + 1, utf8_written);
}
CHECK_EQ(buffer[utf8_written], 42);
if (j > utf8_length) {
if (utf8_written != 0) CHECK_EQ(buffer[utf8_written - 1], 0);
if (utf8_written > 1) CHECK_NE(buffer[utf8_written - 2], 42);
Handle<String> roundtrip = v8_str(buffer);
CHECK(roundtrip->Equals(string));
} else {
if (utf8_written != 0) CHECK_NE(buffer[utf8_written - 1], 42);
}
if (utf8_written2 != 0) CHECK_NE(buffer[utf8_written - 1], 42);
if (nchars >= 2) {
uint16_t trail = StringGet(string, nchars - 1);
uint16_t lead = StringGet(string, nchars - 2);
if (((lead & 0xfc00) == 0xd800) &&
((trail & 0xfc00) == 0xdc00)) {
unsigned u1 = buffer2[utf8_written2 - 4];
unsigned u2 = buffer2[utf8_written2 - 3];
unsigned u3 = buffer2[utf8_written2 - 2];
unsigned u4 = buffer2[utf8_written2 - 1];
CHECK_EQ((u1 & 0xf8), 0xf0u);
CHECK_EQ((u2 & 0xc0), 0x80u);
CHECK_EQ((u3 & 0xc0), 0x80u);
CHECK_EQ((u4 & 0xc0), 0x80u);
uint32_t c = 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
CHECK_EQ((u4 & 0x3f), (c & 0x3f));
CHECK_EQ((u3 & 0x3f), ((c >> 6) & 0x3f));
CHECK_EQ((u2 & 0x3f), ((c >> 12) & 0x3f));
CHECK_EQ((u1 & 0x3), c >> 18);
}
}
}
}
}
THREADED_TEST(Utf16) { THREADED_TEST(Utf16) {
LocalContext context; LocalContext context;
v8::HandleScope scope(context->GetIsolate()); v8::HandleScope scope(context->GetIsolate());
...@@ -7391,9 +7313,6 @@ THREADED_TEST(Utf16) { ...@@ -7391,9 +7313,6 @@ THREADED_TEST(Utf16) {
"}"); "}");
Utf16Helper(context, "a", "alens", 9); Utf16Helper(context, "a", "alens", 9);
Utf16Helper(context, "a2", "a2lens", 81); Utf16Helper(context, "a2", "a2lens", 81);
WriteUtf8Helper(context, "b", "alens", 9);
WriteUtf8Helper(context, "b2", "a2lens", 81);
WriteUtf8Helper(context, "c2", "a2lens", 81);
} }
...@@ -7403,15 +7322,6 @@ static bool SameSymbol(Handle<String> s1, Handle<String> s2) { ...@@ -7403,15 +7322,6 @@ static bool SameSymbol(Handle<String> s1, Handle<String> s2) {
return *is1 == *is2; return *is1 == *is2;
} }
static void SameSymbolHelper(v8::Isolate* isolate, const char* a,
const char* b) {
Handle<String> symbol1 =
v8::String::NewFromUtf8(isolate, a, v8::String::kInternalizedString);
Handle<String> symbol2 =
v8::String::NewFromUtf8(isolate, b, v8::String::kInternalizedString);
CHECK(SameSymbol(symbol1, symbol2));
}
THREADED_TEST(Utf16Symbol) { THREADED_TEST(Utf16Symbol) {
LocalContext context; LocalContext context;
...@@ -7423,18 +7333,6 @@ THREADED_TEST(Utf16Symbol) { ...@@ -7423,18 +7333,6 @@ THREADED_TEST(Utf16Symbol) {
context->GetIsolate(), "abc", v8::String::kInternalizedString); context->GetIsolate(), "abc", v8::String::kInternalizedString);
CHECK(SameSymbol(symbol1, symbol2)); CHECK(SameSymbol(symbol1, symbol2));
SameSymbolHelper(context->GetIsolate(),
"\360\220\220\205", // 4 byte encoding.
"\355\240\201\355\260\205"); // 2 3-byte surrogates.
SameSymbolHelper(context->GetIsolate(),
"\355\240\201\355\260\206", // 2 3-byte surrogates.
"\360\220\220\206"); // 4 byte encoding.
SameSymbolHelper(context->GetIsolate(),
"x\360\220\220\205", // 4 byte encoding.
"x\355\240\201\355\260\205"); // 2 3-byte surrogates.
SameSymbolHelper(context->GetIsolate(),
"x\355\240\201\355\260\206", // 2 3-byte surrogates.
"x\360\220\220\206"); // 4 byte encoding.
CompileRun( CompileRun(
"var sym0 = 'benedictus';" "var sym0 = 'benedictus';"
"var sym0b = 'S\303\270ren';" "var sym0b = 'S\303\270ren';"
......
...@@ -699,18 +699,22 @@ TEST(Utf8CharacterStream) { ...@@ -699,18 +699,22 @@ TEST(Utf8CharacterStream) {
char buffer[kAllUtf8CharsSizeU]; char buffer[kAllUtf8CharsSizeU];
unsigned cursor = 0; unsigned cursor = 0;
for (int i = 0; i <= kMaxUC16Char; i++) { for (int i = 0; i <= kMaxUC16Char; i++) {
cursor += unibrow::Utf8::Encode(buffer + cursor, cursor += unibrow::Utf8::Encode(buffer + cursor, i,
i, unibrow::Utf16::kNoPreviousCharacter, true);
unibrow::Utf16::kNoPreviousCharacter);
} }
DCHECK(cursor == kAllUtf8CharsSizeU); DCHECK(cursor == kAllUtf8CharsSizeU);
i::Utf8ToUtf16CharacterStream stream(reinterpret_cast<const i::byte*>(buffer), i::Utf8ToUtf16CharacterStream stream(reinterpret_cast<const i::byte*>(buffer),
kAllUtf8CharsSizeU); kAllUtf8CharsSizeU);
int32_t bad = unibrow::Utf8::kBadChar;
for (int i = 0; i <= kMaxUC16Char; i++) { for (int i = 0; i <= kMaxUC16Char; i++) {
CHECK_EQU(i, stream.pos()); CHECK_EQU(i, stream.pos());
int32_t c = stream.Advance(); int32_t c = stream.Advance();
CHECK_EQ(i, c); if (i >= 0xd800 && i <= 0xdfff) {
CHECK_EQ(bad, c);
} else {
CHECK_EQ(i, c);
}
CHECK_EQU(i + 1, stream.pos()); CHECK_EQU(i + 1, stream.pos());
} }
for (int i = kMaxUC16Char; i >= 0; i--) { for (int i = kMaxUC16Char; i >= 0; i--) {
...@@ -724,7 +728,9 @@ TEST(Utf8CharacterStream) { ...@@ -724,7 +728,9 @@ TEST(Utf8CharacterStream) {
int progress = static_cast<int>(stream.SeekForward(12)); int progress = static_cast<int>(stream.SeekForward(12));
i += progress; i += progress;
int32_t c = stream.Advance(); int32_t c = stream.Advance();
if (i <= kMaxUC16Char) { if (i >= 0xd800 && i <= 0xdfff) {
CHECK_EQ(bad, c);
} else if (i <= kMaxUC16Char) {
CHECK_EQ(i, c); CHECK_EQ(i, c);
} else { } else {
CHECK_EQ(-1, c); CHECK_EQ(-1, c);
...@@ -913,6 +919,15 @@ static int Utf8LengthHelper(const char* s) { ...@@ -913,6 +919,15 @@ static int Utf8LengthHelper(const char* s) {
// Record a single kBadChar for the first byte and continue. // Record a single kBadChar for the first byte and continue.
continue; continue;
} }
if (c == 0xed) {
unsigned char d = s[i + 1];
if ((d < 0x80) || (d > 0x9f)) {
// This 3 byte sequence is part of a surrogate pair which is not
// supported by UTF-8. Record a single kBadChar for the first byte
// and continue.
continue;
}
}
input_offset = 2; input_offset = 2;
// 3 bytes of UTF-8 turn into 1 UTF-16 code unit. // 3 bytes of UTF-8 turn into 1 UTF-16 code unit.
output_adjust = 2; output_adjust = 2;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment