Commit f130bfd3 authored by Marja Hölttä's avatar Marja Hölttä Committed by Commit Bot

[unicode] Fix overlong / surrogate sequences detection some more.

Follow up to https://chromium-review.googlesource.com/671020

We still didn't return the correct amount of invalid characters, according to
the Encoding spec ( https://encoding.spec.whatwg.org/#utf-8-decoder ), when we
saw a byte sequence which was as start of an overlong / invalid sequence, but
there weren't enough continuation bytes.

A more rigorous test will follow in
https://chromium-review.googlesource.com/c/v8/v8/+/681217

BUG=chromium:765608

Change-Id: I535670edc14d3bae144e5a9ca373f12eec78a934
Reviewed-on: https://chromium-review.googlesource.com/681674
Commit-Queue: Marja Hölttä <marja@chromium.org>
Reviewed-by: 's avatarCamillo Bruni <cbruni@chromium.org>
Reviewed-by: 's avatarDaniel Vogelheim <vogelheim@chromium.org>
Reviewed-by: 's avatarYang Guo <yangguo@chromium.org>
Cr-Commit-Position: refs/heads/master@{#48165}
parent dc7b2b2b
......@@ -230,6 +230,8 @@ static inline bool IsContinuationCharacter(byte chr) {
// This method decodes an UTF-8 value according to RFC 3629 and
// https://encoding.spec.whatwg.org/#utf-8-decoder .
uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
DCHECK_GT(str[0], kMaxOneByteChar);
size_t length = NonASCIISequenceLength(str[0]);
// Check continuation characters.
......@@ -238,39 +240,46 @@ uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
while (count < max_count && IsContinuationCharacter(str[count])) {
count++;
}
*cursor += count;
// There must be enough continuation characters.
if (count != length) return kBadChar;
if (length >= 3 && count < 2) {
// Not enough continuation bytes to check overlong sequences.
*cursor += 1;
return kBadChar;
}
// Check overly long sequences & other conditions.
if (length == 3) {
if (str[0] == 0xE0 && (str[1] < 0xA0 || str[1] > 0xBF)) {
// Overlong three-byte sequence? The first byte generates a kBadChar.
*cursor -= 2;
*cursor += 1;
return kBadChar;
} else if (str[0] == 0xED && (str[1] < 0x80 || str[1] > 0x9F)) {
// High and low surrogate halves? The first byte generates a kBadChar.
*cursor -= 2;
*cursor += 1;
return kBadChar;
}
} else if (length == 4) {
if (str[0] == 0xF0 && (str[1] < 0x90 || str[1] > 0xBF)) {
// Overlong four-byte sequence. The first byte generates a kBadChar.
*cursor -= 3;
*cursor += 1;
return kBadChar;
} else if (str[0] == 0xF4 && (str[1] < 0x80 || str[1] > 0x8F)) {
// Code points outside of the unicode range. The first byte generates a
// kBadChar.
*cursor -= 3;
*cursor += 1;
return kBadChar;
}
}
*cursor += count;
if (count != length) {
// Not enough continuation characters.
return kBadChar;
}
// All errors have been handled, so we only have to assemble the result.
switch (length) {
case 1:
return str[0];
case 2:
return ((str[0] << 6) + str[1]) - 0x00003080;
case 3:
......
......@@ -8544,6 +8544,17 @@ THREADED_TEST(OverlongSequencesAndSurrogates) {
"X\xed\xa0\x80Y\0",
// Invalid 4-bytes sequence (value out of range).
"X\xf4\x90\x80\x80Y\0",
// Start of an overlong 3-byte sequence but not enough continuation bytes.
"X\xe0\x9fY\0",
// Start of an overlong 4-byte sequence but not enough continuation bytes.
"X\xf0\x89\xbfY\0",
// Start of an invalid 3-byte sequence (reserved for surrogates) but not
// enough continuation bytes.
"X\xed\xa0Y\0",
// Start of an invalid 4-bytes sequence (value out of range) but not
// enough continuation bytes.
"X\xf4\x90\x80Y\0",
};
const std::vector<std::vector<uint16_t>> unicode_expected = {
{0x58, 0xfffd, 0xfffd, 0x59},
......@@ -8552,6 +8563,10 @@ THREADED_TEST(OverlongSequencesAndSurrogates) {
{0x58, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x59},
{0x58, 0xfffd, 0xfffd, 0xfffd, 0x59},
{0x58, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x59},
{0x58, 0xfffd, 0xfffd, 0x59},
{0x58, 0xfffd, 0xfffd, 0xfffd, 0x59},
{0x58, 0xfffd, 0xfffd, 0x59},
{0x58, 0xfffd, 0xfffd, 0xfffd, 0x59},
};
CHECK_EQ(unicode_expected.size(), arraysize(cases));
TestUtf8DecodingAgainstReference(cases, unicode_expected);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment