Commit fd40ebb1 authored by vogelheim's avatar vogelheim Committed by Commit bot

Return kBadChar for longest subpart of incomplete utf-8 character.

This brings the two utf-8 decoders (bulk + incremental) in line.
Technically, either behaviour was correct, since the utf-8 spec
demands incomplete utf-8 be handled, but does not specify how.
Unicode recommends that "the maximal subpart at that offset
should be replaced by a single U+FFFD," and with this change we
consistently do that. More details + spec references in the bug.

BUG=chromium:662822

Review-Url: https://codereview.chromium.org/2493143003
Cr-Commit-Position: refs/heads/master@{#41025}
parent 0188c3fb
...@@ -228,80 +228,56 @@ static inline bool IsContinuationCharacter(byte chr) { ...@@ -228,80 +228,56 @@ static inline bool IsContinuationCharacter(byte chr) {
// This method decodes an UTF-8 value according to RFC 3629. // This method decodes an UTF-8 value according to RFC 3629.
uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) { uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
size_t length = NonASCIISequenceLength(str[0]); size_t length = NonASCIISequenceLength(str[0]);
if (length == 0 || max_length < length) {
*cursor += 1; // Check continuation characters.
return kBadChar; size_t max_count = std::min(length, max_length);
} size_t count = 1;
if (length == 2) { while (count < max_count && IsContinuationCharacter(str[count])) {
if (!IsContinuationCharacter(str[1])) { count++;
*cursor += 1;
return kBadChar;
}
*cursor += 2;
return ((str[0] << 6) + str[1]) - 0x00003080;
} }
// Check overly long sequences & other conditions. Use length as error
// indicator.
if (length == 3) { if (length == 3) {
switch (str[0]) { if (str[0] == 0xE0 && (str[1] < 0xA0 || str[1] > 0xBF)) {
case 0xE0: // Overlong three-byte sequence?
// Overlong three-byte sequence. length = 0;
if (str[1] < 0xA0 || str[1] > 0xBF) { } else if (str[0] == 0xED && (str[1] < 0x80 || str[1] > 0x9F)) {
*cursor += 1; // High and low surrogate halves?
return kBadChar; length = 0;
}
break;
case 0xED:
// High and low surrogate halves.
if (str[1] < 0x80 || str[1] > 0x9F) {
*cursor += 1;
return kBadChar;
}
break;
default:
if (!IsContinuationCharacter(str[1])) {
*cursor += 1;
return kBadChar;
}
}
if (!IsContinuationCharacter(str[2])) {
*cursor += 1;
return kBadChar;
} }
*cursor += 3; } else if (length == 4) {
return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080; if (str[0] == 0xF0 && (str[1] < 0x90 || str[1] > 0xBF)) {
}
DCHECK(length == 4);
switch (str[0]) {
case 0xF0:
// Overlong four-byte sequence. // Overlong four-byte sequence.
if (str[1] < 0x90 || str[1] > 0xBF) { length = 0;
*cursor += 1; } else if (str[0] == 0xF4 && (str[1] < 0x80 || str[1] > 0x8F)) {
return kBadChar;
}
break;
case 0xF4:
// Code points outside of the unicode range. // Code points outside of the unicode range.
if (str[1] < 0x80 || str[1] > 0x8F) { length = 0;
*cursor += 1; }
return kBadChar;
}
break;
default:
if (!IsContinuationCharacter(str[1])) {
*cursor += 1;
return kBadChar;
}
} }
if (!IsContinuationCharacter(str[2])) {
*cursor += 1; if (count != length) {
// All invalid encodings should land here.
*cursor += count;
return kBadChar; return kBadChar;
} }
if (!IsContinuationCharacter(str[3])) {
*cursor += 1; // All errors have been handled, so we only have to assemble the result.
return kBadChar; *cursor += length;
switch (length) {
case 1:
return str[0];
case 2:
return ((str[0] << 6) + str[1]) - 0x00003080;
case 3:
return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
case 4:
return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
0x03C82080;
} }
*cursor += 4;
return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - UNREACHABLE();
0x03C82080; return kBadChar;
} }
uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) { uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
...@@ -323,9 +299,10 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) { ...@@ -323,9 +299,10 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
// with one shift. // with one shift.
uint8_t mask = 0x7f >> kind; uint8_t mask = 0x7f >> kind;
// Store the kind - 1 (i.e., remaining bytes) in the top byte, value // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)
// in the bottom three. // in 2nd nibble, and the value in the bottom three. The 2nd nibble is
*buffer = (kind - 1) << 24 | (next & mask); // intended as a counter about how many bytes are still needed.
*buffer = kind << 28 | (kind - 1) << 24 | (next & mask);
return kIncomplete; return kIncomplete;
} else { } else {
// No buffer, and not the start of a 1-byte char (handled at the // No buffer, and not the start of a 1-byte char (handled at the
...@@ -354,15 +331,19 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) { ...@@ -354,15 +331,19 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
// We're inside of a character, as described by buffer. // We're inside of a character, as described by buffer.
// How many bytes (excluding this one) do we still expect? // How many bytes (excluding this one) do we still expect?
uint8_t count = (*buffer >> 24) - 1; uint8_t bytes_expected = *buffer >> 28;
uint8_t bytes_left = (*buffer >> 24) & 0x0f;
bytes_left--;
// Update the value. // Update the value.
uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F); uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
if (count) { if (bytes_left) {
*buffer = count << 24 | value; *buffer = (bytes_expected << 28 | bytes_left << 24 | value);
return kIncomplete; return kIncomplete;
} else { } else {
*buffer = 0; *buffer = 0;
return value; bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80) ||
(bytes_expected == 3 && value < 0x800);
return sequence_was_too_long ? kBadChar : value;
} }
} else { } else {
// Within a character, but not a continuation character? Then the // Within a character, but not a continuation character? Then the
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment