Commit fcb89f55 authored by Marja Hölttä's avatar Marja Hölttä Committed by Commit Bot

[unicode] Add tests for UTF-8 decoders + minor cleanups.

Verify that both UTF-8 decoders (incremental and non-incremental one) match the
expectations.

Also cleanup / harden the UTF-8 handling code, as suggested in
https://chromium-review.googlesource.com/c/v8/v8/+/671020/ .


BUG=chromium:765608

Change-Id: I6344d62ca15b75ac8e333421c94c4aa35ab8190d
Reviewed-on: https://chromium-review.googlesource.com/681217
Commit-Queue: Marja Hölttä <marja@chromium.org>
Reviewed-by: 's avatarCamillo Bruni <cbruni@chromium.org>
Cr-Commit-Position: refs/heads/master@{#48229}
parent 8b749bf9
...@@ -197,27 +197,27 @@ static inline uint8_t NonASCIISequenceLength(byte first) { ...@@ -197,27 +197,27 @@ static inline uint8_t NonASCIISequenceLength(byte first) {
// clang-format off // clang-format off
static const uint8_t lengths[256] = { static const uint8_t lengths[256] = {
// The first 128 entries correspond to ASCII characters. // The first 128 entries correspond to ASCII characters.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* OO - Of */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10 - 1f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 20 - 2f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 30 - 3f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40 - 4f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 50 - 5f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60 - 6f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 70 - 7f */
// The following 64 entries correspond to continuation bytes. // The following 64 entries correspond to continuation bytes.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80 - 8f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 90 - 9f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a0 - af */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* b0 - bf */
// The next are two invalid overlong encodings and 30 two-byte sequences. // The next are two invalid overlong encodings and 30 two-byte sequences.
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* c0-c1 + c2-cf */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* d0-df */
// 16 three-byte sequences. // 16 three-byte sequences.
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* e0-ef */
// 5 four-byte sequences, followed by sequences that could only encode // 5 four-byte sequences, followed by sequences that could only encode
// code points outside of the unicode range. // code points outside of the unicode range.
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; /* f0-f4 + f5-ff */
// clang-format on // clang-format on
return lengths[first]; return lengths[first];
} }
...@@ -322,7 +322,8 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) { ...@@ -322,7 +322,8 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
if (*buffer == 0) { if (*buffer == 0) {
// We're at the start of a new character. // We're at the start of a new character.
uint32_t kind = NonASCIISequenceLength(next); uint32_t kind = NonASCIISequenceLength(next);
if (kind >= 2 && kind <= 4) { CHECK_LE(kind, 4);
if (kind >= 2) {
// Start of 2..4 byte character, and no buffer. // Start of 2..4 byte character, and no buffer.
// The mask for the lower bits depends on the kind, and is // The mask for the lower bits depends on the kind, and is
...@@ -333,7 +334,9 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) { ...@@ -333,7 +334,9 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
// Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes) // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)
// in 2nd nibble, and the value in the bottom three. The 2nd nibble is // in 2nd nibble, and the value in the bottom three. The 2nd nibble is
// intended as a counter about how many bytes are still needed. // intended as a counter about how many bytes are still needed.
*buffer = kind << 28 | (kind - 1) << 24 | (next & mask); uint32_t character_info = kind << 28 | (kind - 1) << 24;
DCHECK_EQ(character_info & mask, 0);
*buffer = character_info | (next & mask);
return kIncomplete; return kIncomplete;
} else { } else {
// No buffer, and not the start of a 1-byte char (handled at the // No buffer, and not the start of a 1-byte char (handled at the
......
...@@ -127,8 +127,7 @@ class Utf16 { ...@@ -127,8 +127,7 @@ class Utf16 {
} }
}; };
class V8_EXPORT_PRIVATE Utf8 {
class Utf8 {
public: public:
static inline uchar Length(uchar chr, int previous); static inline uchar Length(uchar chr, int previous);
static inline unsigned EncodeOneByte(char* out, uint8_t c); static inline unsigned EncodeOneByte(char* out, uint8_t c);
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment