Commit fcb89f55 authored by Marja Hölttä's avatar Marja Hölttä Committed by Commit Bot

[unicode] Add tests for UTF-8 decoders + minor cleanups.

Verify that both UTF-8 decoders (incremental and non-incremental one) match the
expectations.

Also cleanup / harden the UTF-8 handling code, as suggested in
https://chromium-review.googlesource.com/c/v8/v8/+/671020/ .


BUG=chromium:765608

Change-Id: I6344d62ca15b75ac8e333421c94c4aa35ab8190d
Reviewed-on: https://chromium-review.googlesource.com/681217
Commit-Queue: Marja Hölttä <marja@chromium.org>
Reviewed-by: 's avatarCamillo Bruni <cbruni@chromium.org>
Cr-Commit-Position: refs/heads/master@{#48229}
parent 8b749bf9
......@@ -197,27 +197,27 @@ static inline uint8_t NonASCIISequenceLength(byte first) {
// clang-format off
static const uint8_t lengths[256] = {
// The first 128 entries correspond to ASCII characters.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* OO - Of */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10 - 1f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 20 - 2f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 30 - 3f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40 - 4f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 50 - 5f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60 - 6f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 70 - 7f */
// The following 64 entries correspond to continuation bytes.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80 - 8f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 90 - 9f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a0 - af */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* b0 - bf */
// The next are two invalid overlong encodings and 30 two-byte sequences.
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* c0-c1 + c2-cf */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* d0-df */
// 16 three-byte sequences.
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* e0-ef */
// 5 four-byte sequences, followed by sequences that could only encode
// code points outside of the unicode range.
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; /* f0-f4 + f5-ff */
// clang-format on
return lengths[first];
}
......@@ -322,7 +322,8 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
if (*buffer == 0) {
// We're at the start of a new character.
uint32_t kind = NonASCIISequenceLength(next);
if (kind >= 2 && kind <= 4) {
CHECK_LE(kind, 4);
if (kind >= 2) {
// Start of 2..4 byte character, and no buffer.
// The mask for the lower bits depends on the kind, and is
......@@ -333,7 +334,9 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
// Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)
// in 2nd nibble, and the value in the bottom three. The 2nd nibble is
// intended as a counter about how many bytes are still needed.
*buffer = kind << 28 | (kind - 1) << 24 | (next & mask);
uint32_t character_info = kind << 28 | (kind - 1) << 24;
DCHECK_EQ(character_info & mask, 0);
*buffer = character_info | (next & mask);
return kIncomplete;
} else {
// No buffer, and not the start of a 1-byte char (handled at the
......
......@@ -127,8 +127,7 @@ class Utf16 {
}
};
class Utf8 {
class V8_EXPORT_PRIVATE Utf8 {
public:
static inline uchar Length(uchar chr, int previous);
static inline unsigned EncodeOneByte(char* out, uint8_t c);
......
......@@ -4,8 +4,10 @@
#include <memory>
#include <string>
#include <vector>
#include "src/unicode-decoder.h"
#include "src/unicode-inl.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace v8 {
......@@ -23,6 +25,30 @@ void Decode(Utf8Decoder* decoder, const std::string& str) {
decoder->Reset(buffer.get(), str.length());
}
void DecodeNormally(const std::vector<byte>& bytes,
std::vector<unibrow::uchar>* output) {
size_t cursor = 0;
while (cursor < bytes.size()) {
output->push_back(
unibrow::Utf8::ValueOf(bytes.data() + cursor, bytes.size(), &cursor));
}
}
void DecodeIncrementally(const std::vector<byte>& bytes,
std::vector<unibrow::uchar>* output) {
unibrow::Utf8::Utf8IncrementalBuffer buffer = 0;
for (auto b : bytes) {
unibrow::uchar result = unibrow::Utf8::ValueOfIncremental(b, &buffer);
if (result != unibrow::Utf8::kIncomplete) {
output->push_back(result);
}
}
unibrow::uchar result = unibrow::Utf8::ValueOfIncrementalFinish(&buffer);
if (result != unibrow::Utf8::kBufferEmpty) {
output->push_back(result);
}
}
} // namespace
TEST(UnicodeTest, ReadOffEndOfUtf8String) {
......@@ -35,5 +61,382 @@ TEST(UnicodeTest, ReadOffEndOfUtf8String) {
Decode(&decoder, "\xF4");
}
TEST(UnicodeTest, IncrementalUTF8DecodingVsNonIncrementalUtf8Decoding) {
// Unfortunately, V8 has two UTF-8 decoders. This test checks that they
// produce the same result. This test was inspired by
// https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt .
typedef struct {
std::vector<byte> bytes;
std::vector<unibrow::uchar> unicode_expected;
} TestCase;
TestCase data[] = {
// Correct UTF-8 text.
{{0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5},
{0x3ba, 0x1f79, 0x3c3, 0x3bc, 0x3b5}},
// First possible sequence of a certain length:
// 1 byte
{{0x00}, {0x0}},
// 2 bytes
{{0xc2, 0x80}, {0x80}},
// 3 bytes
{{0xe0, 0xa0, 0x80}, {0x800}},
// 4 bytes
{{0xf0, 0x90, 0x80, 0x80}, {0x10000}},
// 5 bytes (not supported)
{{0xf8, 0x88, 0x80, 0x80, 0x80},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// 6 bytes (not supported)
{{0xfc, 0x84, 0x80, 0x80, 0x80, 0x80},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// Last possible sequence of certain length:
// 1 byte
{{0x7f}, {0x7f}},
// 2 bytes
{{0xdf, 0xbf}, {0x7ff}},
// 3 bytes
{{0xef, 0xbf, 0xbf}, {0xffff}},
// 4 bytes (this sequence is not a valid code point)
{{0xf7, 0xbf, 0xbf, 0xbf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// 5 bytes (not supported)
{{0xfb, 0xbf, 0xbf, 0xbf, 0xbf},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// 6 bytes (not supported)
{{0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// Other boundary conditions:
{{0xed, 0x9f, 0xbf}, {0xd7ff}},
{{0xee, 0x80, 0x80}, {0xe000}},
// U+fffd (invalid code point)
{{0xef, 0xbf, 0xbd}, {0xfffd}},
// U+10ffff (last valid code point)
{{0xf4, 0x8f, 0xbf, 0xbf}, {0x10ffff}},
// First invalid (too large) code point
{{0xf4, 0x90, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// Malformed sequences:
// Unexpected continuation bytes:
// First continuation byte
{{0x80}, {0xfffd}},
// Last continuation byte
{{0xbf}, {0xfffd}},
// 2 continuation bytes
{{0x80, 0xbf}, {0xfffd, 0xfffd}},
// 3 continuation bytes
{{0x80, 0xbf, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
// 4 continuation bytes
{{0x80, 0xbf, 0x80, 0xbf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// 5 continuation bytes
{{0x80, 0xbf, 0x80, 0xbf, 0x80},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// 6 continuation bytes
{{0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// 7 continuation bytes
{{0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf, 0xbf},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// Sequence of all 64 possible continuation bytes
{{0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95,
0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0,
0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab,
0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// Using each possible continuation byte in a two-byte sequence:
{{0xd0, 0x80, 0xd0, 0x81, 0xd0, 0x82, 0xd0, 0x83, 0xd0, 0x84, 0xd0, 0x85,
0xd0, 0x86, 0xd0, 0x87, 0xd0, 0x88, 0xd0, 0x89, 0xd0, 0x8a, 0xd0, 0x8b,
0xd0, 0x8c, 0xd0, 0x8d, 0xd0, 0x8e, 0xd0, 0x8f, 0xd0, 0x90, 0xd0, 0x91,
0xd0, 0x92, 0xd0, 0x93, 0xd0, 0x94, 0xd0, 0x95, 0xd0, 0x96, 0xd0, 0x97,
0xd0, 0x98, 0xd0, 0x99, 0xd0, 0x9a, 0xd0, 0x9b, 0xd0, 0x9c, 0xd0, 0x9d,
0xd0, 0x9e, 0xd0, 0x9f, 0xd0, 0xa0, 0xd0, 0xa1, 0xd0, 0xa2, 0xd0, 0xa3,
0xd0, 0xa4, 0xd0, 0xa5, 0xd0, 0xa6, 0xd0, 0xa7, 0xd0, 0xa8, 0xd0, 0xa9,
0xd0, 0xaa, 0xd0, 0xab, 0xd0, 0xac, 0xd0, 0xad, 0xd0, 0xae, 0xd0, 0xaf,
0xd0, 0xb0, 0xd0, 0xb1, 0xd0, 0xb2, 0xd0, 0xb3, 0xd0, 0xb4, 0xd0, 0xb5,
0xd0, 0xb6, 0xd0, 0xb7, 0xd0, 0xb8, 0xd0, 0xb9, 0xd0, 0xba, 0xd0, 0xbb,
0xd0, 0xbc, 0xd0, 0xbd, 0xd0, 0xbe, 0xd0, 0xbf},
{0x400, 0x401, 0x402, 0x403, 0x404, 0x405, 0x406, 0x407, 0x408, 0x409,
0x40a, 0x40b, 0x40c, 0x40d, 0x40e, 0x40f, 0x410, 0x411, 0x412, 0x413,
0x414, 0x415, 0x416, 0x417, 0x418, 0x419, 0x41a, 0x41b, 0x41c, 0x41d,
0x41e, 0x41f, 0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427,
0x428, 0x429, 0x42a, 0x42b, 0x42c, 0x42d, 0x42e, 0x42f, 0x430, 0x431,
0x432, 0x433, 0x434, 0x435, 0x436, 0x437, 0x438, 0x439, 0x43a, 0x43b,
0x43c, 0x43d, 0x43e, 0x43f}},
// Lonely first bytes:
// All 32 first bytes of 32-byte sequences, each followed by a space
// (generates 32 invalid char + space sequences.
{{0xc0, 0x20, 0xc1, 0x20, 0xc2, 0x20, 0xc3, 0x20, 0xc4, 0x20, 0xc5,
0x20, 0xc6, 0x20, 0xc7, 0x20, 0xc8, 0x20, 0xc9, 0x20, 0xca, 0x20,
0xcb, 0x20, 0xcc, 0x20, 0xcd, 0x20, 0xce, 0x20, 0xcf, 0x20, 0xd0,
0x20, 0xd1, 0x20, 0xd2, 0x20, 0xd3, 0x20, 0xd4, 0x20, 0xd5, 0x20,
0xd6, 0x20, 0xd7, 0x20, 0xd8, 0x20, 0xd9, 0x20, 0xda, 0x20, 0xdb,
0x20, 0xdc, 0x20, 0xdd, 0x20, 0xde, 0x20, 0xdf, 0x20},
{0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
0xfffd, 0x20, 0xfffd, 0x20}},
// All 16 first bytes of 3-byte sequences, each followed by a space
// (generates 16 invalid char + space sequences):
{{0xe0, 0x20, 0xe1, 0x20, 0xe2, 0x20, 0xe3, 0x20, 0xe4, 0x20, 0xe5,
0x20, 0xe6, 0x20, 0xe7, 0x20, 0xe8, 0x20, 0xe9, 0x20, 0xea, 0x20,
0xeb, 0x20, 0xec, 0x20, 0xed, 0x20, 0xee, 0x20, 0xef, 0x20},
{0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20}},
// All 8 first bytes of 4-byte sequences, each followed by a space
// (generates 8 invalid char + space sequences):
{{0xf0, 0x20, 0xf1, 0x20, 0xf2, 0x20, 0xf3, 0x20, 0xf4, 0x20, 0xf5, 0x20,
0xf6, 0x20, 0xf7, 0x20},
{0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20}},
// All 4 first bytes of 5-byte sequences (not supported), each followed by
// a space (generates 4 invalid char + space sequences):
{{0xf8, 0x20, 0xf9, 0x20, 0xfa, 0x20, 0xfb, 0x20},
{0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20}},
// All 2 first bytes of 6-byte sequences (not supported), each followed by
// a space (generates 2 invalid char + space sequences):
{{0xfc, 0x20, 0xfd, 0x20}, {0xfffd, 0x20, 0xfffd, 0x20}},
// Sequences with last continuation byte missing. Normally the whole
// incomplete sequence generates a single invalid character (exceptions
// explained below).
// 2-byte sequences with last byte missing
{{0xc0}, {0xfffd}},
{{0xdf}, {0xfffd}},
// 3-byte sequences with last byte missing.
{{0xe8, 0x80}, {0xfffd}},
{{0xe0, 0xbf}, {0xfffd}},
{{0xef, 0xbf}, {0xfffd}},
// Start of an overlong sequence. The first "maximal subpart" is the first
// byte; it creates an invalid character. Each following byte generates an
// invalid character too.
{{0xe0, 0x80}, {0xfffd, 0xfffd}},
// 4-byte sequences with last byte missing
{{0xf1, 0x80, 0x80}, {0xfffd}},
{{0xf4, 0x8f, 0xbf}, {0xfffd}},
// Start of an overlong sequence. The first "maximal subpart" is the first
// byte; it creates an invalid character. Each following byte generates an
// invalid character too.
{{0xf0, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
// 5-byte sequences (not supported) with last byte missing
{{0xf8, 0x80, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xfb, 0xbf, 0xbf, 0xbf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// 6-byte sequences (not supported) with last byte missing
{{0xfc, 0x80, 0x80, 0x80, 0x80},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xfd, 0xbf, 0xbf, 0xbf, 0xbf},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// Concatenation of incomplete sequences: above incomplete sequences
// concatenated.
{{0xc0, 0xdf, 0xe8, 0x80, 0xe0, 0xbf, 0xef, 0xbf, 0xe0, 0x80,
0xf1, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xf0, 0x80, 0x80, 0xf8,
0x80, 0x80, 0x80, 0xfb, 0xbf, 0xbf, 0xbf, 0xfc, 0x80, 0x80,
0x80, 0x80, 0xfd, 0xbf, 0xbf, 0xbf, 0xbf},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// Incomplete sequence tests repeated with a space after the incomplete
// sequence.
// 2-byte sequences with last byte missing
{{0xc0, 0x20}, {0xfffd, 0x20}},
{{0xdf, 0x20}, {0xfffd, 0x20}},
// 3-byte sequences with last byte missing
{{0xe8, 0x80, 0x20}, {0xfffd, 0x20}},
{{0xe0, 0xbf, 0x20}, {0xfffd, 0x20}},
{{0xef, 0xbf, 0x20}, {0xfffd, 0x20}},
// Start of overlong 3-byte sequence with last byte missing
{{0xe0, 0x80, 0x20}, {0xfffd, 0xfffd, 0x20}},
// 4-byte sequences with last byte missing
{{0xf1, 0x80, 0x80, 0x20}, {0xfffd, 0x20}},
{{0xf4, 0x8f, 0xbf, 0x20}, {0xfffd, 0x20}},
// Start of overlong 4-byte sequence with last byte missing
{{0xf0, 0x80, 0x80, 0x20}, {0xfffd, 0xfffd, 0xfffd, 0x20}},
// 5-byte sequences (not supported) with last byte missing
{{0xf8, 0x80, 0x80, 0x80, 0x20}, {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x20}},
{{0xfb, 0xbf, 0xbf, 0xbf, 0x20}, {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x20}},
// 6-byte sequences (not supported) with last byte missing
{{0xfc, 0x80, 0x80, 0x80, 0x80, 0x20},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x20}},
{{0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0x20},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x20}},
// Impossible bytes
{{0xfe}, {0xfffd}},
{{0xff}, {0xfffd}},
{{0xfe, 0xfe, 0xff, 0xff}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// Lead-byte-like bytes which aren't valid lead bytes.
{{0xc0}, {0xfffd}},
{{0xc0, 0xaa}, {0xfffd, 0xfffd}},
{{0xc1}, {0xfffd}},
{{0xc1, 0xaa}, {0xfffd, 0xfffd}},
{{0xf5}, {0xfffd}},
{{0xf5, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xf6}, {0xfffd}},
{{0xf6, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xf7}, {0xfffd}},
{{0xf7, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xf8}, {0xfffd}},
{{0xf8, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xf9}, {0xfffd}},
{{0xf9, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xfa}, {0xfffd}},
{{0xfa, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xfb}, {0xfffd}},
{{0xfb, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xfc}, {0xfffd}},
{{0xfc, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xfd}, {0xfffd}},
{{0xfd, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xfe}, {0xfffd}},
{{0xfe, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xff}, {0xfffd}},
{{0xff, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// Overlong sequences:
// Overlong encodings for "/"
{{0xc0, 0xaf}, {0xfffd, 0xfffd}},
{{0xe0, 0x80, 0xaf}, {0xfffd, 0xfffd, 0xfffd}},
{{0xf0, 0x80, 0x80, 0xaf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// 5-byte sequence (not supported anyway)
{{0xf8, 0x80, 0x80, 0x80, 0xaf},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// 6-byte sequence (not supported anyway)
{{0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// Maximum overlong sequences
{{0xc1, 0xbf}, {0xfffd, 0xfffd}},
{{0xe0, 0x9f, 0xbf}, {0xfffd, 0xfffd, 0xfffd}},
{{0xf0, 0x8f, 0xbf, 0xbf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// 5-byte sequence (not supported anyway)
{{0xf8, 0x87, 0xbf, 0xbf, 0xbf},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// 6-byte sequence (not supported anyway)
{{0xfc, 0x83, 0xbf, 0xbf, 0xbf, 0xbf},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// Overlong encodings for 0
{{0xc0, 0x80}, {0xfffd, 0xfffd}},
{{0xe0, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
{{0xf0, 0x80, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// 5-byte sequence (not supported anyway)
{{0xf8, 0x80, 0x80, 0x80, 0x80},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// 6-byte sequence (not supported anyway)
{{0xfc, 0x80, 0x80, 0x80, 0x80, 0x80},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// Illegal code positions:
// Single UTF-16 surrogates
{{0xed, 0xa0, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
{{0xed, 0xa0, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
{{0xed, 0xad, 0xbf}, {0xfffd, 0xfffd, 0xfffd}},
{{0xed, 0xae, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
{{0xed, 0xaf, 0xbf}, {0xfffd, 0xfffd, 0xfffd}},
{{0xed, 0xb0, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
{{0xed, 0xbe, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
{{0xed, 0xbf, 0xbf}, {0xfffd, 0xfffd, 0xfffd}},
// Paired surrogates
{{0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xed, 0xae, 0x80, 0xed, 0xb0, 0x80},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
{{0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf},
{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
// Surrogates with the last byte missing.
{{0xed, 0xa0}, {0xfffd, 0xfffd}},
{{0xed, 0xa0}, {0xfffd, 0xfffd}},
{{0xed, 0xad}, {0xfffd, 0xfffd}},
{{0xed, 0xae}, {0xfffd, 0xfffd}},
{{0xed, 0xaf}, {0xfffd, 0xfffd}},
{{0xed, 0xb0}, {0xfffd, 0xfffd}},
{{0xed, 0xbe}, {0xfffd, 0xfffd}},
{{0xed, 0xbf}, {0xfffd, 0xfffd}},
// Other non-characters
{{0xef, 0xbf, 0xbe}, {0xfffe}},
{{0xef, 0xbf, 0xbf}, {0xffff}},
{{0xef, 0xb7, 0x90, 0xef, 0xb7, 0x91, 0xef, 0xb7, 0x92, 0xef, 0xb7, 0x93,
0xef, 0xb7, 0x94, 0xef, 0xb7, 0x95, 0xef, 0xb7, 0x96, 0xef, 0xb7, 0x97,
0xef, 0xb7, 0x98, 0xef, 0xb7, 0x99, 0xef, 0xb7, 0x9a, 0xef, 0xb7, 0x9b,
0xef, 0xb7, 0x9c, 0xef, 0xb7, 0x9d, 0xef, 0xb7, 0x9e, 0xef, 0xb7, 0x9f,
0xef, 0xb7, 0xa0, 0xef, 0xb7, 0xa1, 0xef, 0xb7, 0xa2, 0xef, 0xb7, 0xa3,
0xef, 0xb7, 0xa4, 0xef, 0xb7, 0xa5, 0xef, 0xb7, 0xa6, 0xef, 0xb7, 0xa7,
0xef, 0xb7, 0xa8, 0xef, 0xb7, 0xa9, 0xef, 0xb7, 0xaa, 0xef, 0xb7, 0xab,
0xef, 0xb7, 0xac, 0xef, 0xb7, 0xad, 0xef, 0xb7, 0xae, 0xef, 0xb7, 0xaf},
{0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7,
0xfdd8, 0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf,
0xfde0, 0xfde1, 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7,
0xfde8, 0xfde9, 0xfdea, 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef}},
{{0xf0, 0x9f, 0xbf, 0xbe, 0xf0, 0x9f, 0xbf, 0xbf, 0xf0, 0xaf, 0xbf,
0xbe, 0xf0, 0xaf, 0xbf, 0xbf, 0xf0, 0xbf, 0xbf, 0xbe, 0xf0, 0xbf,
0xbf, 0xbf, 0xf1, 0x8f, 0xbf, 0xbe, 0xf1, 0x8f, 0xbf, 0xbf, 0xf1,
0x9f, 0xbf, 0xbe, 0xf1, 0x9f, 0xbf, 0xbf, 0xf1, 0xaf, 0xbf, 0xbe,
0xf1, 0xaf, 0xbf, 0xbf, 0xf1, 0xbf, 0xbf, 0xbe, 0xf1, 0xbf, 0xbf,
0xbf, 0xf2, 0x8f, 0xbf, 0xbe, 0xf2, 0x8f, 0xbf, 0xbf},
{0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff, 0x4fffe, 0x4ffff,
0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff, 0x8fffe,
0x8ffff}},
};
for (auto test : data) {
// For figuring out which test fails:
fprintf(stderr, "test: ");
for (auto b : test.bytes) {
fprintf(stderr, "%x ", b);
}
fprintf(stderr, "\n");
std::vector<unibrow::uchar> output_normal;
DecodeNormally(test.bytes, &output_normal);
CHECK_EQ(output_normal.size(), test.unicode_expected.size());
for (size_t i = 0; i < output_normal.size(); ++i) {
CHECK_EQ(output_normal[i], test.unicode_expected[i]);
}
std::vector<unibrow::uchar> output_incremental;
DecodeIncrementally(test.bytes, &output_incremental);
CHECK_EQ(output_incremental.size(), test.unicode_expected.size());
for (size_t i = 0; i < output_incremental.size(); ++i) {
CHECK_EQ(output_incremental[i], test.unicode_expected[i]);
}
}
}
} // namespace internal
} // namespace v8
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment