Commit b80a03bc authored by Andy Wingo's avatar Andy Wingo Committed by V8 LUCI CQ

[string] Refactor UTF-8 and WTF-8 decoders to share code

This will allow us to more easily add a strict UTF-8 decoder, for use in
stringrefs.

Bug: v8:12868
Change-Id: I6835dca619417f4d2994d8283728cf8ebe599bd7
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3714660Reviewed-by: 's avatarMarja Hölttä <marja@chromium.org>
Commit-Queue: Andy Wingo <wingo@igalia.com>
Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Cr-Commit-Position: refs/heads/main@{#81333}
parent 5b12e62b
...@@ -14,95 +14,42 @@ ...@@ -14,95 +14,42 @@
namespace v8 { namespace v8 {
namespace internal { namespace internal {
Utf8Decoder::Utf8Decoder(const base::Vector<const uint8_t>& chars) template <class Decoder>
Utf8DecoderBase<Decoder>::Utf8DecoderBase(
const base::Vector<const uint8_t>& data)
: encoding_(Encoding::kAscii), : encoding_(Encoding::kAscii),
non_ascii_start_(NonAsciiStart(chars.begin(), chars.length())), non_ascii_start_(NonAsciiStart(data.begin(), data.length())),
utf16_length_(non_ascii_start_) { utf16_length_(non_ascii_start_) {
if (non_ascii_start_ == chars.length()) return; if (non_ascii_start_ == data.length()) return;
const uint8_t* cursor = chars.begin() + non_ascii_start_;
const uint8_t* end = chars.begin() + chars.length();
bool is_one_byte = true; bool is_one_byte = true;
uint32_t incomplete_char = 0; auto state = Decoder::DfaDecoder::kAccept;
unibrow::Utf8::State state = unibrow::Utf8::State::kAccept; uint32_t current = 0;
uint32_t previous = 0;
while (cursor < end) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (t != unibrow::Utf8::kIncomplete) {
is_one_byte = is_one_byte && t <= unibrow::Latin1::kMaxChar;
utf16_length_++;
if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) utf16_length_++;
}
}
unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
if (t != unibrow::Utf8::kBufferEmpty) {
is_one_byte = false;
utf16_length_++;
}
encoding_ = is_one_byte ? Encoding::kLatin1 : Encoding::kUtf16;
}
template <typename Char>
void Utf8Decoder::Decode(Char* out, const base::Vector<const uint8_t>& data) {
CopyChars(out, data.begin(), non_ascii_start_);
out += non_ascii_start_;
uint32_t incomplete_char = 0;
unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
const uint8_t* cursor = data.begin() + non_ascii_start_; const uint8_t* cursor = data.begin() + non_ascii_start_;
const uint8_t* end = data.begin() + data.length(); const uint8_t* end = data.begin() + data.length();
while (cursor < end) { while (cursor < end) {
unibrow::uchar t = auto previous_state = state;
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char); Decoder::DfaDecoder::Decode(*cursor, &state, &current);
if (t != unibrow::Utf8::kIncomplete) { if (state < Decoder::DfaDecoder::kAccept) {
if (sizeof(Char) == 1 || t <= unibrow::Utf16::kMaxNonSurrogateCharCode) { DCHECK_EQ(state, Decoder::DfaDecoder::kReject);
*(out++) = static_cast<Char>(t); if (Decoder::kAllowIncompleteSequences) {
state = Decoder::DfaDecoder::kAccept;
static_assert(unibrow::Utf8::kBadChar > unibrow::Latin1::kMaxChar);
is_one_byte = false;
utf16_length_++;
previous = unibrow::Utf8::kBadChar;
current = 0;
// If we were trying to continue a multibyte sequence, try this byte
// again.
if (previous_state != Decoder::DfaDecoder::kAccept) continue;
} else { } else {
*(out++) = unibrow::Utf16::LeadSurrogate(t); encoding_ = Encoding::kInvalid;
*(out++) = unibrow::Utf16::TrailSurrogate(t); return;
} }
} } else if (state == Decoder::DfaDecoder::kAccept) {
} if (Decoder::InvalidCodePointSequence(current, previous)) {
unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
if (t != unibrow::Utf8::kBufferEmpty) *out = static_cast<Char>(t);
}
template V8_EXPORT_PRIVATE void Utf8Decoder::Decode(
uint8_t* out, const base::Vector<const uint8_t>& data);
template V8_EXPORT_PRIVATE void Utf8Decoder::Decode(
uint16_t* out, const base::Vector<const uint8_t>& data);
#if V8_ENABLE_WEBASSEMBLY
Wtf8Decoder::Wtf8Decoder(const base::Vector<const uint8_t>& data)
: encoding_(Encoding::kAscii),
non_ascii_start_(NonAsciiStart(data.begin(), data.length())),
utf16_length_(non_ascii_start_) {
if (non_ascii_start_ == data.length()) return;
bool is_one_byte = true;
auto state = GeneralizedUtf8DfaDecoder::kAccept;
uint32_t current = 0;
uint32_t previous = 0;
for (size_t i = non_ascii_start_; i < data.size(); i++) {
GeneralizedUtf8DfaDecoder::Decode(data[i], &state, &current);
if (state < GeneralizedUtf8DfaDecoder::kAccept) {
DCHECK_EQ(state, GeneralizedUtf8DfaDecoder::kReject);
encoding_ = Encoding::kInvalid;
return;
}
if (state == GeneralizedUtf8DfaDecoder::kAccept) {
if (unibrow::Utf16::IsTrailSurrogate(current) &&
unibrow::Utf16::IsLeadSurrogate(previous)) {
encoding_ = Encoding::kInvalid; encoding_ = Encoding::kInvalid;
return; return;
} }
...@@ -112,44 +59,84 @@ Wtf8Decoder::Wtf8Decoder(const base::Vector<const uint8_t>& data) ...@@ -112,44 +59,84 @@ Wtf8Decoder::Wtf8Decoder(const base::Vector<const uint8_t>& data)
previous = current; previous = current;
current = 0; current = 0;
} }
cursor++;
} }
if (state == GeneralizedUtf8DfaDecoder::kAccept) { if (state == Decoder::DfaDecoder::kAccept) {
encoding_ = is_one_byte ? Encoding::kLatin1 : Encoding::kUtf16; encoding_ = is_one_byte ? Encoding::kLatin1 : Encoding::kUtf16;
} else if (Decoder::kAllowIncompleteSequences) {
static_assert(unibrow::Utf8::kBadChar > unibrow::Latin1::kMaxChar);
encoding_ = Encoding::kUtf16;
utf16_length_++;
} else { } else {
encoding_ = Encoding::kInvalid; encoding_ = Encoding::kInvalid;
} }
} }
template <class Decoder>
template <typename Char> template <typename Char>
void Wtf8Decoder::Decode(Char* out, const base::Vector<const uint8_t>& data) { void Utf8DecoderBase<Decoder>::Decode(Char* out,
const base::Vector<const uint8_t>& data) {
DCHECK(!is_invalid()); DCHECK(!is_invalid());
CopyChars(out, data.begin(), non_ascii_start_); CopyChars(out, data.begin(), non_ascii_start_);
out += non_ascii_start_; out += non_ascii_start_;
auto state = GeneralizedUtf8DfaDecoder::kAccept; auto state = Decoder::DfaDecoder::kAccept;
uint32_t t = 0; uint32_t current = 0;
for (size_t i = non_ascii_start_; i < data.size(); i++) { const uint8_t* cursor = data.begin() + non_ascii_start_;
GeneralizedUtf8DfaDecoder::Decode(data[i], &state, &t); const uint8_t* end = data.begin() + data.length();
if (state == GeneralizedUtf8DfaDecoder::kAccept) {
if (sizeof(Char) == 1 || t <= unibrow::Utf16::kMaxNonSurrogateCharCode) { while (cursor < end) {
*(out++) = static_cast<Char>(t); auto previous_state = state;
Decoder::DfaDecoder::Decode(*cursor, &state, &current);
if (Decoder::kAllowIncompleteSequences &&
state < Decoder::DfaDecoder::kAccept) {
state = Decoder::DfaDecoder::kAccept;
*(out++) = static_cast<Char>(unibrow::Utf8::kBadChar);
current = 0;
// If we were trying to continue a multibyte sequence, try this byte
// again.
if (previous_state != Decoder::DfaDecoder::kAccept) continue;
} else if (state == Decoder::DfaDecoder::kAccept) {
if (sizeof(Char) == 1 ||
current <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
*(out++) = static_cast<Char>(current);
} else { } else {
*(out++) = unibrow::Utf16::LeadSurrogate(t); *(out++) = unibrow::Utf16::LeadSurrogate(current);
*(out++) = unibrow::Utf16::TrailSurrogate(t); *(out++) = unibrow::Utf16::TrailSurrogate(current);
} }
t = 0; current = 0;
} }
cursor++;
}
if (Decoder::kAllowIncompleteSequences &&
state != Decoder::DfaDecoder::kAccept) {
*out = static_cast<Char>(unibrow::Utf8::kBadChar);
} else {
DCHECK_EQ(state, Decoder::DfaDecoder::kAccept);
} }
DCHECK_EQ(state, GeneralizedUtf8DfaDecoder::kAccept);
} }
template void Wtf8Decoder::Decode(uint8_t* out, template V8_EXPORT_PRIVATE Utf8DecoderBase<Utf8Decoder>::Utf8DecoderBase(
const base::Vector<const uint8_t>& data); const base::Vector<const uint8_t>& data);
template V8_EXPORT_PRIVATE void Utf8DecoderBase<Utf8Decoder>::Decode(
uint8_t* out, const base::Vector<const uint8_t>& data);
template V8_EXPORT_PRIVATE void Utf8DecoderBase<Utf8Decoder>::Decode(
uint16_t* out, const base::Vector<const uint8_t>& data);
#if V8_ENABLE_WEBASSEMBLY
template Utf8DecoderBase<Wtf8Decoder>::Utf8DecoderBase(
const base::Vector<const uint8_t>& data);
template void Utf8DecoderBase<Wtf8Decoder>::Decode(
uint8_t* out, const base::Vector<const uint8_t>& data);
template void Wtf8Decoder::Decode(uint16_t* out, template void Utf8DecoderBase<Wtf8Decoder>::Decode(
const base::Vector<const uint8_t>& data); uint16_t* out, const base::Vector<const uint8_t>& data);
#endif // V8_ENABLE_WEBASSEMBLY #endif // V8_ENABLE_WEBASSEMBLY
} // namespace internal } // namespace internal
......
...@@ -8,6 +8,10 @@ ...@@ -8,6 +8,10 @@
#include "src/base/vector.h" #include "src/base/vector.h"
#include "src/strings/unicode.h" #include "src/strings/unicode.h"
#if V8_ENABLE_WEBASSEMBLY
#include "src/third_party/utf8-decoder/generalized-utf8-decoder.h"
#endif
namespace v8 { namespace v8 {
namespace internal { namespace internal {
...@@ -48,40 +52,14 @@ inline int NonAsciiStart(const uint8_t* chars, int length) { ...@@ -48,40 +52,14 @@ inline int NonAsciiStart(const uint8_t* chars, int length) {
return static_cast<int>(chars - start); return static_cast<int>(chars - start);
} }
class V8_EXPORT_PRIVATE Utf8Decoder final { template <class Decoder>
public: class Utf8DecoderBase {
enum class Encoding : uint8_t { kAscii, kLatin1, kUtf16 };
explicit Utf8Decoder(const base::Vector<const uint8_t>& chars);
// This decoder never fails; an invalid byte sequence decodes to U+FFFD and
// then the decode continues.
bool is_invalid() const { return false; }
bool is_ascii() const { return encoding_ == Encoding::kAscii; }
bool is_one_byte() const { return encoding_ <= Encoding::kLatin1; }
int utf16_length() const { return utf16_length_; }
int non_ascii_start() const { return non_ascii_start_; }
template <typename Char>
V8_EXPORT_PRIVATE void Decode(Char* out,
const base::Vector<const uint8_t>& data);
private:
Encoding encoding_;
int non_ascii_start_;
int utf16_length_;
};
#if V8_ENABLE_WEBASSEMBLY
// Like Utf8Decoder above, except that instead of replacing invalid sequences
// with U+FFFD, we have a separate Encoding::kInvalid state.
class Wtf8Decoder {
public: public:
enum class Encoding : uint8_t { kAscii, kLatin1, kUtf16, kInvalid }; enum class Encoding : uint8_t { kAscii, kLatin1, kUtf16, kInvalid };
explicit Wtf8Decoder(const base::Vector<const uint8_t>& data); bool is_invalid() const {
return static_cast<const Decoder&>(*this).is_invalid();
bool is_invalid() const { return encoding_ == Encoding::kInvalid; } }
bool is_ascii() const { return encoding_ == Encoding::kAscii; } bool is_ascii() const { return encoding_ == Encoding::kAscii; }
bool is_one_byte() const { return encoding_ <= Encoding::kLatin1; } bool is_one_byte() const { return encoding_ <= Encoding::kLatin1; }
int utf16_length() const { int utf16_length() const {
...@@ -94,14 +72,55 @@ class Wtf8Decoder { ...@@ -94,14 +72,55 @@ class Wtf8Decoder {
} }
template <typename Char> template <typename Char>
V8_EXPORT_PRIVATE void Decode(Char* out, void Decode(Char* out, const base::Vector<const uint8_t>& data);
const base::Vector<const uint8_t>& data);
private: protected:
explicit Utf8DecoderBase(const base::Vector<const uint8_t>& data);
Encoding encoding_; Encoding encoding_;
int non_ascii_start_; int non_ascii_start_;
int utf16_length_; int utf16_length_;
}; };
class V8_EXPORT_PRIVATE Utf8Decoder final
: public Utf8DecoderBase<Utf8Decoder> {
public:
static bool InvalidCodePointSequence(uint32_t current, uint32_t previous) {
// The DfaDecoder will only ever decode Unicode scalar values, and all
// sequences of USVs are valid.
DCHECK(!unibrow::Utf16::IsLeadSurrogate(current));
DCHECK(!unibrow::Utf16::IsTrailSurrogate(current));
return false;
}
static const bool kAllowIncompleteSequences = true;
using DfaDecoder = Utf8DfaDecoder;
explicit Utf8Decoder(const base::Vector<const uint8_t>& data)
: Utf8DecoderBase(data) {}
// This decoder never fails; an invalid byte sequence decodes to U+FFFD and
// then the decode continues.
bool is_invalid() const {
DCHECK_NE(encoding_, Encoding::kInvalid);
return false;
}
};
#if V8_ENABLE_WEBASSEMBLY
// Like Utf8Decoder above, except that instead of replacing invalid sequences
// with U+FFFD, we have a separate Encoding::kInvalid state.
class Wtf8Decoder : public Utf8DecoderBase<Wtf8Decoder> {
public:
static bool InvalidCodePointSequence(uint32_t current, uint32_t previous) {
return unibrow::Utf16::IsSurrogatePair(current, previous);
}
static const bool kAllowIncompleteSequences = false;
using DfaDecoder = GeneralizedUtf8DfaDecoder;
explicit Wtf8Decoder(const base::Vector<const uint8_t>& data)
: Utf8DecoderBase(data) {}
bool is_invalid() const { return encoding_ == Encoding::kInvalid; }
};
#endif // V8_ENABLE_WEBASSEMBLY #endif // V8_ENABLE_WEBASSEMBLY
} // namespace internal } // namespace internal
......
...@@ -103,10 +103,10 @@ class Utf16 { ...@@ -103,10 +103,10 @@ class Utf16 {
return IsLeadSurrogate(lead) && IsTrailSurrogate(trail); return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
} }
static inline bool IsLeadSurrogate(int code) { static inline bool IsLeadSurrogate(int code) {
return (code & 0xfc00) == 0xd800; return (code & 0x1ffc00) == 0xd800;
} }
static inline bool IsTrailSurrogate(int code) { static inline bool IsTrailSurrogate(int code) {
return (code & 0xfc00) == 0xdc00; return (code & 0x1ffc00) == 0xdc00;
} }
static inline int CombineSurrogatePair(uchar lead, uchar trail) { static inline int CombineSurrogatePair(uchar lead, uchar trail) {
......
...@@ -9,97 +9,96 @@ ...@@ -9,97 +9,96 @@
#ifndef __GENERALIZED_UTF8_DFA_DECODER_H #ifndef __GENERALIZED_UTF8_DFA_DECODER_H
#define __GENERALIZED_UTF8_DFA_DECODER_H #define __GENERALIZED_UTF8_DFA_DECODER_H
namespace GeneralizedUtf8DfaDecoder { struct GeneralizedUtf8DfaDecoder {
enum State : uint8_t {
enum State : uint8_t { kReject = 0,
kReject = 0, kAccept = 11,
kAccept = 11, kTwoByte = 22,
kTwoByte = 22, kThreeByte = 33,
kThreeByte = 33, kFourByte = 44,
kFourByte = 44, kFourByteLow = 55,
kFourByteLow = 55, kThreeByteHigh = 66,
kThreeByteHigh = 66, kFourByteMidHigh = 77,
kFourByteMidHigh = 77,
};
static inline void Decode(uint8_t byte, State* state, uint32_t* buffer) {
// This first table maps bytes to character to a transition.
//
// The transition value takes a state to a new state, but it also determines
// the set of bits from the current byte that contribute to the decoded
// codepoint:
//
// Transition | Bits from current byte that contribute to decoded codepoint
// ------------------------------------------------------------------------
// 0, 1 | 0b01111111
// 2, 3 | 0b00111111
// 4, 5 | 0b00011111
// 6, 7 | 0b00001111
// 8, 9 | 0b00000111
// 10 | 0b00000011
//
// Given the WTF-8 encoding, we therefore have the following constraints:
// 1. The transition value for 1-byte encodings should have the value 0 or 1
// so that we preserve all of the low 7 bits.
// 2. Continuation bytes (0x80 to 0xBF) are of the form 0b10xxxxxx, and
// therefore should have transition value between 0 and 3.
// 3. Leading bytes for 2-byte encodings are of the form 0b110yyyyy, and
// therefore the transition value can be between 2 and 5.
// 4. Leading bytes for 3-byte encodings (0b1110zzzz) need transition value
// between 4 and 7.
// 5. Leading bytes for 4-byte encodings (0b11110uuu) need transition value
// between 6 and 9.
// 6. We need more states to impose irregular constraints. Sometimes we can
// use the knowldege that e.g. some high significant bits of the xxxx in
// 0b1110xxxx are 0, then we can use a higher transition value.
// 7. Transitions to invalid states can use any transition value.
static constexpr uint8_t transitions[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00-0F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10-1F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20-2F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30-3F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40-4F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50-5F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60-6F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70-7F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80-8F
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 90-9F
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // A0-AF
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // B0-BF
8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // C0-CF
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // D0-DF
9, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, // E0-EF
10, 6, 6, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // F0-FF
}; };
// This second table maps a state to a new state when adding a transition. static inline void Decode(uint8_t byte, State* state, uint32_t* buffer) {
// 00-7F // This first table maps bytes to character to a transition.
// | 80-8F //
// | | 90-9F // The transition value takes a state to a new state, but it also determines
// | | | A0-BF // the set of bits from the current byte that contribute to the decoded
// | | | | C2-DF // codepoint:
// | | | | | E1-EF //
// | | | | | | F1-F3 // Transition | Current byte bits that contribute to decoded codepoint
// | | | | | | | F4 // -------------------------------------------------------------------
// | | | | | | | | C0, C1, F5-FF // 0, 1 | 0b01111111
// | | | | | | | | | E0 // 2, 3 | 0b00111111
// | | | | | | | | | | F0 // 4, 5 | 0b00011111
static constexpr uint8_t states[] = { // 6, 7 | 0b00001111
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // REJECT = 0 // 8, 9 | 0b00000111
11, 0, 0, 0, 22, 33, 44, 55, 0, 66, 77, // ACCEPT = 11 // 10 | 0b00000011
0, 11, 11, 11, 0, 0, 0, 0, 0, 0, 0, // 2-byte = 22 //
0, 22, 22, 22, 0, 0, 0, 0, 0, 0, 0, // 3-byte = 33 // Given the WTF-8 encoding, we therefore have the following constraints:
0, 33, 33, 33, 0, 0, 0, 0, 0, 0, 0, // 4-byte = 44
0, 33, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte low = 55 // 1. The transition value for 1-byte encodings should have the value 0 or
0, 0, 0, 22, 0, 0, 0, 0, 0, 0, 0, // 3-byte high = 66 // 1 so that we preserve all of the low 7 bits.
0, 0, 33, 33, 0, 0, 0, 0, 0, 0, 0, // 4-byte mid/high = 77 // 2. Continuation bytes (0x80 to 0xBF) are of the form 0b10xxxxxx, and
}; // therefore should have transition value between 0 and 3.
// 3. Leading bytes for 2-byte encodings are of the form 0b110yyyyy, and
// therefore the transition value can be between 2 and 5.
// 4. Leading bytes for 3-byte encodings (0b1110zzzz) need transition
// value between 4 and 7.
// 5. Leading bytes for 4-byte encodings (0b11110uuu) need transition
// value between 6 and 9.
// 6. We need more states to impose irregular constraints. Sometimes we
// can use the knowldege that e.g. some high significant bits of the
// xxxx in 0b1110xxxx are 0, then we can use a higher transition value.
// 7. Transitions to invalid states can use any transition value.
static constexpr uint8_t transitions[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00-0F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10-1F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20-2F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30-3F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40-4F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50-5F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60-6F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70-7F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80-8F
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 90-9F
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // A0-AF
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // B0-BF
8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // C0-CF
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // D0-DF
9, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, // E0-EF
10, 6, 6, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // F0-FF
};
uint8_t type = transitions[byte]; // This second table maps a state to a new state when adding a transition.
*state = static_cast<State>(states[*state + type]); // 00-7F
*buffer = (*buffer << 6) | (byte & (0x7F >> (type >> 1))); // | 80-8F
} // | | 90-9F
// | | | A0-BF
// | | | | C2-DF
// | | | | | E1-EF
// | | | | | | F1-F3
// | | | | | | | F4
// | | | | | | | | C0, C1, F5-FF
// | | | | | | | | | E0
// | | | | | | | | | | F0
static constexpr uint8_t states[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // REJECT = 0
11, 0, 0, 0, 22, 33, 44, 55, 0, 66, 77, // ACCEPT = 11
0, 11, 11, 11, 0, 0, 0, 0, 0, 0, 0, // 2-byte = 22
0, 22, 22, 22, 0, 0, 0, 0, 0, 0, 0, // 3-byte = 33
0, 33, 33, 33, 0, 0, 0, 0, 0, 0, 0, // 4-byte = 44
0, 33, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte low = 55
0, 0, 0, 22, 0, 0, 0, 0, 0, 0, 0, // 3-byte high = 66
0, 0, 33, 33, 0, 0, 0, 0, 0, 0, 0, // 4-byte mid/high = 77
};
} // namespace GeneralizedUtf8DfaDecoder uint8_t type = transitions[byte];
*state = static_cast<State>(states[*state + type]);
*buffer = (*buffer << 6) | (byte & (0x7F >> (type >> 1)));
}
};
#endif // __GENERALIZED_UTF8_DFA_DECODER_H #endif // __GENERALIZED_UTF8_DFA_DECODER_H
...@@ -7,71 +7,69 @@ ...@@ -7,71 +7,69 @@
#ifndef __UTF8_DFA_DECODER_H #ifndef __UTF8_DFA_DECODER_H
#define __UTF8_DFA_DECODER_H #define __UTF8_DFA_DECODER_H
namespace Utf8DfaDecoder { struct Utf8DfaDecoder {
enum State : uint8_t {
enum State : uint8_t { kReject = 0,
kReject = 0, kAccept = 12,
kAccept = 12, kTwoByte = 24,
kTwoByte = 24, kThreeByte = 36,
kThreeByte = 36, kThreeByteLowMid = 48,
kThreeByteLowMid = 48, kFourByte = 60,
kFourByte = 60, kFourByteLow = 72,
kFourByteLow = 72, kThreeByteHigh = 84,
kThreeByteHigh = 84, kFourByteMidHigh = 96,
kFourByteMidHigh = 96,
};
static inline void Decode(uint8_t byte, State* state, uint32_t* buffer) {
// This first table maps bytes to character to a transition.
static constexpr uint8_t transitions[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00-0F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10-1F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20-2F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30-3F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40-4F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50-5F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60-6F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70-7F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80-8F
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 90-9F
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // A0-AF
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // B0-BF
9, 9, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // C0-CF
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // D0-DF
10, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, // E0-EF
11, 7, 7, 7, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // F0-FF
}; };
// This second table maps a state to a new state when adding a transition. static inline void Decode(uint8_t byte, State* state, uint32_t* buffer) {
// 00-7F // This first table maps bytes to character to a transition.
// | 80-8F static constexpr uint8_t transitions[] = {
// | | 90-9F 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00-0F
// | | | A0-BF 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10-1F
// | | | | C2-DF 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20-2F
// | | | | | E1-EC, EE, EF 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30-3F
// | | | | | | ED 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40-4F
// | | | | | | | F1-F3 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50-5F
// | | | | | | | | F4 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60-6F
// | | | | | | | | | C0, C1, F5-FF 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70-7F
// | | | | | | | | | | E0 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80-8F
// | | | | | | | | | | | F0 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 90-9F
static constexpr uint8_t states[] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // A0-AF
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // REJECT = 0 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // B0-BF
12, 0, 0, 0, 24, 36, 48, 60, 72, 0, 84, 96, // ACCEPT = 12 9, 9, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // C0-CF
0, 12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, // 2-byte = 24 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // D0-DF
0, 24, 24, 24, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte = 36 10, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, // E0-EF
0, 24, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte low/mid = 48 11, 7, 7, 7, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // F0-FF
0, 36, 36, 36, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte = 60 };
0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte low = 72
0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte high = 84
0, 0, 36, 36, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte mid/high = 96
};
uint8_t type = transitions[byte]; // This second table maps a state to a new state when adding a transition.
*state = static_cast<State>(states[*state + type]); // 00-7F
*buffer = (*buffer << 6) | (byte & (0x7F >> (type >> 1))); // | 80-8F
} // | | 90-9F
// | | | A0-BF
// | | | | C2-DF
// | | | | | E1-EC, EE, EF
// | | | | | | ED
// | | | | | | | F1-F3
// | | | | | | | | F4
// | | | | | | | | | C0, C1, F5-FF
// | | | | | | | | | | E0
// | | | | | | | | | | | F0
static constexpr uint8_t states[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // REJECT = 0
12, 0, 0, 0, 24, 36, 48, 60, 72, 0, 84, 96, // ACCEPT = 12
0, 12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, // 2-byte = 24
0, 24, 24, 24, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte = 36
0, 24, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte low/mid = 48
0, 36, 36, 36, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte = 60
0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte low = 72
0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte high = 84
0, 0, 36, 36, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte mid/high = 96
};
} // namespace Utf8DfaDecoder uint8_t type = transitions[byte];
*state = static_cast<State>(states[*state + type]);
*buffer = (*buffer << 6) | (byte & (0x7F >> (type >> 1)));
}
};
#endif /* __UTF8_DFA_DECODER_H */ #endif /* __UTF8_DFA_DECODER_H */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment