Commit f6b6f71b authored by Justin Ridgewell's avatar Justin Ridgewell Committed by Commit Bot

Consolidate UTF-8 Vector<char> to uc16 decoding into Iterator

Too many files know how to deal with decoding, counting, and splitting UTF-8
into uc16 chars. This consolidates several callers who deal with full
(Vector<char>, not streaming) bytes by using a UTF-8 Iterator to decode bytes
into individual uc16 chars.

R=marja@chromium.org

Bug: 
Change-Id: Ia36df3e8c1abd0398415ad23a474557c71c19a01
Reviewed-on: https://chromium-review.googlesource.com/831093Reviewed-by: 's avatarMarja Hölttä <marja@chromium.org>
Commit-Queue: Justin Ridgewell <jridgewell@google.com>
Cr-Commit-Position: refs/heads/master@{#51405}
parent 0f617ada
......@@ -390,9 +390,9 @@ MaybeHandle<String> Factory::NewStringFromOneByte(Vector<const uint8_t> string,
MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> string,
PretenureFlag pretenure) {
// Check for ASCII first since this is the common case.
const char* start = string.start();
const char* ascii_data = string.start();
int length = string.length();
int non_ascii_start = String::NonAsciiStart(start, length);
int non_ascii_start = String::NonAsciiStart(ascii_data, length);
if (non_ascii_start >= length) {
// If the string is ASCII, we do not need to convert the characters
// since UTF8 is backwards compatible with ASCII.
......@@ -400,35 +400,38 @@ MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> string,
}
// Non-ASCII and we need to decode.
auto non_ascii = string.SubVector(non_ascii_start, length);
Access<UnicodeCache::Utf8Decoder>
decoder(isolate()->unicode_cache()->utf8_decoder());
decoder->Reset(string.start() + non_ascii_start,
length - non_ascii_start);
decoder->Reset(non_ascii);
int utf16_length = static_cast<int>(decoder->Utf16Length());
DCHECK_GT(utf16_length, 0);
// Allocate string.
Handle<SeqTwoByteString> result;
ASSIGN_RETURN_ON_EXCEPTION(
isolate(), result,
NewRawTwoByteString(non_ascii_start + utf16_length, pretenure),
String);
// Copy ASCII portion.
uint16_t* data = result->GetChars();
const char* ascii_data = string.start();
for (int i = 0; i < non_ascii_start; i++) {
*data++ = *ascii_data++;
}
// Now write the remainder.
decoder->WriteUtf16(data, utf16_length);
decoder->WriteUtf16(data, utf16_length, non_ascii);
return result;
}
MaybeHandle<String> Factory::NewStringFromUtf8SubString(
Handle<SeqOneByteString> str, int begin, int length,
PretenureFlag pretenure) {
// Check for ASCII first since this is the common case.
const char* start = reinterpret_cast<const char*>(str->GetChars() + begin);
int non_ascii_start = String::NonAsciiStart(start, length);
const char* ascii_data =
reinterpret_cast<const char*>(str->GetChars() + begin);
int non_ascii_start = String::NonAsciiStart(ascii_data, length);
if (non_ascii_start >= length) {
// If the string is ASCII, we can just make a substring.
// TODO(v8): the pretenure flag is ignored in this case.
......@@ -436,28 +439,35 @@ MaybeHandle<String> Factory::NewStringFromUtf8SubString(
}
// Non-ASCII and we need to decode.
auto non_ascii = Vector<const char>(ascii_data + non_ascii_start,
length - non_ascii_start);
Access<UnicodeCache::Utf8Decoder> decoder(
isolate()->unicode_cache()->utf8_decoder());
decoder->Reset(start + non_ascii_start, length - non_ascii_start);
decoder->Reset(non_ascii);
int utf16_length = static_cast<int>(decoder->Utf16Length());
DCHECK_GT(utf16_length, 0);
// Allocate string.
Handle<SeqTwoByteString> result;
ASSIGN_RETURN_ON_EXCEPTION(
isolate(), result,
NewRawTwoByteString(non_ascii_start + utf16_length, pretenure), String);
// Reset the decoder, because the original {str} may have moved.
const char* ascii_data =
reinterpret_cast<const char*>(str->GetChars() + begin);
decoder->Reset(ascii_data + non_ascii_start, length - non_ascii_start);
// Update pointer references, since the original string may have moved after
// allocation.
ascii_data = reinterpret_cast<const char*>(str->GetChars() + begin);
non_ascii = Vector<const char>(ascii_data + non_ascii_start,
length - non_ascii_start);
// Copy ASCII portion.
uint16_t* data = result->GetChars();
for (int i = 0; i < non_ascii_start; i++) {
*data++ = *ascii_data++;
}
// Now write the remainder.
decoder->WriteUtf16(data, utf16_length);
decoder->WriteUtf16(data, utf16_length, non_ascii);
return result;
}
......
......@@ -55,6 +55,7 @@
#include "src/snapshot/snapshot.h"
#include "src/tracing/trace-event.h"
#include "src/trap-handler/trap-handler.h"
#include "src/unicode-decoder.h"
#include "src/unicode-inl.h"
#include "src/utils-inl.h"
#include "src/utils.h"
......@@ -3575,28 +3576,17 @@ static inline void WriteOneByteData(Vector<const char> vector, uint8_t* chars,
static inline void WriteTwoByteData(Vector<const char> vector, uint16_t* chars,
int len) {
const uint8_t* stream = reinterpret_cast<const uint8_t*>(vector.start());
size_t stream_length = vector.length();
while (stream_length != 0) {
size_t consumed = 0;
uint32_t c = unibrow::Utf8::ValueOf(stream, stream_length, &consumed);
unibrow::Utf8Iterator it = unibrow::Utf8Iterator(vector);
while (!it.Done()) {
DCHECK_GT(len, 0);
len -= 1;
uint16_t c = *it;
++it;
DCHECK_NE(unibrow::Utf8::kBadChar, c);
DCHECK(consumed <= stream_length);
stream_length -= consumed;
stream += consumed;
if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
len -= 2;
if (len < 0) break;
*chars++ = unibrow::Utf16::LeadSurrogate(c);
*chars++ = unibrow::Utf16::TrailSurrogate(c);
} else {
len -= 1;
if (len < 0) break;
*chars++ = c;
}
*chars++ = c;
}
DCHECK_EQ(0, stream_length);
DCHECK_EQ(0, len);
DCHECK_EQ(len, 0);
}
......
......@@ -72,6 +72,7 @@
#include "src/string-stream.h"
#include "src/trap-handler/trap-handler.h"
#include "src/unicode-cache-inl.h"
#include "src/unicode-decoder.h"
#include "src/utils-inl.h"
#include "src/wasm/wasm-engine.h"
#include "src/wasm/wasm-objects.h"
......@@ -11980,24 +11981,15 @@ bool String::IsUtf8EqualTo(Vector<const char> str, bool allow_prefix_match) {
str_len > slen*static_cast<int>(unibrow::Utf8::kMaxEncodedSize))) {
return false;
}
int i;
size_t remaining_in_str = static_cast<size_t>(str_len);
const uint8_t* utf8_data = reinterpret_cast<const uint8_t*>(str.start());
for (i = 0; i < slen && remaining_in_str > 0; i++) {
size_t cursor = 0;
uint32_t r = unibrow::Utf8::ValueOf(utf8_data, remaining_in_str, &cursor);
DCHECK(cursor > 0 && cursor <= remaining_in_str);
if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
if (i > slen - 1) return false;
if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false;
if (Get(i) != unibrow::Utf16::TrailSurrogate(r)) return false;
} else {
if (Get(i) != r) return false;
}
utf8_data += cursor;
remaining_in_str -= cursor;
int i = 0;
unibrow::Utf8Iterator it = unibrow::Utf8Iterator(str);
while (i < slen && !it.Done()) {
if (Get(i++) != *it) return false;
++it;
}
return (allow_prefix_match || i == slen) && remaining_in_str == 0;
return (allow_prefix_match || i == slen) && it.Done();
}
template <>
......@@ -12166,37 +12158,31 @@ uint32_t StringHasher::ComputeUtf8Hash(Vector<const char> chars,
*utf16_length_out = vector_length;
return HashSequentialString(chars.start(), vector_length, seed);
}
// Start with a fake length which won't affect computation.
// It will be updated later.
StringHasher hasher(String::kMaxArrayIndexSize, seed);
size_t remaining = static_cast<size_t>(vector_length);
const uint8_t* stream = reinterpret_cast<const uint8_t*>(chars.start());
DCHECK(hasher.is_array_index_);
unibrow::Utf8Iterator it = unibrow::Utf8Iterator(chars);
int utf16_length = 0;
bool is_index = true;
DCHECK(hasher.is_array_index_);
while (remaining > 0) {
size_t consumed = 0;
uint32_t c = unibrow::Utf8::ValueOf(stream, remaining, &consumed);
DCHECK(consumed > 0 && consumed <= remaining);
stream += consumed;
remaining -= consumed;
bool is_two_characters = c > unibrow::Utf16::kMaxNonSurrogateCharCode;
utf16_length += is_two_characters ? 2 : 1;
// No need to keep hashing. But we do need to calculate utf16_length.
if (utf16_length > String::kMaxHashCalcLength) continue;
if (is_two_characters) {
uint16_t c1 = unibrow::Utf16::LeadSurrogate(c);
uint16_t c2 = unibrow::Utf16::TrailSurrogate(c);
hasher.AddCharacter(c1);
hasher.AddCharacter(c2);
if (is_index) is_index = hasher.UpdateIndex(c1);
if (is_index) is_index = hasher.UpdateIndex(c2);
} else {
hasher.AddCharacter(c);
if (is_index) is_index = hasher.UpdateIndex(c);
}
while (utf16_length < String::kMaxHashCalcLength && !it.Done()) {
utf16_length++;
uint16_t c = *it;
++it;
hasher.AddCharacter(c);
if (is_index) is_index = hasher.UpdateIndex(c);
}
*utf16_length_out = static_cast<int>(utf16_length);
// Now that hashing is done, we just need to calculate utf16_length
while (!it.Done()) {
++it;
utf16_length++;
}
*utf16_length_out = utf16_length;
// Must set length here so that hash computation is correct.
hasher.length_ = utf16_length;
return hasher.GetHashField();
......
......@@ -10,74 +10,78 @@
namespace unibrow {
uint16_t Utf8Iterator::operator*() {
if (V8_UNLIKELY(char_ > Utf16::kMaxNonSurrogateCharCode)) {
return trailing_ ? Utf16::TrailSurrogate(char_)
: Utf16::LeadSurrogate(char_);
}
DCHECK_EQ(trailing_, false);
return char_;
}
Utf8Iterator& Utf8Iterator::operator++() {
if (V8_UNLIKELY(this->Done())) {
char_ = Utf8::kBufferEmpty;
return *this;
}
if (V8_UNLIKELY(char_ > Utf16::kMaxNonSurrogateCharCode && !trailing_)) {
trailing_ = true;
return *this;
}
trailing_ = false;
offset_ = cursor_;
char_ =
Utf8::ValueOf(reinterpret_cast<const uint8_t*>(stream_.begin()) + cursor_,
stream_.length() - cursor_, &cursor_);
return *this;
}
Utf8Iterator Utf8Iterator::operator++(int) {
Utf8Iterator old(*this);
++*this;
return old;
}
bool Utf8Iterator::Done() {
return offset_ == static_cast<size_t>(stream_.length());
}
void Utf8DecoderBase::Reset(uint16_t* buffer, size_t buffer_length,
const uint8_t* stream, size_t stream_length) {
// Assume everything will fit in the buffer and stream won't be needed.
last_byte_of_buffer_unused_ = false;
unbuffered_start_ = nullptr;
unbuffered_length_ = 0;
bool writing_to_buffer = true;
// Loop until stream is read, writing to buffer as long as buffer has space.
const v8::internal::Vector<const char>& stream) {
size_t utf16_length = 0;
while (stream_length != 0) {
size_t cursor = 0;
uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor);
DCHECK(cursor > 0 && cursor <= stream_length);
stream += cursor;
stream_length -= cursor;
bool is_two_characters = character > Utf16::kMaxNonSurrogateCharCode;
utf16_length += is_two_characters ? 2 : 1;
// Don't need to write to the buffer, but still need utf16_length.
if (!writing_to_buffer) continue;
// Write out the characters to the buffer.
// Must check for equality with buffer_length as we've already updated it.
if (utf16_length <= buffer_length) {
if (is_two_characters) {
*buffer++ = Utf16::LeadSurrogate(character);
*buffer++ = Utf16::TrailSurrogate(character);
} else {
*buffer++ = character;
}
if (utf16_length == buffer_length) {
// Just wrote last character of buffer
writing_to_buffer = false;
unbuffered_start_ = stream;
unbuffered_length_ = stream_length;
}
continue;
}
// Have gone over buffer.
// Last char of buffer is unused, set cursor back.
DCHECK(is_two_characters);
writing_to_buffer = false;
last_byte_of_buffer_unused_ = true;
unbuffered_start_ = stream - cursor;
unbuffered_length_ = stream_length + cursor;
Utf8Iterator it = Utf8Iterator(stream);
// Loop until stream is read, writing to buffer as long as buffer has space.
while (utf16_length < buffer_length && !it.Done()) {
*buffer++ = *it;
++it;
utf16_length++;
}
bytes_read_ = it.Offset();
trailing_ = it.Trailing();
chars_written_ = utf16_length;
// Now that writing to buffer is done, we just need to calculate utf16_length
while (!it.Done()) {
++it;
utf16_length++;
}
utf16_length_ = utf16_length;
}
void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream,
size_t stream_length, uint16_t* data,
size_t data_length) {
while (data_length != 0) {
size_t cursor = 0;
uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor);
// There's a total lack of bounds checking for stream
// as it was already done in Reset.
stream += cursor;
DCHECK(stream_length >= cursor);
stream_length -= cursor;
if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) {
*data++ = Utf16::LeadSurrogate(character);
*data++ = Utf16::TrailSurrogate(character);
DCHECK_GT(data_length, 1);
data_length -= 2;
} else {
*data++ = character;
data_length -= 1;
}
void Utf8DecoderBase::WriteUtf16Slow(
uint16_t* data, size_t length,
const v8::internal::Vector<const char>& stream, size_t offset,
bool trailing) {
Utf8Iterator it = Utf8Iterator(stream, offset, trailing);
while (!it.Done()) {
DCHECK_GT(length--, 0);
*data++ = *it;
++it;
}
}
......
......@@ -6,30 +6,73 @@
#define V8_UNICODE_DECODER_H_
#include <sys/types.h>
#include <algorithm>
#include "src/globals.h"
#include "src/unicode.h"
#include "src/utils.h"
#include "src/vector.h"
namespace unibrow {
class Utf8Iterator {
public:
explicit Utf8Iterator(const v8::internal::Vector<const char>& stream)
: Utf8Iterator(stream, 0, false) {}
Utf8Iterator(const v8::internal::Vector<const char>& stream, size_t offset,
bool trailing)
: stream_(stream),
cursor_(offset),
offset_(0),
char_(0),
trailing_(false) {
DCHECK_LE(offset, stream.length());
// Read the first char, setting offset_ to offset in the process.
++*this;
// This must be set after reading the first char, since the offset marks
// the start of the octet sequence that the trailing char is part of.
trailing_ = trailing;
if (trailing) {
DCHECK_GT(char_, Utf16::kMaxNonSurrogateCharCode);
}
}
uint16_t operator*();
Utf8Iterator& operator++();
Utf8Iterator operator++(int);
bool Done();
bool Trailing() { return trailing_; }
size_t Offset() { return offset_; }
private:
const v8::internal::Vector<const char>& stream_;
size_t cursor_;
size_t offset_;
uint32_t char_;
bool trailing_;
};
class V8_EXPORT_PRIVATE Utf8DecoderBase {
public:
// Initialization done in subclass.
inline Utf8DecoderBase();
inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
const uint8_t* stream, size_t stream_length);
const v8::internal::Vector<const char>& stream);
inline size_t Utf16Length() const { return utf16_length_; }
protected:
// This reads all characters and sets the utf16_length_.
// The first buffer_length utf16 chars are cached in the buffer.
void Reset(uint16_t* buffer, size_t buffer_length, const uint8_t* stream,
size_t stream_length);
static void WriteUtf16Slow(const uint8_t* stream, size_t stream_length,
uint16_t* data, size_t length);
const uint8_t* unbuffered_start_;
size_t unbuffered_length_;
void Reset(uint16_t* buffer, size_t buffer_length,
const v8::internal::Vector<const char>& vector);
static void WriteUtf16Slow(uint16_t* data, size_t length,
const v8::internal::Vector<const char>& stream,
size_t offset, bool trailing);
size_t bytes_read_;
size_t chars_written_;
size_t utf16_length_;
bool last_byte_of_buffer_unused_;
bool trailing_;
private:
DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
......@@ -39,55 +82,53 @@ template <size_t kBufferSize>
class Utf8Decoder : public Utf8DecoderBase {
public:
inline Utf8Decoder() {}
inline Utf8Decoder(const char* stream, size_t length);
inline void Reset(const char* stream, size_t length);
inline size_t WriteUtf16(uint16_t* data, size_t length) const;
explicit inline Utf8Decoder(const v8::internal::Vector<const char>& stream);
inline void Reset(const v8::internal::Vector<const char>& stream);
inline size_t WriteUtf16(
uint16_t* data, size_t length,
const v8::internal::Vector<const char>& stream) const;
private:
uint16_t buffer_[kBufferSize];
};
Utf8DecoderBase::Utf8DecoderBase()
: unbuffered_start_(nullptr),
unbuffered_length_(0),
utf16_length_(0),
last_byte_of_buffer_unused_(false) {}
Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
const uint8_t* stream, size_t stream_length) {
Reset(buffer, buffer_length, stream, stream_length);
}
: bytes_read_(0), chars_written_(0), utf16_length_(0), trailing_(false) {}
Utf8DecoderBase::Utf8DecoderBase(
uint16_t* buffer, size_t buffer_length,
const v8::internal::Vector<const char>& stream) {
Reset(buffer, buffer_length, stream);
}
template <size_t kBufferSize>
Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, size_t length)
: Utf8DecoderBase(buffer_, kBufferSize,
reinterpret_cast<const uint8_t*>(stream), length) {}
Utf8Decoder<kBufferSize>::Utf8Decoder(
const v8::internal::Vector<const char>& stream)
: Utf8DecoderBase(buffer_, kBufferSize, stream) {}
template <size_t kBufferSize>
void Utf8Decoder<kBufferSize>::Reset(const char* stream, size_t length) {
Utf8DecoderBase::Reset(buffer_, kBufferSize,
reinterpret_cast<const uint8_t*>(stream), length);
void Utf8Decoder<kBufferSize>::Reset(
const v8::internal::Vector<const char>& stream) {
Utf8DecoderBase::Reset(buffer_, kBufferSize, stream);
}
template <size_t kBufferSize>
size_t Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
size_t length) const {
DCHECK_GT(length, 0);
if (length > utf16_length_) length = utf16_length_;
size_t Utf8Decoder<kBufferSize>::WriteUtf16(
uint16_t* data, size_t data_length,
const v8::internal::Vector<const char>& stream) const {
DCHECK_GT(data_length, 0);
data_length = std::min(data_length, utf16_length_);
// memcpy everything in buffer.
size_t buffer_length =
last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
size_t memcpy_length = length <= buffer_length ? length : buffer_length;
size_t memcpy_length = std::min(data_length, chars_written_);
v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
if (length <= buffer_length) return length;
DCHECK_NOT_NULL(unbuffered_start_);
if (data_length <= chars_written_) return data_length;
// Copy the rest the slow way.
WriteUtf16Slow(unbuffered_start_, unbuffered_length_, data + buffer_length,
length - buffer_length);
return length;
WriteUtf16Slow(data + chars_written_, data_length - chars_written_, stream,
bytes_read_, trailing_);
return data_length;
}
class Latin1 {
......
......@@ -1582,6 +1582,63 @@ TEST(ExternalStringIndexOf) {
.FromJust());
}
#define GC_INSIDE_NEW_STRING_FROM_UTF8_SUB_STRING(NAME, STRING) \
TEST(GCInsideNewStringFromUtf8SubStringWith##NAME) { \
CcTest::InitializeVM(); \
LocalContext context; \
v8::HandleScope scope(CcTest::isolate()); \
Factory* factory = CcTest::i_isolate()->factory(); \
Heap* heap = CcTest::i_isolate()->heap(); \
/* Length must be bigger than the buffer size of the Utf8Decoder. */ \
const char* buf = STRING; \
size_t len = strlen(buf); \
Handle<String> main_string = \
factory \
->NewStringFromOneByte(Vector<const uint8_t>( \
reinterpret_cast<const uint8_t*>(buf), len)) \
.ToHandleChecked(); \
CHECK(heap->InNewSpace(*main_string)); \
/* Next allocation will cause GC. */ \
heap::SimulateFullSpace(CcTest::i_isolate()->heap()->new_space()); \
/* Offset by two to check substring-ing. */ \
Handle<String> s = factory \
->NewStringFromUtf8SubString( \
Handle<SeqOneByteString>::cast(main_string), 2, \
static_cast<int>(len - 2)) \
.ToHandleChecked(); \
Handle<String> expected_string = \
factory->NewStringFromUtf8(Vector<const char>(buf + 2, len - 2)) \
.ToHandleChecked(); \
CHECK(s->Equals(*expected_string)); \
}
GC_INSIDE_NEW_STRING_FROM_UTF8_SUB_STRING(
OneByte,
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ"
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ"
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ"
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ"
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ"
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ"
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ"
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ"
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ")
GC_INSIDE_NEW_STRING_FROM_UTF8_SUB_STRING(
TwoByte,
"QQ\xF0\x9F\x98\x8D\xF0\x9F\x98\x8D"
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ"
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ"
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ"
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ"
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ"
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ"
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ"
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ"
"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ"
"QQ\xF0\x9F\x98\x8D\xF0\x9F\x98\x8D")
#undef GC_INSIDE_NEW_STRING_FROM_UTF8_SUB_STRING
} // namespace test_strings
} // namespace internal
} // namespace v8
......@@ -8,6 +8,7 @@
#include "src/unicode-decoder.h"
#include "src/unicode-inl.h"
#include "src/vector.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace v8 {
......@@ -15,16 +16,6 @@ namespace internal {
namespace {
using Utf8Decoder = unibrow::Utf8Decoder<512>;
void Decode(Utf8Decoder* decoder, const std::string& str) {
// Put the string in its own buffer on the heap to make sure that
// AddressSanitizer's heap-buffer-overflow logic can see what's going on.
std::unique_ptr<char[]> buffer(new char[str.length()]);
memcpy(buffer.get(), str.data(), str.length());
decoder->Reset(buffer.get(), str.length());
}
void DecodeNormally(const std::vector<byte>& bytes,
std::vector<unibrow::uchar>* output) {
size_t cursor = 0;
......@@ -34,6 +25,28 @@ void DecodeNormally(const std::vector<byte>& bytes,
}
}
template <size_t kBufferSize>
void DecodeUtf16(unibrow::Utf8Decoder<kBufferSize>* decoder,
const std::vector<byte>& bytes,
std::vector<unibrow::uchar>* output) {
const char* bytes_begin = reinterpret_cast<const char*>(&(*bytes.begin()));
auto vector = Vector<const char>(bytes_begin, bytes.size());
decoder->Reset(vector);
std::vector<uint16_t> utf16(decoder->Utf16Length());
decoder->WriteUtf16(&(*utf16.begin()), decoder->Utf16Length(), vector);
// Decode back into code points
for (size_t i = 0; i < utf16.size(); i++) {
uint16_t b = utf16[i];
if (unibrow::Utf16::IsLeadSurrogate(b)) {
output->push_back(unibrow::Utf16::CombineSurrogatePair(b, utf16[++i]));
} else {
output->push_back(b);
}
}
}
void DecodeIncrementally(const std::vector<byte>& bytes,
std::vector<unibrow::uchar>* output) {
unibrow::Utf8::Utf8IncrementalBuffer buffer = 0;
......@@ -53,14 +66,52 @@ void DecodeIncrementally(const std::vector<byte>& bytes,
} // namespace
TEST(UnicodeTest, ReadOffEndOfUtf8String) {
Utf8Decoder decoder;
TEST(UnicodeTest, Utf16BufferReuse) {
unibrow::Utf8Decoder<4> utf16_decoder;
// Not enough continuation bytes before string ends.
typedef struct {
std::vector<byte> bytes;
std::vector<unibrow::uchar> unicode_expected;
} TestCase;
TestCase data[] = {
{{0x00}, {0x0}},
{{0xC2, 0x80}, {0x80}},
{{0xE0, 0xA0, 0x80}, {0x800}},
{{0xF0, 0x90, 0x80, 0x80}, {0x10000}},
{{0xE0, 0xA0, 0x80}, {0x800}},
{{0xC2, 0x80}, {0x80}},
{{0x00}, {0x0}},
};
for (auto test : data) {
// For figuring out which test fails:
fprintf(stderr, "test: ");
for (auto b : test.bytes) {
fprintf(stderr, "%x ", b);
}
fprintf(stderr, "\n");
std::vector<unibrow::uchar> output_utf16;
DecodeUtf16(&utf16_decoder, test.bytes, &output_utf16);
CHECK_EQ(output_utf16.size(), test.unicode_expected.size());
for (size_t i = 0; i < output_utf16.size(); ++i) {
CHECK_EQ(output_utf16[i], test.unicode_expected[i]);
}
}
}
TEST(UnicodeTest, SurrogateOverrunsBuffer) {
unibrow::Utf8Decoder<2> utf16_decoder;
std::vector<unibrow::uchar> output_utf16;
// Not enough continuation bytes before string ends.
Decode(&decoder, "\xE0");
Decode(&decoder, "\xED");
Decode(&decoder, "\xF0");
Decode(&decoder, "\xF4");
DecodeUtf16(&utf16_decoder, {0x00, 0xF0, 0x90, 0x80, 0x80, 0x00},
&output_utf16);
CHECK_EQ(output_utf16[0], 0x00);
CHECK_EQ(output_utf16[1], 0x10000);
CHECK_EQ(output_utf16[0], 0x00);
}
TEST(UnicodeTest, IncrementalUTF8DecodingVsNonIncrementalUtf8Decoding) {
......@@ -414,6 +465,8 @@ TEST(UnicodeTest, IncrementalUTF8DecodingVsNonIncrementalUtf8Decoding) {
0x8FFFF}},
};
unibrow::Utf8Decoder<50> utf16_decoder;
for (auto test : data) {
// For figuring out which test fails:
fprintf(stderr, "test: ");
......@@ -437,6 +490,14 @@ TEST(UnicodeTest, IncrementalUTF8DecodingVsNonIncrementalUtf8Decoding) {
for (size_t i = 0; i < output_incremental.size(); ++i) {
CHECK_EQ(output_incremental[i], test.unicode_expected[i]);
}
std::vector<unibrow::uchar> output_utf16;
DecodeUtf16(&utf16_decoder, test.bytes, &output_utf16);
CHECK_EQ(output_utf16.size(), test.unicode_expected.size());
for (size_t i = 0; i < output_utf16.size(); ++i) {
CHECK_EQ(output_utf16[i], test.unicode_expected[i]);
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment