Commit b7ed86ec authored by Toon Verwaest's avatar Toon Verwaest Committed by Commit Bot

[runtime] Simplify/unify utf8 handling

- Removes Utf8Iterator
- Replaces Utf8Decoder with something based on ValueOfIncremental +
  NonAsciiStart and moves it into v8/internal.
- Internalizes utf8 strings by first converting them to one or two byte
- Removes IsUtf8EqualsTo and replaces current uses with IsOneByteEqualsTo

Tbr: jgruber@chromium.org
Change-Id: I16e08d910a745e78d6fd465718fc69ad731fd217
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1585840
Commit-Queue: Toon Verwaest <verwaest@chromium.org>
Reviewed-by: 's avatarIgor Sheludko <ishell@chromium.org>
Reviewed-by: 's avatarUlan Degenbaev <ulan@chromium.org>
Cr-Commit-Position: refs/heads/master@{#61049}
parent 7a70c55d
......@@ -2785,7 +2785,6 @@ v8_source_set("v8_base_without_compiler") {
"src/type-hints.cc",
"src/type-hints.h",
"src/type-traits.h",
"src/unicode-cache.h",
"src/unicode-decoder.cc",
"src/unicode-decoder.h",
"src/unicode-inl.h",
......
......@@ -70,7 +70,7 @@ bool SourceCodeCache::Lookup(Isolate* isolate, Vector<const char> name,
Handle<SharedFunctionInfo>* handle) {
for (int i = 0; i < cache_->length(); i += 2) {
SeqOneByteString str = SeqOneByteString::cast(cache_->get(i));
if (str->IsUtf8EqualTo(name)) {
if (str->IsOneByteEqualTo(Vector<const uint8_t>::cast(name))) {
*handle = Handle<SharedFunctionInfo>(
SharedFunctionInfo::cast(cache_->get(i + 1)), isolate);
return true;
......
......@@ -49,7 +49,6 @@
#include "src/objects/struct-inl.h"
#include "src/objects/template-objects-inl.h"
#include "src/transitions-inl.h"
#include "src/unicode-cache.h"
#include "src/unicode-inl.h"
namespace v8 {
......@@ -632,8 +631,19 @@ Handle<AccessorPair> Factory::NewAccessorPair() {
// Internalized strings are created in the old generation (data space).
Handle<String> Factory::InternalizeUtf8String(Vector<const char> string) {
Utf8StringKey key(string, HashSeed(isolate()));
return InternalizeStringWithKey(&key);
Vector<const uint8_t> utf8_data = Vector<const uint8_t>::cast(string);
Utf8Decoder decoder(utf8_data);
if (decoder.is_ascii()) return InternalizeOneByteString(utf8_data);
if (decoder.is_one_byte()) {
std::unique_ptr<uint8_t[]> buffer(new uint8_t[decoder.utf16_length()]);
decoder.Decode(buffer.get(), utf8_data);
return InternalizeOneByteString(
Vector<const uint8_t>(buffer.get(), decoder.utf16_length()));
}
std::unique_ptr<uint16_t[]> buffer(new uint16_t[decoder.utf16_length()]);
decoder.Decode(buffer.get(), utf8_data);
return InternalizeTwoByteString(
Vector<const uc16>(buffer.get(), decoder.utf16_length()));
}
Handle<String> Factory::InternalizeOneByteString(Vector<const uint8_t> string) {
......@@ -675,122 +685,86 @@ MaybeHandle<String> Factory::NewStringFromOneByte(Vector<const uint8_t> string,
return result;
}
MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> string,
MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> data,
AllocationType allocation) {
DCHECK_NE(allocation, AllocationType::kReadOnly);
// Check for ASCII first since this is the common case.
const char* ascii_data = string.start();
int length = string.length();
int non_ascii_start = String::NonAsciiStart(ascii_data, length);
if (non_ascii_start >= length) {
// If the string is ASCII, we do not need to convert the characters
// since UTF8 is backwards compatible with ASCII.
return NewStringFromOneByte(Vector<const uint8_t>::cast(string),
allocation);
}
std::unique_ptr<uint16_t[]> buffer(new uint16_t[length - non_ascii_start]);
const uint8_t* cursor =
reinterpret_cast<const uint8_t*>(&string[non_ascii_start]);
const uint8_t* end = reinterpret_cast<const uint8_t*>(string.end());
Vector<const uint8_t> utf8_data = Vector<const uint8_t>::cast(data);
Utf8Decoder decoder(utf8_data);
uint16_t* output_cursor = buffer.get();
if (decoder.utf16_length() == 0) return empty_string();
uint32_t incomplete_char = 0;
unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
while (cursor < end) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (V8_LIKELY(t <= unibrow::Utf16::kMaxNonSurrogateCharCode)) {
*(output_cursor++) = static_cast<uc16>(t); // The most frequent case.
} else if (t == unibrow::Utf8::kIncomplete) {
continue;
} else {
*(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
*(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
}
}
if (decoder.is_one_byte()) {
// Allocate string.
Handle<SeqOneByteString> result;
ASSIGN_RETURN_ON_EXCEPTION(
isolate(), result,
NewRawOneByteString(decoder.utf16_length(), allocation), String);
unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
if (t != unibrow::Utf8::kBufferEmpty) {
*(output_cursor++) = static_cast<uc16>(t);
DisallowHeapAllocation no_gc;
decoder.Decode(result->GetChars(no_gc), utf8_data);
return result;
}
DCHECK_LE(output_cursor, buffer.get() + length - non_ascii_start);
int utf16_length = static_cast<int>(output_cursor - buffer.get());
DCHECK_GT(utf16_length, 0);
// Allocate string.
Handle<SeqTwoByteString> result;
ASSIGN_RETURN_ON_EXCEPTION(
isolate(), result,
NewRawTwoByteString(non_ascii_start + utf16_length, allocation), String);
DCHECK_LE(non_ascii_start + utf16_length, length);
NewRawTwoByteString(decoder.utf16_length(), allocation), String);
DisallowHeapAllocation no_gc;
uint16_t* data = result->GetChars(no_gc);
CopyChars(data, ascii_data, non_ascii_start);
CopyChars(data + non_ascii_start, buffer.get(), utf16_length);
decoder.Decode(result->GetChars(no_gc), utf8_data);
return result;
}
MaybeHandle<String> Factory::NewStringFromUtf8SubString(
Handle<SeqOneByteString> str, int begin, int length,
AllocationType allocation) {
Access<UnicodeCache::Utf8Decoder> decoder(
isolate()->unicode_cache()->utf8_decoder());
int non_ascii_start;
int utf16_length = 0;
Vector<const uint8_t> utf8_data;
{
DisallowHeapAllocation no_gc;
const char* ascii_data =
reinterpret_cast<const char*>(str->GetChars(no_gc) + begin);
non_ascii_start = String::NonAsciiStart(ascii_data, length);
if (non_ascii_start < length) {
// Non-ASCII and we need to decode.
auto non_ascii = Vector<const char>(ascii_data + non_ascii_start,
length - non_ascii_start);
decoder->Reset(non_ascii);
utf16_length = static_cast<int>(decoder->Utf16Length());
utf8_data = Vector<const uint8_t>(str->GetChars(no_gc) + begin, length);
}
Utf8Decoder decoder(utf8_data);
if (length == 1) {
uint16_t t;
// Decode even in the case of length 1 since it can be a bad character.
decoder.Decode(&t, utf8_data);
return LookupSingleCharacterStringFromCode(t);
}
if (non_ascii_start >= length) {
if (decoder.is_ascii()) {
// If the string is ASCII, we can just make a substring.
// TODO(v8): the allocation flag is ignored in this case.
return NewSubString(str, begin, begin + length);
}
DCHECK_GT(utf16_length, 0);
DCHECK_GT(decoder.utf16_length(), 0);
if (decoder.is_one_byte()) {
// Allocate string.
Handle<SeqTwoByteString> result;
Handle<SeqOneByteString> result;
ASSIGN_RETURN_ON_EXCEPTION(
isolate(), result,
NewRawTwoByteString(non_ascii_start + utf16_length, allocation), String);
NewRawOneByteString(decoder.utf16_length(), allocation), String);
DisallowHeapAllocation no_gc;
// Update pointer references, since the original string may have moved after
// allocation.
DisallowHeapAllocation no_gc;
const char* ascii_data =
reinterpret_cast<const char*>(str->GetChars(no_gc) + begin);
auto non_ascii = Vector<const char>(ascii_data + non_ascii_start,
length - non_ascii_start);
// Copy ASCII portion.
uint16_t* data = result->GetChars(no_gc);
for (int i = 0; i < non_ascii_start; i++) {
*data++ = *ascii_data++;
utf8_data = Vector<const uint8_t>(str->GetChars(no_gc) + begin, length);
decoder.Decode(result->GetChars(no_gc), utf8_data);
return result;
}
// Now write the remainder.
decoder->WriteUtf16(data, utf16_length, non_ascii);
// Allocate string.
Handle<SeqTwoByteString> result;
ASSIGN_RETURN_ON_EXCEPTION(
isolate(), result,
NewRawTwoByteString(decoder.utf16_length(), allocation), String);
DisallowHeapAllocation no_gc;
// Update pointer references, since the original string may have moved after
// allocation.
utf8_data = Vector<const uint8_t>(str->GetChars(no_gc) + begin, length);
decoder.Decode(result->GetChars(no_gc), utf8_data);
return result;
}
......@@ -830,37 +804,10 @@ MaybeHandle<String> Factory::NewStringFromTwoByte(
namespace {
bool inline IsOneByte(Vector<const char> str, int chars) {
// TODO(dcarney): incorporate Latin-1 check when Latin-1 is supported?
return chars == str.length();
}
bool inline IsOneByte(Handle<String> str) {
return str->IsOneByteRepresentation();
}
inline void WriteOneByteData(Vector<const char> vector, uint8_t* chars,
int len) {
// Only works for one byte strings.
DCHECK(vector.length() == len);
MemCopy(chars, vector.start(), len);
}
inline void WriteTwoByteData(Vector<const char> vector, uint16_t* chars,
int len) {
unibrow::Utf8Iterator it = unibrow::Utf8Iterator(vector);
while (!it.Done()) {
DCHECK_GT(len, 0);
len -= 1;
uint16_t c = *it;
++it;
DCHECK_NE(unibrow::Utf8::kBadChar, c);
*chars++ = c;
}
DCHECK_EQ(len, 0);
}
inline void WriteOneByteData(Handle<String> s, uint8_t* chars, int len) {
DCHECK(s->length() == len);
String::WriteToFlat(*s, chars, 0, len);
......@@ -956,19 +903,6 @@ Handle<String> Factory::AllocateInternalizedStringImpl(T t, int chars,
return answer;
}
Handle<String> Factory::NewInternalizedStringFromUtf8(Vector<const char> str,
int chars,
uint32_t hash_field) {
if (IsOneByte(str, chars)) {
Handle<SeqOneByteString> result =
AllocateRawOneByteInternalizedString(str.length(), hash_field);
DisallowHeapAllocation no_allocation;
MemCopy(result->GetChars(no_allocation), str.start(), str.length());
return result;
}
return AllocateInternalizedStringImpl<false>(str, chars, hash_field);
}
Handle<String> Factory::NewOneByteInternalizedString(Vector<const uint8_t> str,
uint32_t hash_field) {
Handle<SeqOneByteString> result =
......
......@@ -314,11 +314,6 @@ class V8_EXPORT_PRIVATE Factory {
Handle<JSStringIterator> NewJSStringIterator(Handle<String> string);
// Allocates an internalized string in old space based on the character
// stream.
Handle<String> NewInternalizedStringFromUtf8(Vector<const char> str,
int chars, uint32_t hash_field);
Handle<String> NewOneByteInternalizedString(Vector<const uint8_t> str,
uint32_t hash_field);
......
......@@ -71,7 +71,6 @@
#include "src/string-stream.h"
#include "src/tracing/tracing-category-observer.h"
#include "src/trap-handler/trap-handler.h"
#include "src/unicode-cache.h"
#include "src/v8.h"
#include "src/v8threads.h"
#include "src/version.h"
......@@ -3056,9 +3055,6 @@ Isolate::~Isolate() {
delete entry_stack_;
entry_stack_ = nullptr;
delete unicode_cache_;
unicode_cache_ = nullptr;
delete date_cache_;
date_cache_ = nullptr;
......@@ -3330,7 +3326,6 @@ bool Isolate::Init(ReadOnlyDeserializer* read_only_deserializer,
compilation_cache_ = new CompilationCache(this);
descriptor_lookup_cache_ = new DescriptorLookupCache();
unicode_cache_ = new UnicodeCache();
inner_pointer_to_code_cache_ = new InnerPointerToCodeCache(this);
global_handles_ = new GlobalHandles(this);
eternal_handles_ = new EternalHandles();
......
......@@ -397,17 +397,18 @@ Handle<Object> JSStackFrame::GetMethodName() {
}
Handle<String> name(function_->shared()->Name(), isolate_);
name = String::Flatten(isolate_, name);
// The static initializer function is not a method, so don't add a
// class name, just return the function name.
if (name->IsUtf8EqualTo(CStrVector("<static_fields_initializer>"), true)) {
if (name->HasOneBytePrefix(CStrVector("<static_fields_initializer>"))) {
return name;
}
// ES2015 gives getters and setters name prefixes which must
// be stripped to find the property name.
if (name->IsUtf8EqualTo(CStrVector("get "), true) ||
name->IsUtf8EqualTo(CStrVector("set "), true)) {
if (name->HasOneBytePrefix(CStrVector("get ")) ||
name->HasOneBytePrefix(CStrVector("set "))) {
name = isolate_->factory()->NewProperSubString(name, 4, name->length());
}
if (CheckMethodName(isolate_, receiver, name, function_,
......
......@@ -4558,47 +4558,6 @@ uint32_t StringHasher::GetHashField() {
}
}
uint32_t StringHasher::ComputeUtf8Hash(Vector<const char> chars, uint64_t seed,
int* utf16_length_out) {
int vector_length = chars.length();
// Handle some edge cases
if (vector_length <= 1) {
DCHECK(vector_length == 0 ||
static_cast<uint8_t>(chars.start()[0]) <=
unibrow::Utf8::kMaxOneByteChar);
*utf16_length_out = vector_length;
return HashSequentialString(chars.start(), vector_length, seed);
}
// Start with a fake length which won't affect computation.
// It will be updated later.
StringHasher hasher(String::kMaxArrayIndexSize, seed);
DCHECK(hasher.is_array_index_);
unibrow::Utf8Iterator it = unibrow::Utf8Iterator(chars);
int utf16_length = 0;
bool is_index = true;
while (utf16_length < String::kMaxHashCalcLength && !it.Done()) {
utf16_length++;
uint16_t c = *it;
++it;
hasher.AddCharacter(c);
if (is_index) is_index = hasher.UpdateIndex(c);
}
// Now that hashing is done, we just need to calculate utf16_length
while (!it.Done()) {
++it;
utf16_length++;
}
*utf16_length_out = utf16_length;
// Must set length here so that hash computation is correct.
hasher.length_ = utf16_length;
return hasher.GetHashField();
}
void IteratingStringHasher::VisitConsString(ConsString cons_string) {
// Run small ConsStrings through ConsStringIterator.
if (cons_string->length() < 64) {
......
......@@ -265,27 +265,6 @@ class TwoByteStringKey : public SequentialStringKey<uc16> {
Handle<String> AsHandle(Isolate* isolate) override;
};
// Utf8StringKey carries a vector of chars as key.
class Utf8StringKey : public StringTableKey {
public:
explicit Utf8StringKey(Vector<const char> string, uint64_t seed)
: StringTableKey(StringHasher::ComputeUtf8Hash(string, seed, &chars_)),
string_(string) {}
bool IsMatch(Object string) override {
return String::cast(string)->IsUtf8EqualTo(string_);
}
Handle<String> AsHandle(Isolate* isolate) override {
return isolate->factory()->NewInternalizedStringFromUtf8(string_, chars_,
HashField());
}
private:
Vector<const char> string_;
int chars_; // Caches the number of characters when computing the hash code.
};
bool String::Equals(String other) {
if (other == *this) return true;
if (this->IsInternalizedString() && other->IsInternalizedString()) {
......
......@@ -1180,26 +1180,6 @@ Object String::LastIndexOf(Isolate* isolate, Handle<Object> receiver,
return Smi::FromInt(last_index);
}
bool String::IsUtf8EqualTo(Vector<const char> str, bool allow_prefix_match) {
int slen = length();
// Can't check exact length equality, but we can check bounds.
int str_len = str.length();
if (!allow_prefix_match &&
(str_len < slen ||
str_len > slen * static_cast<int>(unibrow::Utf8::kMaxEncodedSize))) {
return false;
}
int i = 0;
unibrow::Utf8Iterator it = unibrow::Utf8Iterator(str);
while (i < slen && !it.Done()) {
if (Get(i++) != *it) return false;
++it;
}
return (allow_prefix_match || i == slen) && it.Done();
}
template <>
bool String::IsEqualTo(Vector<const uint8_t> str) {
return IsOneByteEqualTo(str);
......@@ -1210,6 +1190,18 @@ bool String::IsEqualTo(Vector<const uc16> str) {
return IsTwoByteEqualTo(str);
}
bool String::HasOneBytePrefix(Vector<const char> str) {
int slen = str.length();
if (slen > length()) return false;
DisallowHeapAllocation no_gc;
FlatContent content = GetFlatContent(no_gc);
if (content.IsOneByte()) {
return CompareChars(content.ToOneByteVector().start(), str.start(), slen) ==
0;
}
return CompareChars(content.ToUC16Vector().start(), str.start(), slen) == 0;
}
bool String::IsOneByteEqualTo(Vector<const uint8_t> str) {
int slen = length();
if (str.length() != slen) return false;
......
......@@ -268,14 +268,16 @@ class String : public Name {
inline bool Equals(String other);
inline static bool Equals(Isolate* isolate, Handle<String> one,
Handle<String> two);
V8_EXPORT_PRIVATE bool IsUtf8EqualTo(Vector<const char> str,
bool allow_prefix_match = false);
// Dispatches to Is{One,Two}ByteEqualTo.
template <typename Char>
bool IsEqualTo(Vector<const Char> str);
V8_EXPORT_PRIVATE bool HasOneBytePrefix(Vector<const char> str);
V8_EXPORT_PRIVATE bool IsOneByteEqualTo(Vector<const uint8_t> str);
V8_EXPORT_PRIVATE bool IsOneByteEqualTo(Vector<const char> str) {
return IsOneByteEqualTo(Vector<const uint8_t>::cast(str));
}
bool IsTwoByteEqualTo(Vector<const uc16> str);
// Return a UTF8 representation of the string. The string is null
......@@ -366,50 +368,12 @@ class String : public Name {
EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
static void WriteToFlat(String source, sinkchar* sink, int from, int to);
// The return value may point to the first aligned word containing the first
// non-one-byte character, rather than directly to the non-one-byte character.
// If the return value is >= the passed length, the entire string was
// one-byte.
static inline int NonAsciiStart(const char* chars, int length) {
const char* start = chars;
const char* limit = chars + length;
if (length >= kIntptrSize) {
// Check unaligned bytes.
while (!IsAligned(reinterpret_cast<intptr_t>(chars), sizeof(uintptr_t))) {
if (static_cast<uint8_t>(*chars) > unibrow::Utf8::kMaxOneByteChar) {
return static_cast<int>(chars - start);
}
++chars;
}
// Check aligned words.
DCHECK_EQ(unibrow::Utf8::kMaxOneByteChar, 0x7F);
const uintptr_t non_one_byte_mask = kUintptrAllBitsSet / 0xFF * 0x80;
while (chars + sizeof(uintptr_t) <= limit) {
if (*reinterpret_cast<const uintptr_t*>(chars) & non_one_byte_mask) {
return static_cast<int>(chars - start);
}
chars += sizeof(uintptr_t);
}
}
// Check remaining unaligned bytes.
while (chars < limit) {
if (static_cast<uint8_t>(*chars) > unibrow::Utf8::kMaxOneByteChar) {
return static_cast<int>(chars - start);
}
++chars;
}
return static_cast<int>(chars - start);
}
static inline bool IsAscii(const char* chars, int length) {
return NonAsciiStart(chars, length) >= length;
return IsAscii(reinterpret_cast<const uint8_t*>(chars), length);
}
static inline bool IsAscii(const uint8_t* chars, int length) {
return NonAsciiStart(reinterpret_cast<const char*>(chars), length) >=
length;
return NonAsciiStart(chars, length) >= length;
}
static inline int NonOneByteStart(const uc16* chars, int length) {
......
......@@ -16,7 +16,6 @@
#include "src/message-template.h"
#include "src/parsing/token.h"
#include "src/pointer-with-payload.h"
#include "src/unicode-decoder.h"
#include "src/unicode.h"
namespace v8 {
......
......@@ -5,6 +5,7 @@
#include "src/regexp/regexp-stack.h"
#include "src/isolate.h"
#include "src/memcopy.h"
namespace v8 {
namespace internal {
......
......@@ -8,6 +8,7 @@
#include "src/base/platform/platform.h"
#include "src/counters.h"
#include "src/memcopy.h"
#include "src/snapshot/partial-deserializer.h"
#include "src/snapshot/read-only-deserializer.h"
#include "src/snapshot/startup-deserializer.h"
......
......@@ -24,10 +24,6 @@ class V8_EXPORT_PRIVATE StringHasher {
static inline uint32_t HashSequentialString(const schar* chars, int length,
uint64_t seed);
// Reads all the data, even for long strings and computes the utf16 length.
static uint32_t ComputeUtf8Hash(Vector<const char> chars, uint64_t seed,
int* utf16_length_out);
// Calculated hash value for a string consisting of 1 to
// String::kMaxArrayIndexSize digits with no leading zeros (except "0").
// value is represented decimal value.
......
// Copyright 2015 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_UNICODE_CACHE_H_
#define V8_UNICODE_CACHE_H_
#include "src/base/macros.h"
#include "src/unicode-decoder.h"
#include "src/unicode.h"
#include "src/utils.h"
namespace v8 {
namespace internal {
// Caching predicates used by scanners.
class UnicodeCache {
public:
UnicodeCache() = default;
typedef unibrow::Utf8Decoder<512> Utf8Decoder;
StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; }
private:
StaticResource<Utf8Decoder> utf8_decoder_;
DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
};
} // namespace internal
} // namespace v8
#endif // V8_UNICODE_CACHE_H_
......@@ -2,87 +2,80 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/unicode-inl.h"
#include "src/unicode-decoder.h"
#include <stdio.h>
#include <stdlib.h>
namespace unibrow {
#include "src/memcopy.h"
#include "src/unicode-inl.h"
uint16_t Utf8Iterator::operator*() {
if (V8_UNLIKELY(char_ > Utf16::kMaxNonSurrogateCharCode)) {
return trailing_ ? Utf16::TrailSurrogate(char_)
: Utf16::LeadSurrogate(char_);
namespace v8 {
namespace internal {
Utf8Decoder::Utf8Decoder(const Vector<const uint8_t>& chars)
: encoding_(Encoding::kAscii),
non_ascii_start_(NonAsciiStart(chars.start(), chars.length())),
utf16_length_(non_ascii_start_) {
if (non_ascii_start_ == chars.length()) return;
const uint8_t* cursor = chars.start() + non_ascii_start_;
const uint8_t* end = chars.start() + chars.length();
bool is_one_byte = true;
uint32_t incomplete_char = 0;
unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
while (cursor < end) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (t != unibrow::Utf8::kIncomplete) {
is_one_byte = is_one_byte && t <= unibrow::Latin1::kMaxChar;
utf16_length_++;
if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) utf16_length_++;
}
DCHECK_EQ(trailing_, false);
return char_;
}
Utf8Iterator& Utf8Iterator::operator++() {
if (V8_UNLIKELY(this->Done())) {
char_ = Utf8::kBufferEmpty;
return *this;
}
if (V8_UNLIKELY(char_ > Utf16::kMaxNonSurrogateCharCode && !trailing_)) {
trailing_ = true;
return *this;
unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
if (t != unibrow::Utf8::kBufferEmpty) {
is_one_byte = false;
utf16_length_++;
}
trailing_ = false;
offset_ = cursor_;
char_ =
Utf8::ValueOf(reinterpret_cast<const uint8_t*>(stream_.begin()) + cursor_,
stream_.length() - cursor_, &cursor_);
return *this;
encoding_ = is_one_byte ? Encoding::kLatin1 : Encoding::kUtf16;
}
Utf8Iterator Utf8Iterator::operator++(int) {
Utf8Iterator old(*this);
++*this;
return old;
}
template <typename Char>
void Utf8Decoder::Decode(Char* out, const Vector<const uint8_t>& data) {
CopyChars(out, data.start(), non_ascii_start_);
bool Utf8Iterator::Done() {
return offset_ == static_cast<size_t>(stream_.length());
}
out += non_ascii_start_;
uint32_t incomplete_char = 0;
unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
void Utf8DecoderBase::Reset(uint16_t* buffer, size_t buffer_length,
const v8::internal::Vector<const char>& stream) {
size_t utf16_length = 0;
const uint8_t* cursor = data.start() + non_ascii_start_;
const uint8_t* end = data.start() + data.length();
Utf8Iterator it = Utf8Iterator(stream);
// Loop until stream is read, writing to buffer as long as buffer has space.
while (utf16_length < buffer_length && !it.Done()) {
*buffer++ = *it;
++it;
utf16_length++;
while (cursor < end) {
unibrow::uchar t =
unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
if (t != unibrow::Utf8::kIncomplete) {
if (sizeof(Char) == 1 || t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
*(out++) = static_cast<Char>(t);
} else {
*(out++) = unibrow::Utf16::LeadSurrogate(t);
*(out++) = unibrow::Utf16::TrailSurrogate(t);
}
bytes_read_ = it.Offset();
trailing_ = it.Trailing();
chars_written_ = utf16_length;
// Now that writing to buffer is done, we just need to calculate utf16_length
while (!it.Done()) {
++it;
utf16_length++;
}
utf16_length_ = utf16_length;
}
void Utf8DecoderBase::WriteUtf16Slow(
uint16_t* data, size_t length,
const v8::internal::Vector<const char>& stream, size_t offset,
bool trailing) {
Utf8Iterator it = Utf8Iterator(stream, offset, trailing);
while (!it.Done()) {
DCHECK_GT(length--, 0);
*data++ = *it;
++it;
}
unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
if (t != unibrow::Utf8::kBufferEmpty) *out = static_cast<Char>(t);
}
} // namespace unibrow
template void Utf8Decoder::Decode(uint8_t* out,
const Vector<const uint8_t>& data);
template void Utf8Decoder::Decode(uint16_t* out,
const Vector<const uint8_t>& data);
} // namespace internal
} // namespace v8
......@@ -5,154 +5,70 @@
#ifndef V8_UNICODE_DECODER_H_
#define V8_UNICODE_DECODER_H_
#include <sys/types.h>
#include <algorithm>
#include "src/globals.h"
#include "src/memcopy.h"
#include "src/unicode.h"
#include "src/vector.h"
namespace unibrow {
class Utf8Iterator {
public:
explicit Utf8Iterator(const v8::internal::Vector<const char>& stream)
: Utf8Iterator(stream, 0, false) {}
Utf8Iterator(const v8::internal::Vector<const char>& stream, size_t offset,
bool trailing)
: stream_(stream),
cursor_(offset),
offset_(0),
char_(0),
trailing_(false) {
DCHECK_LE(offset, stream.length());
// Read the first char, setting offset_ to offset in the process.
++*this;
// This must be set after reading the first char, since the offset marks
// the start of the octet sequence that the trailing char is part of.
trailing_ = trailing;
if (trailing) {
DCHECK_GT(char_, Utf16::kMaxNonSurrogateCharCode);
namespace v8 {
namespace internal {
// The return value may point to the first aligned word containing the first
// non-one-byte character, rather than directly to the non-one-byte character.
// If the return value is >= the passed length, the entire string was
// one-byte.
inline int NonAsciiStart(const uint8_t* chars, int length) {
const uint8_t* start = chars;
const uint8_t* limit = chars + length;
if (static_cast<size_t>(length) >= kIntptrSize) {
// Check unaligned bytes.
while (!IsAligned(reinterpret_cast<intptr_t>(chars), kIntptrSize)) {
if (*chars > unibrow::Utf8::kMaxOneByteChar) {
return static_cast<int>(chars - start);
}
++chars;
}
// Check aligned words.
DCHECK_EQ(unibrow::Utf8::kMaxOneByteChar, 0x7F);
const uintptr_t non_one_byte_mask = kUintptrAllBitsSet / 0xFF * 0x80;
while (chars + sizeof(uintptr_t) <= limit) {
if (*reinterpret_cast<const uintptr_t*>(chars) & non_one_byte_mask) {
return static_cast<int>(chars - start);
}
chars += sizeof(uintptr_t);
}
}
// Check remaining unaligned bytes.
while (chars < limit) {
if (*chars > unibrow::Utf8::kMaxOneByteChar) {
return static_cast<int>(chars - start);
}
++chars;
}
uint16_t operator*();
Utf8Iterator& operator++();
Utf8Iterator operator++(int);
bool Done();
bool Trailing() { return trailing_; }
size_t Offset() { return offset_; }
private:
const v8::internal::Vector<const char>& stream_;
size_t cursor_;
size_t offset_;
uint32_t char_;
bool trailing_;
};
return static_cast<int>(chars - start);
}
class V8_EXPORT_PRIVATE Utf8DecoderBase {
class V8_EXPORT_PRIVATE Utf8Decoder final {
public:
// Initialization done in subclass.
inline Utf8DecoderBase();
inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
const v8::internal::Vector<const char>& stream);
inline size_t Utf16Length() const { return utf16_length_; }
protected:
// This reads all characters and sets the utf16_length_.
// The first buffer_length utf16 chars are cached in the buffer.
void Reset(uint16_t* buffer, size_t buffer_length,
const v8::internal::Vector<const char>& vector);
static void WriteUtf16Slow(uint16_t* data, size_t length,
const v8::internal::Vector<const char>& stream,
size_t offset, bool trailing);
enum class Encoding : uint8_t { kAscii, kLatin1, kUtf16 };
size_t bytes_read_;
size_t chars_written_;
size_t utf16_length_;
bool trailing_;
explicit Utf8Decoder(const Vector<const uint8_t>& chars);
private:
DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
};
bool is_ascii() const { return encoding_ == Encoding::kAscii; }
bool is_one_byte() const { return encoding_ < Encoding::kLatin1; }
int utf16_length() const { return utf16_length_; }
int non_ascii_start() const { return non_ascii_start_; }
template <size_t kBufferSize>
class Utf8Decoder : public Utf8DecoderBase {
public:
inline Utf8Decoder() = default;
explicit inline Utf8Decoder(const v8::internal::Vector<const char>& stream);
inline void Reset(const v8::internal::Vector<const char>& stream);
inline size_t WriteUtf16(
uint16_t* data, size_t length,
const v8::internal::Vector<const char>& stream) const;
template <typename Char>
V8_EXPORT_PRIVATE void Decode(Char* out, const Vector<const uint8_t>& data);
private:
uint16_t buffer_[kBufferSize];
};
Utf8DecoderBase::Utf8DecoderBase()
: bytes_read_(0), chars_written_(0), utf16_length_(0), trailing_(false) {}
Utf8DecoderBase::Utf8DecoderBase(
uint16_t* buffer, size_t buffer_length,
const v8::internal::Vector<const char>& stream) {
Reset(buffer, buffer_length, stream);
}
template <size_t kBufferSize>
Utf8Decoder<kBufferSize>::Utf8Decoder(
const v8::internal::Vector<const char>& stream)
: Utf8DecoderBase(buffer_, kBufferSize, stream) {}
template <size_t kBufferSize>
void Utf8Decoder<kBufferSize>::Reset(
const v8::internal::Vector<const char>& stream) {
Utf8DecoderBase::Reset(buffer_, kBufferSize, stream);
}
template <size_t kBufferSize>
size_t Utf8Decoder<kBufferSize>::WriteUtf16(
uint16_t* data, size_t data_length,
const v8::internal::Vector<const char>& stream) const {
DCHECK_GT(data_length, 0);
data_length = std::min(data_length, utf16_length_);
// memcpy everything in buffer.
size_t memcpy_length = std::min(data_length, chars_written_);
v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
if (data_length <= chars_written_) return data_length;
// Copy the rest the slow way.
WriteUtf16Slow(data + chars_written_, data_length - chars_written_, stream,
bytes_read_, trailing_);
return data_length;
}
class Latin1 {
public:
static const unsigned kMaxChar = 0xff;
// Convert the character to Latin-1 case equivalent if possible.
static inline uint16_t TryConvertToLatin1(uint16_t);
Encoding encoding_;
int non_ascii_start_;
int utf16_length_;
};
uint16_t Latin1::TryConvertToLatin1(uint16_t c) {
switch (c) {
// This are equivalent characters in unicode.
case 0x39c:
case 0x3bc:
return 0xb5;
// This is an uppercase of a Latin-1 character
// outside of Latin-1.
case 0x178:
return 0xff;
}
return c;
}
} // namespace unibrow
} // namespace internal
} // namespace v8
#endif // V8_UNICODE_DECODER_H_
......@@ -131,6 +131,25 @@ class Utf16 {
}
};
class Latin1 {
public:
static const unsigned kMaxChar = 0xff;
// Convert the character to Latin-1 case equivalent if possible.
static inline uint16_t TryConvertToLatin1(uint16_t c) {
switch (c) {
// This are equivalent characters in unicode.
case 0x39c:
case 0x3bc:
return 0xb5;
// This is an uppercase of a Latin-1 character
// outside of Latin-1.
case 0x178:
return 0xff;
}
return c;
}
};
class V8_EXPORT_PRIVATE Utf8 {
public:
using State = Utf8DfaDecoder::State;
......
......@@ -153,13 +153,13 @@ static void CheckOddball(Isolate* isolate, Object obj, const char* string) {
CHECK(obj->IsOddball());
Handle<Object> handle(obj, isolate);
Object print_string = *Object::ToString(isolate, handle).ToHandleChecked();
CHECK(String::cast(print_string)->IsUtf8EqualTo(CStrVector(string)));
CHECK(String::cast(print_string)->IsOneByteEqualTo(CStrVector(string)));
}
static void CheckSmi(Isolate* isolate, int value, const char* string) {
Handle<Object> handle(Smi::FromInt(value), isolate);
Object print_string = *Object::ToString(isolate, handle).ToHandleChecked();
CHECK(String::cast(print_string)->IsUtf8EqualTo(CStrVector(string)));
CHECK(String::cast(print_string)->IsOneByteEqualTo(CStrVector(string)));
}
......@@ -168,7 +168,7 @@ static void CheckNumber(Isolate* isolate, double value, const char* string) {
CHECK(number->IsNumber());
Handle<Object> print_string =
Object::ToString(isolate, number).ToHandleChecked();
CHECK(String::cast(*print_string)->IsUtf8EqualTo(CStrVector(string)));
CHECK(String::cast(*print_string)->IsOneByteEqualTo(CStrVector(string)));
}
void CheckEmbeddedObjectsAreEqual(Handle<Code> lhs, Handle<Code> rhs) {
......@@ -764,10 +764,10 @@ static void CheckInternalizedStrings(const char** strings) {
CHECK(a->IsInternalizedString());
Handle<String> b = factory->InternalizeUtf8String(string);
CHECK_EQ(*b, *a);
CHECK(b->IsUtf8EqualTo(CStrVector(string)));
CHECK(b->IsOneByteEqualTo(CStrVector(string)));
b = isolate->factory()->InternalizeUtf8String(CStrVector(string));
CHECK_EQ(*b, *a);
CHECK(b->IsUtf8EqualTo(CStrVector(string)));
CHECK(b->IsOneByteEqualTo(CStrVector(string)));
}
}
......
......@@ -17,8 +17,10 @@ namespace internal {
static void CheckObject(Isolate* isolate, Handle<Object> obj,
const char* string) {
Object print_string = *Object::NoSideEffectsToString(isolate, obj);
CHECK(String::cast(print_string)->IsUtf8EqualTo(CStrVector(string)));
Handle<String> print_string = String::Flatten(
isolate,
Handle<String>::cast(Object::NoSideEffectsToString(isolate, obj)));
CHECK(print_string->IsOneByteEqualTo(CStrVector(string)));
}
static void CheckSmi(Isolate* isolate, int value, const char* string) {
......
......@@ -25,15 +25,13 @@ void DecodeNormally(const std::vector<byte>& bytes,
}
}
template <size_t kBufferSize>
void DecodeUtf16(unibrow::Utf8Decoder<kBufferSize>* decoder,
const std::vector<byte>& bytes,
void DecodeUtf16(const std::vector<uint8_t>& bytes,
std::vector<unibrow::uchar>* output) {
auto vector = Vector<const char>::cast(VectorOf(bytes));
decoder->Reset(vector);
auto utf8_data = Vector<const uint8_t>::cast(VectorOf(bytes));
Utf8Decoder decoder(utf8_data);
std::vector<uint16_t> utf16(decoder->Utf16Length());
decoder->WriteUtf16(&(*utf16.begin()), decoder->Utf16Length(), vector);
std::vector<uint16_t> utf16(decoder.utf16_length());
decoder.Decode(&utf16[0], utf8_data);
// Decode back into code points
for (size_t i = 0; i < utf16.size(); i++) {
......@@ -68,8 +66,6 @@ void DecodeIncrementally(const std::vector<byte>& bytes,
} // namespace
TEST(UnicodeTest, Utf16BufferReuse) {
unibrow::Utf8Decoder<4> utf16_decoder;
// Not enough continuation bytes before string ends.
typedef struct {
std::vector<byte> bytes;
......@@ -94,7 +90,7 @@ TEST(UnicodeTest, Utf16BufferReuse) {
fprintf(stderr, "\n");
std::vector<unibrow::uchar> output_utf16;
DecodeUtf16(&utf16_decoder, test.bytes, &output_utf16);
DecodeUtf16(test.bytes, &output_utf16);
CHECK_EQ(output_utf16.size(), test.unicode_expected.size());
for (size_t i = 0; i < output_utf16.size(); ++i) {
......@@ -104,12 +100,9 @@ TEST(UnicodeTest, Utf16BufferReuse) {
}
TEST(UnicodeTest, SurrogateOverrunsBuffer) {
unibrow::Utf8Decoder<2> utf16_decoder;
std::vector<unibrow::uchar> output_utf16;
// Not enough continuation bytes before string ends.
DecodeUtf16(&utf16_decoder, {0x00, 0xF0, 0x90, 0x80, 0x80, 0x00},
&output_utf16);
DecodeUtf16({0x00, 0xF0, 0x90, 0x80, 0x80, 0x00}, &output_utf16);
CHECK_EQ(output_utf16[0], 0x00);
CHECK_EQ(output_utf16[1], 0x10000);
CHECK_EQ(output_utf16[0], 0x00);
......@@ -466,8 +459,6 @@ TEST(UnicodeTest, IncrementalUTF8DecodingVsNonIncrementalUtf8Decoding) {
0x8FFFF}},
};
unibrow::Utf8Decoder<50> utf16_decoder;
for (auto test : data) {
// For figuring out which test fails:
fprintf(stderr, "test: ");
......@@ -493,7 +484,7 @@ TEST(UnicodeTest, IncrementalUTF8DecodingVsNonIncrementalUtf8Decoding) {
}
std::vector<unibrow::uchar> output_utf16;
DecodeUtf16(&utf16_decoder, test.bytes, &output_utf16);
DecodeUtf16(test.bytes, &output_utf16);
CHECK_EQ(output_utf16.size(), test.unicode_expected.size());
for (size_t i = 0; i < output_utf16.size(); ++i) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment