Commit c3da9290 authored by Yang Guo's avatar Yang Guo Committed by Commit Bot

Simplify Utf8Length calculation.

Flattening the string upfront has performance benefits and we can
also simplify the implementation quite a bit.

Bug: v8:6780
Cq-Include-Trybots: luci.chromium.try:linux_chromium_rel_ng
Change-Id: Ic75bdfdf900d30e51e95cdf1cb8d09aab06332c6
Reviewed-on: https://chromium-review.googlesource.com/1014102Reviewed-by: 's avatarMarja Hölttä <marja@chromium.org>
Reviewed-by: 's avatarMathias Bynens <mathias@chromium.org>
Commit-Queue: Yang Guo <yangguo@chromium.org>
Cr-Commit-Position: refs/heads/master@{#52643}
parent fbf9b5e4
......@@ -5405,202 +5405,28 @@ bool String::ContainsOnlyOneByte() const {
}
class Utf8LengthHelper : public i::AllStatic {
public:
enum State {
kEndsWithLeadingSurrogate = 1 << 0,
kStartsWithTrailingSurrogate = 1 << 1,
kLeftmostEdgeIsCalculated = 1 << 2,
kRightmostEdgeIsCalculated = 1 << 3,
kLeftmostEdgeIsSurrogate = 1 << 4,
kRightmostEdgeIsSurrogate = 1 << 5
};
static const uint8_t kInitialState = 0;
static inline bool EndsWithSurrogate(uint8_t state) {
return state & kEndsWithLeadingSurrogate;
}
static inline bool StartsWithSurrogate(uint8_t state) {
return state & kStartsWithTrailingSurrogate;
}
class Visitor {
public:
Visitor() : utf8_length_(0), state_(kInitialState) {}
void VisitOneByteString(const uint8_t* chars, int length) {
int utf8_length = 0;
// Add in length 1 for each non-Latin1 character.
for (int i = 0; i < length; i++) {
utf8_length += *chars++ >> 7;
}
// Add in length 1 for each character.
utf8_length_ = utf8_length + length;
state_ = kInitialState;
}
void VisitTwoByteString(const uint16_t* chars, int length) {
int utf8_length = 0;
int last_character = unibrow::Utf16::kNoPreviousCharacter;
for (int i = 0; i < length; i++) {
uint16_t c = chars[i];
utf8_length += unibrow::Utf8::Length(c, last_character);
last_character = c;
}
utf8_length_ = utf8_length;
uint8_t state = 0;
if (unibrow::Utf16::IsTrailSurrogate(chars[0])) {
state |= kStartsWithTrailingSurrogate;
}
if (unibrow::Utf16::IsLeadSurrogate(chars[length-1])) {
state |= kEndsWithLeadingSurrogate;
}
state_ = state;
}
static i::ConsString* VisitFlat(i::String* string,
int* length,
uint8_t* state) {
Visitor visitor;
i::ConsString* cons_string = i::String::VisitFlat(&visitor, string);
*length = visitor.utf8_length_;
*state = visitor.state_;
return cons_string;
}
private:
int utf8_length_;
uint8_t state_;
DISALLOW_COPY_AND_ASSIGN(Visitor);
};
static inline void MergeLeafLeft(int* length,
uint8_t* state,
uint8_t leaf_state) {
bool edge_surrogate = StartsWithSurrogate(leaf_state);
if (!(*state & kLeftmostEdgeIsCalculated)) {
DCHECK(!(*state & kLeftmostEdgeIsSurrogate));
*state |= kLeftmostEdgeIsCalculated
| (edge_surrogate ? kLeftmostEdgeIsSurrogate : 0);
} else if (EndsWithSurrogate(*state) && edge_surrogate) {
*length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
}
if (EndsWithSurrogate(leaf_state)) {
*state |= kEndsWithLeadingSurrogate;
} else {
*state &= ~kEndsWithLeadingSurrogate;
}
}
static inline void MergeLeafRight(int* length,
uint8_t* state,
uint8_t leaf_state) {
bool edge_surrogate = EndsWithSurrogate(leaf_state);
if (!(*state & kRightmostEdgeIsCalculated)) {
DCHECK(!(*state & kRightmostEdgeIsSurrogate));
*state |= (kRightmostEdgeIsCalculated
| (edge_surrogate ? kRightmostEdgeIsSurrogate : 0));
} else if (edge_surrogate && StartsWithSurrogate(*state)) {
*length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
}
if (StartsWithSurrogate(leaf_state)) {
*state |= kStartsWithTrailingSurrogate;
} else {
*state &= ~kStartsWithTrailingSurrogate;
}
}
static inline void MergeTerminal(int* length,
uint8_t state,
uint8_t* state_out) {
DCHECK((state & kLeftmostEdgeIsCalculated) &&
(state & kRightmostEdgeIsCalculated));
if (EndsWithSurrogate(state) && StartsWithSurrogate(state)) {
*length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
}
*state_out = kInitialState |
(state & kLeftmostEdgeIsSurrogate ? kStartsWithTrailingSurrogate : 0) |
(state & kRightmostEdgeIsSurrogate ? kEndsWithLeadingSurrogate : 0);
}
static int Calculate(i::ConsString* current, uint8_t* state_out) {
using internal::ConsString;
int total_length = 0;
uint8_t state = kInitialState;
while (true) {
i::String* left = current->first();
i::String* right = current->second();
uint8_t right_leaf_state;
uint8_t left_leaf_state;
int leaf_length;
ConsString* left_as_cons =
Visitor::VisitFlat(left, &leaf_length, &left_leaf_state);
if (left_as_cons == nullptr) {
total_length += leaf_length;
MergeLeafLeft(&total_length, &state, left_leaf_state);
}
ConsString* right_as_cons =
Visitor::VisitFlat(right, &leaf_length, &right_leaf_state);
if (right_as_cons == nullptr) {
total_length += leaf_length;
MergeLeafRight(&total_length, &state, right_leaf_state);
if (left_as_cons != nullptr) {
// 1 Leaf node. Descend in place.
current = left_as_cons;
continue;
} else {
// Terminal node.
MergeTerminal(&total_length, state, state_out);
return total_length;
}
} else if (left_as_cons == nullptr) {
// 1 Leaf node. Descend in place.
current = right_as_cons;
continue;
}
// Both strings are ConsStrings.
// Recurse on smallest.
if (left->length() < right->length()) {
total_length += Calculate(left_as_cons, &left_leaf_state);
MergeLeafLeft(&total_length, &state, left_leaf_state);
current = right_as_cons;
} else {
total_length += Calculate(right_as_cons, &right_leaf_state);
MergeLeafRight(&total_length, &state, right_leaf_state);
current = left_as_cons;
}
}
UNREACHABLE();
}
static inline int Calculate(i::ConsString* current) {
uint8_t state = kInitialState;
return Calculate(current, &state);
}
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8LengthHelper);
};
static int Utf8Length(i::String* str, i::Isolate* isolate) {
int length = str->length();
if (length == 0) return 0;
uint8_t state;
i::ConsString* cons_string =
Utf8LengthHelper::Visitor::VisitFlat(str, &length, &state);
if (cons_string == nullptr) return length;
return Utf8LengthHelper::Calculate(cons_string);
}
int String::Utf8Length() const {
i::Handle<i::String> str = Utils::OpenHandle(this);
str = i::String::Flatten(str);
i::Isolate* isolate = str->GetIsolate();
return v8::Utf8Length(*str, isolate);
int length = str->length();
if (length == 0) return 0;
i::DisallowHeapAllocation no_gc;
i::String::FlatContent flat = str->GetFlatContent();
DCHECK(flat.IsFlat());
int utf8_length = 0;
if (flat.IsOneByte()) {
for (uint8_t c : flat.ToOneByteVector()) {
utf8_length += c >> 7;
}
utf8_length += length;
} else {
int last_character = unibrow::Utf16::kNoPreviousCharacter;
for (uint16_t c : flat.ToUC16Vector()) {
utf8_length += unibrow::Utf8::Length(c, last_character);
last_character = c;
}
}
return utf8_length;
}
......@@ -5825,7 +5651,7 @@ int String::WriteUtf8(char* buffer,
if (success) return writer.CompleteWrite(write_null, nchars_ref);
} else if (capacity >= string_length) {
// First check that the buffer is large enough.
int utf8_bytes = v8::Utf8Length(*str, isolate);
int utf8_bytes = Utf8Length();
if (utf8_bytes <= capacity) {
// one-byte fast path.
if (utf8_bytes == string_length) {
......@@ -5845,8 +5671,6 @@ int String::WriteUtf8(char* buffer,
return WriteUtf8(buffer, -1, nchars_ref, options);
}
}
// Recursive slow path can potentially be unreasonable slow. Flatten.
str = i::String::Flatten(str);
Utf8WriterVisitor writer(buffer, capacity, false, replace_invalid_utf8);
i::String::VisitFlat(&writer, *str);
return writer.CompleteWrite(write_null, nchars_ref);
......@@ -9055,8 +8879,7 @@ String::Utf8Value::Utf8Value(v8::Isolate* isolate, v8::Local<v8::Value> obj)
TryCatch try_catch(isolate);
Local<String> str;
if (!obj->ToString(context).ToLocal(&str)) return;
i::Handle<i::String> i_str = Utils::OpenHandle(*str);
length_ = v8::Utf8Length(*i_str, i_isolate);
length_ = str->Utf8Length();
str_ = i::NewArray<char>(length_ + 1);
str->WriteUtf8(str_);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment