[runtime] Simplify/unify utf8 handling

- Removes Utf8Iterator - Replaces Utf8Decoder with something based on ValueOfIncremental + NonAsciiStart and moves it into v8/internal. - Internalizes utf8 strings by first converting them to one or two byte - Removes IsUtf8EqualsTo and replaces current uses with IsOneByteEqualsTo Tbr: jgruber@chromium.org Change-Id: I16e08d910a745e78d6fd465718fc69ad731fd217 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1585840 Commit-Queue: Toon Verwaest <verwaest@chromium.org> Reviewed-by: Igor Sheludko <ishell@chromium.org> Reviewed-by: Ulan Degenbaev <ulan@chromium.org> Cr-Commit-Position: refs/heads/master@{#61049}

[runtime] Simplify/unify utf8 handling
- Removes Utf8Iterator - Replaces Utf8Decoder with something based on ValueOfIncremental + NonAsciiStart and moves it into v8/internal. - Internalizes utf8 strings by first converting them to one or two byte - Removes IsUtf8EqualsTo and replaces current uses with IsOneByteEqualsTo Tbr: jgruber@chromium.org Change-Id: I16e08d910a745e78d6fd465718fc69ad731fd217 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1585840 Commit-Queue: Toon Verwaest <verwaest@chromium.org> Reviewed-by: Igor Sheludko <ishell@chromium.org> Reviewed-by: Ulan Degenbaev <ulan@chromium.org> Cr-Commit-Position: refs/heads/master@{#61049}
b7ed86ec · Toon Verwaest · Commit Bot · 7a70c55d · b7ed86ec · b7ed86ec
Commit b7ed86ec authored Apr 26, 2019 by Toon Verwaest Committed by Commit Bot Apr 26, 2019
21 changed files
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -2785,7 +2785,6 @@ v8_source_set("v8_base_without_compiler") {
    "src/type-hints.cc",
    "src/type-hints.h",
    "src/type-traits.h",
-    "src/unicode-cache.h",
    "src/unicode-decoder.cc",
    "src/unicode-decoder.h",
    "src/unicode-inl.h",

--- a/src/bootstrapper.cc
+++ b/src/bootstrapper.cc
@@ -70,7 +70,7 @@ bool SourceCodeCache::Lookup(Isolate* isolate, Vector<const char> name,
                             Handle<SharedFunctionInfo>* handle) {
  for (int i = 0; i < cache_->length(); i += 2) {
    SeqOneByteString str = SeqOneByteString::cast(cache_->get(i));
-    if (str->IsUtf8EqualTo(name)) {
+    if (str->IsOneByteEqualTo(Vector<const uint8_t>::cast(name))) {
      *handle = Handle<SharedFunctionInfo>(
          SharedFunctionInfo::cast(cache_->get(i + 1)), isolate);
      return true;

--- a/src/heap/factory.cc
+++ b/src/heap/factory.cc
@@ -49,7 +49,6 @@
 #include "src/objects/struct-inl.h"
 #include "src/objects/template-objects-inl.h"
 #include "src/transitions-inl.h"
-#include "src/unicode-cache.h"
 #include "src/unicode-inl.h"

 namespace v8 {
@@ -632,8 +631,19 @@ Handle<AccessorPair> Factory::NewAccessorPair() {

 // Internalized strings are created in the old generation (data space).
 Handle<String> Factory::InternalizeUtf8String(Vector<const char> string) {
-  Utf8StringKey key(string, HashSeed(isolate()));
-  return InternalizeStringWithKey(&key);
+  Vector<const uint8_t> utf8_data = Vector<const uint8_t>::cast(string);
+  Utf8Decoder decoder(utf8_data);
+  if (decoder.is_ascii()) return InternalizeOneByteString(utf8_data);
+  if (decoder.is_one_byte()) {
+    std::unique_ptr<uint8_t[]> buffer(new uint8_t[decoder.utf16_length()]);
+    decoder.Decode(buffer.get(), utf8_data);
+    return InternalizeOneByteString(
+        Vector<const uint8_t>(buffer.get(), decoder.utf16_length()));
+  }
+  std::unique_ptr<uint16_t[]> buffer(new uint16_t[decoder.utf16_length()]);
+  decoder.Decode(buffer.get(), utf8_data);
+  return InternalizeTwoByteString(
+      Vector<const uc16>(buffer.get(), decoder.utf16_length()));
 }

 Handle<String> Factory::InternalizeOneByteString(Vector<const uint8_t> string) {
@@ -675,122 +685,86 @@ MaybeHandle<String> Factory::NewStringFromOneByte(Vector<const uint8_t> string,
  return result;
 }

-MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> string,
+MaybeHandle<String> Factory::NewStringFromUtf8(Vector<const char> data,
                                               AllocationType allocation) {
-  DCHECK_NE(allocation, AllocationType::kReadOnly);
-  // Check for ASCII first since this is the common case.
-  const char* ascii_data = string.start();
-  int length = string.length();
-  int non_ascii_start = String::NonAsciiStart(ascii_data, length);
-  if (non_ascii_start >= length) {
-    // If the string is ASCII, we do not need to convert the characters
-    // since UTF8 is backwards compatible with ASCII.
-    return NewStringFromOneByte(Vector<const uint8_t>::cast(string),
-                                allocation);
-  }
-
-  std::unique_ptr<uint16_t[]> buffer(new uint16_t[length - non_ascii_start]);
-
-  const uint8_t* cursor =
-      reinterpret_cast<const uint8_t*>(&string[non_ascii_start]);
-  const uint8_t* end = reinterpret_cast<const uint8_t*>(string.end());
+  Vector<const uint8_t> utf8_data = Vector<const uint8_t>::cast(data);
+  Utf8Decoder decoder(utf8_data);

-  uint16_t* output_cursor = buffer.get();
+  if (decoder.utf16_length() == 0) return empty_string();

-  uint32_t incomplete_char = 0;
-  unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
-
-  while (cursor < end) {
-    unibrow::uchar t =
-        unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
-
-    if (V8_LIKELY(t <= unibrow::Utf16::kMaxNonSurrogateCharCode)) {
-      *(output_cursor++) = static_cast<uc16>(t);  // The most frequent case.
-    } else if (t == unibrow::Utf8::kIncomplete) {
-      continue;
-    } else {
-      *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
-      *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
-    }
-  }
+  if (decoder.is_one_byte()) {
+    // Allocate string.
+    Handle<SeqOneByteString> result;
+    ASSIGN_RETURN_ON_EXCEPTION(
+        isolate(), result,
+        NewRawOneByteString(decoder.utf16_length(), allocation), String);

-  unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
-  if (t != unibrow::Utf8::kBufferEmpty) {
-    *(output_cursor++) = static_cast<uc16>(t);
+    DisallowHeapAllocation no_gc;
+    decoder.Decode(result->GetChars(no_gc), utf8_data);
+    return result;
  }

-  DCHECK_LE(output_cursor, buffer.get() + length - non_ascii_start);
-  int utf16_length = static_cast<int>(output_cursor - buffer.get());
-  DCHECK_GT(utf16_length, 0);
-
  // Allocate string.
  Handle<SeqTwoByteString> result;
  ASSIGN_RETURN_ON_EXCEPTION(
      isolate(), result,
-      NewRawTwoByteString(non_ascii_start + utf16_length, allocation), String);
-
-  DCHECK_LE(non_ascii_start + utf16_length, length);
+      NewRawTwoByteString(decoder.utf16_length(), allocation), String);

  DisallowHeapAllocation no_gc;
-  uint16_t* data = result->GetChars(no_gc);
-  CopyChars(data, ascii_data, non_ascii_start);
-  CopyChars(data + non_ascii_start, buffer.get(), utf16_length);
-
+  decoder.Decode(result->GetChars(no_gc), utf8_data);
  return result;
 }

 MaybeHandle<String> Factory::NewStringFromUtf8SubString(
    Handle<SeqOneByteString> str, int begin, int length,
    AllocationType allocation) {
-  Access<UnicodeCache::Utf8Decoder> decoder(
-      isolate()->unicode_cache()->utf8_decoder());
-  int non_ascii_start;
-  int utf16_length = 0;
+  Vector<const uint8_t> utf8_data;
  {
    DisallowHeapAllocation no_gc;
-    const char* ascii_data =
-        reinterpret_cast<const char*>(str->GetChars(no_gc) + begin);
-    non_ascii_start = String::NonAsciiStart(ascii_data, length);
-    if (non_ascii_start < length) {
-      // Non-ASCII and we need to decode.
-      auto non_ascii = Vector<const char>(ascii_data + non_ascii_start,
-                                          length - non_ascii_start);
-      decoder->Reset(non_ascii);
-
-      utf16_length = static_cast<int>(decoder->Utf16Length());
+    utf8_data = Vector<const uint8_t>(str->GetChars(no_gc) + begin, length);
  }
+  Utf8Decoder decoder(utf8_data);
+
+  if (length == 1) {
+    uint16_t t;
+    // Decode even in the case of length 1 since it can be a bad character.
+    decoder.Decode(&t, utf8_data);
+    return LookupSingleCharacterStringFromCode(t);
  }

-  if (non_ascii_start >= length) {
+  if (decoder.is_ascii()) {
    // If the string is ASCII, we can just make a substring.
    // TODO(v8): the allocation flag is ignored in this case.
    return NewSubString(str, begin, begin + length);
  }

-  DCHECK_GT(utf16_length, 0);
+  DCHECK_GT(decoder.utf16_length(), 0);

+  if (decoder.is_one_byte()) {
    // Allocate string.
-  Handle<SeqTwoByteString> result;
+    Handle<SeqOneByteString> result;
    ASSIGN_RETURN_ON_EXCEPTION(
        isolate(), result,
-      NewRawTwoByteString(non_ascii_start + utf16_length, allocation), String);
-
+        NewRawOneByteString(decoder.utf16_length(), allocation), String);
+    DisallowHeapAllocation no_gc;
    // Update pointer references, since the original string may have moved after
    // allocation.
-  DisallowHeapAllocation no_gc;
-  const char* ascii_data =
-      reinterpret_cast<const char*>(str->GetChars(no_gc) + begin);
-  auto non_ascii = Vector<const char>(ascii_data + non_ascii_start,
-                                      length - non_ascii_start);
-
-  // Copy ASCII portion.
-  uint16_t* data = result->GetChars(no_gc);
-  for (int i = 0; i < non_ascii_start; i++) {
-    *data++ = *ascii_data++;
+    utf8_data = Vector<const uint8_t>(str->GetChars(no_gc) + begin, length);
+    decoder.Decode(result->GetChars(no_gc), utf8_data);
+    return result;
  }

-  // Now write the remainder.
-  decoder->WriteUtf16(data, utf16_length, non_ascii);
+  // Allocate string.
+  Handle<SeqTwoByteString> result;
+  ASSIGN_RETURN_ON_EXCEPTION(
+      isolate(), result,
+      NewRawTwoByteString(decoder.utf16_length(), allocation), String);
+
+  DisallowHeapAllocation no_gc;
+  // Update pointer references, since the original string may have moved after
+  // allocation.
+  utf8_data = Vector<const uint8_t>(str->GetChars(no_gc) + begin, length);
+  decoder.Decode(result->GetChars(no_gc), utf8_data);
  return result;
 }

@@ -830,37 +804,10 @@ MaybeHandle<String> Factory::NewStringFromTwoByte(

 namespace {

-bool inline IsOneByte(Vector<const char> str, int chars) {
-  // TODO(dcarney): incorporate Latin-1 check when Latin-1 is supported?
-  return chars == str.length();
-}
-
 bool inline IsOneByte(Handle<String> str) {
  return str->IsOneByteRepresentation();
 }

-inline void WriteOneByteData(Vector<const char> vector, uint8_t* chars,
-                             int len) {
-  // Only works for one byte strings.
-  DCHECK(vector.length() == len);
-  MemCopy(chars, vector.start(), len);
-}
-
-inline void WriteTwoByteData(Vector<const char> vector, uint16_t* chars,
-                             int len) {
-  unibrow::Utf8Iterator it = unibrow::Utf8Iterator(vector);
-  while (!it.Done()) {
-    DCHECK_GT(len, 0);
-    len -= 1;
-
-    uint16_t c = *it;
-    ++it;
-    DCHECK_NE(unibrow::Utf8::kBadChar, c);
-    *chars++ = c;
-  }
-  DCHECK_EQ(len, 0);
-}
-
 inline void WriteOneByteData(Handle<String> s, uint8_t* chars, int len) {
  DCHECK(s->length() == len);
  String::WriteToFlat(*s, chars, 0, len);
@@ -956,19 +903,6 @@ Handle<String> Factory::AllocateInternalizedStringImpl(T t, int chars,
  return answer;
 }

-Handle<String> Factory::NewInternalizedStringFromUtf8(Vector<const char> str,
-                                                      int chars,
-                                                      uint32_t hash_field) {
-  if (IsOneByte(str, chars)) {
-    Handle<SeqOneByteString> result =
-        AllocateRawOneByteInternalizedString(str.length(), hash_field);
-    DisallowHeapAllocation no_allocation;
-    MemCopy(result->GetChars(no_allocation), str.start(), str.length());
-    return result;
-  }
-  return AllocateInternalizedStringImpl<false>(str, chars, hash_field);
-}
-
 Handle<String> Factory::NewOneByteInternalizedString(Vector<const uint8_t> str,
                                                     uint32_t hash_field) {
  Handle<SeqOneByteString> result =

--- a/src/heap/factory.h
+++ b/src/heap/factory.h
@@ -314,11 +314,6 @@ class V8_EXPORT_PRIVATE Factory {

  Handle<JSStringIterator> NewJSStringIterator(Handle<String> string);

-  // Allocates an internalized string in old space based on the character
-  // stream.
-  Handle<String> NewInternalizedStringFromUtf8(Vector<const char> str,
-                                               int chars, uint32_t hash_field);
-
  Handle<String> NewOneByteInternalizedString(Vector<const uint8_t> str,
                                              uint32_t hash_field);


--- a/src/isolate.cc
+++ b/src/isolate.cc
@@ -71,7 +71,6 @@
 #include "src/string-stream.h"
 #include "src/tracing/tracing-category-observer.h"
 #include "src/trap-handler/trap-handler.h"
-#include "src/unicode-cache.h"
 #include "src/v8.h"
 #include "src/v8threads.h"
 #include "src/version.h"
@@ -3056,9 +3055,6 @@ Isolate::~Isolate() {
  delete entry_stack_;
  entry_stack_ = nullptr;

-  delete unicode_cache_;
-  unicode_cache_ = nullptr;
-
  delete date_cache_;
  date_cache_ = nullptr;

@@ -3330,7 +3326,6 @@ bool Isolate::Init(ReadOnlyDeserializer* read_only_deserializer,

  compilation_cache_ = new CompilationCache(this);
  descriptor_lookup_cache_ = new DescriptorLookupCache();
-  unicode_cache_ = new UnicodeCache();
  inner_pointer_to_code_cache_ = new InnerPointerToCodeCache(this);
  global_handles_ = new GlobalHandles(this);
  eternal_handles_ = new EternalHandles();

--- a/src/messages.cc
+++ b/src/messages.cc
@@ -397,17 +397,18 @@ Handle<Object> JSStackFrame::GetMethodName() {
  }

  Handle<String> name(function_->shared()->Name(), isolate_);
+  name = String::Flatten(isolate_, name);

  // The static initializer function is not a method, so don't add a
  // class name, just return the function name.
-  if (name->IsUtf8EqualTo(CStrVector("<static_fields_initializer>"), true)) {
+  if (name->HasOneBytePrefix(CStrVector("<static_fields_initializer>"))) {
    return name;
  }

  // ES2015 gives getters and setters name prefixes which must
  // be stripped to find the property name.
-  if (name->IsUtf8EqualTo(CStrVector("get "), true) ||
-      name->IsUtf8EqualTo(CStrVector("set "), true)) {
+  if (name->HasOneBytePrefix(CStrVector("get ")) ||
+      name->HasOneBytePrefix(CStrVector("set "))) {
    name = isolate_->factory()->NewProperSubString(name, 4, name->length());
  }
  if (CheckMethodName(isolate_, receiver, name, function_,

--- a/src/objects.cc
+++ b/src/objects.cc
@@ -4558,47 +4558,6 @@ uint32_t StringHasher::GetHashField() {
  }
 }

-uint32_t StringHasher::ComputeUtf8Hash(Vector<const char> chars, uint64_t seed,
-                                       int* utf16_length_out) {
-  int vector_length = chars.length();
-  // Handle some edge cases
-  if (vector_length <= 1) {
-    DCHECK(vector_length == 0 ||
-           static_cast<uint8_t>(chars.start()[0]) <=
-               unibrow::Utf8::kMaxOneByteChar);
-    *utf16_length_out = vector_length;
-    return HashSequentialString(chars.start(), vector_length, seed);
-  }
-
-  // Start with a fake length which won't affect computation.
-  // It will be updated later.
-  StringHasher hasher(String::kMaxArrayIndexSize, seed);
-  DCHECK(hasher.is_array_index_);
-
-  unibrow::Utf8Iterator it = unibrow::Utf8Iterator(chars);
-  int utf16_length = 0;
-  bool is_index = true;
-
-  while (utf16_length < String::kMaxHashCalcLength && !it.Done()) {
-    utf16_length++;
-    uint16_t c = *it;
-    ++it;
-    hasher.AddCharacter(c);
-    if (is_index) is_index = hasher.UpdateIndex(c);
-  }
-
-  // Now that hashing is done, we just need to calculate utf16_length
-  while (!it.Done()) {
-    ++it;
-    utf16_length++;
-  }
-
-  *utf16_length_out = utf16_length;
-  // Must set length here so that hash computation is correct.
-  hasher.length_ = utf16_length;
-  return hasher.GetHashField();
-}
-
 void IteratingStringHasher::VisitConsString(ConsString cons_string) {
  // Run small ConsStrings through ConsStringIterator.
  if (cons_string->length() < 64) {

--- a/src/objects/string-inl.h
+++ b/src/objects/string-inl.h
@@ -265,27 +265,6 @@ class TwoByteStringKey : public SequentialStringKey<uc16> {
  Handle<String> AsHandle(Isolate* isolate) override;
 };

-// Utf8StringKey carries a vector of chars as key.
-class Utf8StringKey : public StringTableKey {
- public:
-  explicit Utf8StringKey(Vector<const char> string, uint64_t seed)
-      : StringTableKey(StringHasher::ComputeUtf8Hash(string, seed, &chars_)),
-        string_(string) {}
-
-  bool IsMatch(Object string) override {
-    return String::cast(string)->IsUtf8EqualTo(string_);
-  }
-
-  Handle<String> AsHandle(Isolate* isolate) override {
-    return isolate->factory()->NewInternalizedStringFromUtf8(string_, chars_,
-                                                             HashField());
-  }
-
- private:
-  Vector<const char> string_;
-  int chars_;  // Caches the number of characters when computing the hash code.
-};
-
 bool String::Equals(String other) {
  if (other == *this) return true;
  if (this->IsInternalizedString() && other->IsInternalizedString()) {

--- a/src/objects/string.cc
+++ b/src/objects/string.cc
@@ -1180,26 +1180,6 @@ Object String::LastIndexOf(Isolate* isolate, Handle<Object> receiver,
  return Smi::FromInt(last_index);
 }

-bool String::IsUtf8EqualTo(Vector<const char> str, bool allow_prefix_match) {
-  int slen = length();
-  // Can't check exact length equality, but we can check bounds.
-  int str_len = str.length();
-  if (!allow_prefix_match &&
-      (str_len < slen ||
-       str_len > slen * static_cast<int>(unibrow::Utf8::kMaxEncodedSize))) {
-    return false;
-  }
-
-  int i = 0;
-  unibrow::Utf8Iterator it = unibrow::Utf8Iterator(str);
-  while (i < slen && !it.Done()) {
-    if (Get(i++) != *it) return false;
-    ++it;
-  }
-
-  return (allow_prefix_match || i == slen) && it.Done();
-}
-
 template <>
 bool String::IsEqualTo(Vector<const uint8_t> str) {
  return IsOneByteEqualTo(str);
@@ -1210,6 +1190,18 @@ bool String::IsEqualTo(Vector<const uc16> str) {
  return IsTwoByteEqualTo(str);
 }

+bool String::HasOneBytePrefix(Vector<const char> str) {
+  int slen = str.length();
+  if (slen > length()) return false;
+  DisallowHeapAllocation no_gc;
+  FlatContent content = GetFlatContent(no_gc);
+  if (content.IsOneByte()) {
+    return CompareChars(content.ToOneByteVector().start(), str.start(), slen) ==
+           0;
+  }
+  return CompareChars(content.ToUC16Vector().start(), str.start(), slen) == 0;
+}
+
 bool String::IsOneByteEqualTo(Vector<const uint8_t> str) {
  int slen = length();
  if (str.length() != slen) return false;

--- a/src/objects/string.h
+++ b/src/objects/string.h
@@ -268,14 +268,16 @@ class String : public Name {
  inline bool Equals(String other);
  inline static bool Equals(Isolate* isolate, Handle<String> one,
                            Handle<String> two);
-  V8_EXPORT_PRIVATE bool IsUtf8EqualTo(Vector<const char> str,
-                                       bool allow_prefix_match = false);

  // Dispatches to Is{One,Two}ByteEqualTo.
  template <typename Char>
  bool IsEqualTo(Vector<const Char> str);

+  V8_EXPORT_PRIVATE bool HasOneBytePrefix(Vector<const char> str);
  V8_EXPORT_PRIVATE bool IsOneByteEqualTo(Vector<const uint8_t> str);
+  V8_EXPORT_PRIVATE bool IsOneByteEqualTo(Vector<const char> str) {
+    return IsOneByteEqualTo(Vector<const uint8_t>::cast(str));
+  }
  bool IsTwoByteEqualTo(Vector<const uc16> str);

  // Return a UTF8 representation of the string.  The string is null
@@ -366,50 +368,12 @@ class String : public Name {
  EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
  static void WriteToFlat(String source, sinkchar* sink, int from, int to);

-  // The return value may point to the first aligned word containing the first
-  // non-one-byte character, rather than directly to the non-one-byte character.
-  // If the return value is >= the passed length, the entire string was
-  // one-byte.
-  static inline int NonAsciiStart(const char* chars, int length) {
-    const char* start = chars;
-    const char* limit = chars + length;
-
-    if (length >= kIntptrSize) {
-      // Check unaligned bytes.
-      while (!IsAligned(reinterpret_cast<intptr_t>(chars), sizeof(uintptr_t))) {
-        if (static_cast<uint8_t>(*chars) > unibrow::Utf8::kMaxOneByteChar) {
-          return static_cast<int>(chars - start);
-        }
-        ++chars;
-      }
-      // Check aligned words.
-      DCHECK_EQ(unibrow::Utf8::kMaxOneByteChar, 0x7F);
-      const uintptr_t non_one_byte_mask = kUintptrAllBitsSet / 0xFF * 0x80;
-      while (chars + sizeof(uintptr_t) <= limit) {
-        if (*reinterpret_cast<const uintptr_t*>(chars) & non_one_byte_mask) {
-          return static_cast<int>(chars - start);
-        }
-        chars += sizeof(uintptr_t);
-      }
-    }
-    // Check remaining unaligned bytes.
-    while (chars < limit) {
-      if (static_cast<uint8_t>(*chars) > unibrow::Utf8::kMaxOneByteChar) {
-        return static_cast<int>(chars - start);
-      }
-      ++chars;
-    }
-
-    return static_cast<int>(chars - start);
-  }
-
  static inline bool IsAscii(const char* chars, int length) {
-    return NonAsciiStart(chars, length) >= length;
+    return IsAscii(reinterpret_cast<const uint8_t*>(chars), length);
  }

  static inline bool IsAscii(const uint8_t* chars, int length) {
-    return NonAsciiStart(reinterpret_cast<const char*>(chars), length) >=
-           length;
+    return NonAsciiStart(chars, length) >= length;
  }

  static inline int NonOneByteStart(const uc16* chars, int length) {

--- a/src/parsing/scanner.h
+++ b/src/parsing/scanner.h
@@ -16,7 +16,6 @@
 #include "src/message-template.h"
 #include "src/parsing/token.h"
 #include "src/pointer-with-payload.h"
-#include "src/unicode-decoder.h"
 #include "src/unicode.h"

 namespace v8 {

--- a/src/regexp/regexp-stack.cc
+++ b/src/regexp/regexp-stack.cc
@@ -5,6 +5,7 @@
 #include "src/regexp/regexp-stack.h"

 #include "src/isolate.h"
+#include "src/memcopy.h"

 namespace v8 {
 namespace internal {

--- a/src/snapshot/snapshot-common.cc
+++ b/src/snapshot/snapshot-common.cc
@@ -8,6 +8,7 @@

 #include "src/base/platform/platform.h"
 #include "src/counters.h"
+#include "src/memcopy.h"
 #include "src/snapshot/partial-deserializer.h"
 #include "src/snapshot/read-only-deserializer.h"
 #include "src/snapshot/startup-deserializer.h"

--- a/src/string-hasher.h
+++ b/src/string-hasher.h
@@ -24,10 +24,6 @@ class V8_EXPORT_PRIVATE StringHasher {
  static inline uint32_t HashSequentialString(const schar* chars, int length,
                                              uint64_t seed);

-  // Reads all the data, even for long strings and computes the utf16 length.
-  static uint32_t ComputeUtf8Hash(Vector<const char> chars, uint64_t seed,
-                                  int* utf16_length_out);
-
  // Calculated hash value for a string consisting of 1 to
  // String::kMaxArrayIndexSize digits with no leading zeros (except "0").
  // value is represented decimal value.

--- a/src/unicode-cache.h
+++ b/src/unicode-cache.h
-// Copyright 2015 the V8 project authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef V8_UNICODE_CACHE_H_
-#define V8_UNICODE_CACHE_H_
-
-#include "src/base/macros.h"
-#include "src/unicode-decoder.h"
-#include "src/unicode.h"
-#include "src/utils.h"
-
-namespace v8 {
-namespace internal {
-
-// Caching predicates used by scanners.
-class UnicodeCache {
- public:
-  UnicodeCache() = default;
-  typedef unibrow::Utf8Decoder<512> Utf8Decoder;
-
-  StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; }
-
- private:
-  StaticResource<Utf8Decoder> utf8_decoder_;
-
-  DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
-};
-
-}  // namespace internal
-}  // namespace v8
-
-#endif  // V8_UNICODE_CACHE_H_
--- a/src/unicode-decoder.cc
+++ b/src/unicode-decoder.cc
@@ -2,87 +2,80 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

-
-#include "src/unicode-inl.h"
 #include "src/unicode-decoder.h"
-#include <stdio.h>
-#include <stdlib.h>

-namespace unibrow {
+#include "src/memcopy.h"
+#include "src/unicode-inl.h"

-uint16_t Utf8Iterator::operator*() {
-  if (V8_UNLIKELY(char_ > Utf16::kMaxNonSurrogateCharCode)) {
-    return trailing_ ? Utf16::TrailSurrogate(char_)
-                     : Utf16::LeadSurrogate(char_);
+namespace v8 {
+namespace internal {
+
+Utf8Decoder::Utf8Decoder(const Vector<const uint8_t>& chars)
+    : encoding_(Encoding::kAscii),
+      non_ascii_start_(NonAsciiStart(chars.start(), chars.length())),
+      utf16_length_(non_ascii_start_) {
+  if (non_ascii_start_ == chars.length()) return;
+
+  const uint8_t* cursor = chars.start() + non_ascii_start_;
+  const uint8_t* end = chars.start() + chars.length();
+
+  bool is_one_byte = true;
+  uint32_t incomplete_char = 0;
+  unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
+
+  while (cursor < end) {
+    unibrow::uchar t =
+        unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
+    if (t != unibrow::Utf8::kIncomplete) {
+      is_one_byte = is_one_byte && t <= unibrow::Latin1::kMaxChar;
+      utf16_length_++;
+      if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) utf16_length_++;
    }
-
-  DCHECK_EQ(trailing_, false);
-  return char_;
-}
-
-Utf8Iterator& Utf8Iterator::operator++() {
-  if (V8_UNLIKELY(this->Done())) {
-    char_ = Utf8::kBufferEmpty;
-    return *this;
  }

-  if (V8_UNLIKELY(char_ > Utf16::kMaxNonSurrogateCharCode && !trailing_)) {
-    trailing_ = true;
-    return *this;
+  unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
+  if (t != unibrow::Utf8::kBufferEmpty) {
+    is_one_byte = false;
+    utf16_length_++;
  }

-  trailing_ = false;
-  offset_ = cursor_;
-
-  char_ =
-      Utf8::ValueOf(reinterpret_cast<const uint8_t*>(stream_.begin()) + cursor_,
-                    stream_.length() - cursor_, &cursor_);
-  return *this;
+  encoding_ = is_one_byte ? Encoding::kLatin1 : Encoding::kUtf16;
 }

-Utf8Iterator Utf8Iterator::operator++(int) {
-  Utf8Iterator old(*this);
-  ++*this;
-  return old;
-}
+template <typename Char>
+void Utf8Decoder::Decode(Char* out, const Vector<const uint8_t>& data) {
+  CopyChars(out, data.start(), non_ascii_start_);

-bool Utf8Iterator::Done() {
-  return offset_ == static_cast<size_t>(stream_.length());
-}
+  out += non_ascii_start_;
+
+  uint32_t incomplete_char = 0;
+  unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;

-void Utf8DecoderBase::Reset(uint16_t* buffer, size_t buffer_length,
-                            const v8::internal::Vector<const char>& stream) {
-  size_t utf16_length = 0;
+  const uint8_t* cursor = data.start() + non_ascii_start_;
+  const uint8_t* end = data.start() + data.length();

-  Utf8Iterator it = Utf8Iterator(stream);
-  // Loop until stream is read, writing to buffer as long as buffer has space.
-  while (utf16_length < buffer_length && !it.Done()) {
-    *buffer++ = *it;
-    ++it;
-    utf16_length++;
+  while (cursor < end) {
+    unibrow::uchar t =
+        unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
+    if (t != unibrow::Utf8::kIncomplete) {
+      if (sizeof(Char) == 1 || t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
+        *(out++) = static_cast<Char>(t);
+      } else {
+        *(out++) = unibrow::Utf16::LeadSurrogate(t);
+        *(out++) = unibrow::Utf16::TrailSurrogate(t);
      }
-  bytes_read_ = it.Offset();
-  trailing_ = it.Trailing();
-  chars_written_ = utf16_length;
-
-  // Now that writing to buffer is done, we just need to calculate utf16_length
-  while (!it.Done()) {
-    ++it;
-    utf16_length++;
    }
-  utf16_length_ = utf16_length;
-}
-
-void Utf8DecoderBase::WriteUtf16Slow(
-    uint16_t* data, size_t length,
-    const v8::internal::Vector<const char>& stream, size_t offset,
-    bool trailing) {
-  Utf8Iterator it = Utf8Iterator(stream, offset, trailing);
-  while (!it.Done()) {
-    DCHECK_GT(length--, 0);
-    *data++ = *it;
-    ++it;
  }
+
+  unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
+  if (t != unibrow::Utf8::kBufferEmpty) *out = static_cast<Char>(t);
 }

-}  // namespace unibrow
+template void Utf8Decoder::Decode(uint8_t* out,
+                                  const Vector<const uint8_t>& data);
+
+template void Utf8Decoder::Decode(uint16_t* out,
+                                  const Vector<const uint8_t>& data);
+
+}  // namespace internal
+}  // namespace v8
--- a/src/unicode-decoder.h
+++ b/src/unicode-decoder.h
@@ -5,154 +5,70 @@
 #ifndef V8_UNICODE_DECODER_H_
 #define V8_UNICODE_DECODER_H_

-#include <sys/types.h>
-#include <algorithm>
-#include "src/globals.h"
-#include "src/memcopy.h"
 #include "src/unicode.h"
 #include "src/vector.h"

-namespace unibrow {
-
-class Utf8Iterator {
- public:
-  explicit Utf8Iterator(const v8::internal::Vector<const char>& stream)
-      : Utf8Iterator(stream, 0, false) {}
-  Utf8Iterator(const v8::internal::Vector<const char>& stream, size_t offset,
-               bool trailing)
-      : stream_(stream),
-        cursor_(offset),
-        offset_(0),
-        char_(0),
-        trailing_(false) {
-    DCHECK_LE(offset, stream.length());
-    // Read the first char, setting offset_ to offset in the process.
-    ++*this;
-
-    // This must be set after reading the first char, since the offset marks
-    // the start of the octet sequence that the trailing char is part of.
-    trailing_ = trailing;
-    if (trailing) {
-      DCHECK_GT(char_, Utf16::kMaxNonSurrogateCharCode);
+namespace v8 {
+namespace internal {
+
+// The return value may point to the first aligned word containing the first
+// non-one-byte character, rather than directly to the non-one-byte character.
+// If the return value is >= the passed length, the entire string was
+// one-byte.
+inline int NonAsciiStart(const uint8_t* chars, int length) {
+  const uint8_t* start = chars;
+  const uint8_t* limit = chars + length;
+
+  if (static_cast<size_t>(length) >= kIntptrSize) {
+    // Check unaligned bytes.
+    while (!IsAligned(reinterpret_cast<intptr_t>(chars), kIntptrSize)) {
+      if (*chars > unibrow::Utf8::kMaxOneByteChar) {
+        return static_cast<int>(chars - start);
      }
+      ++chars;
+    }
+    // Check aligned words.
+    DCHECK_EQ(unibrow::Utf8::kMaxOneByteChar, 0x7F);
+    const uintptr_t non_one_byte_mask = kUintptrAllBitsSet / 0xFF * 0x80;
+    while (chars + sizeof(uintptr_t) <= limit) {
+      if (*reinterpret_cast<const uintptr_t*>(chars) & non_one_byte_mask) {
+        return static_cast<int>(chars - start);
+      }
+      chars += sizeof(uintptr_t);
+    }
+  }
+  // Check remaining unaligned bytes.
+  while (chars < limit) {
+    if (*chars > unibrow::Utf8::kMaxOneByteChar) {
+      return static_cast<int>(chars - start);
+    }
+    ++chars;
  }

-  uint16_t operator*();
-  Utf8Iterator& operator++();
-  Utf8Iterator operator++(int);
-  bool Done();
-  bool Trailing() { return trailing_; }
-  size_t Offset() { return offset_; }
-
- private:
-  const v8::internal::Vector<const char>& stream_;
-  size_t cursor_;
-  size_t offset_;
-  uint32_t char_;
-  bool trailing_;
-};
+  return static_cast<int>(chars - start);
+}

-class V8_EXPORT_PRIVATE Utf8DecoderBase {
+class V8_EXPORT_PRIVATE Utf8Decoder final {
 public:
-  // Initialization done in subclass.
-  inline Utf8DecoderBase();
-  inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
-                         const v8::internal::Vector<const char>& stream);
-  inline size_t Utf16Length() const { return utf16_length_; }
-
- protected:
-  // This reads all characters and sets the utf16_length_.
-  // The first buffer_length utf16 chars are cached in the buffer.
-  void Reset(uint16_t* buffer, size_t buffer_length,
-             const v8::internal::Vector<const char>& vector);
-  static void WriteUtf16Slow(uint16_t* data, size_t length,
-                             const v8::internal::Vector<const char>& stream,
-                             size_t offset, bool trailing);
+  enum class Encoding : uint8_t { kAscii, kLatin1, kUtf16 };

-  size_t bytes_read_;
-  size_t chars_written_;
-  size_t utf16_length_;
-  bool trailing_;
+  explicit Utf8Decoder(const Vector<const uint8_t>& chars);

- private:
-  DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
-};
+  bool is_ascii() const { return encoding_ == Encoding::kAscii; }
+  bool is_one_byte() const { return encoding_ < Encoding::kLatin1; }
+  int utf16_length() const { return utf16_length_; }
+  int non_ascii_start() const { return non_ascii_start_; }

-template <size_t kBufferSize>
-class Utf8Decoder : public Utf8DecoderBase {
- public:
-  inline Utf8Decoder() = default;
-  explicit inline Utf8Decoder(const v8::internal::Vector<const char>& stream);
-  inline void Reset(const v8::internal::Vector<const char>& stream);
-  inline size_t WriteUtf16(
-      uint16_t* data, size_t length,
-      const v8::internal::Vector<const char>& stream) const;
+  template <typename Char>
+  V8_EXPORT_PRIVATE void Decode(Char* out, const Vector<const uint8_t>& data);

 private:
-  uint16_t buffer_[kBufferSize];
-};
-
-Utf8DecoderBase::Utf8DecoderBase()
-    : bytes_read_(0), chars_written_(0), utf16_length_(0), trailing_(false) {}
-
-Utf8DecoderBase::Utf8DecoderBase(
-    uint16_t* buffer, size_t buffer_length,
-    const v8::internal::Vector<const char>& stream) {
-  Reset(buffer, buffer_length, stream);
-}
-
-template <size_t kBufferSize>
-Utf8Decoder<kBufferSize>::Utf8Decoder(
-    const v8::internal::Vector<const char>& stream)
-    : Utf8DecoderBase(buffer_, kBufferSize, stream) {}
-
-template <size_t kBufferSize>
-void Utf8Decoder<kBufferSize>::Reset(
-    const v8::internal::Vector<const char>& stream) {
-  Utf8DecoderBase::Reset(buffer_, kBufferSize, stream);
-}
-
-template <size_t kBufferSize>
-size_t Utf8Decoder<kBufferSize>::WriteUtf16(
-    uint16_t* data, size_t data_length,
-    const v8::internal::Vector<const char>& stream) const {
-  DCHECK_GT(data_length, 0);
-  data_length = std::min(data_length, utf16_length_);
-
-  // memcpy everything in buffer.
-  size_t memcpy_length = std::min(data_length, chars_written_);
-  v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
-
-  if (data_length <= chars_written_) return data_length;
-
-  // Copy the rest the slow way.
-  WriteUtf16Slow(data + chars_written_, data_length - chars_written_, stream,
-                 bytes_read_, trailing_);
-  return data_length;
-}
-
-class Latin1 {
- public:
-  static const unsigned kMaxChar = 0xff;
-  // Convert the character to Latin-1 case equivalent if possible.
-  static inline uint16_t TryConvertToLatin1(uint16_t);
+  Encoding encoding_;
+  int non_ascii_start_;
+  int utf16_length_;
 };

-uint16_t Latin1::TryConvertToLatin1(uint16_t c) {
-  switch (c) {
-    // This are equivalent characters in unicode.
-    case 0x39c:
-    case 0x3bc:
-      return 0xb5;
-    // This is an uppercase of a Latin-1 character
-    // outside of Latin-1.
-    case 0x178:
-      return 0xff;
-  }
-  return c;
-}
-
-
-}  // namespace unibrow
+}  // namespace internal
+}  // namespace v8

 #endif  // V8_UNICODE_DECODER_H_
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -131,6 +131,25 @@ class Utf16 {
  }
 };

+class Latin1 {
+ public:
+  static const unsigned kMaxChar = 0xff;
+  // Convert the character to Latin-1 case equivalent if possible.
+  static inline uint16_t TryConvertToLatin1(uint16_t c) {
+    switch (c) {
+      // This are equivalent characters in unicode.
+      case 0x39c:
+      case 0x3bc:
+        return 0xb5;
+      // This is an uppercase of a Latin-1 character
+      // outside of Latin-1.
+      case 0x178:
+        return 0xff;
+    }
+    return c;
+  }
+};
+
 class V8_EXPORT_PRIVATE Utf8 {
 public:
  using State = Utf8DfaDecoder::State;

--- a/test/cctest/heap/test-heap.cc
+++ b/test/cctest/heap/test-heap.cc
@@ -153,13 +153,13 @@ static void CheckOddball(Isolate* isolate, Object obj, const char* string) {
  CHECK(obj->IsOddball());
  Handle<Object> handle(obj, isolate);
  Object print_string = *Object::ToString(isolate, handle).ToHandleChecked();
-  CHECK(String::cast(print_string)->IsUtf8EqualTo(CStrVector(string)));
+  CHECK(String::cast(print_string)->IsOneByteEqualTo(CStrVector(string)));
 }

 static void CheckSmi(Isolate* isolate, int value, const char* string) {
  Handle<Object> handle(Smi::FromInt(value), isolate);
  Object print_string = *Object::ToString(isolate, handle).ToHandleChecked();
-  CHECK(String::cast(print_string)->IsUtf8EqualTo(CStrVector(string)));
+  CHECK(String::cast(print_string)->IsOneByteEqualTo(CStrVector(string)));
 }


@@ -168,7 +168,7 @@ static void CheckNumber(Isolate* isolate, double value, const char* string) {
  CHECK(number->IsNumber());
  Handle<Object> print_string =
      Object::ToString(isolate, number).ToHandleChecked();
-  CHECK(String::cast(*print_string)->IsUtf8EqualTo(CStrVector(string)));
+  CHECK(String::cast(*print_string)->IsOneByteEqualTo(CStrVector(string)));
 }

 void CheckEmbeddedObjectsAreEqual(Handle<Code> lhs, Handle<Code> rhs) {
@@ -764,10 +764,10 @@ static void CheckInternalizedStrings(const char** strings) {
    CHECK(a->IsInternalizedString());
    Handle<String> b = factory->InternalizeUtf8String(string);
    CHECK_EQ(*b, *a);
-    CHECK(b->IsUtf8EqualTo(CStrVector(string)));
+    CHECK(b->IsOneByteEqualTo(CStrVector(string)));
    b = isolate->factory()->InternalizeUtf8String(CStrVector(string));
    CHECK_EQ(*b, *a);
-    CHECK(b->IsUtf8EqualTo(CStrVector(string)));
+    CHECK(b->IsOneByteEqualTo(CStrVector(string)));
  }
 }


--- a/test/cctest/test-object.cc
+++ b/test/cctest/test-object.cc
@@ -17,8 +17,10 @@ namespace internal {

 static void CheckObject(Isolate* isolate, Handle<Object> obj,
                        const char* string) {
-  Object print_string = *Object::NoSideEffectsToString(isolate, obj);
-  CHECK(String::cast(print_string)->IsUtf8EqualTo(CStrVector(string)));
+  Handle<String> print_string = String::Flatten(
+      isolate,
+      Handle<String>::cast(Object::NoSideEffectsToString(isolate, obj)));
+  CHECK(print_string->IsOneByteEqualTo(CStrVector(string)));
 }

 static void CheckSmi(Isolate* isolate, int value, const char* string) {

--- a/test/unittests/unicode-unittest.cc
+++ b/test/unittests/unicode-unittest.cc
@@ -25,15 +25,13 @@ void DecodeNormally(const std::vector<byte>& bytes,
  }
 }

-template <size_t kBufferSize>
-void DecodeUtf16(unibrow::Utf8Decoder<kBufferSize>* decoder,
-                 const std::vector<byte>& bytes,
+void DecodeUtf16(const std::vector<uint8_t>& bytes,
                 std::vector<unibrow::uchar>* output) {
-  auto vector = Vector<const char>::cast(VectorOf(bytes));
-  decoder->Reset(vector);
+  auto utf8_data = Vector<const uint8_t>::cast(VectorOf(bytes));
+  Utf8Decoder decoder(utf8_data);

-  std::vector<uint16_t> utf16(decoder->Utf16Length());
-  decoder->WriteUtf16(&(*utf16.begin()), decoder->Utf16Length(), vector);
+  std::vector<uint16_t> utf16(decoder.utf16_length());
+  decoder.Decode(&utf16[0], utf8_data);

  // Decode back into code points
  for (size_t i = 0; i < utf16.size(); i++) {
@@ -68,8 +66,6 @@ void DecodeIncrementally(const std::vector<byte>& bytes,
 }  // namespace

 TEST(UnicodeTest, Utf16BufferReuse) {
-  unibrow::Utf8Decoder<4> utf16_decoder;
-
  // Not enough continuation bytes before string ends.
  typedef struct {
    std::vector<byte> bytes;
@@ -94,7 +90,7 @@ TEST(UnicodeTest, Utf16BufferReuse) {
    fprintf(stderr, "\n");

    std::vector<unibrow::uchar> output_utf16;
-    DecodeUtf16(&utf16_decoder, test.bytes, &output_utf16);
+    DecodeUtf16(test.bytes, &output_utf16);

    CHECK_EQ(output_utf16.size(), test.unicode_expected.size());
    for (size_t i = 0; i < output_utf16.size(); ++i) {
@@ -104,12 +100,9 @@ TEST(UnicodeTest, Utf16BufferReuse) {
 }

 TEST(UnicodeTest, SurrogateOverrunsBuffer) {
-  unibrow::Utf8Decoder<2> utf16_decoder;
-
  std::vector<unibrow::uchar> output_utf16;
  // Not enough continuation bytes before string ends.
-  DecodeUtf16(&utf16_decoder, {0x00, 0xF0, 0x90, 0x80, 0x80, 0x00},
-              &output_utf16);
+  DecodeUtf16({0x00, 0xF0, 0x90, 0x80, 0x80, 0x00}, &output_utf16);
  CHECK_EQ(output_utf16[0], 0x00);
  CHECK_EQ(output_utf16[1], 0x10000);
  CHECK_EQ(output_utf16[0], 0x00);
@@ -466,8 +459,6 @@ TEST(UnicodeTest, IncrementalUTF8DecodingVsNonIncrementalUtf8Decoding) {
        0x8FFFF}},
  };

-  unibrow::Utf8Decoder<50> utf16_decoder;
-
  for (auto test : data) {
    // For figuring out which test fails:
    fprintf(stderr, "test: ");
@@ -493,7 +484,7 @@ TEST(UnicodeTest, IncrementalUTF8DecodingVsNonIncrementalUtf8Decoding) {
    }

    std::vector<unibrow::uchar> output_utf16;
-    DecodeUtf16(&utf16_decoder, test.bytes, &output_utf16);
+    DecodeUtf16(test.bytes, &output_utf16);

    CHECK_EQ(output_utf16.size(), test.unicode_expected.size());
    for (size_t i = 0; i < output_utf16.size(); ++i) {