Speed up CharacterRange::AddCaseEquivalents

By using the lexCss("color:") to measure the performance The change make the lexCss("color:") x21 - x40 times faster than trunk. x2.3 - x4.6 times faster than m74. Design Doc: http://shorturl.at/adfO5 Measured by out/x64.release/d8 reg977003.js see reg977003.js attached to chromium:977003 Also see another cl of benchmark in https://chromium-review.googlesource.com/c/v8/v8/+/1679651/ Bug: chromium:977003 Change-Id: Ie8518493d2c33df1594be1b4576bda715087b421 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1674851 Commit-Queue: Frank Tang <ftang@chromium.org> Reviewed-by: Yang Guo <yangguo@chromium.org> Cr-Commit-Position: refs/heads/master@{#62471}

Speed up CharacterRange::AddCaseEquivalents
By using the lexCss("color:") to measure the performance The change make the lexCss("color:") x21 - x40 times faster than trunk. x2.3 - x4.6 times faster than m74. Design Doc: http://shorturl.at/adfO5 Measured by out/x64.release/d8 reg977003.js see reg977003.js attached to chromium:977003 Also see another cl of benchmark in https://chromium-review.googlesource.com/c/v8/v8/+/1679651/ Bug: chromium:977003 Change-Id: Ie8518493d2c33df1594be1b4576bda715087b421 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1674851 Commit-Queue: Frank Tang <ftang@chromium.org> Reviewed-by: Yang Guo <yangguo@chromium.org> Cr-Commit-Position: refs/heads/master@{#62471}
f23f644f · Frank Tang · Commit Bot · a420d20c · f23f644f · f23f644f
Commit f23f644f authored Jun 29, 2019 by Frank Tang Committed by Commit Bot Jul 01, 2019
4 changed files
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -2708,6 +2708,7 @@ v8_source_set("v8_base_without_compiler") {
    "src/regexp/regexp-nodes.h",
    "src/regexp/regexp-parser.cc",
    "src/regexp/regexp-parser.h",
+    "src/regexp/regexp-special-case.h",
    "src/regexp/regexp-stack.cc",
    "src/regexp/regexp-stack.h",
    "src/regexp/regexp-utils.cc",
@@ -3240,6 +3241,8 @@ v8_source_set("v8_base_without_compiler") {
  ]

  if (v8_enable_i18n_support) {
+    deps += [ ":run_gen-regexp-special-case" ]
+    sources += [ "$target_gen_dir/src/regexp/special-case.cc" ]
    if (is_win) {
      deps += [ "//third_party/icu:icudata" ]
    }
@@ -3907,6 +3910,48 @@ v8_executable("torque-language-server") {
  }
 }

+v8_executable("gen-regexp-special-case") {
+  visibility = [ ":*" ]  # Only targets in this file can depend on this.
+
+  sources = [
+    "src/regexp/gen-regexp-special-case.cc",
+  ]
+
+  deps = [
+    ":v8_libbase",
+    "//build/win:default_exe_manifest",
+    "//third_party/icu",
+  ]
+
+  configs = [ ":internal_config" ]
+}
+
+action("run_gen-regexp-special-case") {
+  visibility = [ ":*" ]  # Only targets in this file can depend on this.
+
+  script = "tools/run.py"
+
+  sources = v8_extra_library_files
+
+  deps = [
+    ":gen-regexp-special-case",
+  ]
+
+  output_file = "$target_gen_dir/src/regexp/special-case.cc"
+
+  outputs = [
+    output_file,
+  ]
+
+  args = [
+    "./" + rebase_path(
+            get_label_info(":gen-regexp-special-case($v8_generator_toolchain)",
+                           "root_out_dir") + "/gen-regexp-special-case",
+            root_build_dir),
+    rebase_path(output_file, root_build_dir),
+  ]
+}
+
 ###############################################################################
 # Public targets
 #

--- a/src/regexp/gen-regexp-special-case.cc
+++ b/src/regexp/gen-regexp-special-case.cc
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+
+#include "src/base/logging.h"
+#include "unicode/uchar.h"
+#include "unicode/uniset.h"
+
+namespace v8 {
+namespace internal {
+
+// The following code generates BuildSpecialAddSet() and BuildIgnoreSet()
+// functions into "src/regexp/special-case.cc".
+// See more details in http://shorturl.at/adfO5
+void PrintSet(std::ofstream& out, const char* func_name,
+              const icu::UnicodeSet& set) {
+  out << "icu::UnicodeSet " << func_name << "() {\n"
+      << "  icu::UnicodeSet set;\n";
+  for (int32_t i = 0; i < set.getRangeCount(); i++) {
+    if (set.getRangeStart(i) == set.getRangeEnd(i)) {
+      out << "  set.add(0x" << set.getRangeStart(i) << ");\n";
+    } else {
+      out << "  set.add(0x" << set.getRangeStart(i) << ", 0x"
+          << set.getRangeEnd(i) << ");\n";
+    }
+  }
+  out << "  set.freeze();\n"
+      << "  return set;\n"
+      << "}\n";
+}
+
+void PrintSpecial(std::ofstream& out) {
+  icu::UnicodeSet current;
+  icu::UnicodeSet processed(0xd800, 0xdbff);  // Ignore surrogate range.
+  icu::UnicodeSet special_add;
+  icu::UnicodeSet ignore;
+  UErrorCode status = U_ZERO_ERROR;
+  icu::UnicodeSet upper("[\\p{Lu}]", status);
+  CHECK(U_SUCCESS(status));
+  // Iterate through all chars in BMP except ASCII and Surrogate.
+  for (UChar32 i = 0x80; i < 0x010000; i++) {
+    // Ignore those characters which is already processed.
+    if (!processed.contains(i)) {
+      current.set(i, i);
+      current.closeOver(USET_CASE_INSENSITIVE);
+
+      // Remember we already processed current.
+      processed.addAll(current);
+
+      // All uppercase characters in current.
+      icu::UnicodeSet keep_upper(current);
+      keep_upper.retainAll(upper);
+
+      // Check if we have more than one uppercase character in current.
+      // If there are more than one uppercase character, then it is a special
+      // set which need to be added into either "Special Add" set or "Ignore"
+      // set.
+      int32_t number_of_upper = 0;
+      for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) {
+        number_of_upper +=
+            keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1;
+      }
+      if (number_of_upper > 1) {
+        // Add all non uppercase characters (could be Ll or Mn) to special add
+        // set.
+        current.removeAll(upper);
+        special_add.addAll(current);
+
+        // Add the uppercase characters of non uppercase character to
+        // special add set.
+        CHECK_GT(current.getRangeCount(), 0);
+        UChar32 main_upper = u_toupper(current.getRangeStart(0));
+        special_add.add(main_upper);
+
+        // Add all uppercase except the main upper to ignore set.
+        keep_upper.remove(main_upper);
+        ignore.addAll(keep_upper);
+      }
+    }
+  }
+
+  // Remove any ASCII
+  special_add.remove(0x0000, 0x007f);
+  PrintSet(out, "BuildIgnoreSet", ignore);
+  PrintSet(out, "BuildSpecialAddSet", special_add);
+}
+
+void WriteHeader(const char* header_filename) {
+  std::ofstream out(header_filename);
+  out << std::hex << std::setfill('0') << std::setw(4);
+
+  out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n"
+      << "// The following functions are used to build icu::UnicodeSet\n"
+      << "// for specical cases different between Unicode and ECMA262.\n"
+      << "#ifdef V8_INTL_SUPPORT\n"
+      << "#include \"src/regexp/special-case.h\"\n\n"
+      << "#include \"unicode/uniset.h\"\n"
+      << "namespace v8 {\n"
+      << "namespace internal {\n\n";
+
+  PrintSpecial(out);
+
+  out << "\n"
+      << "}  // namespace internal\n"
+      << "}  // namespace v8\n"
+      << "#endif  // V8_INTL_SUPPORT\n";
+}
+
+}  // namespace internal
+}  // namespace v8
+
+int main(int argc, const char** argv) {
+  if (argc != 2) {
+    std::cerr << "Usage: " << argv[0] << " <output filename>\n";
+    std::exit(1);
+  }
+  v8::internal::WriteHeader(argv[1]);
+
+  return 0;
+}
--- a/src/regexp/regexp-compiler-tonode.cc
+++ b/src/regexp/regexp-compiler-tonode.cc
@@ -6,6 +6,9 @@

 #include "src/execution/isolate.h"
 #include "src/regexp/regexp.h"
+#ifdef V8_INTL_SUPPORT
+#include "src/regexp/special-case.h"
+#endif  // V8_INTL_SUPPORT
 #include "src/strings/unicode-inl.h"
 #include "src/zone/zone-list-inl.h"

@@ -1137,6 +1140,39 @@ Vector<const int> CharacterRange::GetWordBounds() {
  return Vector<const int>(kWordRanges, kWordRangeCount - 1);
 }

+#ifdef V8_INTL_SUPPORT
+struct IgnoreSet {
+  IgnoreSet() : set(BuildIgnoreSet()) {}
+  const icu::UnicodeSet set;
+};
+
+struct SpecialAddSet {
+  SpecialAddSet() : set(BuildSpecialAddSet()) {}
+  const icu::UnicodeSet set;
+};
+
+icu::UnicodeSet BuildAsciiAToZSet() {
+  icu::UnicodeSet set('a', 'z');
+  set.add('A', 'Z');
+  set.freeze();
+  return set;
+}
+
+struct AsciiAToZSet {
+  AsciiAToZSet() : set(BuildAsciiAToZSet()) {}
+  const icu::UnicodeSet set;
+};
+
+static base::LazyInstance<IgnoreSet>::type ignore_set =
+    LAZY_INSTANCE_INITIALIZER;
+
+static base::LazyInstance<SpecialAddSet>::type special_add_set =
+    LAZY_INSTANCE_INITIALIZER;
+
+static base::LazyInstance<AsciiAToZSet>::type ascii_a_to_z_set =
+    LAZY_INSTANCE_INITIALIZER;
+#endif  // V8_INTL_SUPPORT
+
 // static
 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
                                        ZoneList<CharacterRange>* ranges,
@@ -1144,58 +1180,100 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
  CharacterRange::Canonicalize(ranges);
  int range_count = ranges->length();
 #ifdef V8_INTL_SUPPORT
-  icu::UnicodeSet already_added;
  icu::UnicodeSet others;
  for (int i = 0; i < range_count; i++) {
    CharacterRange range = ranges->at(i);
-    uc32 bottom = range.from();
-    if (bottom > String::kMaxUtf16CodeUnit) continue;
-    uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
+    uc32 from = range.from();
+    if (from > String::kMaxUtf16CodeUnit) continue;
+    uc32 to = Min(range.to(), String::kMaxUtf16CodeUnit);
    // Nothing to be done for surrogates.
-    if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
+    if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
    if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
-      if (bottom > String::kMaxOneByteCharCode) continue;
-      if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
+      if (from > String::kMaxOneByteCharCode) continue;
+      if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode;
+    }
+    others.add(from, to);
  }
-    already_added.add(bottom, top);
-    icu::Locale locale = icu::Locale::getRoot();
-    while (bottom <= top) {
-      icu::UnicodeString upper(bottom);
-      upper.toUpper(locale);
-      icu::UnicodeSet expanded(bottom, bottom);
-      expanded.closeOver(USET_CASE_INSENSITIVE);
-      for (int32_t i = 0; i < expanded.getRangeCount(); i++) {
-        UChar32 start = expanded.getRangeStart(i);
-        UChar32 end = expanded.getRangeEnd(i);
-        while (start <= end) {
-          icu::UnicodeString upper2(start);
-          upper2.toUpper(locale);
-          // Only add if the upper case are the same.
-          if (upper[0] == upper2[0]) {
-            // #sec-runtime-semantics-canonicalize-ch
-            // 3.g. If the numeric value of ch ≥ 128 and the numeric value of
-            // cu < 128, return ch.
-            if (bottom >= 128 && start < 128) {
-              others.add(bottom);
+
+  // Set of characters already added to ranges that do not need to be added
+  // again.
+  icu::UnicodeSet already_added(others);
+
+  // Set of characters in ranges that are in the 52 ASCII characters [a-zA-Z].
+  icu::UnicodeSet in_ascii_a_to_z(others);
+  in_ascii_a_to_z.retainAll(ascii_a_to_z_set.Pointer()->set);
+
+  // Remove all chars in [a-zA-Z] from others.
+  others.removeAll(in_ascii_a_to_z);
+
+  // Set of characters in ranges that are overlapping with special add set.
+  icu::UnicodeSet in_special_add(others);
+  in_special_add.retainAll(special_add_set.Pointer()->set);
+
+  others.removeAll(in_special_add);
+
+  // Ignore all chars in ignore set.
+  others.removeAll(ignore_set.Pointer()->set);
+
+  // For most of the chars in ranges that is still in others, find the case
+  // equivlant set by calling closeOver(USET_CASE_INSENSITIVE).
+  others.closeOver(USET_CASE_INSENSITIVE);
+
+  // Because closeOver(USET_CASE_INSENSITIVE) may add ASCII [a-zA-Z] to others,
+  // but ECMA262 "i" mode won't consider that, remove them from others.
+  // Ex: U+017F add 'S' and 's' to others.
+  others.removeAll(ascii_a_to_z_set.Pointer()->set);
+
+  // Special handling for in_ascii_a_to_z.
+  for (int32_t i = 0; i < in_ascii_a_to_z.getRangeCount(); i++) {
+    UChar32 start = in_ascii_a_to_z.getRangeStart(i);
+    UChar32 end = in_ascii_a_to_z.getRangeEnd(i);
+    // Check if it is uppercase A-Z by checking bit 6.
+    if (start & 0x0020) {
+      // Add the lowercases
+      others.add(start & 0x005F, end & 0x005F);
    } else {
-              // 3.h. 3.h. 3.h. Return cu.
-              others.add(start);
+      // Add the uppercases
+      others.add(start | 0x0020, end | 0x0020);
    }
  }
-          start++;
+
+  // Special handling for chars in "Special Add" set.
+  for (int32_t i = 0; i < in_special_add.getRangeCount(); i++) {
+    UChar32 end = in_special_add.getRangeEnd(i);
+    for (UChar32 ch = in_special_add.getRangeStart(i); ch <= end; ch++) {
+      // Add the uppercase of this character if itself is not an uppercase
+      // character.
+      // Note: The if condiction cannot be u_islower(ch) because ch could be
+      // neither uppercase nor lowercase but Mn.
+      if (!u_isupper(ch)) {
+        others.add(u_toupper(ch));
      }
+      icu::UnicodeSet candidates(ch, ch);
+      candidates.closeOver(USET_CASE_INSENSITIVE);
+      for (int32_t j = 0; j < candidates.getRangeCount(); j++) {
+        UChar32 end2 = candidates.getRangeEnd(j);
+        for (UChar32 ch2 = candidates.getRangeStart(j); ch2 <= end2; ch2++) {
+          // Add character that is not uppercase to others.
+          if (!u_isupper(ch2)) {
+            others.add(ch2);
          }
-      bottom++;
        }
      }
+    }
+  }
+
+  // Remove all characters which already in the ranges.
  others.removeAll(already_added);
+
+  // Add others to the ranges
  for (int32_t i = 0; i < others.getRangeCount(); i++) {
-    UChar32 start = others.getRangeStart(i);
-    UChar32 end = others.getRangeEnd(i);
-    if (start == end) {
-      ranges->Add(CharacterRange::Singleton(start), zone);
+    UChar32 from = others.getRangeStart(i);
+    UChar32 to = others.getRangeEnd(i);
+    if (from == to) {
+      ranges->Add(CharacterRange::Singleton(from), zone);
    } else {
-      ranges->Add(CharacterRange::Range(start, end), zone);
+      ranges->Add(CharacterRange::Range(from, to), zone);
    }
  }
 #else

--- a/src/regexp/special-case.h
+++ b/src/regexp/special-case.h
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_SPECIAL_CASE_H_
+#define V8_REGEXP_SPECIAL_CASE_H_
+
+#ifdef V8_INTL_SUPPORT
+#include "unicode/uversion.h"
+namespace U_ICU_NAMESPACE {
+class UnicodeSet;
+}  //  namespace U_ICU_NAMESPACE
+
+namespace v8 {
+namespace internal {
+
+// Functions to build special sets of Unicode characters that need special
+// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE).
+//
+// For the characters in the "ignore set", the process should not treat other
+// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case
+// equivlant under the ECMA262 RegExp "i" mode because these characters are
+// uppercase themselves that no other characters in the set uppercase to.
+//
+// For the characters in the "special add set", the proecess should add only
+// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is
+// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode
+// and also that ONE uppercase character that other non uppercase character
+// uppercase into to the set. Other uppercase characters in the result of
+// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262
+// RegExp "i" mode consider two characters as "case equivlant" if both
+// characters uppercase to the same character.
+//
+// For example, consider the following case equivalent set defined by Unicode
+// standard. Notice there are more than one uppercase characters in this set:
+//  U+212B Å Angstrom Sign - an uppercase character.
+//  U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character.
+//  U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which
+//    uppercase to U+00C5.
+// In this case equivlant set is a special set and need special handling while
+// considering "case equivlant" under the ECMA262 RegExp "i" mode which is
+// different than Unicode Standard:
+//  * U+212B should be included into the "ignore" set because there are no other
+//    characters, under the ECMA262 "i" mode, are considered as "case equivlant"
+//    to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5
+//    uppercase to U+212B.
+//  * U+00C5 and U+00E5 will both be included into the "special add" set. While
+//    calculate the "equivlant set" under ECMA262 "i" mode, the process will
+//    add U+00E5, because it is not an uppercase character in the set. The
+//    process will also add U+00C5, because it is the uppercase character which
+//    other non uppercase character, U+00C5, uppercase into.
+//
+// For characters not included in "ignore set" and "special add set", the
+// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is
+// much faster.
+//
+// Under Unicode 12.0, there are only 7 characters in the "special add set" and
+// 4 characters in "ignore set" so even the special add process is slower, it is
+// limited to a small set of cases only.
+//
+// The implementation of these two function will be generated by calling ICU
+// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by
+// the code in src/regexp/gen-regexp-special-case.cc.
+//
+// These two function will be used with LazyInstance<> template to generate
+// global sharable set to reduce memory usage and speed up performance.
+
+// Function to build and return the Ignore set.
+icu::UnicodeSet BuildIgnoreSet();
+
+// Function to build and return the Special Add set.
+icu::UnicodeSet BuildSpecialAddSet();
+
+}  // namespace internal
+}  // namespace v8
+
+#endif  // V8_INTL_SUPPORT
+
+#endif  // V8_REGEXP_SPECIAL_CASE_H_