Commit f23f644f authored by Frank Tang's avatar Frank Tang Committed by Commit Bot

Speed up CharacterRange::AddCaseEquivalents

By using the lexCss("color:") to measure the performance
The change make the lexCss("color:")
  x21 - x40 times faster than trunk.
  x2.3 - x4.6 times faster than m74.

Design Doc: http://shorturl.at/adfO5

Measured by out/x64.release/d8 reg977003.js
see reg977003.js attached to chromium:977003

Also see another cl of benchmark in
https://chromium-review.googlesource.com/c/v8/v8/+/1679651/


Bug: chromium:977003
Change-Id: Ie8518493d2c33df1594be1b4576bda715087b421
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1674851
Commit-Queue: Frank Tang <ftang@chromium.org>
Reviewed-by: 's avatarYang Guo <yangguo@chromium.org>
Cr-Commit-Position: refs/heads/master@{#62471}
parent a420d20c
......@@ -2708,6 +2708,7 @@ v8_source_set("v8_base_without_compiler") {
"src/regexp/regexp-nodes.h",
"src/regexp/regexp-parser.cc",
"src/regexp/regexp-parser.h",
"src/regexp/regexp-special-case.h",
"src/regexp/regexp-stack.cc",
"src/regexp/regexp-stack.h",
"src/regexp/regexp-utils.cc",
......@@ -3240,6 +3241,8 @@ v8_source_set("v8_base_without_compiler") {
]
if (v8_enable_i18n_support) {
deps += [ ":run_gen-regexp-special-case" ]
sources += [ "$target_gen_dir/src/regexp/special-case.cc" ]
if (is_win) {
deps += [ "//third_party/icu:icudata" ]
}
......@@ -3907,6 +3910,48 @@ v8_executable("torque-language-server") {
}
}
v8_executable("gen-regexp-special-case") {
visibility = [ ":*" ] # Only targets in this file can depend on this.
sources = [
"src/regexp/gen-regexp-special-case.cc",
]
deps = [
":v8_libbase",
"//build/win:default_exe_manifest",
"//third_party/icu",
]
configs = [ ":internal_config" ]
}
action("run_gen-regexp-special-case") {
visibility = [ ":*" ] # Only targets in this file can depend on this.
script = "tools/run.py"
sources = v8_extra_library_files
deps = [
":gen-regexp-special-case",
]
output_file = "$target_gen_dir/src/regexp/special-case.cc"
outputs = [
output_file,
]
args = [
"./" + rebase_path(
get_label_info(":gen-regexp-special-case($v8_generator_toolchain)",
"root_out_dir") + "/gen-regexp-special-case",
root_build_dir),
rebase_path(output_file, root_build_dir),
]
}
###############################################################################
# Public targets
#
......
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <fstream>
#include <iomanip>
#include <iostream>
#include <sstream>
#include "src/base/logging.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
namespace v8 {
namespace internal {
// The following code generates BuildSpecialAddSet() and BuildIgnoreSet()
// functions into "src/regexp/special-case.cc".
// See more details in http://shorturl.at/adfO5
void PrintSet(std::ofstream& out, const char* func_name,
const icu::UnicodeSet& set) {
out << "icu::UnicodeSet " << func_name << "() {\n"
<< " icu::UnicodeSet set;\n";
for (int32_t i = 0; i < set.getRangeCount(); i++) {
if (set.getRangeStart(i) == set.getRangeEnd(i)) {
out << " set.add(0x" << set.getRangeStart(i) << ");\n";
} else {
out << " set.add(0x" << set.getRangeStart(i) << ", 0x"
<< set.getRangeEnd(i) << ");\n";
}
}
out << " set.freeze();\n"
<< " return set;\n"
<< "}\n";
}
void PrintSpecial(std::ofstream& out) {
icu::UnicodeSet current;
icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range.
icu::UnicodeSet special_add;
icu::UnicodeSet ignore;
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeSet upper("[\\p{Lu}]", status);
CHECK(U_SUCCESS(status));
// Iterate through all chars in BMP except ASCII and Surrogate.
for (UChar32 i = 0x80; i < 0x010000; i++) {
// Ignore those characters which is already processed.
if (!processed.contains(i)) {
current.set(i, i);
current.closeOver(USET_CASE_INSENSITIVE);
// Remember we already processed current.
processed.addAll(current);
// All uppercase characters in current.
icu::UnicodeSet keep_upper(current);
keep_upper.retainAll(upper);
// Check if we have more than one uppercase character in current.
// If there are more than one uppercase character, then it is a special
// set which need to be added into either "Special Add" set or "Ignore"
// set.
int32_t number_of_upper = 0;
for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) {
number_of_upper +=
keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1;
}
if (number_of_upper > 1) {
// Add all non uppercase characters (could be Ll or Mn) to special add
// set.
current.removeAll(upper);
special_add.addAll(current);
// Add the uppercase characters of non uppercase character to
// special add set.
CHECK_GT(current.getRangeCount(), 0);
UChar32 main_upper = u_toupper(current.getRangeStart(0));
special_add.add(main_upper);
// Add all uppercase except the main upper to ignore set.
keep_upper.remove(main_upper);
ignore.addAll(keep_upper);
}
}
}
// Remove any ASCII
special_add.remove(0x0000, 0x007f);
PrintSet(out, "BuildIgnoreSet", ignore);
PrintSet(out, "BuildSpecialAddSet", special_add);
}
void WriteHeader(const char* header_filename) {
std::ofstream out(header_filename);
out << std::hex << std::setfill('0') << std::setw(4);
out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n"
<< "// The following functions are used to build icu::UnicodeSet\n"
<< "// for specical cases different between Unicode and ECMA262.\n"
<< "#ifdef V8_INTL_SUPPORT\n"
<< "#include \"src/regexp/special-case.h\"\n\n"
<< "#include \"unicode/uniset.h\"\n"
<< "namespace v8 {\n"
<< "namespace internal {\n\n";
PrintSpecial(out);
out << "\n"
<< "} // namespace internal\n"
<< "} // namespace v8\n"
<< "#endif // V8_INTL_SUPPORT\n";
}
} // namespace internal
} // namespace v8
int main(int argc, const char** argv) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <output filename>\n";
std::exit(1);
}
v8::internal::WriteHeader(argv[1]);
return 0;
}
......@@ -6,6 +6,9 @@
#include "src/execution/isolate.h"
#include "src/regexp/regexp.h"
#ifdef V8_INTL_SUPPORT
#include "src/regexp/special-case.h"
#endif // V8_INTL_SUPPORT
#include "src/strings/unicode-inl.h"
#include "src/zone/zone-list-inl.h"
......@@ -1137,6 +1140,39 @@ Vector<const int> CharacterRange::GetWordBounds() {
return Vector<const int>(kWordRanges, kWordRangeCount - 1);
}
#ifdef V8_INTL_SUPPORT
struct IgnoreSet {
IgnoreSet() : set(BuildIgnoreSet()) {}
const icu::UnicodeSet set;
};
struct SpecialAddSet {
SpecialAddSet() : set(BuildSpecialAddSet()) {}
const icu::UnicodeSet set;
};
icu::UnicodeSet BuildAsciiAToZSet() {
icu::UnicodeSet set('a', 'z');
set.add('A', 'Z');
set.freeze();
return set;
}
struct AsciiAToZSet {
AsciiAToZSet() : set(BuildAsciiAToZSet()) {}
const icu::UnicodeSet set;
};
static base::LazyInstance<IgnoreSet>::type ignore_set =
LAZY_INSTANCE_INITIALIZER;
static base::LazyInstance<SpecialAddSet>::type special_add_set =
LAZY_INSTANCE_INITIALIZER;
static base::LazyInstance<AsciiAToZSet>::type ascii_a_to_z_set =
LAZY_INSTANCE_INITIALIZER;
#endif // V8_INTL_SUPPORT
// static
void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
ZoneList<CharacterRange>* ranges,
......@@ -1144,58 +1180,100 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
CharacterRange::Canonicalize(ranges);
int range_count = ranges->length();
#ifdef V8_INTL_SUPPORT
icu::UnicodeSet already_added;
icu::UnicodeSet others;
for (int i = 0; i < range_count; i++) {
CharacterRange range = ranges->at(i);
uc32 bottom = range.from();
if (bottom > String::kMaxUtf16CodeUnit) continue;
uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
uc32 from = range.from();
if (from > String::kMaxUtf16CodeUnit) continue;
uc32 to = Min(range.to(), String::kMaxUtf16CodeUnit);
// Nothing to be done for surrogates.
if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
if (bottom > String::kMaxOneByteCharCode) continue;
if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
if (from > String::kMaxOneByteCharCode) continue;
if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode;
}
others.add(from, to);
}
already_added.add(bottom, top);
icu::Locale locale = icu::Locale::getRoot();
while (bottom <= top) {
icu::UnicodeString upper(bottom);
upper.toUpper(locale);
icu::UnicodeSet expanded(bottom, bottom);
expanded.closeOver(USET_CASE_INSENSITIVE);
for (int32_t i = 0; i < expanded.getRangeCount(); i++) {
UChar32 start = expanded.getRangeStart(i);
UChar32 end = expanded.getRangeEnd(i);
while (start <= end) {
icu::UnicodeString upper2(start);
upper2.toUpper(locale);
// Only add if the upper case are the same.
if (upper[0] == upper2[0]) {
// #sec-runtime-semantics-canonicalize-ch
// 3.g. If the numeric value of ch ≥ 128 and the numeric value of
// cu < 128, return ch.
if (bottom >= 128 && start < 128) {
others.add(bottom);
// Set of characters already added to ranges that do not need to be added
// again.
icu::UnicodeSet already_added(others);
// Set of characters in ranges that are in the 52 ASCII characters [a-zA-Z].
icu::UnicodeSet in_ascii_a_to_z(others);
in_ascii_a_to_z.retainAll(ascii_a_to_z_set.Pointer()->set);
// Remove all chars in [a-zA-Z] from others.
others.removeAll(in_ascii_a_to_z);
// Set of characters in ranges that are overlapping with special add set.
icu::UnicodeSet in_special_add(others);
in_special_add.retainAll(special_add_set.Pointer()->set);
others.removeAll(in_special_add);
// Ignore all chars in ignore set.
others.removeAll(ignore_set.Pointer()->set);
// For most of the chars in ranges that is still in others, find the case
// equivlant set by calling closeOver(USET_CASE_INSENSITIVE).
others.closeOver(USET_CASE_INSENSITIVE);
// Because closeOver(USET_CASE_INSENSITIVE) may add ASCII [a-zA-Z] to others,
// but ECMA262 "i" mode won't consider that, remove them from others.
// Ex: U+017F add 'S' and 's' to others.
others.removeAll(ascii_a_to_z_set.Pointer()->set);
// Special handling for in_ascii_a_to_z.
for (int32_t i = 0; i < in_ascii_a_to_z.getRangeCount(); i++) {
UChar32 start = in_ascii_a_to_z.getRangeStart(i);
UChar32 end = in_ascii_a_to_z.getRangeEnd(i);
// Check if it is uppercase A-Z by checking bit 6.
if (start & 0x0020) {
// Add the lowercases
others.add(start & 0x005F, end & 0x005F);
} else {
// 3.h. 3.h. 3.h. Return cu.
others.add(start);
// Add the uppercases
others.add(start | 0x0020, end | 0x0020);
}
}
start++;
// Special handling for chars in "Special Add" set.
for (int32_t i = 0; i < in_special_add.getRangeCount(); i++) {
UChar32 end = in_special_add.getRangeEnd(i);
for (UChar32 ch = in_special_add.getRangeStart(i); ch <= end; ch++) {
// Add the uppercase of this character if itself is not an uppercase
// character.
// Note: The if condiction cannot be u_islower(ch) because ch could be
// neither uppercase nor lowercase but Mn.
if (!u_isupper(ch)) {
others.add(u_toupper(ch));
}
icu::UnicodeSet candidates(ch, ch);
candidates.closeOver(USET_CASE_INSENSITIVE);
for (int32_t j = 0; j < candidates.getRangeCount(); j++) {
UChar32 end2 = candidates.getRangeEnd(j);
for (UChar32 ch2 = candidates.getRangeStart(j); ch2 <= end2; ch2++) {
// Add character that is not uppercase to others.
if (!u_isupper(ch2)) {
others.add(ch2);
}
bottom++;
}
}
}
}
// Remove all characters which already in the ranges.
others.removeAll(already_added);
// Add others to the ranges
for (int32_t i = 0; i < others.getRangeCount(); i++) {
UChar32 start = others.getRangeStart(i);
UChar32 end = others.getRangeEnd(i);
if (start == end) {
ranges->Add(CharacterRange::Singleton(start), zone);
UChar32 from = others.getRangeStart(i);
UChar32 to = others.getRangeEnd(i);
if (from == to) {
ranges->Add(CharacterRange::Singleton(from), zone);
} else {
ranges->Add(CharacterRange::Range(start, end), zone);
ranges->Add(CharacterRange::Range(from, to), zone);
}
}
#else
......
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_SPECIAL_CASE_H_
#define V8_REGEXP_SPECIAL_CASE_H_
#ifdef V8_INTL_SUPPORT
#include "unicode/uversion.h"
namespace U_ICU_NAMESPACE {
class UnicodeSet;
} // namespace U_ICU_NAMESPACE
namespace v8 {
namespace internal {
// Functions to build special sets of Unicode characters that need special
// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE).
//
// For the characters in the "ignore set", the process should not treat other
// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case
// equivlant under the ECMA262 RegExp "i" mode because these characters are
// uppercase themselves that no other characters in the set uppercase to.
//
// For the characters in the "special add set", the proecess should add only
// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is
// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode
// and also that ONE uppercase character that other non uppercase character
// uppercase into to the set. Other uppercase characters in the result of
// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262
// RegExp "i" mode consider two characters as "case equivlant" if both
// characters uppercase to the same character.
//
// For example, consider the following case equivalent set defined by Unicode
// standard. Notice there are more than one uppercase characters in this set:
// U+212B Å Angstrom Sign - an uppercase character.
// U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character.
// U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which
// uppercase to U+00C5.
// In this case equivlant set is a special set and need special handling while
// considering "case equivlant" under the ECMA262 RegExp "i" mode which is
// different than Unicode Standard:
// * U+212B should be included into the "ignore" set because there are no other
// characters, under the ECMA262 "i" mode, are considered as "case equivlant"
// to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5
// uppercase to U+212B.
// * U+00C5 and U+00E5 will both be included into the "special add" set. While
// calculate the "equivlant set" under ECMA262 "i" mode, the process will
// add U+00E5, because it is not an uppercase character in the set. The
// process will also add U+00C5, because it is the uppercase character which
// other non uppercase character, U+00C5, uppercase into.
//
// For characters not included in "ignore set" and "special add set", the
// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is
// much faster.
//
// Under Unicode 12.0, there are only 7 characters in the "special add set" and
// 4 characters in "ignore set" so even the special add process is slower, it is
// limited to a small set of cases only.
//
// The implementation of these two function will be generated by calling ICU
// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by
// the code in src/regexp/gen-regexp-special-case.cc.
//
// These two function will be used with LazyInstance<> template to generate
// global sharable set to reduce memory usage and speed up performance.
// Function to build and return the Ignore set.
icu::UnicodeSet BuildIgnoreSet();
// Function to build and return the Special Add set.
icu::UnicodeSet BuildSpecialAddSet();
} // namespace internal
} // namespace v8
#endif // V8_INTL_SUPPORT
#endif // V8_REGEXP_SPECIAL_CASE_H_
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment