Commit 3fab9d05 authored by Iain Ireland's avatar Iain Ireland Committed by Commit Bot

[regexp] Fix and unify non-unicode case-folding algorithms

Non-unicode, case-insensitive regexps (e.g. /foo/i, not foo/iu) use a
case-folding algorithm that doesn't quite match the Unicode
definition. There are two places in irregexp that need to do
case-folding. Prior to this patch, neither of them quite matched the
spec (https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch).

This patch implements the "Canonicalize" algorithm in
src/regexp/special-case.h, and uses it in the relevant places. It
replaces special-case logic around upper-casing / ASCII characters
with the following approach:

1. For most characters, calling UnicodeSet::closeOver on a set
   containing that character will produce the correct set of
   case-insensitive matches.

2. For a small handful of characters (like the sharp S that prompted
   this change), UnicodeSet::closeOver will include some characters
   that should be omitted. For example, although closeOver('ß') =
   "ßẞ", uppercase('ß') is "SS", so step 3.e means that 'ß'
   canonicalizes to itself, and should not match 'ẞ'. In these cases,
   we can skip the closeOver entirely, because it will never add an
   equivalent character. These characters are in the IgnoreSet.

3. For an even smaller handful of characters, UnicodeSet::closeOver
   will produce some characters that should be omitted, but also some
   characters that should be included. For example, closeOver('k') =
   "kKK" (lowercase k, uppercase K, U+212A KELVIN SIGN), but KELVIN
   SIGN should not match either of the other two (step 3.g). To handle
   this, we put such characters in the SpecialAddSet. In these cases,
   we closeOver the original character, but filter out the results
   that do not have the same canonical value.

The computation of IgnoreSet and SpecialAddSet happens at build time,
using the pre-existing gen-regexp-special-case.cc step.

R=jgruber@chromium.org

Bug: v8:10248
Change-Id: I00d48b180c83bb8e645cc59eda57b01eab134f0b
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2072858Reviewed-by: 's avatarFrank Tang <ftang@chromium.org>
Reviewed-by: 's avatarJakob Gruber <jgruber@chromium.org>
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66641}
parent 61d496a6
...@@ -97,6 +97,7 @@ Henrique Ferreiro <henrique.ferreiro@gmail.com> ...@@ -97,6 +97,7 @@ Henrique Ferreiro <henrique.ferreiro@gmail.com>
Hirofumi Mako <mkhrfm@gmail.com> Hirofumi Mako <mkhrfm@gmail.com>
Honggyu Kim <honggyu.kp@gmail.com> Honggyu Kim <honggyu.kp@gmail.com>
Huáng Jùnliàng <jlhwung@gmail.com> Huáng Jùnliàng <jlhwung@gmail.com>
Iain Ireland <iireland@mozilla.com>
Ingvar Stepanyan <me@rreverser.com> Ingvar Stepanyan <me@rreverser.com>
Ioseb Dzmanashvili <ioseb.dzmanashvili@gmail.com> Ioseb Dzmanashvili <ioseb.dzmanashvili@gmail.com>
Isiah Meadows <impinball@gmail.com> Isiah Meadows <impinball@gmail.com>
......
// Copyright 2019 the V8 project authors. All rights reserved. // Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. // found in the LICENSE file.
...@@ -7,19 +7,19 @@ ...@@ -7,19 +7,19 @@
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "src/base/logging.h" #include "src/regexp/special-case.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
namespace v8 { namespace v8 {
namespace internal { namespace internal {
// The following code generates BuildSpecialAddSet() and BuildIgnoreSet() static const uc32 kSurrogateStart = 0xd800;
// functions into "src/regexp/special-case.cc". static const uc32 kSurrogateEnd = 0xdfff;
// See more details in http://shorturl.at/adfO5 static const uc32 kNonBmpStart = 0x10000;
void PrintSet(std::ofstream& out, const char* func_name,
// The following code generates "src/regexp/special-case.cc".
void PrintSet(std::ofstream& out, const char* name,
const icu::UnicodeSet& set) { const icu::UnicodeSet& set) {
out << "icu::UnicodeSet " << func_name << "() {\n" out << "icu::UnicodeSet Build" << name << "() {\n"
<< " icu::UnicodeSet set;\n"; << " icu::UnicodeSet set;\n";
for (int32_t i = 0; i < set.getRangeCount(); i++) { for (int32_t i = 0; i < set.getRangeCount(); i++) {
if (set.getRangeStart(i) == set.getRangeEnd(i)) { if (set.getRangeStart(i) == set.getRangeEnd(i)) {
...@@ -31,73 +31,113 @@ void PrintSet(std::ofstream& out, const char* func_name, ...@@ -31,73 +31,113 @@ void PrintSet(std::ofstream& out, const char* func_name,
} }
out << " set.freeze();\n" out << " set.freeze();\n"
<< " return set;\n" << " return set;\n"
<< "}\n"; << "}\n\n";
out << "struct " << name << "Data {\n"
<< " " << name << "Data() : set(Build" << name << "()) {}\n"
<< " const icu::UnicodeSet set;\n"
<< "};\n\n";
out << "//static\n"
<< "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
<< " static base::LazyInstance<" << name << "Data>::type set =\n"
<< " LAZY_INSTANCE_INITIALIZER;\n"
<< " return set.Pointer()->set;\n"
<< "}\n\n";
} }
void PrintSpecial(std::ofstream& out) { void PrintSpecial(std::ofstream& out) {
icu::UnicodeSet current; icu::UnicodeSet current;
icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range.
icu::UnicodeSet special_add; icu::UnicodeSet special_add;
icu::UnicodeSet ignore; icu::UnicodeSet ignore;
UErrorCode status = U_ZERO_ERROR; UErrorCode status = U_ZERO_ERROR;
icu::UnicodeSet upper("[\\p{Lu}]", status); icu::UnicodeSet upper("[\\p{Lu}]", status);
CHECK(U_SUCCESS(status)); CHECK(U_SUCCESS(status));
// Iterate through all chars in BMP except ASCII and Surrogate.
for (UChar32 i = 0x80; i < 0x010000; i++) {
// Ignore those characters which is already processed.
if (!processed.contains(i)) {
current.set(i, i);
current.closeOver(USET_CASE_INSENSITIVE);
// Remember we already processed current. // Iterate through all chars in BMP except surrogates.
processed.addAll(current); for (UChar32 i = 0; i < kNonBmpStart; i++) {
if (i >= kSurrogateStart && i <= kSurrogateEnd) {
// All uppercase characters in current. continue; // Ignore surrogate range
icu::UnicodeSet keep_upper(current); }
keep_upper.retainAll(upper); current.set(i, i);
current.closeOver(USET_CASE_INSENSITIVE);
// Check if we have more than one uppercase character in current.
// If there are more than one uppercase character, then it is a special // Check to see if all characters in the case-folding equivalence
// set which need to be added into either "Special Add" set or "Ignore" // class as defined by UnicodeSet::closeOver all map to the same
// set. // canonical value.
int32_t number_of_upper = 0; UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) { bool class_has_matching_canonical_char = false;
number_of_upper += bool class_has_non_matching_canonical_char = false;
keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1; for (int32_t j = 0; j < current.getRangeCount(); j++) {
for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
c++) {
if (c == i) {
continue;
}
UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
if (canonical == other_canonical) {
class_has_matching_canonical_char = true;
} else {
class_has_non_matching_canonical_char = true;
}
}
}
// If any other character in i's equivalence class has a
// different canonical value, then i needs special handling. If
// no other character shares a canonical value with i, we can
// ignore i when adding alternatives for case-independent
// comparison. If at least one other character shares a
// canonical value, then i needs special handling.
if (class_has_non_matching_canonical_char) {
if (class_has_matching_canonical_char) {
special_add.add(i);
} else {
ignore.add(i);
} }
if (number_of_upper > 1) { }
// Add all non uppercase characters (could be Ll or Mn) to special add }
// set.
current.removeAll(upper); // Verify that no Unicode equivalence class contains two non-trivial
special_add.addAll(current); // JS equivalence classes. Every character in SpecialAddSet has the
// same canonical value as every other non-IgnoreSet character in
// Add the uppercase characters of non uppercase character to // its Unicode equivalence class. Therefore, if we call closeOver on
// special add set. // a set containing no IgnoreSet characters, the only characters
CHECK_GT(current.getRangeCount(), 0); // that must be removed from the result are in IgnoreSet. This fact
UChar32 main_upper = u_toupper(current.getRangeStart(0)); // is used in CharacterRange::AddCaseEquivalents.
special_add.add(main_upper); for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
for (UChar32 c = special_add.getRangeStart(i);
// Add all uppercase except the main upper to ignore set. c <= special_add.getRangeEnd(i); c++) {
keep_upper.remove(main_upper); UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
ignore.addAll(keep_upper); current.set(c, c);
current.closeOver(USET_CASE_INSENSITIVE);
current.removeAll(ignore);
for (int32_t j = 0; j < current.getRangeCount(); j++) {
for (UChar32 c2 = current.getRangeStart(j);
c2 <= current.getRangeEnd(j); c2++) {
CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
}
} }
} }
} }
// Remove any ASCII PrintSet(out, "IgnoreSet", ignore);
special_add.remove(0x0000, 0x007f); PrintSet(out, "SpecialAddSet", special_add);
PrintSet(out, "BuildIgnoreSet", ignore);
PrintSet(out, "BuildSpecialAddSet", special_add);
} }
void WriteHeader(const char* header_filename) { void WriteHeader(const char* header_filename) {
std::ofstream out(header_filename); std::ofstream out(header_filename);
out << std::hex << std::setfill('0') << std::setw(4); out << std::hex << std::setfill('0') << std::setw(4);
out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n" << "// Use of this source code is governed by a BSD-style license that\n"
<< "// The following functions are used to build icu::UnicodeSet\n" << "// can be found in the LICENSE file.\n\n"
<< "// for specical cases different between Unicode and ECMA262.\n" << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
<< "// The following functions are used to build UnicodeSets\n"
<< "// for special cases where the case-folding algorithm used by\n"
<< "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
<< "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
<< "// Semantics: Canonicalize) step 3.\n\n"
<< "#ifdef V8_INTL_SUPPORT\n" << "#ifdef V8_INTL_SUPPORT\n"
<< "#include \"src/base/lazy-instance.h\"\n\n"
<< "#include \"src/regexp/special-case.h\"\n\n" << "#include \"src/regexp/special-case.h\"\n\n"
<< "#include \"unicode/uniset.h\"\n" << "#include \"unicode/uniset.h\"\n"
<< "namespace v8 {\n" << "namespace v8 {\n"
......
...@@ -1140,39 +1140,6 @@ Vector<const int> CharacterRange::GetWordBounds() { ...@@ -1140,39 +1140,6 @@ Vector<const int> CharacterRange::GetWordBounds() {
return Vector<const int>(kWordRanges, kWordRangeCount - 1); return Vector<const int>(kWordRanges, kWordRangeCount - 1);
} }
#ifdef V8_INTL_SUPPORT
struct IgnoreSet {
IgnoreSet() : set(BuildIgnoreSet()) {}
const icu::UnicodeSet set;
};
struct SpecialAddSet {
SpecialAddSet() : set(BuildSpecialAddSet()) {}
const icu::UnicodeSet set;
};
icu::UnicodeSet BuildAsciiAToZSet() {
icu::UnicodeSet set('a', 'z');
set.add('A', 'Z');
set.freeze();
return set;
}
struct AsciiAToZSet {
AsciiAToZSet() : set(BuildAsciiAToZSet()) {}
const icu::UnicodeSet set;
};
static base::LazyInstance<IgnoreSet>::type ignore_set =
LAZY_INSTANCE_INITIALIZER;
static base::LazyInstance<SpecialAddSet>::type special_add_set =
LAZY_INSTANCE_INITIALIZER;
static base::LazyInstance<AsciiAToZSet>::type ascii_a_to_z_set =
LAZY_INSTANCE_INITIALIZER;
#endif // V8_INTL_SUPPORT
// static // static
void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
ZoneList<CharacterRange>* ranges, ZoneList<CharacterRange>* ranges,
...@@ -1195,75 +1162,22 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, ...@@ -1195,75 +1162,22 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
others.add(from, to); others.add(from, to);
} }
// Set of characters already added to ranges that do not need to be added // Compute the set of additional characters that should be added,
// again. // using UnicodeSet::closeOver. ECMA 262 defines slightly different
// case-folding rules than Unicode, so some characters that are
// added by closeOver do not match anything other than themselves in
// JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the
// same case-insensitive character as 's' or 'S' according to
// Unicode, but does not match any other character in JS. To handle
// this case, we add such characters to the IgnoreSet and filter
// them out. We filter twice: once before calling closeOver (to
// prevent 'ſ' from adding 's'), and once after calling closeOver
// (to prevent 's' from adding 'ſ'). See regexp/special-case.h for
// more information.
icu::UnicodeSet already_added(others); icu::UnicodeSet already_added(others);
others.removeAll(RegExpCaseFolding::IgnoreSet());
// Set of characters in ranges that are in the 52 ASCII characters [a-zA-Z].
icu::UnicodeSet in_ascii_a_to_z(others);
in_ascii_a_to_z.retainAll(ascii_a_to_z_set.Pointer()->set);
// Remove all chars in [a-zA-Z] from others.
others.removeAll(in_ascii_a_to_z);
// Set of characters in ranges that are overlapping with special add set.
icu::UnicodeSet in_special_add(others);
in_special_add.retainAll(special_add_set.Pointer()->set);
others.removeAll(in_special_add);
// Ignore all chars in ignore set.
others.removeAll(ignore_set.Pointer()->set);
// For most of the chars in ranges that is still in others, find the case
// equivlant set by calling closeOver(USET_CASE_INSENSITIVE).
others.closeOver(USET_CASE_INSENSITIVE); others.closeOver(USET_CASE_INSENSITIVE);
others.removeAll(RegExpCaseFolding::IgnoreSet());
// Because closeOver(USET_CASE_INSENSITIVE) may add ASCII [a-zA-Z] to others,
// but ECMA262 "i" mode won't consider that, remove them from others.
// Ex: U+017F add 'S' and 's' to others.
others.removeAll(ascii_a_to_z_set.Pointer()->set);
// Special handling for in_ascii_a_to_z.
for (int32_t i = 0; i < in_ascii_a_to_z.getRangeCount(); i++) {
UChar32 start = in_ascii_a_to_z.getRangeStart(i);
UChar32 end = in_ascii_a_to_z.getRangeEnd(i);
// Check if it is uppercase A-Z by checking bit 6.
if (start & 0x0020) {
// Add the lowercases
others.add(start & 0x005F, end & 0x005F);
} else {
// Add the uppercases
others.add(start | 0x0020, end | 0x0020);
}
}
// Special handling for chars in "Special Add" set.
for (int32_t i = 0; i < in_special_add.getRangeCount(); i++) {
UChar32 end = in_special_add.getRangeEnd(i);
for (UChar32 ch = in_special_add.getRangeStart(i); ch <= end; ch++) {
// Add the uppercase of this character if itself is not an uppercase
// character.
// Note: The if condiction cannot be u_islower(ch) because ch could be
// neither uppercase nor lowercase but Mn.
if (!u_isupper(ch)) {
others.add(u_toupper(ch));
}
icu::UnicodeSet candidates(ch, ch);
candidates.closeOver(USET_CASE_INSENSITIVE);
for (int32_t j = 0; j < candidates.getRangeCount(); j++) {
UChar32 end2 = candidates.getRangeEnd(j);
for (UChar32 ch2 = candidates.getRangeStart(j); ch2 <= end2; ch2++) {
// Add character that is not uppercase to others.
if (!u_isupper(ch2)) {
others.add(ch2);
}
}
}
}
}
// Remove all characters which already in the ranges.
others.removeAll(already_added); others.removeAll(already_added);
// Add others to the ranges // Add others to the ranges
......
...@@ -9,6 +9,9 @@ ...@@ -9,6 +9,9 @@
#include "src/objects/objects-inl.h" #include "src/objects/objects-inl.h"
#include "src/regexp/regexp-macro-assembler-arch.h" #include "src/regexp/regexp-macro-assembler-arch.h"
#include "src/regexp/regexp-macro-assembler-tracer.h" #include "src/regexp/regexp-macro-assembler-tracer.h"
#ifdef V8_INTL_SUPPORT
#include "src/regexp/special-case.h"
#endif // V8_INTL_SUPPORT
#include "src/strings/unicode-inl.h" #include "src/strings/unicode-inl.h"
#include "src/zone/zone-list-inl.h" #include "src/zone/zone-list-inl.h"
...@@ -725,32 +728,34 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, ...@@ -725,32 +728,34 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
unibrow::uchar* letters, unibrow::uchar* letters,
int letter_length) { int letter_length) {
#ifdef V8_INTL_SUPPORT #ifdef V8_INTL_SUPPORT
// Special case for U+017F which has upper case in ASCII range. if (RegExpCaseFolding::IgnoreSet().contains(character)) {
if (character == 0x017f) {
letters[0] = character; letters[0] = character;
return 1; return 1;
} }
bool in_special_add_set =
RegExpCaseFolding::SpecialAddSet().contains(character);
icu::UnicodeSet set; icu::UnicodeSet set;
set.add(character); set.add(character);
set = set.closeOver(USET_CASE_INSENSITIVE); set = set.closeOver(USET_CASE_INSENSITIVE);
UChar32 canon = 0;
if (in_special_add_set) {
canon = RegExpCaseFolding::Canonicalize(character);
}
int32_t range_count = set.getRangeCount(); int32_t range_count = set.getRangeCount();
int items = 0; int items = 0;
for (int32_t i = 0; i < range_count; i++) { for (int32_t i = 0; i < range_count; i++) {
UChar32 start = set.getRangeStart(i); UChar32 start = set.getRangeStart(i);
UChar32 end = set.getRangeEnd(i); UChar32 end = set.getRangeEnd(i);
CHECK(end - start + items <= letter_length); CHECK(end - start + items <= letter_length);
// Only add to the output if character is not in ASCII range for (UChar32 cu = start; cu <= end; cu++) {
// or the case equivalent character is in ASCII range. if (one_byte_subject && cu > String::kMaxOneByteCharCode) break;
// #sec-runtime-semantics-canonicalize-ch if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) {
// 3.g If the numeric value of ch ≥ 128 and the numeric value of cu < 128, continue;
// return ch.
if (!((start >= 128) && (character < 128))) {
// No range have start and end span across code point 128.
DCHECK((start >= 128) == (end >= 128));
for (UChar32 cu = start; cu <= end; cu++) {
if (one_byte_subject && cu > String::kMaxOneByteCharCode) break;
letters[items++] = (unibrow::uchar)(cu);
} }
letters[items++] = (unibrow::uchar)(cu);
} }
} }
return items; return items;
......
...@@ -6,70 +6,109 @@ ...@@ -6,70 +6,109 @@
#define V8_REGEXP_SPECIAL_CASE_H_ #define V8_REGEXP_SPECIAL_CASE_H_
#ifdef V8_INTL_SUPPORT #ifdef V8_INTL_SUPPORT
#include "unicode/uversion.h" #include "src/base/logging.h"
namespace U_ICU_NAMESPACE { #include "src/common/globals.h"
class UnicodeSet;
} // namespace U_ICU_NAMESPACE #include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
namespace v8 { namespace v8 {
namespace internal { namespace internal {
// Functions to build special sets of Unicode characters that need special // Sets of Unicode characters that need special handling under "i" mode
// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE).
// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262
// defines slightly different case-folding rules than Unicode. An
// input character should match a pattern character if the result of
// the Canonicalize algorithm is the same for both characters.
// //
// For the characters in the "ignore set", the process should not treat other // Roughly speaking, for "i" regexps, Canonicalize(c) is the same as
// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case // c.toUpperCase(), unless a) c.toUpperCase() is a multi-character
// equivlant under the ECMA262 RegExp "i" mode because these characters are // string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See
// uppercase themselves that no other characters in the set uppercase to. // https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for
// the precise definition.
// //
// For the characters in the "special add set", the proecess should add only // While compiling such regular expressions, we need to compute the
// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is // set of characters that should match a given input character. (See
// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode // GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.)
// and also that ONE uppercase character that other non uppercase character // For almost all characters, this can be efficiently computed using
// uppercase into to the set. Other uppercase characters in the result of // UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent
// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262 // the remaining special cases.
// RegExp "i" mode consider two characters as "case equivlant" if both
// characters uppercase to the same character.
// //
// For example, consider the following case equivalent set defined by Unicode // For a character c, the rules are as follows:
// standard. Notice there are more than one uppercase characters in this set:
// U+212B Å Angstrom Sign - an uppercase character.
// U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character.
// U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which
// uppercase to U+00C5.
// In this case equivlant set is a special set and need special handling while
// considering "case equivlant" under the ECMA262 RegExp "i" mode which is
// different than Unicode Standard:
// * U+212B should be included into the "ignore" set because there are no other
// characters, under the ECMA262 "i" mode, are considered as "case equivlant"
// to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5
// uppercase to U+212B.
// * U+00C5 and U+00E5 will both be included into the "special add" set. While
// calculate the "equivlant set" under ECMA262 "i" mode, the process will
// add U+00E5, because it is not an uppercase character in the set. The
// process will also add U+00C5, because it is the uppercase character which
// other non uppercase character, U+00C5, uppercase into.
// //
// For characters not included in "ignore set" and "special add set", the // 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling
// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is // UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet
// much faster. // containing c will produce the set of characters that should
// match /c/i (or /[c]/i), and only those characters.
// //
// Under Unicode 12.0, there are only 7 characters in the "special add set" and // 2. If c is in IgnoreSet, then the only character it should match is
// 4 characters in "ignore set" so even the special add process is slower, it is // itself. However, closeOver will add additional incorrect
// limited to a small set of cases only. // matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ'
// (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is
// "SS". Step 3.e therefore requires that 'ß' canonicalizes to
// itself, and should not match 'ẞ'. In these cases, we can skip
// the closeOver entirely, because it will never add an equivalent
// character.
// //
// The implementation of these two function will be generated by calling ICU // 3. If c is in SpecialAddSet, then it should match at least one
// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by // character other than itself. However, closeOver will add at
// the code in src/regexp/gen-regexp-special-case.cc. // least one additional incorrect match. For example, consider the
// letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase
// K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN
// SIGN should not match either of the other two characters. As a
// result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in
// IgnoreSet). To find the correct matches for characters in
// SpecialAddSet, we closeOver the original character, but filter
// out the results that do not have the same canonical value.
// //
// These two function will be used with LazyInstance<> template to generate // The contents of these sets are calculated at build time by
// global sharable set to reduce memory usage and speed up performance. // src/regexp/gen-regexp-special-case.cc, which generates
// gen/src/regexp/special-case.cc. This is done by iterating over the
// result of closeOver for each BMP character, and finding sets for
// which at least one character has a different canonical value than
// another character. Characters that match no other characters in
// their equivalence class are added to IgnoreSet. Characters that
// match at least one other character are added to SpecialAddSet.
class RegExpCaseFolding final : public AllStatic {
public:
static const icu::UnicodeSet& IgnoreSet();
static const icu::UnicodeSet& SpecialAddSet();
// This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
// Canonicalize) step 3, which is used to determine whether
// characters match when ignoreCase is true and unicode is false.
static UChar32 Canonicalize(UChar32 ch) {
// a. Assert: ch is a UTF-16 code unit.
CHECK_LE(ch, 0xffff);
// b. Let s be the String value consisting of the single code unit ch.
icu::UnicodeString s(ch);
// c. Let u be the same result produced as if by performing the algorithm
// for String.prototype.toUpperCase using s as the this value.
// d. Assert: Type(u) is String.
icu::UnicodeString& u = s.toUpper();
// e. If u does not consist of a single code unit, return ch.
if (u.length() != 1) {
return ch;
}
// f. Let cu be u's single code unit element.
UChar32 cu = u.char32At(0);
// Function to build and return the Ignore set. // g. If the value of ch >= 128 and the value of cu < 128, return ch.
icu::UnicodeSet BuildIgnoreSet(); if (ch >= 128 && cu < 128) {
return ch;
}
// Function to build and return the Special Add set. // h. Return cu.
icu::UnicodeSet BuildSpecialAddSet(); return cu;
}
};
} // namespace internal } // namespace internal
} // namespace v8 } // namespace v8
......
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// See https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch
function Canonicalize(ch) {
var u = ch.toUpperCase();
if (u.length > 1) return ch;
var cu = u.charCodeAt(0);
if (ch.charCodeAt(0) >= 128 && cu < 128) return ch;
return cu;
}
function TestEquivalenceClass(eclass) {
for (var i = 0; i < eclass.length; i++) {
for (var j = 0; j < eclass.length; j++) {
if (i == j) continue;
var c1 = eclass[i];
var c2 = eclass[j];
var shouldMatch = Canonicalize(c1) === Canonicalize(c2);
var re1 = new RegExp(c1, 'i');
var re2 = new RegExp('[' + c1 + ']', 'i');
assertEquals(re1.test(c2), shouldMatch);
assertEquals(re2.test(c2), shouldMatch);
}
}
}
function TestAll() {
for (var eclass of equivalence_classes) {
TestEquivalenceClass(eclass);
}
}
// Interesting case-folding equivalence classes (as determined by
// ICU's UnicodeSet::closeOver). A class is interesting if it contains
// more than two characters, or if it contains any characters in
// IgnoreSet or SpecialAddSet as defined in src/regexp/special-case.h.
var equivalence_classes = [
'\u0041\u0061', // Aa (sanity check)
'\u004b\u006b\u212a', // KkK
'\u0053\u0073\u017f', // Ssſ
'\u00b5\u039c\u03bc', // µΜμ
'\u00c5\u00e5\u212b', // ÅåÅ
'\u00df\u1e9e', // ßẞ
'\u03a9\u03c9\u2126', // ΩωΩ
'\u0390\u1fd3', // ΐΐ
'\u0398\u03b8\u03d1\u03f4', // Θθϑϴ
'\u03b0\u1fe3', // ΰΰ
'\u1f80\u1f88', // ᾀᾈ
'\u1fb3\u1fbc', // ᾳᾼ
'\u1fc3\u1fcc', // ῃῌ
'\u1ff3\u1ffc', // ῳῼ
'\ufb05\ufb06', // ſtst
// Everything below this line is a well-behaved case-folding
// equivalence class with more than two characters but only one
// canonical case-folded character
'\u01c4\u01c5\u01c6', '\u01c7\u01c8\u01c9', '\u01ca\u01cb\u01cc',
'\u01f1\u01f2\u01f3', '\u0345\u0399\u03b9\u1fbe', '\u0392\u03b2\u03d0',
'\u0395\u03b5\u03f5', '\u039a\u03ba\u03f0', '\u03a0\u03c0\u03d6',
'\u03a1\u03c1\u03f1', '\u03a3\u03c2\u03c3', '\u03a6\u03c6\u03d5',
'\u0412\u0432\u1c80', '\u0414\u0434\u1c81', '\u041e\u043e\u1c82',
'\u0421\u0441\u1c83', '\u0422\u0442\u1c84\u1c85', '\u042a\u044a\u1c86',
'\u0462\u0463\u1c87', '\u1c88\ua64a\ua64b', '\u1e60\u1e61\u1e9b'
];
TestAll();
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment