Commit abbb54ed authored by Jakob Gruber's avatar Jakob Gruber Committed by V8 LUCI CQ

[regexp] Extend case-insensitive handling in RationalizeConsecutiveAtoms

Apply case-insensitive comparisons not only for the initial character,
but for the entire prefix. This avoids degenerate behavior for patterns
like /aaaa|AAAA|AAAA/i (i.e. generate a single 4-char prefix instead of
four 1-char prefixes).

Bug: v8:12472
Change-Id: Ib2b49fe73ca846a1b7ec90056cc64bdf5cf33026
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3398114Reviewed-by: 's avatarPatrick Thier <pthier@chromium.org>
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/main@{#78668}
parent cbddd61d
...@@ -519,12 +519,29 @@ int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { ...@@ -519,12 +519,29 @@ int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
#ifdef V8_INTL_SUPPORT #ifdef V8_INTL_SUPPORT
// Case Insensitve comparesion int CompareCaseInsensitive(const icu::UnicodeString& a,
int CompareFirstCharCaseInsensitve(RegExpTree* const* a, RegExpTree* const* b) { const icu::UnicodeString& b) {
return a.caseCompare(b, U_FOLD_CASE_DEFAULT);
}
int CompareFirstCharCaseInsensitive(RegExpTree* const* a,
RegExpTree* const* b) {
RegExpAtom* atom1 = (*a)->AsAtom(); RegExpAtom* atom1 = (*a)->AsAtom();
RegExpAtom* atom2 = (*b)->AsAtom(); RegExpAtom* atom2 = (*b)->AsAtom();
icu::UnicodeString character1(atom1->data().at(0)); return CompareCaseInsensitive(icu::UnicodeString{atom1->data().at(0)},
return character1.caseCompare(atom2->data().at(0), U_FOLD_CASE_DEFAULT); icu::UnicodeString{atom2->data().at(0)});
}
bool Equals(bool ignore_case, const icu::UnicodeString& a,
const icu::UnicodeString& b) {
if (a == b) return true;
if (ignore_case) return CompareCaseInsensitive(a, b) == 0;
return false; // Case-sensitive equality already checked above.
}
bool CharAtEquals(bool ignore_case, int index, const RegExpAtom* a,
const RegExpAtom* b) {
return Equals(ignore_case, a->data().at(index), b->data().at(index));
} }
#else #else
...@@ -540,20 +557,43 @@ unibrow::uchar Canonical( ...@@ -540,20 +557,43 @@ unibrow::uchar Canonical(
return canonical; return canonical;
} }
int CompareFirstCharCaseIndependent( int CompareCaseInsensitive(
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
unibrow::uchar a, unibrow::uchar b) {
if (a == b) return 0;
if (a >= 'a' || b >= 'a') {
a = Canonical(canonicalize, a);
b = Canonical(canonicalize, b);
}
return static_cast<int>(a) - static_cast<int>(b);
}
int CompareFirstCharCaseInsensitive(
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
RegExpTree* const* a, RegExpTree* const* b) { RegExpTree* const* a, RegExpTree* const* b) {
RegExpAtom* atom1 = (*a)->AsAtom(); RegExpAtom* atom1 = (*a)->AsAtom();
RegExpAtom* atom2 = (*b)->AsAtom(); RegExpAtom* atom2 = (*b)->AsAtom();
unibrow::uchar character1 = atom1->data().at(0); return CompareCaseInsensitive(canonicalize, atom1->data().at(0),
unibrow::uchar character2 = atom2->data().at(0); atom2->data().at(0));
if (character1 == character2) return 0; }
if (character1 >= 'a' || character2 >= 'a') {
character1 = Canonical(canonicalize, character1); bool Equals(bool ignore_case,
character2 = Canonical(canonicalize, character2); unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
} unibrow::uchar a, unibrow::uchar b) {
return static_cast<int>(character1) - static_cast<int>(character2); if (a == b) return true;
if (ignore_case) {
return CompareCaseInsensitive(canonicalize, a, b) == 0;
}
return false; // Case-sensitive equality already checked above.
}
bool CharAtEquals(bool ignore_case,
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
int index, const RegExpAtom* a, const RegExpAtom* b) {
return Equals(ignore_case, canonicalize, a->data().at(index),
b->data().at(index));
} }
#endif // V8_INTL_SUPPORT #endif // V8_INTL_SUPPORT
} // namespace } // namespace
...@@ -591,14 +631,14 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { ...@@ -591,14 +631,14 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
DCHECK_LE(first_atom, i); DCHECK_LE(first_atom, i);
if (IsIgnoreCase(compiler->flags())) { if (IsIgnoreCase(compiler->flags())) {
#ifdef V8_INTL_SUPPORT #ifdef V8_INTL_SUPPORT
alternatives->StableSort(CompareFirstCharCaseInsensitve, first_atom, alternatives->StableSort(CompareFirstCharCaseInsensitive, first_atom,
i - first_atom); i - first_atom);
#else #else
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
compiler->isolate()->regexp_macro_assembler_canonicalize(); compiler->isolate()->regexp_macro_assembler_canonicalize();
auto compare_closure = [canonicalize](RegExpTree* const* a, auto compare_closure = [canonicalize](RegExpTree* const* a,
RegExpTree* const* b) { RegExpTree* const* b) {
return CompareFirstCharCaseIndependent(canonicalize, a, b); return CompareFirstCharCaseInsensitive(canonicalize, a, b);
}; };
alternatives->StableSort(compare_closure, first_atom, i - first_atom); alternatives->StableSort(compare_closure, first_atom, i - first_atom);
#endif // V8_INTL_SUPPORT #endif // V8_INTL_SUPPORT
...@@ -615,6 +655,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { ...@@ -615,6 +655,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
Zone* zone = compiler->zone(); Zone* zone = compiler->zone();
ZoneList<RegExpTree*>* alternatives = this->alternatives(); ZoneList<RegExpTree*>* alternatives = this->alternatives();
int length = alternatives->length(); int length = alternatives->length();
const bool ignore_case = IsIgnoreCase(compiler->flags());
int write_posn = 0; int write_posn = 0;
int i = 0; int i = 0;
...@@ -629,7 +670,12 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { ...@@ -629,7 +670,12 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
#ifdef V8_INTL_SUPPORT #ifdef V8_INTL_SUPPORT
icu::UnicodeString common_prefix(atom->data().at(0)); icu::UnicodeString common_prefix(atom->data().at(0));
#else #else
unibrow::Mapping<unibrow::Ecma262Canonicalize>* const canonicalize =
compiler->isolate()->regexp_macro_assembler_canonicalize();
unibrow::uchar common_prefix = atom->data().at(0); unibrow::uchar common_prefix = atom->data().at(0);
if (ignore_case) {
common_prefix = Canonical(canonicalize, common_prefix);
}
#endif // V8_INTL_SUPPORT #endif // V8_INTL_SUPPORT
int first_with_prefix = i; int first_with_prefix = i;
int prefix_length = atom->length(); int prefix_length = atom->length();
...@@ -640,21 +686,10 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { ...@@ -640,21 +686,10 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
RegExpAtom* const alt_atom = alternative->AsAtom(); RegExpAtom* const alt_atom = alternative->AsAtom();
#ifdef V8_INTL_SUPPORT #ifdef V8_INTL_SUPPORT
icu::UnicodeString new_prefix(alt_atom->data().at(0)); icu::UnicodeString new_prefix(alt_atom->data().at(0));
if (new_prefix != common_prefix) { if (!Equals(ignore_case, new_prefix, common_prefix)) break;
if (!IsIgnoreCase(compiler->flags())) break;
if (common_prefix.caseCompare(new_prefix, U_FOLD_CASE_DEFAULT) != 0)
break;
}
#else #else
unibrow::uchar new_prefix = alt_atom->data().at(0); unibrow::uchar new_prefix = alt_atom->data().at(0);
if (new_prefix != common_prefix) { if (!Equals(ignore_case, canonicalize, new_prefix, common_prefix)) break;
if (!IsIgnoreCase(compiler->flags())) break;
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
compiler->isolate()->regexp_macro_assembler_canonicalize();
new_prefix = Canonical(canonicalize, new_prefix);
common_prefix = Canonical(canonicalize, common_prefix);
if (new_prefix != common_prefix) break;
}
#endif // V8_INTL_SUPPORT #endif // V8_INTL_SUPPORT
prefix_length = std::min(prefix_length, alt_atom->length()); prefix_length = std::min(prefix_length, alt_atom->length());
i++; i++;
...@@ -672,7 +707,11 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { ...@@ -672,7 +707,11 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
RegExpAtom* old_atom = RegExpAtom* old_atom =
alternatives->at(j + first_with_prefix)->AsAtom(); alternatives->at(j + first_with_prefix)->AsAtom();
for (int k = 1; k < prefix_length; k++) { for (int k = 1; k < prefix_length; k++) {
if (alt_atom->data().at(k) != old_atom->data().at(k)) { #ifdef V8_INTL_SUPPORT
if (!CharAtEquals(ignore_case, k, alt_atom, old_atom)) {
#else
if (!CharAtEquals(ignore_case, canonicalize, k, alt_atom, old_atom)) {
#endif // V8_INTL_SUPPORT
prefix_length = k; prefix_length = k;
break; break;
} }
......
// Copyright 2021 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
const manyAs = 'A'.repeat(0x10000);
const manyas = manyAs.toLowerCase();
const re = RegExp('^(?:' + manyas + '|' + manyAs + '|' + manyAs + ')$', 'i');
// Shouldn't crash with a stack overflow.
assertThrows(() => manyas.replace(re, manyAs));
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment