Commit daef0ec5 authored by erikcorry's avatar erikcorry Committed by Commit bot

Reland Extend big-disjunction optimization to case-independent regexps

Previous code review https://codereview.chromium.org/1182783009/
R=yangguo@chromium.org
BUG=chromium:482998
LOG=n

Review URL: https://codereview.chromium.org/1204123003

Cr-Commit-Position: refs/heads/master@{#29290}
parent f461c7a6
...@@ -323,7 +323,8 @@ List<HeapEntry*>* HeapSnapshot::GetSortedEntriesList() { ...@@ -323,7 +323,8 @@ List<HeapEntry*>* HeapSnapshot::GetSortedEntriesList() {
for (int i = 0; i < entries_.length(); ++i) { for (int i = 0; i < entries_.length(); ++i) {
sorted_entries_[i] = &entries_[i]; sorted_entries_[i] = &entries_[i];
} }
sorted_entries_.Sort(SortByIds); sorted_entries_.Sort<int (*)(HeapEntry* const*, HeapEntry* const*)>(
SortByIds);
} }
return &sorted_entries_; return &sorted_entries_;
} }
......
...@@ -4837,6 +4837,34 @@ int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { ...@@ -4837,6 +4837,34 @@ int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
} }
static unibrow::uchar Canonical(
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
unibrow::uchar c) {
unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
int length = canonicalize->get(c, '\0', chars);
DCHECK_LE(length, 1);
unibrow::uchar canonical = c;
if (length == 1) canonical = chars[0];
return canonical;
}
int CompareFirstCharCaseIndependent(
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
RegExpTree* const* a, RegExpTree* const* b) {
RegExpAtom* atom1 = (*a)->AsAtom();
RegExpAtom* atom2 = (*b)->AsAtom();
unibrow::uchar character1 = atom1->data().at(0);
unibrow::uchar character2 = atom2->data().at(0);
if (character1 == character2) return 0;
if (character1 >= 'a' || character2 >= 'a') {
character1 = Canonical(canonicalize, character1);
character2 = Canonical(canonicalize, character2);
}
return static_cast<int>(character1) - static_cast<int>(character2);
}
// We can stable sort runs of atoms, since the order does not matter if they // We can stable sort runs of atoms, since the order does not matter if they
// start with different characters. // start with different characters.
// Returns true if any consecutive atoms were found. // Returns true if any consecutive atoms were found.
...@@ -4860,15 +4888,23 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { ...@@ -4860,15 +4888,23 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
i++; i++;
} }
// Sort atoms to get ones with common prefixes together. // Sort atoms to get ones with common prefixes together.
// This step is not valid if we are in a case-independent regexp, // This step is more tricky if we are in a case-independent regexp,
// because it would change /is|I/ to /I|is/, and order matters when // because it would change /is|I/ to /I|is/, and order matters when
// the regexp parts don't match only disjoint starting points. To fix // the regexp parts don't match only disjoint starting points. To fix
// this would need a version of CompareFirstChar that uses case- // this we have a version of CompareFirstChar that uses case-
// independent character classes for comparison. // independent character classes for comparison.
if (!compiler->ignore_case()) { DCHECK_LT(first_atom, alternatives->length());
DCHECK_LT(first_atom, alternatives->length()); DCHECK_LE(i, alternatives->length());
DCHECK_LE(i, alternatives->length()); DCHECK_LE(first_atom, i);
DCHECK_LE(first_atom, i); if (compiler->ignore_case()) {
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
compiler->isolate()->regexp_macro_assembler_canonicalize();
auto compare_closure =
[canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
return CompareFirstCharCaseIndependent(canonicalize, a, b);
};
alternatives->StableSort(compare_closure, first_atom, i - first_atom);
} else {
alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
} }
if (i - first_atom > 1) found_consecutive_atoms = true; if (i - first_atom > 1) found_consecutive_atoms = true;
...@@ -4893,7 +4929,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { ...@@ -4893,7 +4929,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
continue; continue;
} }
RegExpAtom* atom = alternative->AsAtom(); RegExpAtom* atom = alternative->AsAtom();
uc16 common_prefix = atom->data().at(0); unibrow::uchar common_prefix = atom->data().at(0);
int first_with_prefix = i; int first_with_prefix = i;
int prefix_length = atom->length(); int prefix_length = atom->length();
i++; i++;
...@@ -4901,7 +4937,15 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { ...@@ -4901,7 +4937,15 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
alternative = alternatives->at(i); alternative = alternatives->at(i);
if (!alternative->IsAtom()) break; if (!alternative->IsAtom()) break;
atom = alternative->AsAtom(); atom = alternative->AsAtom();
if (atom->data().at(0) != common_prefix) break; unibrow::uchar new_prefix = atom->data().at(0);
if (new_prefix != common_prefix) {
if (!compiler->ignore_case()) break;
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
compiler->isolate()->regexp_macro_assembler_canonicalize();
new_prefix = Canonical(canonicalize, new_prefix);
common_prefix = Canonical(canonicalize, common_prefix);
if (new_prefix != common_prefix) break;
}
prefix_length = Min(prefix_length, atom->length()); prefix_length = Min(prefix_length, atom->length());
i++; i++;
} }
...@@ -4917,7 +4961,10 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { ...@@ -4917,7 +4961,10 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
RegExpAtom* old_atom = RegExpAtom* old_atom =
alternatives->at(j + first_with_prefix)->AsAtom(); alternatives->at(j + first_with_prefix)->AsAtom();
for (int k = 1; k < prefix_length; k++) { for (int k = 1; k < prefix_length; k++) {
if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k; if (atom->data().at(k) != old_atom->data().at(k)) {
prefix_length = k;
break;
}
} }
} }
RegExpAtom* prefix = RegExpAtom* prefix =
......
...@@ -193,14 +193,16 @@ int List<T, P>::CountOccurrences(const T& elm, int start, int end) const { ...@@ -193,14 +193,16 @@ int List<T, P>::CountOccurrences(const T& elm, int start, int end) const {
} }
template<typename T, class P> template <typename T, class P>
void List<T, P>::Sort(int (*cmp)(const T* x, const T* y)) { template <typename CompareFunction>
void List<T, P>::Sort(CompareFunction cmp) {
Sort(cmp, 0, length_); Sort(cmp, 0, length_);
} }
template <typename T, class P> template <typename T, class P>
void List<T, P>::Sort(int (*cmp)(const T* x, const T* y), size_t s, size_t l) { template <typename CompareFunction>
void List<T, P>::Sort(CompareFunction cmp, size_t s, size_t l) {
ToVector().Sort(cmp, s, l); ToVector().Sort(cmp, s, l);
#ifdef DEBUG #ifdef DEBUG
for (size_t i = s + 1; i < l; i++) DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0); for (size_t i = s + 1; i < l; i++) DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0);
...@@ -215,14 +217,15 @@ void List<T, P>::Sort() { ...@@ -215,14 +217,15 @@ void List<T, P>::Sort() {
template <typename T, class P> template <typename T, class P>
void List<T, P>::StableSort(int (*cmp)(const T* x, const T* y)) { template <typename CompareFunction>
void List<T, P>::StableSort(CompareFunction cmp) {
StableSort(cmp, 0, length_); StableSort(cmp, 0, length_);
} }
template <typename T, class P> template <typename T, class P>
void List<T, P>::StableSort(int (*cmp)(const T* x, const T* y), size_t s, template <typename CompareFunction>
size_t l) { void List<T, P>::StableSort(CompareFunction cmp, size_t s, size_t l) {
ToVector().StableSort(cmp, s, l); ToVector().StableSort(cmp, s, l);
#ifdef DEBUG #ifdef DEBUG
for (size_t i = s + 1; i < l; i++) DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0); for (size_t i = s + 1; i < l; i++) DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0);
......
...@@ -149,12 +149,15 @@ class List { ...@@ -149,12 +149,15 @@ class List {
void Iterate(Visitor* visitor); void Iterate(Visitor* visitor);
// Sort all list entries (using QuickSort) // Sort all list entries (using QuickSort)
void Sort(int (*cmp)(const T* x, const T* y), size_t start, size_t length); template <typename CompareFunction>
void Sort(int (*cmp)(const T* x, const T* y)); void Sort(CompareFunction cmp, size_t start, size_t length);
template <typename CompareFunction>
void Sort(CompareFunction cmp);
void Sort(); void Sort();
void StableSort(int (*cmp)(const T* x, const T* y), size_t start, template <typename CompareFunction>
size_t length); void StableSort(CompareFunction cmp, size_t start, size_t length);
void StableSort(int (*cmp)(const T* x, const T* y)); template <typename CompareFunction>
void StableSort(CompareFunction cmp);
void StableSort(); void StableSort();
INLINE(void Initialize(int capacity, INLINE(void Initialize(int capacity,
......
...@@ -69,24 +69,30 @@ class Vector { ...@@ -69,24 +69,30 @@ class Vector {
return Vector<T>(result, length_); return Vector<T>(result, length_);
} }
void Sort(int (*cmp)(const T*, const T*), size_t s, size_t l) { template <typename CompareFunction>
std::sort(start() + s, start() + s + l, RawComparer(cmp)); void Sort(CompareFunction cmp, size_t s, size_t l) {
std::sort(start() + s, start() + s + l, RawComparer<CompareFunction>(cmp));
} }
void Sort(int (*cmp)(const T*, const T*)) { template <typename CompareFunction>
std::sort(start(), start() + length(), RawComparer(cmp)); void Sort(CompareFunction cmp) {
std::sort(start(), start() + length(), RawComparer<CompareFunction>(cmp));
} }
void Sort() { void Sort() {
std::sort(start(), start() + length()); std::sort(start(), start() + length());
} }
void StableSort(int (*cmp)(const T*, const T*), size_t s, size_t l) { template <typename CompareFunction>
std::stable_sort(start() + s, start() + s + l, RawComparer(cmp)); void StableSort(CompareFunction cmp, size_t s, size_t l) {
std::stable_sort(start() + s, start() + s + l,
RawComparer<CompareFunction>(cmp));
} }
void StableSort(int (*cmp)(const T*, const T*)) { template <typename CompareFunction>
std::stable_sort(start(), start() + length(), RawComparer(cmp)); void StableSort(CompareFunction cmp) {
std::stable_sort(start(), start() + length(),
RawComparer<CompareFunction>(cmp));
} }
void StableSort() { std::stable_sort(start(), start() + length()); } void StableSort() { std::stable_sort(start(), start() + length()); }
...@@ -136,15 +142,16 @@ class Vector { ...@@ -136,15 +142,16 @@ class Vector {
T* start_; T* start_;
int length_; int length_;
template <typename CookedComparer>
class RawComparer { class RawComparer {
public: public:
explicit RawComparer(int (*cmp)(const T*, const T*)) : cmp_(cmp) {} explicit RawComparer(CookedComparer cmp) : cmp_(cmp) {}
bool operator()(const T& a, const T& b) { bool operator()(const T& a, const T& b) {
return cmp_(&a, &b) < 0; return cmp_(&a, &b) < 0;
} }
private: private:
int (*cmp_)(const T*, const T*); CookedComparer cmp_;
}; };
}; };
......
...@@ -181,8 +181,9 @@ ...@@ -181,8 +181,9 @@
'array-constructor': [PASS, TIMEOUT], 'array-constructor': [PASS, TIMEOUT],
# Very slow on ARM and MIPS, contains no architecture dependent code. # Very slow on ARM and MIPS, contains no architecture dependent code.
'unicode-case-overoptimization': [PASS, NO_VARIANTS, ['arch == arm or arch == android_arm or arch == android_arm64 or arch == mipsel or arch == mips64el or arch == mips', TIMEOUT]], 'unicode-case-overoptimization': [PASS, NO_VARIANTS, ['arch == arm or arch == arm64 or arch == android_arm or arch == android_arm64 or arch == mipsel or arch == mips64el or arch == mips', TIMEOUT]],
'regress/regress-3976': [PASS, NO_VARIANTS, ['arch == arm or arch == android_arm or arch == android_arm64 or arch == mipsel or arch == mips64el or arch == mips', SKIP]], 'regress/regress-3976': [PASS, NO_VARIANTS, ['arch == arm or arch == arm64 or arch == android_arm or arch == android_arm64 or arch == mipsel or arch == mips64el or arch == mips', SKIP]],
'regress/regress-crbug-482998': [PASS, NO_VARIANTS, ['arch == arm or arch == arm64 or arch == android_arm or arch == android_arm64 or arch == mipsel or arch == mips64el or arch == mips', SKIP]],
############################################################################## ##############################################################################
# This test expects to reach a certain recursion depth, which may not work # This test expects to reach a certain recursion depth, which may not work
......
// Copyright 2015 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
function Test(lower, upper) {
var lx = lower + "x";
var ux = upper + "x";
var lp = lower + "|";
var uxp = upper + "x|";
assertEquals(lx, new RegExp(uxp + lp + lower + "cat", "i").exec(lx) + "");
assertEquals(ux, new RegExp(uxp + lp + lower + "cat", "i").exec(ux) + "");
assertEquals(lower, new RegExp(lp + uxp + lower + "cat", "i").exec(lx) + "");
assertEquals(upper, new RegExp(lp + uxp + lower + "cat", "i").exec(ux) + "");
}
function TestFail(lower, upper) {
var lx = lower + "x";
var ux = upper + "x";
var lp = lower + "|";
var uxp = upper + "x|";
assertEquals(lower, new RegExp(uxp + lp + lower + "cat", "i").exec(lx) + "");
assertEquals(ux, new RegExp(uxp + lp + lower + "cat", "i").exec(ux) + "");
assertEquals(lower, new RegExp(lp + uxp + lower + "cat", "i").exec(lx) + "");
assertEquals(ux, new RegExp(lp + uxp + lower + "cat", "i").exec(ux) + "");
}
Test("a", "A");
Test("0", "0");
TestFail("a", "b");
// Small and capital o-umlaut
Test(String.fromCharCode(0xf6), String.fromCharCode(0xd6));
// Small and capital kha.
Test(String.fromCharCode(0x445), String.fromCharCode(0x425));
// Small and capital y-umlaut.
Test(String.fromCharCode(0xff), String.fromCharCode(0x178));
// Small and large Greek mu.
Test(String.fromCharCode(0x3bc), String.fromCharCode(0x39c));
// Micron and large Greek mu.
Test(String.fromCharCode(0xb5), String.fromCharCode(0x39c));
// Micron and small Greek mu.
Test(String.fromCharCode(0xb5), String.fromCharCode(0x3bc));
// German double s and capital S. These are not equivalent since one is double.
TestFail(String.fromCharCode(0xdf), "S");
// Small i and Turkish capital dotted I. These are not equivalent due to
// 21.2.2.8.2 section 3g. One is below 128 and the other is above 127.
TestFail("i", String.fromCharCode(0x130));
// Small dotless i and I. These are not equivalent either.
TestFail(String.fromCharCode(0x131), "I");
...@@ -3,13 +3,13 @@ ...@@ -3,13 +3,13 @@
// found in the LICENSE file. // found in the LICENSE file.
// Should not time out. Running time 0.5s vs. 120s before the change. // Should not time out. Running time 0.5s vs. 120s before the change.
function collapse() { function collapse(flags) {
var src = "(?:"; var src = "(?:";
for (var i = 128; i < 0x1000; i++) { for (var i = 128; i < 0x1000; i++) {
src += "a" + String.fromCharCode(i) + "|"; src += String.fromCharCode(96 + i % 26) + String.fromCharCode(i) + "|";
} }
src += "aa)"; src += "aa)";
var collapsible = new RegExp(src); var collapsible = new RegExp(src, flags);
var subject = "zzzzzzz" + String.fromCharCode(3000); var subject = "zzzzzzz" + String.fromCharCode(3000);
for (var i = 0; i < 1000; i++) { for (var i = 0; i < 1000; i++) {
subject += "xxxxxxx"; subject += "xxxxxxx";
...@@ -19,4 +19,5 @@ function collapse() { ...@@ -19,4 +19,5 @@ function collapse() {
} }
} }
collapse(); collapse("i");
collapse("");
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment