Commit 05507cc3 authored by erikcorry's avatar erikcorry Committed by Commit bot

Reland II of 'Optimize trivial regexp disjunctions' CL 1176453002

This change rewrites regexps like (ab|ac|z|ad|ae|af) into (a[b-f]|z).  We can only reorder disjunctions like this for case-dependent regexps.  For case-independent regexps, the disjunctions should be pre-sorted for best results.

R=yangguo@chromium.org
BUG=chromium:482998
LOG=n

Review URL: https://codereview.chromium.org/1180433003

Cr-Commit-Position: refs/heads/master@{#28902}
parent 1a511b0e
......@@ -2872,6 +2872,9 @@ class RegExpDisjunction final : public RegExpTree {
int max_match() override { return max_match_; }
ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
private:
bool SortConsecutiveAtoms(RegExpCompiler* compiler);
void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
ZoneList<RegExpTree*>* alternatives_;
int min_match_;
int max_match_;
......
......@@ -4817,10 +4817,200 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
}
int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
RegExpAtom* atom1 = (*a)->AsAtom();
RegExpAtom* atom2 = (*b)->AsAtom();
uc16 character1 = atom1->data().at(0);
uc16 character2 = atom2->data().at(0);
if (character1 < character2) return -1;
if (character1 > character2) return 1;
return 0;
}
// We can stable sort runs of atoms, since the order does not matter if they
// start with different characters.
// Returns true if any consecutive atoms were found.
bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
ZoneList<RegExpTree*>* alternatives = this->alternatives();
int length = alternatives->length();
bool found_consecutive_atoms = false;
for (int i = 0; i < length; i++) {
while (i < length) {
RegExpTree* alternative = alternatives->at(i);
if (alternative->IsAtom()) break;
i++;
}
// i is length or it is the index of an atom.
if (i == length) break;
int first_atom = i;
i++;
while (i < length) {
RegExpTree* alternative = alternatives->at(i);
if (!alternative->IsAtom()) break;
i++;
}
// Sort atoms to get ones with common prefixes together.
// This step is not valid if we are in a case-independent regexp,
// because it would change /is|I/ to /I|is/, and order matters when
// the regexp parts don't match only disjoint starting points. To fix
// this would need a version of CompareFirstChar that uses case-
// independent character classes for comparison.
if (!compiler->ignore_case()) {
DCHECK_LT(first_atom, alternatives->length());
DCHECK_LE(i, alternatives->length());
DCHECK_LE(first_atom, i);
alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
}
if (i - first_atom > 1) found_consecutive_atoms = true;
}
return found_consecutive_atoms;
}
// Optimizes ab|ac|az to a(?:b|c|d).
void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
Zone* zone = compiler->zone();
ZoneList<RegExpTree*>* alternatives = this->alternatives();
int length = alternatives->length();
int write_posn = 0;
int i = 0;
while (i < length) {
RegExpTree* alternative = alternatives->at(i);
if (!alternative->IsAtom()) {
alternatives->at(write_posn++) = alternatives->at(i);
i++;
continue;
}
RegExpAtom* atom = alternative->AsAtom();
uc16 common_prefix = atom->data().at(0);
int first_with_prefix = i;
int prefix_length = atom->length();
i++;
while (i < length) {
alternative = alternatives->at(i);
if (!alternative->IsAtom()) break;
atom = alternative->AsAtom();
if (atom->data().at(0) != common_prefix) break;
prefix_length = Min(prefix_length, atom->length());
i++;
}
if (i > first_with_prefix + 2) {
// Found worthwhile run of alternatives with common prefix of at least one
// character. The sorting function above did not sort on more than one
// character for reasons of correctness, but there may still be a longer
// common prefix if the terms were similar or presorted in the input.
// Find out how long the common prefix is.
int run_length = i - first_with_prefix;
atom = alternatives->at(first_with_prefix)->AsAtom();
for (int j = 1; j < run_length && prefix_length > 1; j++) {
RegExpAtom* old_atom =
alternatives->at(j + first_with_prefix)->AsAtom();
for (int k = 1; k < prefix_length; k++) {
if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k;
}
}
RegExpAtom* prefix =
new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
pair->Add(prefix, zone);
ZoneList<RegExpTree*>* suffixes =
new (zone) ZoneList<RegExpTree*>(run_length, zone);
for (int j = 0; j < run_length; j++) {
RegExpAtom* old_atom =
alternatives->at(j + first_with_prefix)->AsAtom();
int len = old_atom->length();
if (len == prefix_length) {
suffixes->Add(new (zone) RegExpEmpty(), zone);
} else {
RegExpTree* suffix = new (zone) RegExpAtom(
old_atom->data().SubVector(prefix_length, old_atom->length()));
suffixes->Add(suffix, zone);
}
}
pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
} else {
// Just copy any non-worthwhile alternatives.
for (int j = first_with_prefix; j < i; j++) {
alternatives->at(write_posn++) = alternatives->at(j);
}
}
}
alternatives->Rewind(write_posn); // Trim end of array.
}
// Optimizes b|c|z to [bcz].
void RegExpDisjunction::FixSingleCharacterDisjunctions(
RegExpCompiler* compiler) {
Zone* zone = compiler->zone();
ZoneList<RegExpTree*>* alternatives = this->alternatives();
int length = alternatives->length();
int write_posn = 0;
int i = 0;
while (i < length) {
RegExpTree* alternative = alternatives->at(i);
if (!alternative->IsAtom()) {
alternatives->at(write_posn++) = alternatives->at(i);
i++;
continue;
}
RegExpAtom* atom = alternative->AsAtom();
if (atom->length() != 1) {
alternatives->at(write_posn++) = alternatives->at(i);
i++;
continue;
}
int first_in_run = i;
i++;
while (i < length) {
alternative = alternatives->at(i);
if (!alternative->IsAtom()) break;
atom = alternative->AsAtom();
if (atom->length() != 1) break;
i++;
}
if (i > first_in_run + 1) {
// Found non-trivial run of single-character alternatives.
int run_length = i - first_in_run;
ZoneList<CharacterRange>* ranges =
new (zone) ZoneList<CharacterRange>(2, zone);
for (int j = 0; j < run_length; j++) {
RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
DCHECK_EQ(old_atom->length(), 1);
ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
}
alternatives->at(write_posn++) =
new (zone) RegExpCharacterClass(ranges, false);
} else {
// Just copy any trivial alternatives.
for (int j = first_in_run; j < i; j++) {
alternatives->at(write_posn++) = alternatives->at(j);
}
}
}
alternatives->Rewind(write_posn); // Trim end of array.
}
RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
ZoneList<RegExpTree*>* alternatives = this->alternatives();
if (alternatives->length() > 2) {
bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
FixSingleCharacterDisjunctions(compiler);
if (alternatives->length() == 1) {
return alternatives->at(0)->ToNode(compiler, on_success);
}
}
int length = alternatives->length();
ChoiceNode* result =
new(compiler->zone()) ChoiceNode(length, compiler->zone());
for (int i = 0; i < length; i++) {
......
......@@ -212,7 +212,7 @@ class RegExpImpl {
// and the total executable memory at any point.
static const int kRegExpExecutableMemoryLimit = 16 * MB;
static const int kRegExpCompiledLimit = 1 * MB;
static const int kRegExpTooLargeToOptimize = 10 * KB;
static const int kRegExpTooLargeToOptimize = 20 * KB;
private:
static bool CompileIrregexp(Handle<JSRegExp> re,
......
......@@ -195,10 +195,15 @@ int List<T, P>::CountOccurrences(const T& elm, int start, int end) const {
template<typename T, class P>
void List<T, P>::Sort(int (*cmp)(const T* x, const T* y)) {
ToVector().Sort(cmp);
Sort(cmp, 0, length_);
}
template <typename T, class P>
void List<T, P>::Sort(int (*cmp)(const T* x, const T* y), size_t s, size_t l) {
ToVector().Sort(cmp, s, l);
#ifdef DEBUG
for (int i = 1; i < length_; i++)
DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0);
for (size_t i = s + 1; i < l; i++) DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0);
#endif
}
......@@ -209,7 +214,29 @@ void List<T, P>::Sort() {
}
template<typename T, class P>
template <typename T, class P>
void List<T, P>::StableSort(int (*cmp)(const T* x, const T* y)) {
StableSort(cmp, 0, length_);
}
template <typename T, class P>
void List<T, P>::StableSort(int (*cmp)(const T* x, const T* y), size_t s,
size_t l) {
ToVector().StableSort(cmp, s, l);
#ifdef DEBUG
for (size_t i = s + 1; i < l; i++) DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0);
#endif
}
template <typename T, class P>
void List<T, P>::StableSort() {
ToVector().StableSort();
}
template <typename T, class P>
void List<T, P>::Initialize(int capacity, P allocator) {
DCHECK(capacity >= 0);
data_ = (capacity > 0) ? NewData(capacity, allocator) : NULL;
......
......@@ -149,8 +149,13 @@ class List {
void Iterate(Visitor* visitor);
// Sort all list entries (using QuickSort)
void Sort(int (*cmp)(const T* x, const T* y), size_t start, size_t length);
void Sort(int (*cmp)(const T* x, const T* y));
void Sort();
void StableSort(int (*cmp)(const T* x, const T* y), size_t start,
size_t length);
void StableSort(int (*cmp)(const T* x, const T* y));
void StableSort();
INLINE(void Initialize(int capacity,
AllocationPolicy allocator = AllocationPolicy()));
......
......@@ -69,6 +69,10 @@ class Vector {
return Vector<T>(result, length_);
}
void Sort(int (*cmp)(const T*, const T*), size_t s, size_t l) {
std::sort(start() + s, start() + s + l, RawComparer(cmp));
}
void Sort(int (*cmp)(const T*, const T*)) {
std::sort(start(), start() + length(), RawComparer(cmp));
}
......@@ -77,6 +81,16 @@ class Vector {
std::sort(start(), start() + length());
}
void StableSort(int (*cmp)(const T*, const T*), size_t s, size_t l) {
std::stable_sort(start() + s, start() + s + l, RawComparer(cmp));
}
void StableSort(int (*cmp)(const T*, const T*)) {
std::stable_sort(start(), start() + length(), RawComparer(cmp));
}
void StableSort() { std::stable_sort(start(), start() + length()); }
void Truncate(int length) {
DCHECK(length <= length_);
length_ = length;
......
......@@ -1747,13 +1747,13 @@ TEST(TestSizeOfRegExpCode) {
// Adjust source below and this check to match
// RegExpImple::kRegExpTooLargeToOptimize.
DCHECK_EQ(i::RegExpImpl::kRegExpTooLargeToOptimize, 10 * KB);
DCHECK_EQ(i::RegExpImpl::kRegExpTooLargeToOptimize, 20 * KB);
// Compile a regexp that is much larger if we are using regexp optimizations.
CompileRun(
"var reg_exp_source = '(?:a|bc|def|ghij|klmno|pqrstu)';"
"var half_size_reg_exp;"
"while (reg_exp_source.length < 10 * 1024) {"
"while (reg_exp_source.length < 20 * 1024) {"
" half_size_reg_exp = reg_exp_source;"
" reg_exp_source = reg_exp_source + reg_exp_source;"
"}"
......@@ -1784,7 +1784,11 @@ TEST(TestSizeOfRegExpCode) {
int size_of_regexp_code = size_with_regexp - initial_size;
CHECK_LE(size_of_regexp_code, 1 * MB);
// On some platforms the debug-code flag causes huge amounts of regexp code
// to be emitted, breaking this test.
if (!FLAG_debug_code) {
CHECK_LE(size_of_regexp_code, 1 * MB);
}
// Small regexp is half the size, but compiles to more than twice the code
// due to the optimization steps.
......
// Copyright 2015 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Should not time out. Running time 0.5s vs. 120s before the change.
function collapse() {
var src = "(?:";
for (var i = 128; i < 0x1000; i++) {
src += "a" + String.fromCharCode(i) + "|";
}
src += "aa)";
var collapsible = new RegExp(src);
var subject = "zzzzzzz" + String.fromCharCode(3000);
for (var i = 0; i < 1000; i++) {
subject += "xxxxxxx";
}
for (var i = 0; i < 2000; i++) {
assertFalse(collapsible.test(subject));
}
}
collapse();
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment