Commit c51e4f3c authored by Jakob Gruber's avatar Jakob Gruber Committed by Commit Bot

[regexp] Rewrite certain Assertion sequences

RegExp assertions (e.g.: '^', '$', '\b', ...) sequences have certain
properties that this rewriter exploits:

1. They are zero-width and order-independent, thus one can remove all
duplicate assertions.
2. If a subsequence is guaranteed to fail, the entire sequence fails.
Any sequence always known to fail (e.g. containing both '\b' and '\B')
can be rewritten to a single node that triggers failure.

This CL generalizes the previous optimization for repeated assertions
to be order-independent, i.e. assertions only have to be in the same
sequence but not next to each other.

Bug: v8:6515, v8:6126
Change-Id: I3f92f081ce8a55ad8c34c269a09a6686e3b008f3
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1657925
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Reviewed-by: 's avatarPeter Marshall <petermarshall@chromium.org>
Cr-Commit-Position: refs/heads/master@{#62201}
parent 6b1b5105
......@@ -268,12 +268,13 @@ class RegExpAlternative final : public RegExpTree {
class RegExpAssertion final : public RegExpTree {
public:
enum AssertionType {
START_OF_LINE,
START_OF_INPUT,
END_OF_LINE,
END_OF_INPUT,
BOUNDARY,
NON_BOUNDARY
START_OF_LINE = 0,
START_OF_INPUT = 1,
END_OF_LINE = 2,
END_OF_INPUT = 3,
BOUNDARY = 4,
NON_BOUNDARY = 5,
LAST_TYPE = NON_BOUNDARY,
};
RegExpAssertion(AssertionType type, JSRegExp::Flags flags)
: assertion_type_(type), flags_(flags) {}
......@@ -285,7 +286,8 @@ class RegExpAssertion final : public RegExpTree {
bool IsAnchoredAtEnd() override;
int min_match() override { return 0; }
int max_match() override { return 0; }
AssertionType assertion_type() { return assertion_type_; }
AssertionType assertion_type() const { return assertion_type_; }
JSRegExp::Flags flags() const { return flags_; }
private:
const AssertionType assertion_type_;
......
......@@ -916,9 +916,102 @@ RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index,
return ActionNode::StorePosition(start_reg, true, body_node);
}
namespace {
class AssertionSequenceRewriter final {
public:
// TODO(jgruber): Consider moving this to a separate AST tree rewriter pass
// instead of sprinkling rewrites into the AST->Node conversion process.
static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) {
AssertionSequenceRewriter rewriter(terms, zone);
static constexpr int kNoIndex = -1;
int from = kNoIndex;
for (int i = 0; i < terms->length(); i++) {
RegExpTree* t = terms->at(i);
if (from == kNoIndex && t->IsAssertion()) {
from = i; // Start a sequence.
} else if (from != kNoIndex && !t->IsAssertion()) {
// Terminate and process the sequence.
if (i - from > 1) rewriter.Rewrite(from, i);
from = kNoIndex;
}
}
if (from != kNoIndex && terms->length() - from > 1) {
rewriter.Rewrite(from, terms->length());
}
}
// All assertions are zero width. A consecutive sequence of assertions is
// order-independent. There's two ways we can optimize here:
// 1. fold all identical assertions.
// 2. if any assertion combinations are known to fail (e.g. \b\B), the entire
// sequence fails.
void Rewrite(int from, int to) {
DCHECK_GT(to, from + 1);
// Bitfield of all seen assertions.
uint32_t seen_assertions = 0;
STATIC_ASSERT(RegExpAssertion::LAST_TYPE < kUInt32Size * kBitsPerByte);
// Flags must match for folding.
JSRegExp::Flags flags = terms_->at(from)->AsAssertion()->flags();
bool saw_mismatched_flags = false;
for (int i = from; i < to; i++) {
RegExpAssertion* t = terms_->at(i)->AsAssertion();
if (t->flags() != flags) saw_mismatched_flags = true;
const uint32_t bit = 1 << t->assertion_type();
if ((seen_assertions & bit) && !saw_mismatched_flags) {
// Fold duplicates.
terms_->Set(i, new (zone_) RegExpEmpty());
}
seen_assertions |= bit;
}
// Collapse failures.
const uint32_t always_fails_mask =
1 << RegExpAssertion::BOUNDARY | 1 << RegExpAssertion::NON_BOUNDARY;
if ((seen_assertions & always_fails_mask) == always_fails_mask) {
ReplaceSequenceWithFailure(from, to);
}
}
void ReplaceSequenceWithFailure(int from, int to) {
// Replace the entire sequence with a single node that always fails.
// TODO(jgruber): Consider adding an explicit Fail kind. Until then, the
// negated '*' (everything) range serves the purpose.
ZoneList<CharacterRange>* ranges =
new (zone_) ZoneList<CharacterRange>(0, zone_);
RegExpCharacterClass* cc =
new (zone_) RegExpCharacterClass(zone_, ranges, JSRegExp::Flags());
terms_->Set(from, cc);
// Zero out the rest.
RegExpEmpty* empty = new (zone_) RegExpEmpty();
for (int i = from + 1; i < to; i++) terms_->Set(i, empty);
}
private:
AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone)
: zone_(zone), terms_(terms) {}
Zone* zone_;
ZoneList<RegExpTree*>* terms_;
};
} // namespace
RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
ZoneList<RegExpTree*>* children = nodes();
AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone());
RegExpNode* current = on_success;
if (compiler->read_backward()) {
for (int i = 0; i < children->length(); i++) {
......
......@@ -1964,12 +1964,6 @@ void RegExpBuilder::AddTerm(RegExpTree* term) {
void RegExpBuilder::AddAssertion(RegExpTree* assert) {
FlushText();
if (terms_.length() > 0 && terms_.last()->IsAssertion()) {
// Omit repeated assertions of the same type.
RegExpAssertion* last = terms_.last()->AsAssertion();
RegExpAssertion* next = assert->AsAssertion();
if (last->assertion_type() == next->assertion_type()) return;
}
terms_.Add(assert, zone());
LAST(ADD_ASSERT);
}
......
......@@ -261,8 +261,9 @@ TEST(RegExpParser) {
CheckParseEq("\\u0034", "'\x34'");
CheckParseEq("\\u003z", "'u003z'");
CheckParseEq("foo[z]*", "(: 'foo' (# 0 - g [z]))");
CheckParseEq("^^^$$$\\b\\b\\b\\b", "(: @^i @$i @b)");
CheckParseEq("\\b\\b\\b\\b\\B\\B\\B\\B\\b\\b\\b\\b", "(: @b @B @b)");
CheckParseEq("^^^$$$\\b\\b\\b\\b", "(: @^i @^i @^i @$i @$i @$i @b @b @b @b)");
CheckParseEq("\\b\\b\\b\\b\\B\\B\\B\\B\\b\\b\\b\\b",
"(: @b @b @b @b @B @B @B @B @b @b @b @b)");
CheckParseEq("\\b\\B\\b", "(: @b @B @b)");
// Unicode regexps
......
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// These patterns shouldn't generate code of excessive size.
assertNull(/\b\B\b\B\b\B\b\B\b\B\b\B\b\B\b\B\b\B/.exec(" aa "));
assertNull(/\b\b\b\b\b\b\b\b\b\B\B\B\B\B\B\B\B\B/.exec(" aa "));
assertNull(/\b\B$\b\B$\b\B$\b\B$\b\B$\b\B$\b\B$/.exec(" aa "));
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment