Commit f2a832ca authored by Martin Bidlingmaier's avatar Martin Bidlingmaier Committed by Commit Bot

[regexp] Support more quantifiers in experimental engine

Previously to this commit only quantifiers of the form /<x>*/, i.e.
arbitrarily often greedy repetition, were implemented.  Now a much
larger class is supported, e.g. + and ? and their non-greedy variants.
Because it came up repeatedly during the implementation, the commit also
adds the Label and DeferredLabel classes to patch JMP and FORK target
addresses more easily.

Still not supported are the following quantifiers:
- Possessive quantifiers, where I'm not entirely sure whether they could
  be implemented in principle. Re2 doesn't support them.
- Quantifiers with large but finite numbers for min and max numbers of
  repetitions, as in e.g. /<x>{9000, 90000}/. These are currently
  limited to some small value. This is because the body of such
  repetitions is unrolled explicitly, so the size of the bytecode is
  linear in the number of repetitions.

Cq-Include-Trybots: luci.v8.try:v8_linux64_fyi_rel_ng
Bug: v8:10765
Change-Id: Id04d893252588abb0f80c3cb33cfc707f6601ea0
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2387575
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Reviewed-by: 's avatarJakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#69759}
parent 9f9c4776
......@@ -97,19 +97,55 @@ class CanBeHandledVisitor final : private RegExpVisitor {
}
void* VisitQuantifier(RegExpQuantifier* node, void*) override {
// TODO(mbid, v8:10765): Theoretically we can support arbitrary min() and
// max(), but the size of the automaton grows linearly with finite max().
// We probably want a cut-off value here, or maybe we can "virtualize" the
// repetitions.
// Non-greedy quantifiers are easy to implement, but not supported atm.
// It's not clear to me how a possessive quantifier would be implemented,
// we should check whether re2 supports this.
result_ = result_ && node->min() == 0 &&
node->max() == RegExpTree::kInfinity && node->is_greedy();
if (!result_) {
// Finite but large values of `min()` and `max()` are bad for the
// breadth-first engine because finite (optional) repetition is dealt with
// by replicating the bytecode of the body of the quantifier. The number
// of replicatons grows exponentially in how deeply quantifiers are nested.
// `replication_factor_` keeps track of how often the current node will
// have to be replicated in the generated bytecode, and we don't allow this
// to exceed some small value.
static constexpr int kMaxReplicationFactor = 16;
// First we rule out values for min and max that are too big even before
// taking into account the ambient replication_factor_. This also guards
// against overflows in `local_replication` or `replication_factor_`.
if (node->min() > kMaxReplicationFactor ||
(node->max() != RegExpTree::kInfinity &&
node->max() > kMaxReplicationFactor)) {
result_ = false;
return nullptr;
}
// Save the current replication factor so that it can be restored if we
// return with `result_ == true`.
int before_replication_factor = replication_factor_;
int local_replication;
if (node->max() == RegExpTree::kInfinity) {
local_replication = node->min() + 1;
} else {
local_replication = node->max();
}
replication_factor_ *= local_replication;
if (replication_factor_ > kMaxReplicationFactor) {
result_ = false;
return nullptr;
}
switch (node->quantifier_type()) {
case RegExpQuantifier::GREEDY:
case RegExpQuantifier::NON_GREEDY:
break;
case RegExpQuantifier::POSSESSIVE:
// TODO(mbid, v8:10765): It's not clear to me whether this can be
// supported in breadth-first mode. Re2 doesn't support it.
result_ = false;
return nullptr;
}
node->body()->Accept(this, nullptr);
replication_factor_ = before_replication_factor;
return nullptr;
}
......@@ -141,6 +177,9 @@ class CanBeHandledVisitor final : private RegExpVisitor {
void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
private:
// See comment in `VisitQuantifier`:
int replication_factor_ = 1;
bool result_ = true;
Zone* zone_;
};
......@@ -156,6 +195,90 @@ bool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree,
namespace {
// A label in bytecode with known address.
class Label {
public:
explicit Label(int index) : index_(index) { DCHECK_GE(index_, 0); }
int index() { return index_; }
// Friend functions because `label.AddForkTo(code, zone)` reads like we're
// adding code to where `label` is defined, but we're adding a fork with
// target `label` at the end of `code`.
friend void AddForkTo(Label target, ZoneList<RegExpInstruction>& code,
Zone* zone) {
code.Add(RegExpInstruction::Fork(target.index_), zone);
}
friend void AddJmpTo(Label target, ZoneList<RegExpInstruction>& code,
Zone* zone) {
code.Add(RegExpInstruction::Jmp(target.index_), zone);
}
private:
int index_;
};
// A label in bytecode whose address is not known yet. The address *must* be
// `Bind` before the deferred label object goes out of scope, and the deferred
// label object *must not* be used after it was defined. (Use the `Label`
// object returned by `Bind` instead.)
struct DeferredLabel {
// Implemented as a linked list through the `payload.pc` of FORK and JMP
// instructions.
public:
DeferredLabel() = default;
~DeferredLabel() { DCHECK_EQ(patch_list_begin_, kLabelWasDefined); }
friend void AddForkTo(DeferredLabel& target,
ZoneList<RegExpInstruction>& code, Zone* zone) {
DCHECK_NE(target.patch_list_begin_, DeferredLabel::kLabelWasDefined);
int new_list_begin = code.length();
DCHECK_GE(new_list_begin, 0);
code.Add(RegExpInstruction::Fork(target.patch_list_begin_), zone);
target.patch_list_begin_ = new_list_begin;
}
friend void AddJmpTo(DeferredLabel& target, ZoneList<RegExpInstruction>& code,
Zone* zone) {
DCHECK_NE(target.patch_list_begin_, DeferredLabel::kLabelWasDefined);
int new_list_begin = code.length();
DCHECK_GE(new_list_begin, 0);
code.Add(RegExpInstruction::Jmp(target.patch_list_begin_), zone);
target.patch_list_begin_ = new_list_begin;
}
// Define the deferred label as referring to the next instruction that will
// be pushed to `code`. Consumes the DeferredLabel object and returns a
// Label object.
Label Bind(ZoneList<RegExpInstruction>& code) && {
DCHECK_NE(patch_list_begin_, kLabelWasDefined);
int index = code.length();
while (patch_list_begin_ != kEmptyList) {
RegExpInstruction& inst = code[patch_list_begin_];
DCHECK(inst.opcode == RegExpInstruction::FORK ||
inst.opcode == RegExpInstruction::JMP);
patch_list_begin_ = inst.payload.pc;
inst.payload.pc = index;
}
patch_list_begin_ = kLabelWasDefined;
return Label(index);
}
private:
static constexpr int kEmptyList = -1;
static constexpr int kLabelWasDefined = -2;
int patch_list_begin_ = kEmptyList;
// Don't copy, don't move. Moving could be implemented, but it's not
// needed anywhere.
DISALLOW_COPY_AND_ASSIGN(DeferredLabel);
};
class CompileVisitor : private RegExpVisitor {
public:
static ZoneList<RegExpInstruction> Compile(RegExpTree* tree,
......@@ -179,62 +302,45 @@ class CompileVisitor : private RegExpVisitor {
// 1` and should push code corresponding to the ith alternative onto `code_`.
template <class F>
void CompileDisjunction(int alt_num, F gen_alt) {
// An alternative a0 | a1 | a2 is compiled into
// FORK <a2>
// FORK <a1>
// <a0>
// JMP $end
// <a1>
// JMP $end
// <a2>
// where $end is the index of the next instruction after <a2>.
// An alternative a1 | ... | an is compiled into
//
// FORK tail1
// <a1>
// JMP end
// tail1:
// FORK tail2
// <a2>
// JMP end
// tail2:
// ...
// ...
// tail{n -1}:
// <an>
// end:
//
// By the semantics of the FORK instruction (see above at definition and
// semantics), the forked thread has lower priority than the current
// thread. This means that with the code we're generating here, the thread
// matching the alternative a0 is indeed the thread with the highest
// priority, followed by the thread for a1 and so on.
// semantics), a forked thread has lower priority than the thread that
// spawned it. This means that with the code we're generating here, the
// thread matching the alternative a1 has indeed highest priority, followed
// by the thread for a2 and so on.
if (alt_num == 0) {
return;
}
// Record the index of the first of the alt_num - 1 fork instructions in the
// beginning.
int forks_begin = code_.length();
// Add FORKs to alts[alt_num - 1], alts[alt_num - 2], ..., alts[1].
for (int i = alt_num - 1; i != 0; --i) {
// The FORK's address is patched once we know the address of the ith
// alternative.
code_.Add(RegExpInstruction::Fork(-1), zone_);
}
// List containing the index of the final JMP instruction after each
// alternative but the last one.
ZoneList<int> jmp_indices(alt_num - 1, zone_);
DeferredLabel end;
for (int i = 0; i != alt_num; ++i) {
if (i != 0) {
// If this is not the first alternative, we have to patch the
// corresponding FORK statement in the beginning.
code_[forks_begin + alt_num - 1 - i].payload.pc = code_.length();
}
for (int i = 0; i != alt_num - 1; ++i) {
DeferredLabel tail;
AddForkTo(tail, code_, zone_);
gen_alt(i);
if (i != alt_num - 1) {
// If this is not the last alternative, we have to emit a JMP past the
// remaining alternatives. We don't know this address yet, so we have
// to patch patch it once all alternatives are emitted.
jmp_indices.Add(code_.length(), zone_);
code_.Add(RegExpInstruction::Jmp(-1), zone_);
}
AddJmpTo(end, code_, zone_);
std::move(tail).Bind(code_);
}
// All alternatives are emitted. Now we can patch the JMP instruction
// after each but the last alternative.
int end_index = code_.length();
for (int jmp_index : jmp_indices) {
code_[jmp_index].payload.pc = end_index;
}
gen_alt(alt_num - 1);
std::move(end).Bind(code_);
}
void* VisitDisjunction(RegExpDisjunction* node, void*) override {
......@@ -298,30 +404,118 @@ class CompileVisitor : private RegExpVisitor {
}
void* VisitQuantifier(RegExpQuantifier* node, void*) override {
// TODO(mbid,v8:10765): For now we support a quantifier of the form /x*/,
// i.e. greedy match of any number of /x/. See also the comment in
// `CanBeHandledVisitor::VisitQuantifier`.
DCHECK_EQ(node->min(), 0);
DCHECK_EQ(node->max(), RegExpTree::kInfinity);
DCHECK(node->is_greedy());
// The repetition of /x/ is compiled into
//
// a: FORK d
// b: <x>
// c: JMP a
// d: ...
//
// Note that a FORKed thread has lower priority than the main thread, so
// this will indeed match greedily.
// First repeat the body `min()` times.
for (int i = 0; i != node->min(); ++i) {
node->body()->Accept(this, nullptr);
}
switch (node->quantifier_type()) {
case RegExpQuantifier::POSSESSIVE:
UNREACHABLE();
case RegExpQuantifier::GREEDY: {
if (node->max() == RegExpTree::kInfinity) {
// This is compiled into
//
// begin:
// FORK end
// <body>
// JMP begin
// end:
// ...
//
// This is greedy because a forked thread has lower priority than the
// thread that spawned it.
Label begin(code_.length());
DeferredLabel end;
AddForkTo(end, code_, zone_);
node->body()->Accept(this, nullptr);
AddJmpTo(begin, code_, zone_);
std::move(end).Bind(code_);
} else {
DCHECK_NE(node->max(), RegExpTree::kInfinity);
// This is compiled into
//
// FORK end
// <body>
// FORK end
// <body>
// ... ; max - min times in total
// ...
// FORK end
// <body>
// end:
// ...
DeferredLabel end;
for (int i = node->min(); i != node->max(); ++i) {
AddForkTo(end, code_, zone_);
node->body()->Accept(this, nullptr);
}
std::move(end).Bind(code_);
}
break;
}
case RegExpQuantifier::NON_GREEDY: {
if (node->max() == RegExpTree::kInfinity) {
// This is compiled into
//
// FORK body
// JMP end
// body:
// <body>
// FORK body
// end:
// ...
Label body(code_.length() + 2);
DeferredLabel end;
AddForkTo(body, code_, zone_);
AddJmpTo(end, code_, zone_);
DCHECK_EQ(body.index(), code_.length());
node->body()->Accept(this, nullptr);
AddForkTo(body, code_, zone_);
std::move(end).Bind(code_);
} else {
DCHECK_NE(node->max(), RegExpTree::kInfinity);
// This is compiled into
//
// FORK body0
// JMP end
// body0:
// <body>
// FORK body1
// JMP end
// body1:
// <body>
// ...
// ...
// body{max - min - 1}:
// <body>
// end:
// ...
DeferredLabel end;
for (int i = node->min(); i != node->max(); ++i) {
Label body(code_.length() + 2);
AddForkTo(body, code_, zone_);
AddJmpTo(end, code_, zone_);
DCHECK_EQ(body.index(), code_.length());
node->body()->Accept(this, nullptr);
}
std::move(end).Bind(code_);
}
break;
}
}
int initial_fork_index = code_.length();
// The FORK's address is patched once we're done.
code_.Add(RegExpInstruction::Fork(-1), zone_);
node->body()->Accept(this, nullptr);
code_.Add(RegExpInstruction::Jmp(initial_fork_index), zone_);
int end_index = code_.length();
code_[initial_fork_index].payload.pc = end_index;
return nullptr;
}
......
......@@ -443,11 +443,12 @@ class RegExpQuantifier final : public RegExpTree {
bool IsQuantifier() override;
int min_match() override { return min_match_; }
int max_match() override { return max_match_; }
int min() { return min_; }
int max() { return max_; }
bool is_possessive() { return quantifier_type_ == POSSESSIVE; }
int min() const { return min_; }
int max() const { return max_; }
QuantifierType quantifier_type() const { return quantifier_type_; }
bool is_possessive() const { return quantifier_type_ == POSSESSIVE; }
bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
bool is_greedy() { return quantifier_type_ == GREEDY; }
bool is_greedy() const { return quantifier_type_ == GREEDY; }
RegExpTree* body() { return body_; }
private:
......
......@@ -37,10 +37,22 @@ Test(/\w\d/, "?a??a3!!!", ["a3"], 0);
// surrogate characters that make up 💩. The leading surrogate is 0xD83D.
Test(/[💩]/, "f💩", [String.fromCodePoint(0xD83D)], 0);
// Greedy quantifier for 0 or more matches.
// Greedy and non-greedy quantifiers.
Test(/x*/, "asdfxk", [""], 0);
Test(/xx*a/, "xxa", ["xxa"], 0);
Test(/asdf*/, "aasdfffk", ["asdfff"], 0);
Test(/x*[xa]/, "xxaa", ["xxa"], 0);
Test(/x*?[xa]/, "xxaa", ["x"], 0);
Test(/x*?a/, "xxaa", ["xxa"], 0);
Test(/x+a/, "axxa", ["xxa"], 0);
Test(/x+?[ax]/, "axxa", ["xx"], 0);
Test(/xx?[xa]/, "xxaa", ["xxa"], 0);
Test(/xx??[xa]/, "xxaa", ["xx"], 0);
Test(/xx??a/, "xxaa", ["xxa"], 0);
Test(/x{4}/, "xxxxxxxxx", ["xxxx"], 0);
Test(/x{4,}/, "xxxxxxxxx", ["xxxxxxxxx"], 0);
Test(/x{4,}?/, "xxxxxxxxx", ["xxxx"], 0);
Test(/x{2,4}/, "xxxxxxxxx", ["xxxx"], 0);
Test(/x{2,4}?/, "xxxxxxxxx", ["xx"], 0);
// Non-capturing groups and nested operators.
Test(/(?:)/, "asdf", [""], 0);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment