Commit 4b15b984 authored by Seth Brenith's avatar Seth Brenith Committed by Commit Bot

[regexp] Better quick checks on loop entry nodes

Like the predecessor change https://crrev.com/c/v8/v8/+/1702125 , this
change is inspired by attempting to exit earlier from generated RegExp
code, when no further matches are possible because any match would be
too long. The motivating example this time is the following expression,
which tests whether a string of Unicode playing cards has five of the
same suit in a row:

/([🂡-🂮]{5})|([🂱-🂾]{5})|([🃁-🃎]{5})|([🃑-🃞]{5})/u

A human reading this expression can readily see that any match requires
at least 10 characters (5 surrogate pairs), but the LoopChoiceNode for
each repeated option reports its minimum distance to the end of a match
as zero. This is correct, because the LoopChoiceNode's behavior depends
on additional state (the loop counter). However, the preceding node, a
SET_REGISTER action that initializes the loop counter, could confidently
state that it consumes at least 10 characters. Furthermore, when we try
to emit a quick check for that action, we could follow only paths from
the LoopChoiceNode that are possible based on the minimum iteration
count. This change implements both of those "could"s.

I expect this improvement to apply pretty broadly to expressions that
use minimum repetition counts and that don't meet the criteria for
unrolling. In this particular case, I get about 12% improvement on the
overall UniPoker test, due to reducing the execution time of this
expression by 85% and the execution time of another similar expression
that checks for n-of-a-kind by 20%.

Bug: v8:9305

Change-Id: I319e381743967bdf83324be75bae943fbb5dd496
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1704941
Commit-Queue: Seth Brenith <seth.brenith@microsoft.com>
Reviewed-by: 's avatarJakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#62963}
parent 4189da7b
......@@ -1627,8 +1627,8 @@ RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
bool needs_counter = has_min || has_max;
int reg_ctr = needs_counter ? compiler->AllocateRegister()
: RegExpCompiler::kNoRegister;
LoopChoiceNode* center = new (zone)
LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
LoopChoiceNode* center = new (zone) LoopChoiceNode(
body->min_match() == 0, compiler->read_backward(), min, zone);
if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
RegExpNode* loop_return =
needs_counter ? static_cast<RegExpNode*>(
......@@ -1668,7 +1668,7 @@ RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
center->AddLoopAlternative(body_alt);
}
if (needs_counter) {
return ActionNode::SetRegister(reg_ctr, 0, center);
return ActionNode::SetRegisterForLoop(reg_ctr, 0, center);
} else {
return center;
}
......
This diff is collapsed.
......@@ -285,10 +285,11 @@ class Trace {
void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
};
class DeferredSetRegister : public DeferredAction {
class DeferredSetRegisterForLoop : public DeferredAction {
public:
DeferredSetRegister(int reg, int value)
: DeferredAction(ActionNode::SET_REGISTER, reg), value_(value) {}
DeferredSetRegisterForLoop(int reg, int value)
: DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg),
value_(value) {}
int value() { return value_; }
private:
......
......@@ -200,7 +200,7 @@ void DotPrinterImpl::VisitAssertion(AssertionNode* that) {
void DotPrinterImpl::VisitAction(ActionNode* that) {
os_ << " n" << that << " [";
switch (that->action_type_) {
case ActionNode::SET_REGISTER:
case ActionNode::SET_REGISTER_FOR_LOOP:
os_ << "label=\"$" << that->data_.u_store_register.reg
<< ":=" << that->data_.u_store_register.value << "\", shape=octagon";
break;
......
......@@ -140,6 +140,15 @@ class RegExpNode : public ZoneObject {
// fail and can be ignored when determining how many characters are consumed
// on success. If this node has not been analyzed yet, EatsAtLeast returns 0.
int EatsAtLeast(bool not_at_start);
// Returns how many characters this node must consume in order to succeed,
// given that this is a LoopChoiceNode whose counter register is in a
// newly-initialized state at the current position in the generated code. For
// example, consider /a{6,8}/. Absent any extra information, the
// LoopChoiceNode for the repetition must report that it consumes at least
// zero characters, because it may have already looped several times. However,
// with a newly-initialized counter, it can report that it consumes at least
// six characters.
virtual EatsAtLeastInfo EatsAtLeastFromLoopEntry();
// Emits some quick code that checks whether the preloaded characters match.
// Falls through on certain failure, jumps to the label on possible success.
// If the node cannot make a quick check it does nothing and returns false.
......@@ -156,6 +165,17 @@ class RegExpNode : public ZoneObject {
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start) = 0;
// Fills in quick check details for this node, given that this is a
// LoopChoiceNode whose counter register is in a newly-initialized state at
// the current position in the generated code. For example, consider /a{6,8}/.
// Absent any extra information, the LoopChoiceNode for the repetition cannot
// generate any useful quick check because a match might be the (empty)
// continuation node. However, with a newly-initialized counter, it can
// generate a quick check for several 'a' characters at once.
virtual void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start);
static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
// Only returns the successor for a text node of length 1 that matches any
......@@ -277,7 +297,7 @@ class SeqRegExpNode : public RegExpNode {
class ActionNode : public SeqRegExpNode {
public:
enum ActionType {
SET_REGISTER,
SET_REGISTER_FOR_LOOP,
INCREMENT_REGISTER,
STORE_POSITION,
BEGIN_SUBMATCH,
......@@ -285,7 +305,8 @@ class ActionNode : public SeqRegExpNode {
EMPTY_MATCH_CHECK,
CLEAR_CAPTURES
};
static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
static ActionNode* SetRegisterForLoop(int reg, int val,
RegExpNode* on_success);
static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
static ActionNode* StorePosition(int reg, bool is_capture,
RegExpNode* on_success);
......@@ -305,10 +326,7 @@ class ActionNode : public SeqRegExpNode {
void Emit(RegExpCompiler* compiler, Trace* trace) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int filled_in,
bool not_at_start) override {
return on_success()->GetQuickCheckDetails(details, compiler, filled_in,
not_at_start);
}
bool not_at_start) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
ActionType action_type() { return action_type_; }
......@@ -659,23 +677,32 @@ class NegativeLookaroundChoiceNode : public ChoiceNode {
class LoopChoiceNode : public ChoiceNode {
public:
LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
LoopChoiceNode(bool body_can_be_zero_length, bool read_backward,
int min_loop_iterations, Zone* zone)
: ChoiceNode(2, zone),
loop_node_(nullptr),
continue_node_(nullptr),
body_can_be_zero_length_(body_can_be_zero_length),
read_backward_(read_backward) {}
read_backward_(read_backward),
traversed_loop_initialization_node_(false),
min_loop_iterations_(min_loop_iterations) {}
void AddLoopAlternative(GuardedAlternative alt);
void AddContinueAlternative(GuardedAlternative alt);
void Emit(RegExpCompiler* compiler, Trace* trace) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int characters_filled_in,
bool not_at_start) override;
void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
EatsAtLeastInfo EatsAtLeastFromLoopEntry() override;
RegExpNode* loop_node() { return loop_node_; }
RegExpNode* continue_node() { return continue_node_; }
bool body_can_be_zero_length() { return body_can_be_zero_length_; }
int min_loop_iterations() const { return min_loop_iterations_; }
bool read_backward() override { return read_backward_; }
void Accept(NodeVisitor* visitor) override;
RegExpNode* FilterOneByte(int depth) override;
......@@ -692,6 +719,22 @@ class LoopChoiceNode : public ChoiceNode {
RegExpNode* continue_node_;
bool body_can_be_zero_length_;
bool read_backward_;
// Temporary marker set only while generating quick check details. Represents
// whether GetQuickCheckDetails traversed the initialization node for this
// loop's counter. If so, we may be able to generate stricter quick checks
// because we know the loop node must match at least min_loop_iterations_
// times before the continuation node can match.
bool traversed_loop_initialization_node_;
// The minimum number of times the loop_node_ must match before the
// continue_node_ might be considered. This value can be temporarily decreased
// while generating quick check details, to represent the remaining iterations
// after the completed portion of the quick check details.
int min_loop_iterations_;
friend class IterationDecrementer;
friend class LoopInitializationMarker;
};
class NodeVisitor {
......
......@@ -27,6 +27,7 @@ assertEquals(["def"], "abcdef".match(/(?<=\w*)[^a|b|c]{3}/));
// Start of line matches.
assertEquals(["def"], "abcdef".match(/(?<=^abc)def/));
assertEquals(["def"], "abcdef".match(/(?<=^[a-c]{3})def/));
assertEquals(["def"], "abcabcdef".match(/(?<=^[a-c]{6})def/));
assertEquals(["def"], "xyz\nabcdef".match(/(?<=^[a-c]{3})def/m));
assertEquals(["ab", "cd", "efg"], "ab\ncd\nefg".match(/(?<=^)\w+/gm));
assertEquals(["ab", "cd", "efg"], "ab\ncd\nefg".match(/\w+(?<=$)/gm));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment