Commit 4b15b984 authored by Seth Brenith's avatar Seth Brenith Committed by Commit Bot

[regexp] Better quick checks on loop entry nodes

Like the predecessor change https://crrev.com/c/v8/v8/+/1702125 , this
change is inspired by attempting to exit earlier from generated RegExp
code, when no further matches are possible because any match would be
too long. The motivating example this time is the following expression,
which tests whether a string of Unicode playing cards has five of the
same suit in a row:

/([🂡-🂮]{5})|([🂱-🂾]{5})|([🃁-🃎]{5})|([🃑-🃞]{5})/u

A human reading this expression can readily see that any match requires
at least 10 characters (5 surrogate pairs), but the LoopChoiceNode for
each repeated option reports its minimum distance to the end of a match
as zero. This is correct, because the LoopChoiceNode's behavior depends
on additional state (the loop counter). However, the preceding node, a
SET_REGISTER action that initializes the loop counter, could confidently
state that it consumes at least 10 characters. Furthermore, when we try
to emit a quick check for that action, we could follow only paths from
the LoopChoiceNode that are possible based on the minimum iteration
count. This change implements both of those "could"s.

I expect this improvement to apply pretty broadly to expressions that
use minimum repetition counts and that don't meet the criteria for
unrolling. In this particular case, I get about 12% improvement on the
overall UniPoker test, due to reducing the execution time of this
expression by 85% and the execution time of another similar expression
that checks for n-of-a-kind by 20%.

Bug: v8:9305

Change-Id: I319e381743967bdf83324be75bae943fbb5dd496
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1704941
Commit-Queue: Seth Brenith <seth.brenith@microsoft.com>
Reviewed-by: 's avatarJakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#62963}
parent 4189da7b
......@@ -1627,8 +1627,8 @@ RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
bool needs_counter = has_min || has_max;
int reg_ctr = needs_counter ? compiler->AllocateRegister()
: RegExpCompiler::kNoRegister;
LoopChoiceNode* center = new (zone)
LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
LoopChoiceNode* center = new (zone) LoopChoiceNode(
body->min_match() == 0, compiler->read_backward(), min, zone);
if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
RegExpNode* loop_return =
needs_counter ? static_cast<RegExpNode*>(
......@@ -1668,7 +1668,7 @@ RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
center->AddLoopAlternative(body_alt);
}
if (needs_counter) {
return ActionNode::SetRegister(reg_ctr, 0, center);
return ActionNode::SetRegisterForLoop(reg_ctr, 0, center);
} else {
return center;
}
......
......@@ -423,14 +423,14 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
action = action->next()) {
if (action->Mentions(reg)) {
switch (action->action_type()) {
case ActionNode::SET_REGISTER: {
Trace::DeferredSetRegister* psr =
static_cast<Trace::DeferredSetRegister*>(action);
case ActionNode::SET_REGISTER_FOR_LOOP: {
Trace::DeferredSetRegisterForLoop* psr =
static_cast<Trace::DeferredSetRegisterForLoop*>(action);
if (!absolute) {
value += psr->value();
absolute = true;
}
// SET_REGISTER is currently only used for newly introduced loop
// SET_REGISTER_FOR_LOOP is only used for newly introduced loop
// counters. They can have a significant previous value if they
// occur in a loop. TODO(lrn): Propagate this information, so
// we can set undo_action to IGNORE if we know there is no value to
......@@ -635,9 +635,10 @@ void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
guards_->Add(guard, zone);
}
ActionNode* ActionNode::SetRegister(int reg, int val, RegExpNode* on_success) {
ActionNode* ActionNode::SetRegisterForLoop(int reg, int val,
RegExpNode* on_success) {
ActionNode* result =
new (on_success->zone()) ActionNode(SET_REGISTER, on_success);
new (on_success->zone()) ActionNode(SET_REGISTER_FOR_LOOP, on_success);
result->data_.u_store_register.reg = reg;
result->data_.u_store_register.value = val;
return result;
......@@ -1335,6 +1336,18 @@ void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
SaveBMInfo(bm, not_at_start, offset);
}
void ActionNode::GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int filled_in,
bool not_at_start) {
if (action_type_ == SET_REGISTER_FOR_LOOP) {
on_success()->GetQuickCheckDetailsFromLoopEntry(details, compiler,
filled_in, not_at_start);
} else {
on_success()->GetQuickCheckDetails(details, compiler, filled_in,
not_at_start);
}
}
void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) {
// Match the behaviour of EatsAtLeast on this node.
......@@ -1388,6 +1401,60 @@ int RegExpNode::EatsAtLeast(bool not_at_start) {
: eats_at_least_.eats_at_least_from_possibly_start;
}
EatsAtLeastInfo RegExpNode::EatsAtLeastFromLoopEntry() {
// SET_REGISTER_FOR_LOOP is only used to initialize loop counters, and it
// implies that the following node must be a LoopChoiceNode. If we need to
// set registers to constant values for other reasons, we could introduce a
// new action type SET_REGISTER that doesn't imply anything about its
// successor.
UNREACHABLE();
}
void RegExpNode::GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start) {
// See comment in RegExpNode::EatsAtLeastFromLoopEntry.
UNREACHABLE();
}
EatsAtLeastInfo LoopChoiceNode::EatsAtLeastFromLoopEntry() {
DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue.
if (read_backward()) {
// Can't do anything special for a backward loop, so return the basic values
// that we got during analysis.
return *eats_at_least_info();
}
DCHECK_GE(loop_node_->EatsAtLeast(true), continue_node_->EatsAtLeast(true));
// Figure out how much the loop body itself eats, not including anything in
// the continuation case.
uint8_t loop_body_from_not_start = static_cast<uint8_t>(
loop_node_->EatsAtLeast(true) - continue_node_->EatsAtLeast(true));
uint8_t loop_body_from_possibly_start = base::saturated_cast<uint8_t>(
loop_node_->EatsAtLeast(false) - continue_node_->EatsAtLeast(true));
EatsAtLeastInfo result;
result.eats_at_least_from_not_start = base::saturated_cast<uint8_t>(
min_loop_iterations() * loop_body_from_not_start +
continue_node_->EatsAtLeast(true));
if (min_loop_iterations() > 0 && loop_body_from_possibly_start > 0) {
// First loop iteration eats at least one, so all subsequent iterations
// and the after-loop chunk are guaranteed to not be at the start.
result.eats_at_least_from_possibly_start = base::saturated_cast<uint8_t>(
loop_body_from_possibly_start +
(min_loop_iterations() - 1) * loop_body_from_not_start +
continue_node_->EatsAtLeast(true));
} else {
// Loop body might eat nothing, so only continue node contributes.
result.eats_at_least_from_possibly_start =
continue_node_->EatsAtLeast(false);
}
return result;
}
bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
Trace* bounds_check_trace, Trace* trace,
bool preload_has_checked_bounds,
......@@ -1514,7 +1581,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
// and the mask-compare will determine definitely whether we have
// a match at this character position.
pos->mask = char_mask;
pos->value = c;
pos->value = chars[0];
pos->determines_perfectly = true;
} else {
uint32_t common_bits = char_mask;
......@@ -1699,6 +1766,37 @@ class VisitMarker {
NodeInfo* info_;
};
// Temporarily sets traversed_loop_initialization_node_.
class LoopInitializationMarker {
public:
explicit LoopInitializationMarker(LoopChoiceNode* node) : node_(node) {
DCHECK(!node_->traversed_loop_initialization_node_);
node_->traversed_loop_initialization_node_ = true;
}
~LoopInitializationMarker() {
DCHECK(node_->traversed_loop_initialization_node_);
node_->traversed_loop_initialization_node_ = false;
}
private:
LoopChoiceNode* node_;
DISALLOW_COPY_AND_ASSIGN(LoopInitializationMarker);
};
// Temporarily decrements min_loop_iterations_.
class IterationDecrementer {
public:
explicit IterationDecrementer(LoopChoiceNode* node) : node_(node) {
DCHECK_GT(node_->min_loop_iterations_, 0);
--node_->min_loop_iterations_;
}
~IterationDecrementer() { ++node_->min_loop_iterations_; }
private:
LoopChoiceNode* node_;
DISALLOW_COPY_AND_ASSIGN(IterationDecrementer);
};
RegExpNode* SeqRegExpNode::FilterOneByte(int depth) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
......@@ -1870,9 +1968,48 @@ void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
int characters_filled_in,
bool not_at_start) {
if (body_can_be_zero_length_ || info()->visited) return;
not_at_start = not_at_start || this->not_at_start();
DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue.
if (traversed_loop_initialization_node_ && min_loop_iterations_ > 0 &&
loop_node_->EatsAtLeast(not_at_start) >
continue_node_->EatsAtLeast(true)) {
// Loop body is guaranteed to execute at least once, and consume characters
// when it does, meaning the only possible quick checks from this point
// begin with the loop body. We may recursively visit this LoopChoiceNode,
// but we temporarily decrease its minimum iteration counter so we know when
// to check the continue case.
IterationDecrementer next_iteration(this);
loop_node_->GetQuickCheckDetails(details, compiler, characters_filled_in,
not_at_start);
} else {
// Might not consume anything in the loop body, so treat it like a normal
// ChoiceNode (and don't recursively visit this node again).
VisitMarker marker(info());
return ChoiceNode::GetQuickCheckDetails(details, compiler,
characters_filled_in, not_at_start);
ChoiceNode::GetQuickCheckDetails(details, compiler, characters_filled_in,
not_at_start);
}
}
void LoopChoiceNode::GetQuickCheckDetailsFromLoopEntry(
QuickCheckDetails* details, RegExpCompiler* compiler,
int characters_filled_in, bool not_at_start) {
if (traversed_loop_initialization_node_) {
// We already entered this loop once, exited via its continuation node, and
// followed an outer loop's back-edge to before the loop entry point. We
// could try to reset the minimum iteration count to its starting value at
// this point, but that seems like more trouble than it's worth. It's safe
// to keep going with the current (possibly reduced) minimum iteration
// count.
GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
} else {
// We are entering a loop via its counter initialization action, meaning we
// are guaranteed to run the loop body at least some minimum number of times
// before running the continuation node. Set a flag so that this node knows
// (now and any times we visit it again recursively) that it was entered
// from the top.
LoopInitializationMarker marker(this);
GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
}
}
void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
......@@ -3153,8 +3290,8 @@ void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
on_success()->Emit(compiler, &new_trace);
break;
}
case SET_REGISTER: {
Trace::DeferredSetRegister new_set(data_.u_store_register.reg,
case SET_REGISTER_FOR_LOOP: {
Trace::DeferredSetRegisterForLoop new_set(data_.u_store_register.reg,
data_.u_store_register.value);
Trace new_trace = *trace;
new_trace.add_action(&new_set);
......@@ -3362,10 +3499,20 @@ class EatsAtLeastPropagator : public AllStatic {
static void VisitAction(ActionNode* that) {
// POSITIVE_SUBMATCH_SUCCESS rewinds input, so we must not consider
// successor nodes for eats_at_least. Otherwise the current node eats at
// least as much as its successor.
if (that->action_type() != ActionNode::POSITIVE_SUBMATCH_SUCCESS) {
// successor nodes for eats_at_least. SET_REGISTER_FOR_LOOP indicates a loop
// entry point, which means the loop body will run at least the minimum
// number of times before the continuation case can run. Otherwise the
// current node eats at least as much as its successor.
switch (that->action_type()) {
case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
break; // Was already initialized to zero.
case ActionNode::SET_REGISTER_FOR_LOOP:
that->set_eats_at_least_info(
that->on_success()->EatsAtLeastFromLoopEntry());
break;
default:
that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
break;
}
}
......
......@@ -285,10 +285,11 @@ class Trace {
void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
};
class DeferredSetRegister : public DeferredAction {
class DeferredSetRegisterForLoop : public DeferredAction {
public:
DeferredSetRegister(int reg, int value)
: DeferredAction(ActionNode::SET_REGISTER, reg), value_(value) {}
DeferredSetRegisterForLoop(int reg, int value)
: DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg),
value_(value) {}
int value() { return value_; }
private:
......
......@@ -200,7 +200,7 @@ void DotPrinterImpl::VisitAssertion(AssertionNode* that) {
void DotPrinterImpl::VisitAction(ActionNode* that) {
os_ << " n" << that << " [";
switch (that->action_type_) {
case ActionNode::SET_REGISTER:
case ActionNode::SET_REGISTER_FOR_LOOP:
os_ << "label=\"$" << that->data_.u_store_register.reg
<< ":=" << that->data_.u_store_register.value << "\", shape=octagon";
break;
......
......@@ -140,6 +140,15 @@ class RegExpNode : public ZoneObject {
// fail and can be ignored when determining how many characters are consumed
// on success. If this node has not been analyzed yet, EatsAtLeast returns 0.
int EatsAtLeast(bool not_at_start);
// Returns how many characters this node must consume in order to succeed,
// given that this is a LoopChoiceNode whose counter register is in a
// newly-initialized state at the current position in the generated code. For
// example, consider /a{6,8}/. Absent any extra information, the
// LoopChoiceNode for the repetition must report that it consumes at least
// zero characters, because it may have already looped several times. However,
// with a newly-initialized counter, it can report that it consumes at least
// six characters.
virtual EatsAtLeastInfo EatsAtLeastFromLoopEntry();
// Emits some quick code that checks whether the preloaded characters match.
// Falls through on certain failure, jumps to the label on possible success.
// If the node cannot make a quick check it does nothing and returns false.
......@@ -156,6 +165,17 @@ class RegExpNode : public ZoneObject {
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start) = 0;
// Fills in quick check details for this node, given that this is a
// LoopChoiceNode whose counter register is in a newly-initialized state at
// the current position in the generated code. For example, consider /a{6,8}/.
// Absent any extra information, the LoopChoiceNode for the repetition cannot
// generate any useful quick check because a match might be the (empty)
// continuation node. However, with a newly-initialized counter, it can
// generate a quick check for several 'a' characters at once.
virtual void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start);
static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
// Only returns the successor for a text node of length 1 that matches any
......@@ -277,7 +297,7 @@ class SeqRegExpNode : public RegExpNode {
class ActionNode : public SeqRegExpNode {
public:
enum ActionType {
SET_REGISTER,
SET_REGISTER_FOR_LOOP,
INCREMENT_REGISTER,
STORE_POSITION,
BEGIN_SUBMATCH,
......@@ -285,7 +305,8 @@ class ActionNode : public SeqRegExpNode {
EMPTY_MATCH_CHECK,
CLEAR_CAPTURES
};
static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
static ActionNode* SetRegisterForLoop(int reg, int val,
RegExpNode* on_success);
static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
static ActionNode* StorePosition(int reg, bool is_capture,
RegExpNode* on_success);
......@@ -305,10 +326,7 @@ class ActionNode : public SeqRegExpNode {
void Emit(RegExpCompiler* compiler, Trace* trace) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int filled_in,
bool not_at_start) override {
return on_success()->GetQuickCheckDetails(details, compiler, filled_in,
not_at_start);
}
bool not_at_start) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
ActionType action_type() { return action_type_; }
......@@ -659,23 +677,32 @@ class NegativeLookaroundChoiceNode : public ChoiceNode {
class LoopChoiceNode : public ChoiceNode {
public:
LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
LoopChoiceNode(bool body_can_be_zero_length, bool read_backward,
int min_loop_iterations, Zone* zone)
: ChoiceNode(2, zone),
loop_node_(nullptr),
continue_node_(nullptr),
body_can_be_zero_length_(body_can_be_zero_length),
read_backward_(read_backward) {}
read_backward_(read_backward),
traversed_loop_initialization_node_(false),
min_loop_iterations_(min_loop_iterations) {}
void AddLoopAlternative(GuardedAlternative alt);
void AddContinueAlternative(GuardedAlternative alt);
void Emit(RegExpCompiler* compiler, Trace* trace) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int characters_filled_in,
bool not_at_start) override;
void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
EatsAtLeastInfo EatsAtLeastFromLoopEntry() override;
RegExpNode* loop_node() { return loop_node_; }
RegExpNode* continue_node() { return continue_node_; }
bool body_can_be_zero_length() { return body_can_be_zero_length_; }
int min_loop_iterations() const { return min_loop_iterations_; }
bool read_backward() override { return read_backward_; }
void Accept(NodeVisitor* visitor) override;
RegExpNode* FilterOneByte(int depth) override;
......@@ -692,6 +719,22 @@ class LoopChoiceNode : public ChoiceNode {
RegExpNode* continue_node_;
bool body_can_be_zero_length_;
bool read_backward_;
// Temporary marker set only while generating quick check details. Represents
// whether GetQuickCheckDetails traversed the initialization node for this
// loop's counter. If so, we may be able to generate stricter quick checks
// because we know the loop node must match at least min_loop_iterations_
// times before the continuation node can match.
bool traversed_loop_initialization_node_;
// The minimum number of times the loop_node_ must match before the
// continue_node_ might be considered. This value can be temporarily decreased
// while generating quick check details, to represent the remaining iterations
// after the completed portion of the quick check details.
int min_loop_iterations_;
friend class IterationDecrementer;
friend class LoopInitializationMarker;
};
class NodeVisitor {
......
......@@ -27,6 +27,7 @@ assertEquals(["def"], "abcdef".match(/(?<=\w*)[^a|b|c]{3}/));
// Start of line matches.
assertEquals(["def"], "abcdef".match(/(?<=^abc)def/));
assertEquals(["def"], "abcdef".match(/(?<=^[a-c]{3})def/));
assertEquals(["def"], "abcabcdef".match(/(?<=^[a-c]{6})def/));
assertEquals(["def"], "xyz\nabcdef".match(/(?<=^[a-c]{3})def/m));
assertEquals(["ab", "cd", "efg"], "ab\ncd\nefg".match(/(?<=^)\w+/gm));
assertEquals(["ab", "cd", "efg"], "ab\ncd\nefg".match(/\w+(?<=$)/gm));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment