Commit bea0ffd0 authored by Seth Brenith's avatar Seth Brenith Committed by Commit Bot

Reland "[regexp] Better quick checks on loop entry nodes"

This is a reland of 4b15b984

Updates since original: fix an arithmetic overflow bug, remove an invalid
DCHECK, add a unit test that would trigger that DCHECK.

Original change's description:
> [regexp] Better quick checks on loop entry nodes
>
> Like the predecessor change https://crrev.com/c/v8/v8/+/1702125 , this
> change is inspired by attempting to exit earlier from generated RegExp
> code, when no further matches are possible because any match would be
> too long. The motivating example this time is the following expression,
> which tests whether a string of Unicode playing cards has five of the
> same suit in a row:
>
> /([🂡-🂮]{5})|([🂱-🂾]{5})|([🃁-🃎]{5})|([🃑-🃞]{5})/u
>
> A human reading this expression can readily see that any match requires
> at least 10 characters (5 surrogate pairs), but the LoopChoiceNode for
> each repeated option reports its minimum distance to the end of a match
> as zero. This is correct, because the LoopChoiceNode's behavior depends
> on additional state (the loop counter). However, the preceding node, a
> SET_REGISTER action that initializes the loop counter, could confidently
> state that it consumes at least 10 characters. Furthermore, when we try
> to emit a quick check for that action, we could follow only paths from
> the LoopChoiceNode that are possible based on the minimum iteration
> count. This change implements both of those "could"s.
>
> I expect this improvement to apply pretty broadly to expressions that
> use minimum repetition counts and that don't meet the criteria for
> unrolling. In this particular case, I get about 12% improvement on the
> overall UniPoker test, due to reducing the execution time of this
> expression by 85% and the execution time of another similar expression
> that checks for n-of-a-kind by 20%.
>
> Bug: v8:9305
>
> Change-Id: I319e381743967bdf83324be75bae943fbb5dd496
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1704941
> Commit-Queue: Seth Brenith <seth.brenith@microsoft.com>
> Reviewed-by: Jakob Gruber <jgruber@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#62963}

Bug: v8:9305
Change-Id: I992070d383009013881bf778242254c27134b650
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1726674Reviewed-by: 's avatarJakob Gruber <jgruber@chromium.org>
Commit-Queue: Seth Brenith <seth.brenith@microsoft.com>
Cr-Commit-Position: refs/heads/master@{#63009}
parent 0921e8f2
......@@ -1627,8 +1627,8 @@ RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
bool needs_counter = has_min || has_max;
int reg_ctr = needs_counter ? compiler->AllocateRegister()
: RegExpCompiler::kNoRegister;
LoopChoiceNode* center = new (zone)
LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
LoopChoiceNode* center = new (zone) LoopChoiceNode(
body->min_match() == 0, compiler->read_backward(), min, zone);
if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
RegExpNode* loop_return =
needs_counter ? static_cast<RegExpNode*>(
......@@ -1668,7 +1668,7 @@ RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
center->AddLoopAlternative(body_alt);
}
if (needs_counter) {
return ActionNode::SetRegister(reg_ctr, 0, center);
return ActionNode::SetRegisterForLoop(reg_ctr, 0, center);
} else {
return center;
}
......
......@@ -423,14 +423,14 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
action = action->next()) {
if (action->Mentions(reg)) {
switch (action->action_type()) {
case ActionNode::SET_REGISTER: {
Trace::DeferredSetRegister* psr =
static_cast<Trace::DeferredSetRegister*>(action);
case ActionNode::SET_REGISTER_FOR_LOOP: {
Trace::DeferredSetRegisterForLoop* psr =
static_cast<Trace::DeferredSetRegisterForLoop*>(action);
if (!absolute) {
value += psr->value();
absolute = true;
}
// SET_REGISTER is currently only used for newly introduced loop
// SET_REGISTER_FOR_LOOP is only used for newly introduced loop
// counters. They can have a significant previous value if they
// occur in a loop. TODO(lrn): Propagate this information, so
// we can set undo_action to IGNORE if we know there is no value to
......@@ -635,9 +635,10 @@ void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
guards_->Add(guard, zone);
}
ActionNode* ActionNode::SetRegister(int reg, int val, RegExpNode* on_success) {
ActionNode* ActionNode::SetRegisterForLoop(int reg, int val,
RegExpNode* on_success) {
ActionNode* result =
new (on_success->zone()) ActionNode(SET_REGISTER, on_success);
new (on_success->zone()) ActionNode(SET_REGISTER_FOR_LOOP, on_success);
result->data_.u_store_register.reg = reg;
result->data_.u_store_register.value = val;
return result;
......@@ -1335,6 +1336,18 @@ void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
SaveBMInfo(bm, not_at_start, offset);
}
void ActionNode::GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int filled_in,
bool not_at_start) {
if (action_type_ == SET_REGISTER_FOR_LOOP) {
on_success()->GetQuickCheckDetailsFromLoopEntry(details, compiler,
filled_in, not_at_start);
} else {
on_success()->GetQuickCheckDetails(details, compiler, filled_in,
not_at_start);
}
}
void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) {
// Match the behaviour of EatsAtLeast on this node.
......@@ -1388,6 +1401,65 @@ int RegExpNode::EatsAtLeast(bool not_at_start) {
: eats_at_least_.eats_at_least_from_possibly_start;
}
EatsAtLeastInfo RegExpNode::EatsAtLeastFromLoopEntry() {
// SET_REGISTER_FOR_LOOP is only used to initialize loop counters, and it
// implies that the following node must be a LoopChoiceNode. If we need to
// set registers to constant values for other reasons, we could introduce a
// new action type SET_REGISTER that doesn't imply anything about its
// successor.
UNREACHABLE();
}
void RegExpNode::GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start) {
// See comment in RegExpNode::EatsAtLeastFromLoopEntry.
UNREACHABLE();
}
EatsAtLeastInfo LoopChoiceNode::EatsAtLeastFromLoopEntry() {
DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue.
if (read_backward()) {
// Can't do anything special for a backward loop, so return the basic values
// that we got during analysis.
return *eats_at_least_info();
}
// Figure out how much the loop body itself eats, not including anything in
// the continuation case. In general, the nodes in the loop body should report
// that they eat at least the number eaten by the continuation node, since any
// successful match in the loop body must also include the continuation node.
// However, in some cases involving positive lookaround, the loop body under-
// reports its appetite, so use saturated math here to avoid negative numbers.
uint8_t loop_body_from_not_start = base::saturated_cast<uint8_t>(
loop_node_->EatsAtLeast(true) - continue_node_->EatsAtLeast(true));
uint8_t loop_body_from_possibly_start = base::saturated_cast<uint8_t>(
loop_node_->EatsAtLeast(false) - continue_node_->EatsAtLeast(true));
// Limit the number of loop iterations to avoid overflow in subsequent steps.
int loop_iterations = base::saturated_cast<uint8_t>(min_loop_iterations());
EatsAtLeastInfo result;
result.eats_at_least_from_not_start =
base::saturated_cast<uint8_t>(loop_iterations * loop_body_from_not_start +
continue_node_->EatsAtLeast(true));
if (loop_iterations > 0 && loop_body_from_possibly_start > 0) {
// First loop iteration eats at least one, so all subsequent iterations
// and the after-loop chunk are guaranteed to not be at the start.
result.eats_at_least_from_possibly_start = base::saturated_cast<uint8_t>(
loop_body_from_possibly_start +
(loop_iterations - 1) * loop_body_from_not_start +
continue_node_->EatsAtLeast(true));
} else {
// Loop body might eat nothing, so only continue node contributes.
result.eats_at_least_from_possibly_start =
continue_node_->EatsAtLeast(false);
}
return result;
}
bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
Trace* bounds_check_trace, Trace* trace,
bool preload_has_checked_bounds,
......@@ -1514,7 +1586,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
// and the mask-compare will determine definitely whether we have
// a match at this character position.
pos->mask = char_mask;
pos->value = c;
pos->value = chars[0];
pos->determines_perfectly = true;
} else {
uint32_t common_bits = char_mask;
......@@ -1699,6 +1771,37 @@ class VisitMarker {
NodeInfo* info_;
};
// Temporarily sets traversed_loop_initialization_node_.
class LoopInitializationMarker {
public:
explicit LoopInitializationMarker(LoopChoiceNode* node) : node_(node) {
DCHECK(!node_->traversed_loop_initialization_node_);
node_->traversed_loop_initialization_node_ = true;
}
~LoopInitializationMarker() {
DCHECK(node_->traversed_loop_initialization_node_);
node_->traversed_loop_initialization_node_ = false;
}
private:
LoopChoiceNode* node_;
DISALLOW_COPY_AND_ASSIGN(LoopInitializationMarker);
};
// Temporarily decrements min_loop_iterations_.
class IterationDecrementer {
public:
explicit IterationDecrementer(LoopChoiceNode* node) : node_(node) {
DCHECK_GT(node_->min_loop_iterations_, 0);
--node_->min_loop_iterations_;
}
~IterationDecrementer() { ++node_->min_loop_iterations_; }
private:
LoopChoiceNode* node_;
DISALLOW_COPY_AND_ASSIGN(IterationDecrementer);
};
RegExpNode* SeqRegExpNode::FilterOneByte(int depth) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
......@@ -1870,9 +1973,48 @@ void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
int characters_filled_in,
bool not_at_start) {
if (body_can_be_zero_length_ || info()->visited) return;
VisitMarker marker(info());
return ChoiceNode::GetQuickCheckDetails(details, compiler,
characters_filled_in, not_at_start);
not_at_start = not_at_start || this->not_at_start();
DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue.
if (traversed_loop_initialization_node_ && min_loop_iterations_ > 0 &&
loop_node_->EatsAtLeast(not_at_start) >
continue_node_->EatsAtLeast(true)) {
// Loop body is guaranteed to execute at least once, and consume characters
// when it does, meaning the only possible quick checks from this point
// begin with the loop body. We may recursively visit this LoopChoiceNode,
// but we temporarily decrease its minimum iteration counter so we know when
// to check the continue case.
IterationDecrementer next_iteration(this);
loop_node_->GetQuickCheckDetails(details, compiler, characters_filled_in,
not_at_start);
} else {
// Might not consume anything in the loop body, so treat it like a normal
// ChoiceNode (and don't recursively visit this node again).
VisitMarker marker(info());
ChoiceNode::GetQuickCheckDetails(details, compiler, characters_filled_in,
not_at_start);
}
}
void LoopChoiceNode::GetQuickCheckDetailsFromLoopEntry(
QuickCheckDetails* details, RegExpCompiler* compiler,
int characters_filled_in, bool not_at_start) {
if (traversed_loop_initialization_node_) {
// We already entered this loop once, exited via its continuation node, and
// followed an outer loop's back-edge to before the loop entry point. We
// could try to reset the minimum iteration count to its starting value at
// this point, but that seems like more trouble than it's worth. It's safe
// to keep going with the current (possibly reduced) minimum iteration
// count.
GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
} else {
// We are entering a loop via its counter initialization action, meaning we
// are guaranteed to run the loop body at least some minimum number of times
// before running the continuation node. Set a flag so that this node knows
// (now and any times we visit it again recursively) that it was entered
// from the top.
LoopInitializationMarker marker(this);
GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
}
}
void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
......@@ -3153,9 +3295,9 @@ void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
on_success()->Emit(compiler, &new_trace);
break;
}
case SET_REGISTER: {
Trace::DeferredSetRegister new_set(data_.u_store_register.reg,
data_.u_store_register.value);
case SET_REGISTER_FOR_LOOP: {
Trace::DeferredSetRegisterForLoop new_set(data_.u_store_register.reg,
data_.u_store_register.value);
Trace new_trace = *trace;
new_trace.add_action(&new_set);
on_success()->Emit(compiler, &new_trace);
......@@ -3362,10 +3504,20 @@ class EatsAtLeastPropagator : public AllStatic {
static void VisitAction(ActionNode* that) {
// POSITIVE_SUBMATCH_SUCCESS rewinds input, so we must not consider
// successor nodes for eats_at_least. Otherwise the current node eats at
// least as much as its successor.
if (that->action_type() != ActionNode::POSITIVE_SUBMATCH_SUCCESS) {
that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
// successor nodes for eats_at_least. SET_REGISTER_FOR_LOOP indicates a loop
// entry point, which means the loop body will run at least the minimum
// number of times before the continuation case can run. Otherwise the
// current node eats at least as much as its successor.
switch (that->action_type()) {
case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
break; // Was already initialized to zero.
case ActionNode::SET_REGISTER_FOR_LOOP:
that->set_eats_at_least_info(
that->on_success()->EatsAtLeastFromLoopEntry());
break;
default:
that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
break;
}
}
......
......@@ -285,10 +285,11 @@ class Trace {
void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
};
class DeferredSetRegister : public DeferredAction {
class DeferredSetRegisterForLoop : public DeferredAction {
public:
DeferredSetRegister(int reg, int value)
: DeferredAction(ActionNode::SET_REGISTER, reg), value_(value) {}
DeferredSetRegisterForLoop(int reg, int value)
: DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg),
value_(value) {}
int value() { return value_; }
private:
......
......@@ -200,7 +200,7 @@ void DotPrinterImpl::VisitAssertion(AssertionNode* that) {
void DotPrinterImpl::VisitAction(ActionNode* that) {
os_ << " n" << that << " [";
switch (that->action_type_) {
case ActionNode::SET_REGISTER:
case ActionNode::SET_REGISTER_FOR_LOOP:
os_ << "label=\"$" << that->data_.u_store_register.reg
<< ":=" << that->data_.u_store_register.value << "\", shape=octagon";
break;
......
......@@ -140,6 +140,15 @@ class RegExpNode : public ZoneObject {
// fail and can be ignored when determining how many characters are consumed
// on success. If this node has not been analyzed yet, EatsAtLeast returns 0.
int EatsAtLeast(bool not_at_start);
// Returns how many characters this node must consume in order to succeed,
// given that this is a LoopChoiceNode whose counter register is in a
// newly-initialized state at the current position in the generated code. For
// example, consider /a{6,8}/. Absent any extra information, the
// LoopChoiceNode for the repetition must report that it consumes at least
// zero characters, because it may have already looped several times. However,
// with a newly-initialized counter, it can report that it consumes at least
// six characters.
virtual EatsAtLeastInfo EatsAtLeastFromLoopEntry();
// Emits some quick code that checks whether the preloaded characters match.
// Falls through on certain failure, jumps to the label on possible success.
// If the node cannot make a quick check it does nothing and returns false.
......@@ -156,6 +165,17 @@ class RegExpNode : public ZoneObject {
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start) = 0;
// Fills in quick check details for this node, given that this is a
// LoopChoiceNode whose counter register is in a newly-initialized state at
// the current position in the generated code. For example, consider /a{6,8}/.
// Absent any extra information, the LoopChoiceNode for the repetition cannot
// generate any useful quick check because a match might be the (empty)
// continuation node. However, with a newly-initialized counter, it can
// generate a quick check for several 'a' characters at once.
virtual void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start);
static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
// Only returns the successor for a text node of length 1 that matches any
......@@ -277,7 +297,7 @@ class SeqRegExpNode : public RegExpNode {
class ActionNode : public SeqRegExpNode {
public:
enum ActionType {
SET_REGISTER,
SET_REGISTER_FOR_LOOP,
INCREMENT_REGISTER,
STORE_POSITION,
BEGIN_SUBMATCH,
......@@ -285,7 +305,8 @@ class ActionNode : public SeqRegExpNode {
EMPTY_MATCH_CHECK,
CLEAR_CAPTURES
};
static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
static ActionNode* SetRegisterForLoop(int reg, int val,
RegExpNode* on_success);
static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
static ActionNode* StorePosition(int reg, bool is_capture,
RegExpNode* on_success);
......@@ -305,10 +326,7 @@ class ActionNode : public SeqRegExpNode {
void Emit(RegExpCompiler* compiler, Trace* trace) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int filled_in,
bool not_at_start) override {
return on_success()->GetQuickCheckDetails(details, compiler, filled_in,
not_at_start);
}
bool not_at_start) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
ActionType action_type() { return action_type_; }
......@@ -659,23 +677,32 @@ class NegativeLookaroundChoiceNode : public ChoiceNode {
class LoopChoiceNode : public ChoiceNode {
public:
LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
LoopChoiceNode(bool body_can_be_zero_length, bool read_backward,
int min_loop_iterations, Zone* zone)
: ChoiceNode(2, zone),
loop_node_(nullptr),
continue_node_(nullptr),
body_can_be_zero_length_(body_can_be_zero_length),
read_backward_(read_backward) {}
read_backward_(read_backward),
traversed_loop_initialization_node_(false),
min_loop_iterations_(min_loop_iterations) {}
void AddLoopAlternative(GuardedAlternative alt);
void AddContinueAlternative(GuardedAlternative alt);
void Emit(RegExpCompiler* compiler, Trace* trace) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int characters_filled_in,
bool not_at_start) override;
void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
EatsAtLeastInfo EatsAtLeastFromLoopEntry() override;
RegExpNode* loop_node() { return loop_node_; }
RegExpNode* continue_node() { return continue_node_; }
bool body_can_be_zero_length() { return body_can_be_zero_length_; }
int min_loop_iterations() const { return min_loop_iterations_; }
bool read_backward() override { return read_backward_; }
void Accept(NodeVisitor* visitor) override;
RegExpNode* FilterOneByte(int depth) override;
......@@ -692,6 +719,22 @@ class LoopChoiceNode : public ChoiceNode {
RegExpNode* continue_node_;
bool body_can_be_zero_length_;
bool read_backward_;
// Temporary marker set only while generating quick check details. Represents
// whether GetQuickCheckDetails traversed the initialization node for this
// loop's counter. If so, we may be able to generate stricter quick checks
// because we know the loop node must match at least min_loop_iterations_
// times before the continuation node can match.
bool traversed_loop_initialization_node_;
// The minimum number of times the loop_node_ must match before the
// continue_node_ might be considered. This value can be temporarily decreased
// while generating quick check details, to represent the remaining iterations
// after the completed portion of the quick check details.
int min_loop_iterations_;
friend class IterationDecrementer;
friend class LoopInitializationMarker;
};
class NodeVisitor {
......
......@@ -27,6 +27,7 @@ assertEquals(["def"], "abcdef".match(/(?<=\w*)[^a|b|c]{3}/));
// Start of line matches.
assertEquals(["def"], "abcdef".match(/(?<=^abc)def/));
assertEquals(["def"], "abcdef".match(/(?<=^[a-c]{3})def/));
assertEquals(["def"], "abcabcdef".match(/(?<=^[a-c]{6})def/));
assertEquals(["def"], "xyz\nabcdef".match(/(?<=^[a-c]{3})def/m));
assertEquals(["ab", "cd", "efg"], "ab\ncd\nefg".match(/(?<=^)\w+/gm));
assertEquals(["ab", "cd", "efg"], "ab\ncd\nefg".match(/\w+(?<=$)/gm));
......
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
"".match(/(?:(?=a)b){5}abcde/);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment