Commit e6e9cbac authored by Martin Bidlingmaier's avatar Martin Bidlingmaier Committed by Commit Bot

[regexp] Support the msy flags in experimental engine

The m (multiline) and s (dotall) flags just needed to be marked as
allowed; the required logic was already in the regexp parser.

A regexp /<x>/ without the y (sticky) flag is equivalent to the sticky
regexp /.*?<x>/y.  The interpreter now assumes that every regexp is
sticky, and the compiler appends a preamble corresponding to /.*?/
before non-sticky regexps.  To reuse existing code for compiling this
preamble, the logic for each kind of quantifier is now in a separate
function and called from VisitQuantifier and for the preamble.

The commit also includes some improvements/fixes for character ranges:
- Empty character ranges/disjunctions should never match, but before
  this commit they would *always* match.
- The check of the range bounds in CanBeHandledVisitor was unncessary;
  without the unicode flag this can't be a range that can't be specified
  in 2-byte codepoints, and once we support unicode we simply support
  all codepoints.
- The capacity of the list containing the complementary intervals of a
  character range is now calculated more accurately.

Cq-Include-Trybots: luci.v8.try:v8_linux64_fyi_rel_ng
Bug: v8:10765
Change-Id: I71a0e07279b4e1140c0ed1651b3714200c801de9
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2404766
Commit-Queue: Martin Bidlingmaier <mbid@google.com>
Reviewed-by: 's avatarJakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#70082}
parent 339c555b
...@@ -113,6 +113,16 @@ struct RegExpInstruction { ...@@ -113,6 +113,16 @@ struct RegExpInstruction {
return result; return result;
} }
static RegExpInstruction ConsumeAnyChar() {
return ConsumeRange(Uc16Range{0x0000, 0xFFFF});
}
static RegExpInstruction Fail() {
// This is encoded as the empty CONSUME_RANGE of characters 0xFFFF <= c <=
// 0x0000.
return ConsumeRange(Uc16Range{0xFFFF, 0x0000});
}
static RegExpInstruction Fork(int32_t alt_index) { static RegExpInstruction Fork(int32_t alt_index) {
RegExpInstruction result; RegExpInstruction result;
result.opcode = FORK; result.opcode = FORK;
......
...@@ -19,21 +19,23 @@ constexpr uc32 kMaxSupportedCodepoint = 0xFFFFu; ...@@ -19,21 +19,23 @@ constexpr uc32 kMaxSupportedCodepoint = 0xFFFFu;
class CanBeHandledVisitor final : private RegExpVisitor { class CanBeHandledVisitor final : private RegExpVisitor {
// Visitor to implement `ExperimentalRegExp::CanBeHandled`. // Visitor to implement `ExperimentalRegExp::CanBeHandled`.
public: public:
static bool Check(RegExpTree* node, JSRegExp::Flags flags, int capture_count, static bool Check(RegExpTree* tree, JSRegExp::Flags flags,
Zone* zone) { int capture_count) {
if (!AreSuitableFlags(flags)) return false; if (!AreSuitableFlags(flags)) return false;
CanBeHandledVisitor visitor(zone); CanBeHandledVisitor visitor;
node->Accept(&visitor, nullptr); tree->Accept(&visitor, nullptr);
return visitor.result_; return visitor.result_;
} }
private: private:
explicit CanBeHandledVisitor(Zone* zone) : zone_(zone) {} CanBeHandledVisitor() = default;
static bool AreSuitableFlags(JSRegExp::Flags flags) { static bool AreSuitableFlags(JSRegExp::Flags flags) {
// TODO(mbid, v8:10765): We should be able to support all flags in the // TODO(mbid, v8:10765): We should be able to support all flags in the
// future. // future.
static constexpr JSRegExp::Flags kAllowedFlags = JSRegExp::kGlobal; static constexpr JSRegExp::Flags kAllowedFlags =
JSRegExp::kGlobal | JSRegExp::kSticky | JSRegExp::kMultiline |
JSRegExp::kDotAll;
// We support Unicode iff kUnicode is among the supported flags. // We support Unicode iff kUnicode is among the supported flags.
STATIC_ASSERT(ExperimentalRegExp::kSupportsUnicode == STATIC_ASSERT(ExperimentalRegExp::kSupportsUnicode ==
((kAllowedFlags & JSRegExp::kUnicode) != 0)); ((kAllowedFlags & JSRegExp::kUnicode) != 0));
...@@ -62,24 +64,11 @@ class CanBeHandledVisitor final : private RegExpVisitor { ...@@ -62,24 +64,11 @@ class CanBeHandledVisitor final : private RegExpVisitor {
void* VisitCharacterClass(RegExpCharacterClass* node, void*) override { void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
result_ = result_ && AreSuitableFlags(node->flags()); result_ = result_ && AreSuitableFlags(node->flags());
for (CharacterRange r : *node->ranges(zone_)) {
// TODO(mbid, v8:10765): We don't support full unicode yet, so we only
// allow character ranges that can be specified with two-byte characters.
if (r.to() > kMaxSupportedCodepoint) {
result_ = false;
return nullptr;
}
}
return nullptr; return nullptr;
} }
void* VisitAssertion(RegExpAssertion* node, void*) override { void* VisitAssertion(RegExpAssertion* node, void*) override {
// TODO(mbid,v8:10765): Once regexps that we shouldn't try to match at result_ = result_ && AreSuitableFlags(node->flags());
// every input position (e.g. sticky) are supported, we should also support
// START_OF_INPUT.
result_ = result_ &&
node->assertion_type() != RegExpAssertion::START_OF_INPUT &&
AreSuitableFlags(node->flags());
return nullptr; return nullptr;
} }
...@@ -181,16 +170,15 @@ class CanBeHandledVisitor final : private RegExpVisitor { ...@@ -181,16 +170,15 @@ class CanBeHandledVisitor final : private RegExpVisitor {
int replication_factor_ = 1; int replication_factor_ = 1;
bool result_ = true; bool result_ = true;
Zone* zone_;
}; };
} // namespace } // namespace
bool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree, bool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree,
JSRegExp::Flags flags, JSRegExp::Flags flags,
int capture_count, Zone* zone) { int capture_count) {
DCHECK(FLAG_enable_experimental_regexp_engine); DCHECK(FLAG_enable_experimental_regexp_engine);
return CanBeHandledVisitor::Check(tree, flags, capture_count, zone); return CanBeHandledVisitor::Check(tree, flags, capture_count);
} }
namespace { namespace {
...@@ -286,6 +274,15 @@ class CompileVisitor : private RegExpVisitor { ...@@ -286,6 +274,15 @@ class CompileVisitor : private RegExpVisitor {
Zone* zone) { Zone* zone) {
CompileVisitor compiler(zone); CompileVisitor compiler(zone);
if ((flags & JSRegExp::kSticky) == 0 && !tree->IsAnchoredAtStart()) {
// The match is not anchored, i.e. may start at any input position, so we
// emit a preamble corresponding to /.*?/. This skips an arbitrary
// prefix in the input non-greedily.
compiler.CompileNonGreedyStar([&]() {
compiler.code_.Add(RegExpInstruction::ConsumeAnyChar(), zone);
});
}
compiler.code_.Add(RegExpInstruction::SetRegisterToCp(0), zone); compiler.code_.Add(RegExpInstruction::SetRegisterToCp(0), zone);
tree->Accept(&compiler, nullptr); tree->Accept(&compiler, nullptr);
compiler.code_.Add(RegExpInstruction::SetRegisterToCp(1), zone); compiler.code_.Add(RegExpInstruction::SetRegisterToCp(1), zone);
...@@ -303,7 +300,7 @@ class CompileVisitor : private RegExpVisitor { ...@@ -303,7 +300,7 @@ class CompileVisitor : private RegExpVisitor {
// `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num - // `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num -
// 1` and should push code corresponding to the ith alternative onto `code_`. // 1` and should push code corresponding to the ith alternative onto `code_`.
template <class F> template <class F>
void CompileDisjunction(int alt_num, F gen_alt) { void CompileDisjunction(int alt_num, F&& gen_alt) {
// An alternative a1 | ... | an is compiled into // An alternative a1 | ... | an is compiled into
// //
// FORK tail1 // FORK tail1
...@@ -327,6 +324,8 @@ class CompileVisitor : private RegExpVisitor { ...@@ -327,6 +324,8 @@ class CompileVisitor : private RegExpVisitor {
// by the thread for a2 and so on. // by the thread for a2 and so on.
if (alt_num == 0) { if (alt_num == 0) {
// The empty disjunction. This can never match.
code_.Add(RegExpInstruction::Fail(), zone_);
return; return;
} }
...@@ -369,11 +368,12 @@ class CompileVisitor : private RegExpVisitor { ...@@ -369,11 +368,12 @@ class CompileVisitor : private RegExpVisitor {
ZoneList<CharacterRange>* ranges = node->ranges(zone_); ZoneList<CharacterRange>* ranges = node->ranges(zone_);
CharacterRange::Canonicalize(ranges); CharacterRange::Canonicalize(ranges);
if (node->is_negated()) { if (node->is_negated()) {
// Capacity 2 for the common case where we compute the complement of a // The complement of a disjoint, non-adjacent (i.e. `Canonicalize`d)
// single interval range that doesn't contain 0 and kMaxCodePoint. // union of k intervals is a union of at most k + 1 intervals.
ZoneList<CharacterRange>* negated = ZoneList<CharacterRange>* negated =
zone_->New<ZoneList<CharacterRange>>(2, zone_); zone_->New<ZoneList<CharacterRange>>(ranges->length() + 1, zone_);
CharacterRange::Negate(ranges, negated, zone_); CharacterRange::Negate(ranges, negated, zone_);
DCHECK_LE(negated->length(), ranges->length() + 1);
ranges = negated; ranges = negated;
} }
...@@ -417,6 +417,114 @@ class CompileVisitor : private RegExpVisitor { ...@@ -417,6 +417,114 @@ class CompileVisitor : private RegExpVisitor {
} }
} }
// Emit bytecode corresponding to /<emit_body>*/.
template <class F>
void CompileGreedyStar(F&& emit_body) {
// This is compiled into
//
// begin:
// FORK end
// <body>
// JMP begin
// end:
// ...
//
// This is greedy because a forked thread has lower priority than the
// thread that spawned it.
Label begin(code_.length());
DeferredLabel end;
AddForkTo(end, code_, zone_);
emit_body();
AddJmpTo(begin, code_, zone_);
std::move(end).Bind(code_);
}
// Emit bytecode corresponding to /<emit_body>*?/.
template <class F>
void CompileNonGreedyStar(F&& emit_body) {
// This is compiled into
//
// FORK body
// JMP end
// body:
// <body>
// FORK body
// end:
// ...
Label body(code_.length() + 2);
DeferredLabel end;
AddForkTo(body, code_, zone_);
AddJmpTo(end, code_, zone_);
DCHECK_EQ(body.index(), code_.length());
emit_body();
AddForkTo(body, code_, zone_);
std::move(end).Bind(code_);
}
// Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}/.
template <class F>
void CompileGreedyRepetition(F&& emit_body, int max_repetition_num) {
// This is compiled into
//
// FORK end
// <body>
// FORK end
// <body>
// ...
// ...
// FORK end
// <body>
// end:
// ...
DeferredLabel end;
for (int i = 0; i != max_repetition_num; ++i) {
AddForkTo(end, code_, zone_);
emit_body();
}
std::move(end).Bind(code_);
}
// Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}?/.
template <class F>
void CompileNonGreedyRepetition(F&& emit_body, int max_repetition_num) {
// This is compiled into
//
// FORK body0
// JMP end
// body0:
// <body>
// FORK body1
// JMP end
// body1:
// <body>
// ...
// ...
// body{max_repetition_num - 1}:
// <body>
// end:
// ...
DeferredLabel end;
for (int i = 0; i != max_repetition_num; ++i) {
Label body(code_.length() + 2);
AddForkTo(body, code_, zone_);
AddJmpTo(end, code_, zone_);
DCHECK_EQ(body.index(), code_.length());
emit_body();
}
std::move(end).Bind(code_);
}
void* VisitQuantifier(RegExpQuantifier* node, void*) override { void* VisitQuantifier(RegExpQuantifier* node, void*) override {
// Emit the body, but clear registers occuring in body first. // Emit the body, but clear registers occuring in body first.
// //
...@@ -440,105 +548,20 @@ class CompileVisitor : private RegExpVisitor { ...@@ -440,105 +548,20 @@ class CompileVisitor : private RegExpVisitor {
UNREACHABLE(); UNREACHABLE();
case RegExpQuantifier::GREEDY: { case RegExpQuantifier::GREEDY: {
if (node->max() == RegExpTree::kInfinity) { if (node->max() == RegExpTree::kInfinity) {
// This is compiled into CompileGreedyStar(emit_body);
//
// begin:
// FORK end
// <body>
// JMP begin
// end:
// ...
//
// This is greedy because a forked thread has lower priority than the
// thread that spawned it.
Label begin(code_.length());
DeferredLabel end;
AddForkTo(end, code_, zone_);
emit_body();
AddJmpTo(begin, code_, zone_);
std::move(end).Bind(code_);
} else { } else {
DCHECK_NE(node->max(), RegExpTree::kInfinity); DCHECK_NE(node->max(), RegExpTree::kInfinity);
// This is compiled into CompileGreedyRepetition(emit_body, node->max() - node->min());
//
// FORK end
// <body>
// FORK end
// <body>
// ... ; max - min times in total
// ...
// FORK end
// <body>
// end:
// ...
DeferredLabel end;
for (int i = node->min(); i != node->max(); ++i) {
AddForkTo(end, code_, zone_);
emit_body();
}
std::move(end).Bind(code_);
} }
break; break;
} }
case RegExpQuantifier::NON_GREEDY: { case RegExpQuantifier::NON_GREEDY: {
if (node->max() == RegExpTree::kInfinity) { if (node->max() == RegExpTree::kInfinity) {
// This is compiled into CompileNonGreedyStar(emit_body);
//
// FORK body
// JMP end
// body:
// <body>
// FORK body
// end:
// ...
Label body(code_.length() + 2);
DeferredLabel end;
AddForkTo(body, code_, zone_);
AddJmpTo(end, code_, zone_);
DCHECK_EQ(body.index(), code_.length());
emit_body();
AddForkTo(body, code_, zone_);
std::move(end).Bind(code_);
} else { } else {
DCHECK_NE(node->max(), RegExpTree::kInfinity); DCHECK_NE(node->max(), RegExpTree::kInfinity);
// This is compiled into CompileNonGreedyRepetition(emit_body, node->max() - node->min());
//
// FORK body0
// JMP end
// body0:
// <body>
// FORK body1
// JMP end
// body1:
// <body>
// ...
// ...
// body{max - min - 1}:
// <body>
// end:
// ...
DeferredLabel end;
for (int i = node->min(); i != node->max(); ++i) {
Label body(code_.length() + 2);
AddForkTo(body, code_, zone_);
AddJmpTo(end, code_, zone_);
DCHECK_EQ(body.index(), code_.length());
emit_body();
}
std::move(end).Bind(code_);
} }
break;
} }
} }
return nullptr; return nullptr;
......
...@@ -20,7 +20,7 @@ class ExperimentalRegExpCompiler final : public AllStatic { ...@@ -20,7 +20,7 @@ class ExperimentalRegExpCompiler final : public AllStatic {
// TODO(mbid,v8:10765): Currently more things are not handled, e.g. some // TODO(mbid,v8:10765): Currently more things are not handled, e.g. some
// quantifiers and unicode. // quantifiers and unicode.
static bool CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags, static bool CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags,
int capture_count, Zone* zone); int capture_count);
// Compile regexp into a bytecode program. The regexp must be handlable by // Compile regexp into a bytecode program. The regexp must be handlable by
// the experimental engine; see`CanBeHandled`. The program is returned as a // the experimental engine; see`CanBeHandled`. The program is returned as a
// ZoneList backed by the same Zone that is used in the RegExpTree argument. // ZoneList backed by the same Zone that is used in the RegExpTree argument.
......
...@@ -240,14 +240,6 @@ class NfaInterpreter { ...@@ -240,14 +240,6 @@ class NfaInterpreter {
uc16 input_char = input_[input_index_]; uc16 input_char = input_[input_index_];
++input_index_; ++input_index_;
// If we haven't found a match yet, we add a thread with least priority
// that attempts a match starting after `input_char`.
if (!FoundMatch()) {
active_threads_.Add(
InterpreterThread{0, NewRegisterArray(kUndefinedRegisterValue)},
zone_);
}
// We unblock all blocked_threads_ by feeding them the input char. // We unblock all blocked_threads_ by feeding them the input char.
FlushBlockedThreads(input_char); FlushBlockedThreads(input_char);
......
...@@ -14,9 +14,8 @@ namespace v8 { ...@@ -14,9 +14,8 @@ namespace v8 {
namespace internal { namespace internal {
bool ExperimentalRegExp::CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags, bool ExperimentalRegExp::CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags,
int capture_count, Zone* zone) { int capture_count) {
return ExperimentalRegExpCompiler::CanBeHandled(tree, flags, capture_count, return ExperimentalRegExpCompiler::CanBeHandled(tree, flags, capture_count);
zone);
} }
void ExperimentalRegExp::Initialize(Isolate* isolate, Handle<JSRegExp> re, void ExperimentalRegExp::Initialize(Isolate* isolate, Handle<JSRegExp> re,
......
...@@ -20,7 +20,7 @@ class ExperimentalRegExp final : public AllStatic { ...@@ -20,7 +20,7 @@ class ExperimentalRegExp final : public AllStatic {
// checked on the fly in the parser. Not done currently because walking the // checked on the fly in the parser. Not done currently because walking the
// AST again is more flexible and less error prone (but less performant). // AST again is more flexible and less error prone (but less performant).
static bool CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags, static bool CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags,
int capture_count, Zone* zone); int capture_count);
static void Initialize(Isolate* isolate, Handle<JSRegExp> re, static void Initialize(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> pattern, JSRegExp::Flags flags, Handle<String> pattern, JSRegExp::Flags flags,
int capture_count); int capture_count);
......
...@@ -184,7 +184,7 @@ MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re, ...@@ -184,7 +184,7 @@ MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re,
if (FLAG_enable_experimental_regexp_engine && if (FLAG_enable_experimental_regexp_engine &&
ExperimentalRegExp::CanBeHandled(parse_result.tree, flags, ExperimentalRegExp::CanBeHandled(parse_result.tree, flags,
parse_result.capture_count, &zone)) { parse_result.capture_count)) {
ExperimentalRegExp::Initialize(isolate, re, pattern, flags, ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
parse_result.capture_count); parse_result.capture_count);
has_been_compiled = true; has_been_compiled = true;
......
...@@ -887,7 +887,7 @@ class MatchInfoBackedMatch : public String::Match { ...@@ -887,7 +887,7 @@ class MatchInfoBackedMatch : public String::Match {
: isolate_(isolate), match_info_(match_info) { : isolate_(isolate), match_info_(match_info) {
subject_ = String::Flatten(isolate, subject); subject_ = String::Flatten(isolate, subject);
if (regexp->TypeTag() == JSRegExp::IRREGEXP) { if (JSRegExp::TypeSupportsCaptures(regexp->TypeTag())) {
Object o = regexp->CaptureNameMap(); Object o = regexp->CaptureNameMap();
has_named_captures_ = o.IsFixedArray(); has_named_captures_ = o.IsFixedArray();
if (has_named_captures_) { if (has_named_captures_) {
......
...@@ -76,15 +76,23 @@ Test(/(?:(123)|(xyz))*/, "xyz123", ["xyz123", "123", undefined], 0); ...@@ -76,15 +76,23 @@ Test(/(?:(123)|(xyz))*/, "xyz123", ["xyz123", "123", undefined], 0);
Test(/((123)|(xyz)*)*/, "xyz123xyz", ["xyz123xyz", "xyz", undefined, "xyz"], 0); Test(/((123)|(xyz)*)*/, "xyz123xyz", ["xyz123xyz", "xyz", undefined, "xyz"], 0);
// Assertions. // Assertions.
// TODO(mbid,v8:10765): Once supported, we should also check ^ and $ with the
// multiline flag.
Test(/asdf\b/, "asdf---", ["asdf"], 0); Test(/asdf\b/, "asdf---", ["asdf"], 0);
Test(/asdf\b/, "asdfg", null, 0); Test(/asdf\b/, "asdfg", null, 0);
Test(/asd[fg]\B/, "asdf asdgg", ["asdg"], 0); Test(/asd[fg]\B/, "asdf asdgg", ["asdg"], 0);
// TODO(mbid,v8:10765): The ^ assertion should work once we support anchored Test(/^asd[fg]/, "asdf asdgg", ["asdf"], 0);
// regexps.
//Test(/^asd[fg]/, "asdf asdgg", ["asdf"], 0);
Test(/asd[fg]$/, "asdf asdg", ["asdg"], 0); Test(/asd[fg]$/, "asdf asdg", ["asdg"], 0);
// The global flag. // The global flag.
Test(/asdf/g, "fjasdfkkasdf", ["asdf"], 6); Test(/asdf/g, "fjasdfkkasdf", ["asdf"], 6);
// The sticky flag.
var r = /asdf/y;
r.lastIndex = 2;
Test(r, "fjasdfkkasdf", ["asdf"], 6);
// The multiline flag.
Test(/^a/m, "x\na", ["a"], 0);
Test(/x$/m, "x\na", ["x"], 0);
// The dotall flag.
Test(/asdf.xyz/s, "asdf\nxyz", ["asdf\nxyz"], 0);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment