Commit e6e9cbac authored by Martin Bidlingmaier's avatar Martin Bidlingmaier Committed by Commit Bot

[regexp] Support the msy flags in experimental engine

The m (multiline) and s (dotall) flags just needed to be marked as
allowed; the required logic was already in the regexp parser.

A regexp /<x>/ without the y (sticky) flag is equivalent to the sticky
regexp /.*?<x>/y.  The interpreter now assumes that every regexp is
sticky, and the compiler appends a preamble corresponding to /.*?/
before non-sticky regexps.  To reuse existing code for compiling this
preamble, the logic for each kind of quantifier is now in a separate
function and called from VisitQuantifier and for the preamble.

The commit also includes some improvements/fixes for character ranges:
- Empty character ranges/disjunctions should never match, but before
  this commit they would *always* match.
- The check of the range bounds in CanBeHandledVisitor was unncessary;
  without the unicode flag this can't be a range that can't be specified
  in 2-byte codepoints, and once we support unicode we simply support
  all codepoints.
- The capacity of the list containing the complementary intervals of a
  character range is now calculated more accurately.

Cq-Include-Trybots: luci.v8.try:v8_linux64_fyi_rel_ng
Bug: v8:10765
Change-Id: I71a0e07279b4e1140c0ed1651b3714200c801de9
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2404766
Commit-Queue: Martin Bidlingmaier <mbid@google.com>
Reviewed-by: 's avatarJakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#70082}
parent 339c555b
......@@ -113,6 +113,16 @@ struct RegExpInstruction {
return result;
}
static RegExpInstruction ConsumeAnyChar() {
return ConsumeRange(Uc16Range{0x0000, 0xFFFF});
}
static RegExpInstruction Fail() {
// This is encoded as the empty CONSUME_RANGE of characters 0xFFFF <= c <=
// 0x0000.
return ConsumeRange(Uc16Range{0xFFFF, 0x0000});
}
static RegExpInstruction Fork(int32_t alt_index) {
RegExpInstruction result;
result.opcode = FORK;
......
......@@ -20,7 +20,7 @@ class ExperimentalRegExpCompiler final : public AllStatic {
// TODO(mbid,v8:10765): Currently more things are not handled, e.g. some
// quantifiers and unicode.
static bool CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags,
int capture_count, Zone* zone);
int capture_count);
// Compile regexp into a bytecode program. The regexp must be handlable by
// the experimental engine; see`CanBeHandled`. The program is returned as a
// ZoneList backed by the same Zone that is used in the RegExpTree argument.
......
......@@ -240,14 +240,6 @@ class NfaInterpreter {
uc16 input_char = input_[input_index_];
++input_index_;
// If we haven't found a match yet, we add a thread with least priority
// that attempts a match starting after `input_char`.
if (!FoundMatch()) {
active_threads_.Add(
InterpreterThread{0, NewRegisterArray(kUndefinedRegisterValue)},
zone_);
}
// We unblock all blocked_threads_ by feeding them the input char.
FlushBlockedThreads(input_char);
......
......@@ -14,9 +14,8 @@ namespace v8 {
namespace internal {
bool ExperimentalRegExp::CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags,
int capture_count, Zone* zone) {
return ExperimentalRegExpCompiler::CanBeHandled(tree, flags, capture_count,
zone);
int capture_count) {
return ExperimentalRegExpCompiler::CanBeHandled(tree, flags, capture_count);
}
void ExperimentalRegExp::Initialize(Isolate* isolate, Handle<JSRegExp> re,
......
......@@ -20,7 +20,7 @@ class ExperimentalRegExp final : public AllStatic {
// checked on the fly in the parser. Not done currently because walking the
// AST again is more flexible and less error prone (but less performant).
static bool CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags,
int capture_count, Zone* zone);
int capture_count);
static void Initialize(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> pattern, JSRegExp::Flags flags,
int capture_count);
......
......@@ -184,7 +184,7 @@ MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re,
if (FLAG_enable_experimental_regexp_engine &&
ExperimentalRegExp::CanBeHandled(parse_result.tree, flags,
parse_result.capture_count, &zone)) {
parse_result.capture_count)) {
ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
parse_result.capture_count);
has_been_compiled = true;
......
......@@ -887,7 +887,7 @@ class MatchInfoBackedMatch : public String::Match {
: isolate_(isolate), match_info_(match_info) {
subject_ = String::Flatten(isolate, subject);
if (regexp->TypeTag() == JSRegExp::IRREGEXP) {
if (JSRegExp::TypeSupportsCaptures(regexp->TypeTag())) {
Object o = regexp->CaptureNameMap();
has_named_captures_ = o.IsFixedArray();
if (has_named_captures_) {
......
......@@ -76,15 +76,23 @@ Test(/(?:(123)|(xyz))*/, "xyz123", ["xyz123", "123", undefined], 0);
Test(/((123)|(xyz)*)*/, "xyz123xyz", ["xyz123xyz", "xyz", undefined, "xyz"], 0);
// Assertions.
// TODO(mbid,v8:10765): Once supported, we should also check ^ and $ with the
// multiline flag.
Test(/asdf\b/, "asdf---", ["asdf"], 0);
Test(/asdf\b/, "asdfg", null, 0);
Test(/asd[fg]\B/, "asdf asdgg", ["asdg"], 0);
// TODO(mbid,v8:10765): The ^ assertion should work once we support anchored
// regexps.
//Test(/^asd[fg]/, "asdf asdgg", ["asdf"], 0);
Test(/^asd[fg]/, "asdf asdgg", ["asdf"], 0);
Test(/asd[fg]$/, "asdf asdg", ["asdg"], 0);
// The global flag.
Test(/asdf/g, "fjasdfkkasdf", ["asdf"], 6);
// The sticky flag.
var r = /asdf/y;
r.lastIndex = 2;
Test(r, "fjasdfkkasdf", ["asdf"], 6);
// The multiline flag.
Test(/^a/m, "x\na", ["a"], 0);
Test(/x$/m, "x\na", ["x"], 0);
// The dotall flag.
Test(/asdf.xyz/s, "asdf\nxyz", ["asdf\nxyz"], 0);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment