Commit 13667065 authored by Martin Bidlingmaier's avatar Martin Bidlingmaier Committed by Commit Bot

[regexp] Support some non-trivial EXPERIMENTAL patterns

This CL adds support for disjunctions and some quantification in
EXPERIMENTAL regexp patterns. It is implemented using a new bytecode
format and an NFA-based breadth-first interpreter.

R=jgruber@chromium.org

Bug: v8:10765
Change-Id: Idd49a3bbc9a9fcc2be80d822c9d84a638e53e777
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2370634
Commit-Queue: Martin Bidlingmaier <mbid@google.com>
Reviewed-by: 's avatarDominik Inführ <dinfuehr@chromium.org>
Reviewed-by: 's avatarJakob Gruber <jgruber@chromium.org>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#69621}
parent 97e79b25
......@@ -1223,23 +1223,23 @@ void JSRegExp::JSRegExpVerify(Isolate* isolate) {
Object latin1_code = arr.get(JSRegExp::kIrregexpLatin1CodeIndex);
Object uc16_code = arr.get(JSRegExp::kIrregexpUC16CodeIndex);
Object experimental_pattern =
arr.get(JSRegExp::kExperimentalPatternIndex);
if (latin1_code.IsCode()) {
// `this` should be a compiled regexp.
CHECK(latin1_code.IsCode());
CHECK_EQ(Code::cast(latin1_code).builtin_index(),
Builtins::kRegExpExperimentalTrampoline);
Object latin1_bytecode = arr.get(JSRegExp::kIrregexpLatin1BytecodeIndex);
Object uc16_bytecode = arr.get(JSRegExp::kIrregexpUC16BytecodeIndex);
CHECK(uc16_code.IsCode());
CHECK_EQ(Code::cast(uc16_code).builtin_index(),
bool is_compiled = latin1_code.IsCode();
if (is_compiled) {
CHECK_EQ(Code::cast(latin1_code).builtin_index(),
Builtins::kRegExpExperimentalTrampoline);
CHECK_EQ(uc16_code, latin1_code);
CHECK(experimental_pattern.IsString());
CHECK(latin1_bytecode.IsByteArray());
CHECK_EQ(uc16_bytecode, latin1_bytecode);
} else {
CHECK_EQ(latin1_code, uninitialized);
CHECK_EQ(uc16_code, uninitialized);
CHECK_EQ(experimental_pattern, uninitialized);
CHECK_EQ(latin1_bytecode, uninitialized);
CHECK_EQ(uc16_bytecode, uninitialized);
}
CHECK_EQ(arr.get(JSRegExp::kIrregexpMaxRegisterCountIndex),
......
......@@ -3347,7 +3347,6 @@ void Factory::SetRegExpExperimentalData(Handle<JSRegExp> regexp,
store->set(JSRegExp::kIrregexpCaptureNameMapIndex, uninitialized);
store->set(JSRegExp::kIrregexpTicksUntilTierUpIndex, uninitialized);
store->set(JSRegExp::kIrregexpBacktrackLimit, uninitialized);
store->set(JSRegExp::kExperimentalPatternIndex, uninitialized);
regexp->set_data(*store);
}
......
......@@ -189,18 +189,14 @@ class JSRegExp : public TorqueGeneratedJSRegExp<JSRegExp, JSObject> {
static const int kIrregexpBacktrackLimit = kDataIndex + 8;
static const int kIrregexpDataSize = kDataIndex + 9;
// TODO(mbid,v8:10765): At the moment the EXPERIMENTAL data array is an
// extension of IRREGEXP data, with most fields set to some
// TODO(mbid,v8:10765): At the moment the EXPERIMENTAL data array conforms
// to the format of an IRREGEXP data array, with most fields set to some
// default/uninitialized value. This is because EXPERIMENTAL and IRREGEXP
// regexps take the same code path in
// `RegExpBuiltinsAssembler::RegExpExecInternal`, which reads off various
// fields from the `store` array. `RegExpExecInternal` should probably
// regexps take the same code path in `RegExpExecInternal`, which reads off
// various fields from the data array. `RegExpExecInternal` should probably
// distinguish between EXPERIMENTAL and IRREGEXP, and then we can get rid of
// all the IRREGEXP only fields.
// The same as kAtomPatternIndex for atom regexps.
static constexpr int kExperimentalPatternIndex = kIrregexpDataSize;
static constexpr int kExperimentalDataSize = kIrregexpDataSize + 1;
static constexpr int kExperimentalDataSize = kIrregexpDataSize;
// In-object fields.
static const int kLastIndexFieldIndex = 0;
......
This diff is collapsed.
......@@ -12,11 +12,18 @@ namespace internal {
class ExperimentalRegExp final : public AllStatic {
public:
// Initialization & Compilation:
// Initialization & Compilation
// -------------------------------------------------------------------------
// Check whether a parsed regexp pattern can be compiled and executed by the
// EXPERIMENTAL engine.
// TODO(mbid, v8:10765): This walks the RegExpTree, but it could also be
// checked on the fly in the parser. Not done currently because walking the
// AST again is more flexible and less error prone (but less performant).
static bool CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags, Zone* zone);
static void Initialize(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> pattern, JSRegExp::Flags flags,
int capture_count);
static bool IsCompiled(Handle<JSRegExp> re);
static bool IsCompiled(Handle<JSRegExp> re, Isolate* isolate);
static void Compile(Isolate* isolate, Handle<JSRegExp> re);
// Execution:
......
......@@ -174,13 +174,10 @@ MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re,
bool has_been_compiled = false;
if (FLAG_enable_experimental_regexp_engine && parse_result.simple &&
!IgnoreCase(flags) && !IsSticky(flags)) {
// Parse-tree is a single atom that is equal to the pattern. For now we let
// the experimental regexp engine deal with this case instead of string
// search via ATOM (modulo some performance-related heuristic).
int capture_count = 0;
ExperimentalRegExp::Initialize(isolate, re, pattern, flags, capture_count);
if (FLAG_enable_experimental_regexp_engine &&
ExperimentalRegExp::CanBeHandled(parse_result.tree, flags, &zone)) {
ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
parse_result.capture_count);
has_been_compiled = true;
} else if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) &&
!HasFewDifferentCharacters(pattern)) {
......@@ -979,7 +976,7 @@ int32_t* RegExpGlobalCache::FetchNext() {
register_array_, register_array_size_);
break;
case JSRegExp::EXPERIMENTAL: {
if (!ExperimentalRegExp::IsCompiled(regexp_)) {
if (!ExperimentalRegExp::IsCompiled(regexp_, isolate_)) {
ExperimentalRegExp::Compile(isolate_, regexp_);
}
DisallowHeapAllocation no_gc;
......
......@@ -1254,6 +1254,28 @@ RUNTIME_FUNCTION(Runtime_RegexpHasNativeCode) {
return isolate->heap()->ToBoolean(result);
}
RUNTIME_FUNCTION(Runtime_RegexpTypeTag) {
HandleScope shs(isolate);
DCHECK_EQ(1, args.length());
CONVERT_ARG_CHECKED(JSRegExp, regexp, 0);
const char* type_str;
switch (regexp.TypeTag()) {
case JSRegExp::NOT_COMPILED:
type_str = "NOT_COMPILED";
break;
case JSRegExp::ATOM:
type_str = "ATOM";
break;
case JSRegExp::IRREGEXP:
type_str = "IRREGEXP";
break;
case JSRegExp::EXPERIMENTAL:
type_str = "EXPERIMENTAL";
break;
}
return *isolate->factory()->NewStringFromAsciiChecked(type_str);
}
#define ELEMENTS_KIND_CHECK_RUNTIME_FUNCTION(Name) \
RUNTIME_FUNCTION(Runtime_Has##Name) { \
CONVERT_ARG_CHECKED(JSObject, obj, 0); \
......
......@@ -516,6 +516,7 @@ namespace internal {
F(IsWasmTrapHandlerEnabled, 0, 1) \
F(RegexpHasBytecode, 2, 1) \
F(RegexpHasNativeCode, 2, 1) \
F(RegexpTypeTag, 1, 1) \
F(MapIteratorProtector, 0, 1) \
F(NeverOptimizeFunction, 1, 1) \
F(NotifyContextDisposed, 0, 1) \
......
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --allow-natives-syntax --enable-experimental-regexp-engine
function Test(regexp, subject, expectedResult, expectedLastIndex) {
//assertEquals(%RegexpTypeTag(regexp), "EXPERIMENTAL");
var result = regexp.exec(subject);
assertArrayEquals(result, expectedResult);
assertEquals(regexp.lastIndex, expectedLastIndex);
}
// The empty regexp.
Test(new RegExp(""), "asdf", [""], 0);
// Plain patterns without special operators.
Test(/asdf1/, "123asdf1xyz", ["asdf1"], 0);
// Escaped operators, otherwise plain string:
Test(/\*\.\(\[\]\?/, "123*.([]?123", ["*.([]?"], 0);
// Some two byte values:
Test(/쁰d섊/, "123쁰d섊abc", ["쁰d섊"], 0);
// A pattern with surrogates but without unicode flag:
Test(/💩f/, "123💩f", ["💩f"], 0);
// Disjunctions.
Test(/asdf|123/, "xyz123asdf", ["123"], 0);
Test(/asdf|123|fj|f|a/, "da123", ["a"], 0);
Test(/|123/, "123", [""], 0);
// Character ranges.
Test(/[abc]/, "123asdf", ["a"], 0);
Test(/[0-9]/, "asdf123xyz", ["1"], 0);
Test(/[^0-9]/, "123!xyz", ["!"], 0);
Test(/\w\d/, "?a??a3!!!", ["a3"], 0);
// [💩] without unicode flag is a character range matching one of the two
// surrogate characters that make up 💩. The leading surrogate is 0xD83D.
Test(/[💩]/, "f💩", [String.fromCodePoint(0xD83D)], 0);
// Greedy quantifier for 0 or more matches.
Test(/x*/, "asdfxk", [""], 0);
Test(/asdf*/, "aasdfffk", ["asdfff"], 0);
// Non-capturing groups and nested operators.
Test(/(?:)/, "asdf", [""], 0);
Test(/(?:asdf)/, "123asdfxyz", ["asdf"], 0);
Test(/(?:asdf)|123/, "xyz123asdf", ["123"], 0);
Test(/asdf(?:[0-9]|(?:xy|x)*)*/, "kkkasdf5xyx8xyyky", ["asdf5xyx8xy"], 0);
// The global flag.
Test(/asdf/g, "fjasdfkkasdf", ["asdf"], 6);
......@@ -6,6 +6,7 @@
// RegExp.prototype.replace with a function as an argument.
// Flags: --regexp-tier-up --regexp-tier-up-ticks=5
// Flags: --allow-natives-syntax --no-force-slow-path --no-regexp-interpret-all
// Flags: --no-enable-experimental-regexp-engine
const kLatin1 = true;
const kUnicode = false;
......
......@@ -6,6 +6,7 @@
// RegExp.prototype.replace with a function as an argument.
// Flags: --regexp-tier-up --regexp-tier-up-ticks=1
// Flags: --allow-natives-syntax --no-force-slow-path --no-regexp-interpret-all
// Flags: --no-enable-experimental-regexp-engine
const kLatin1 = true;
const kUnicode = false;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment