Commit 13667065 authored by Martin Bidlingmaier's avatar Martin Bidlingmaier Committed by Commit Bot

[regexp] Support some non-trivial EXPERIMENTAL patterns

This CL adds support for disjunctions and some quantification in
EXPERIMENTAL regexp patterns. It is implemented using a new bytecode
format and an NFA-based breadth-first interpreter.

R=jgruber@chromium.org

Bug: v8:10765
Change-Id: Idd49a3bbc9a9fcc2be80d822c9d84a638e53e777
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2370634
Commit-Queue: Martin Bidlingmaier <mbid@google.com>
Reviewed-by: 's avatarDominik Inführ <dinfuehr@chromium.org>
Reviewed-by: 's avatarJakob Gruber <jgruber@chromium.org>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#69621}
parent 97e79b25
......@@ -1223,23 +1223,23 @@ void JSRegExp::JSRegExpVerify(Isolate* isolate) {
Object latin1_code = arr.get(JSRegExp::kIrregexpLatin1CodeIndex);
Object uc16_code = arr.get(JSRegExp::kIrregexpUC16CodeIndex);
Object experimental_pattern =
arr.get(JSRegExp::kExperimentalPatternIndex);
if (latin1_code.IsCode()) {
// `this` should be a compiled regexp.
CHECK(latin1_code.IsCode());
CHECK_EQ(Code::cast(latin1_code).builtin_index(),
Builtins::kRegExpExperimentalTrampoline);
Object latin1_bytecode = arr.get(JSRegExp::kIrregexpLatin1BytecodeIndex);
Object uc16_bytecode = arr.get(JSRegExp::kIrregexpUC16BytecodeIndex);
CHECK(uc16_code.IsCode());
CHECK_EQ(Code::cast(uc16_code).builtin_index(),
bool is_compiled = latin1_code.IsCode();
if (is_compiled) {
CHECK_EQ(Code::cast(latin1_code).builtin_index(),
Builtins::kRegExpExperimentalTrampoline);
CHECK_EQ(uc16_code, latin1_code);
CHECK(experimental_pattern.IsString());
CHECK(latin1_bytecode.IsByteArray());
CHECK_EQ(uc16_bytecode, latin1_bytecode);
} else {
CHECK_EQ(latin1_code, uninitialized);
CHECK_EQ(uc16_code, uninitialized);
CHECK_EQ(experimental_pattern, uninitialized);
CHECK_EQ(latin1_bytecode, uninitialized);
CHECK_EQ(uc16_bytecode, uninitialized);
}
CHECK_EQ(arr.get(JSRegExp::kIrregexpMaxRegisterCountIndex),
......
......@@ -3347,7 +3347,6 @@ void Factory::SetRegExpExperimentalData(Handle<JSRegExp> regexp,
store->set(JSRegExp::kIrregexpCaptureNameMapIndex, uninitialized);
store->set(JSRegExp::kIrregexpTicksUntilTierUpIndex, uninitialized);
store->set(JSRegExp::kIrregexpBacktrackLimit, uninitialized);
store->set(JSRegExp::kExperimentalPatternIndex, uninitialized);
regexp->set_data(*store);
}
......
......@@ -189,18 +189,14 @@ class JSRegExp : public TorqueGeneratedJSRegExp<JSRegExp, JSObject> {
static const int kIrregexpBacktrackLimit = kDataIndex + 8;
static const int kIrregexpDataSize = kDataIndex + 9;
// TODO(mbid,v8:10765): At the moment the EXPERIMENTAL data array is an
// extension of IRREGEXP data, with most fields set to some
// TODO(mbid,v8:10765): At the moment the EXPERIMENTAL data array conforms
// to the format of an IRREGEXP data array, with most fields set to some
// default/uninitialized value. This is because EXPERIMENTAL and IRREGEXP
// regexps take the same code path in
// `RegExpBuiltinsAssembler::RegExpExecInternal`, which reads off various
// fields from the `store` array. `RegExpExecInternal` should probably
// regexps take the same code path in `RegExpExecInternal`, which reads off
// various fields from the data array. `RegExpExecInternal` should probably
// distinguish between EXPERIMENTAL and IRREGEXP, and then we can get rid of
// all the IRREGEXP only fields.
// The same as kAtomPatternIndex for atom regexps.
static constexpr int kExperimentalPatternIndex = kIrregexpDataSize;
static constexpr int kExperimentalDataSize = kIrregexpDataSize + 1;
static constexpr int kExperimentalDataSize = kIrregexpDataSize;
// In-object fields.
static const int kLastIndexFieldIndex = 0;
......
......@@ -4,123 +4,1000 @@
#include "src/regexp/experimental/experimental.h"
#include <vector>
#include <iomanip>
#include <ios>
#include "src/base/optional.h"
#include "src/base/small-vector.h"
#include "src/objects/js-regexp-inl.h"
#include "src/regexp/regexp-ast.h"
#include "src/regexp/regexp-parser.h"
#include "src/utils/ostreams.h"
namespace v8 {
namespace internal {
namespace {
// TODO(mbid, v8:10765): Currently the experimental engine doesn't support
// UTF-16, but this shouldn't be too hard to implement.
constexpr uc32 kMaxSupportedCodepoint = 0xFFFFu;
class CanBeHandledVisitor final : private RegExpVisitor {
// Visitor to implement `ExperimentalRegExp::CanBeHandled`.
public:
static bool Check(RegExpTree* node, JSRegExp::Flags flags, Zone* zone) {
if (!AreSuitableFlags(flags)) {
return false;
}
CanBeHandledVisitor visitor(zone);
node->Accept(&visitor, nullptr);
return visitor.result_;
}
private:
explicit CanBeHandledVisitor(Zone* zone) : zone_(zone) {}
static bool AreSuitableFlags(JSRegExp::Flags flags) {
// TODO(mbid, v8:10765): We should be able to support all flags in the
// future.
static constexpr JSRegExp::Flags allowed_flags = JSRegExp::kGlobal;
return (flags & ~allowed_flags) == 0;
}
void* VisitDisjunction(RegExpDisjunction* node, void*) override {
for (RegExpTree* alt : *node->alternatives()) {
alt->Accept(this, nullptr);
if (!result_) {
return nullptr;
}
}
return nullptr;
}
void* VisitAlternative(RegExpAlternative* node, void*) override {
for (RegExpTree* child : *node->nodes()) {
child->Accept(this, nullptr);
if (!result_) {
return nullptr;
}
}
return nullptr;
}
void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
result_ = result_ && AreSuitableFlags(node->flags());
for (CharacterRange r : *node->ranges(zone_)) {
// TODO(mbid, v8:10765): We don't support full unicode yet, so we only
// allow character ranges that can be specified with two-byte characters.
if (r.to() > kMaxSupportedCodepoint) {
result_ = false;
return nullptr;
}
}
return nullptr;
}
void* VisitAssertion(RegExpAssertion* node, void*) override {
// TODO(mbid, v8:10765): We should be able to support at least some
// assertions. re2 does, too.
result_ = false;
return nullptr;
}
void* VisitAtom(RegExpAtom* node, void*) override {
result_ = result_ && AreSuitableFlags(node->flags());
return nullptr;
}
void* VisitText(RegExpText* node, void*) override {
for (TextElement& el : *node->elements()) {
el.tree()->Accept(this, nullptr);
if (!result_) {
return nullptr;
}
}
return nullptr;
}
void* VisitQuantifier(RegExpQuantifier* node, void*) override {
// TODO(mbid, v8:10765): Theoretically we can support arbitrary min() and
// max(), but the size of the automaton grows linearly with finite max().
// We probably want a cut-off value here, or maybe we can "virtualize" the
// repetitions.
// Non-greedy quantifiers are easy to implement, but not supported atm.
// It's not clear to me how a possessive quantifier would be implemented,
// we should check whether re2 supports this.
result_ = result_ && node->min() == 0 &&
node->max() == RegExpTree::kInfinity && node->is_greedy();
if (!result_) {
return nullptr;
}
node->body()->Accept(this, nullptr);
return nullptr;
}
void* VisitCapture(RegExpCapture* node, void*) override {
// TODO(mbid, v8:10765): This can be implemented with the NFA interpreter,
// but not with the lazy DFA. See also re2.
result_ = false;
return nullptr;
}
void* VisitGroup(RegExpGroup* node, void*) override {
node->body()->Accept(this, nullptr);
return nullptr;
}
void* VisitLookaround(RegExpLookaround* node, void*) override {
// TODO(mbid, v8:10765): This will be hard to support, but not impossible I
// think. See product automata.
result_ = false;
return nullptr;
}
void* VisitBackReference(RegExpBackReference* node, void*) override {
// This can't be implemented without backtracking.
result_ = false;
return nullptr;
}
void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
private:
bool result_ = true;
Zone* zone_;
};
} // namespace
bool ExperimentalRegExp::CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags,
Zone* zone) {
DCHECK(FLAG_enable_experimental_regexp_engine);
return CanBeHandledVisitor::Check(tree, flags, zone);
}
void ExperimentalRegExp::Initialize(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> source,
JSRegExp::Flags flags, int capture_count) {
DCHECK(FLAG_enable_experimental_regexp_engine);
if (FLAG_trace_experimental_regexp_engine) {
std::cout << "Using experimental regexp engine for: " << *source
<< std::endl;
StdoutStream{} << "Initializing experimental regexp " << *source
<< std::endl;
}
isolate->factory()->SetRegExpExperimentalData(re, source, flags,
capture_count);
}
bool ExperimentalRegExp::IsCompiled(Handle<JSRegExp> re) {
return re->DataAt(JSRegExp::kExperimentalPatternIndex).IsString();
bool ExperimentalRegExp::IsCompiled(Handle<JSRegExp> re, Isolate* isolate) {
DCHECK(FLAG_enable_experimental_regexp_engine);
DCHECK_EQ(re->TypeTag(), JSRegExp::EXPERIMENTAL);
#ifdef VERIFY_HEAP
re->JSRegExpVerify(isolate);
#endif
return re->DataAt(JSRegExp::kIrregexpLatin1BytecodeIndex) !=
Smi::FromInt(JSRegExp::kUninitializedValue);
}
// ----------------------------------------------------------------------------
// Definition and semantics of the EXPERIMENTAL bytecode.
// Background:
// - Russ Cox's blog post series on regular expression matching, in particular
// https://swtch.com/~rsc/regexp/regexp2.html
// - The re2 regular regexp library: https://github.com/google/re2
//
// This comment describes the bytecode used by the experimental regexp engine
// and its abstract semantics in terms of a VM. An implementation of the
// semantics that avoids exponential runtime can be found in `NfaInterpreter`.
//
// The experimental bytecode describes a non-deterministic finite automaton. It
// runs on a multithreaded virtual machine (VM), i.e. in several threads
// concurrently. (These "threads" don't need to be actual operating system
// threads.) Apart from a list of threads, the VM maintains an immutable
// shared input string which threads can read from. Each thread is given by a
// program counter (PC, index of the current instruction), a fixed number of
// registers of indices into the input string, and a monotonically increasing
// index which represents the current position within the input string.
//
// For the precise encoding of the instruction set, see the definition `struct
// RegExpInstruction` below. Currently we support the following instructions:
// - CONSUME_RANGE: Check whether the codepoint of the current character is
// contained in a non-empty closed interval [min, max] specified in the
// instruction payload. Abort this thread if false, otherwise advance the
// input position by 1 and continue with the next instruction.
// - ACCEPT: Stop this thread and signify the end of a match at the current
// input position.
// - FORK: If executed by a thread t, spawn a new thread t0 whose register
// values and input position agree with those of t, but whose PC value is set
// to the value specified in the instruction payload. The register values of
// t and t0 agree directly after the FORK, but they can diverge. Thread t
// continues with the instruction directly after the current FORK
// instruction.
// - JMP: Instead of incrementing the PC value after execution of this
// instruction by 1, set PC of this thread to the value specified in the
// instruction payload and continue there.
//
// Special care must be exercised with respect to thread priority. It is
// possible that more than one thread executes an ACCEPT statement. The output
// of the program is given by the contents of the matching thread's registers,
// so this is ambiguous in case of multiple matches. To resolve the ambiguity,
// every implementation of the VM must output the match that a backtracking
// implementation would output (i.e. behave the same as Irregexp).
//
// A backtracking implementation of the VM maintains a stack of postponed
// threads. Upon encountering a FORK statement, this VM will create a copy of
// the current thread, set the copy's PC value according to the instruction
// payload, and push it to the stack of postponed threads. The VM will then
// continue execution of the current thread.
//
// If at some point a thread t executes a MATCH statement, the VM stops and
// outputs the registers of t. Postponed threads are discarded. On the other
// hand, if a thread t is aborted because some input character didn't pass a
// check, then the VM pops the topmost postponed thread and continues execution
// with this thread. If there are no postponed threads, then the VM outputs
// failure, i.e. no matches.
//
// Equivalently, we can describe the behavior of the backtracking VM in terms
// of priority: Threads are linearly ordered by priority, and matches generated
// by threads with high priority must be preferred over matches generated by
// threads with low priority, regardless of the chronological order in which
// matches were found. If a thread t executes a FORK statement and spawns a
// thread t0, then the priority of t0 is such that the following holds:
// * t0 < t, i.e. t0 has lower priority than t.
// * For all threads u such that u != t and u != t0, we have t0 < u iff t < u,
// i.e. the t0 compares to other threads the same as t.
// For example, if there are currently 3 threads s, t, u such that s < t < u,
// then after t executes a fork, the thread priorities will be s < t0 < t < u.
namespace {
struct Uc16Range {
uc16 min; // Inclusive.
uc16 max; // Inclusive.
};
// Bytecode format.
// Currently very simple fixed-size: The opcode is encoded in the first 4
// bytes, the payload takes another 4 bytes.
struct RegExpInstruction {
enum Opcode : int32_t {
CONSUME_RANGE,
FORK,
JMP,
ACCEPT,
};
static RegExpInstruction ConsumeRange(Uc16Range consume_range) {
RegExpInstruction result;
result.opcode = CONSUME_RANGE;
result.payload.consume_range = consume_range;
return result;
}
static RegExpInstruction Fork(int32_t alt_index) {
RegExpInstruction result;
result.opcode = FORK;
result.payload.pc = alt_index;
return result;
}
static RegExpInstruction Jmp(int32_t alt_index) {
RegExpInstruction result;
result.opcode = JMP;
result.payload.pc = alt_index;
return result;
}
static RegExpInstruction Accept() {
RegExpInstruction result;
result.opcode = ACCEPT;
return result;
}
Opcode opcode;
union {
// Payload of CONSUME_RANGE:
Uc16Range consume_range;
// Payload of FORK and JMP, the next/forked program counter (pc):
int32_t pc;
} payload;
STATIC_ASSERT(sizeof(payload) == 4);
};
STATIC_ASSERT(sizeof(RegExpInstruction) == 8);
// TODO(mbid,v8:10765): This is rather wasteful. We can fit the opcode in 2-3
// bits, so the remaining 29/30 bits can be used as payload. Problem: The
// payload of CONSUME_RANGE consists of two 16-bit values `min` and `max`, so
// this wouldn't fit. We could encode the payload of a CONSUME_RANGE
// instruction by the start of the interval and its length instead, and then
// only allows lengths that fit into 14/13 bits. A longer range can then be
// encoded as a disjunction of smaller ranges.
//
// Another thought: CONSUME_RANGEs are only valid if the payloads are such that
// min <= max. Thus there are
//
// 2^16 + 2^16 - 1 + ... + 1
// = 2^16 * (2^16 + 1) / 2
// = 2^31 + 2^15
//
// valid payloads for a CONSUME_RANGE instruction. If we want to fit
// instructions into 4 bytes, we would still have almost 2^31 instructions left
// over if we encode everything as tight as possible. For example, we could
// use another 2^29 values for JMP, another 2^29 for FORK, 1 value for ACCEPT,
// and then still have almost 2^30 instructions left over for something like
// zero-width assertions and captures.
std::ostream& PrintAsciiOrHex(std::ostream& os, uc16 c) {
if (c < 128 && std::isprint(c)) {
os << static_cast<char>(c);
} else {
os << "0x" << std::hex << static_cast<int>(c);
}
return os;
}
std::ostream& operator<<(std::ostream& os, const RegExpInstruction& inst) {
switch (inst.opcode) {
case RegExpInstruction::CONSUME_RANGE: {
os << "CONSUME_RANGE [";
PrintAsciiOrHex(os, inst.payload.consume_range.min);
os << ", ";
PrintAsciiOrHex(os, inst.payload.consume_range.max);
os << "]";
break;
}
case RegExpInstruction::FORK:
os << "FORK " << inst.payload.pc;
break;
case RegExpInstruction::JMP:
os << "JMP " << inst.payload.pc;
break;
case RegExpInstruction::ACCEPT:
os << "ACCEPT";
break;
}
return os;
}
// The maximum number of digits required to display a non-negative number < n
// in base 10.
int DigitsRequiredBelow(int n) {
DCHECK_GE(n, 0);
int result = 1;
for (int i = 10; i < n; i *= 10) {
result += 1;
}
return result;
}
std::ostream& operator<<(std::ostream& os,
Vector<const RegExpInstruction> insts) {
int inst_num = insts.length();
int line_digit_num = DigitsRequiredBelow(inst_num);
for (int i = 0; i != inst_num; ++i) {
const RegExpInstruction& inst = insts[i];
os << std::setfill('0') << std::setw(line_digit_num) << i << ": " << inst
<< std::endl;
}
return os;
}
Vector<RegExpInstruction> AsInstructionSequence(ByteArray raw_bytes) {
RegExpInstruction* inst_begin =
reinterpret_cast<RegExpInstruction*>(raw_bytes.GetDataStartAddress());
int inst_num = raw_bytes.length() / sizeof(RegExpInstruction);
DCHECK_EQ(sizeof(RegExpInstruction) * inst_num, raw_bytes.length());
return Vector<RegExpInstruction>(inst_begin, inst_num);
}
class Compiler : private RegExpVisitor {
public:
static Handle<ByteArray> Compile(RegExpTree* tree, Isolate* isolate,
Zone* zone) {
Compiler compiler(zone);
tree->Accept(&compiler, nullptr);
compiler.code_.Add(RegExpInstruction::Accept(), zone);
int byte_length = sizeof(RegExpInstruction) * compiler.code_.length();
Handle<ByteArray> array = isolate->factory()->NewByteArray(byte_length);
MemCopy(array->GetDataStartAddress(), compiler.code_.begin(), byte_length);
return array;
}
private:
// TODO(mbid,v8:10765): Use some upper bound for code_ capacity computed from
// the `tree` size we're going to compile?
explicit Compiler(Zone* zone) : zone_(zone), code_(0, zone) {}
// Generate a disjunction of code fragments compiled by a function `alt_gen`.
// `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num -
// 1` and should push code corresponding to the ith alternative onto `code_`.
template <class F>
void CompileDisjunction(int alt_num, F gen_alt) {
// An alternative a0 | a1 | a2 is compiled into
// FORK <a2>
// FORK <a1>
// <a0>
// JMP $end
// <a1>
// JMP $end
// <a2>
// where $end is the index of the next instruction after <a2>.
//
// By the semantics of the FORK instruction (see above at definition and
// semantics), the forked thread has lower priority than the current
// thread. This means that with the code we're generating here, the thread
// matching the alternative a0 is indeed the thread with the highest
// priority, followed by the thread for a1 and so on.
if (alt_num == 0) {
return;
}
// Record the index of the first of the alt_num - 1 fork instructions in the
// beginning.
int forks_begin = code_.length();
// Add FORKs to alts[alt_num - 1], alts[alt_num - 2], ..., alts[1].
for (int i = alt_num - 1; i != 0; --i) {
// The FORK's address is patched once we know the address of the ith
// alternative.
code_.Add(RegExpInstruction::Fork(-1), zone_);
}
// List containing the index of the final JMP instruction after each
// alternative but the last one.
ZoneList<int> jmp_indices(alt_num - 1, zone_);
for (int i = 0; i != alt_num; ++i) {
if (i != 0) {
// If this is not the first alternative, we have to patch the
// corresponding FORK statement in the beginning.
code_[forks_begin + alt_num - 1 - i].payload.pc = code_.length();
}
gen_alt(i);
if (i != alt_num - 1) {
// If this is not the last alternative, we have to emit a JMP past the
// remaining alternatives. We don't know this address yet, so we have
// to patch patch it once all alternatives are emitted.
jmp_indices.Add(code_.length(), zone_);
code_.Add(RegExpInstruction::Jmp(-1), zone_);
}
}
// All alternatives are emitted. Now we can patch the JMP instruction
// after each but the last alternative.
int end_index = code_.length();
for (int jmp_index : jmp_indices) {
code_[jmp_index].payload.pc = end_index;
}
}
void* VisitDisjunction(RegExpDisjunction* node, void*) override {
ZoneList<RegExpTree*>& alts = *node->alternatives();
CompileDisjunction(alts.length(),
[&](int i) { alts[i]->Accept(this, nullptr); });
return nullptr;
}
void* VisitAlternative(RegExpAlternative* node, void*) override {
for (RegExpTree* child : *node->nodes()) {
child->Accept(this, nullptr);
}
return nullptr;
}
void* VisitAssertion(RegExpAssertion* node, void*) override {
// TODO(mbid,v8:10765): Support this case.
UNREACHABLE();
}
void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
// A character class is compiled as Disjunction over its `CharacterRange`s.
ZoneList<CharacterRange>* ranges = node->ranges(zone_);
CharacterRange::Canonicalize(ranges);
if (node->is_negated()) {
// Capacity 2 for the common case where we compute the complement of a
// single interval range that doesn't contain 0 and kMaxCodePoint.
ZoneList<CharacterRange>* negated =
zone_->New<ZoneList<CharacterRange>>(2, zone_);
CharacterRange::Negate(ranges, negated, zone_);
ranges = negated;
}
CompileDisjunction(ranges->length(), [&](int i) {
// We don't support utf16 for now, so only ranges that can be specified
// by (complements of) ranges with uc16 bounds.
STATIC_ASSERT(kMaxSupportedCodepoint <= std::numeric_limits<uc16>::max());
uc32 from = (*ranges)[i].from();
DCHECK_LE(from, kMaxSupportedCodepoint);
uc16 from_uc16 = static_cast<uc16>(from);
uc32 to = (*ranges)[i].to();
DCHECK_IMPLIES(to > kMaxSupportedCodepoint, to == String::kMaxCodePoint);
uc16 to_uc16 = static_cast<uc16>(std::min(to, kMaxSupportedCodepoint));
Uc16Range range{from_uc16, to_uc16};
code_.Add(RegExpInstruction::ConsumeRange(range), zone_);
});
return nullptr;
}
void* VisitAtom(RegExpAtom* node, void*) override {
for (uc16 c : node->data()) {
code_.Add(RegExpInstruction::ConsumeRange(Uc16Range{c, c}), zone_);
}
return nullptr;
}
void* VisitQuantifier(RegExpQuantifier* node, void*) override {
// TODO(mbid,v8:10765): For now we support a quantifier of the form /x*/,
// i.e. greedy match of any number of /x/. See also the comment in
// `CanBeHandledVisitor::VisitQuantifier`.
DCHECK_EQ(node->min(), 0);
DCHECK_EQ(node->max(), RegExpTree::kInfinity);
DCHECK(node->is_greedy());
// The repetition of /x/ is compiled into
//
// a: FORK d
// b: <x>
// c: JMP a
// d: ...
//
// Note that a FORKed thread has lower priority than the main thread, so
// this will indeed match greedily.
int initial_fork_index = code_.length();
// The FORK's address is patched once we're done.
code_.Add(RegExpInstruction::Fork(-1), zone_);
node->body()->Accept(this, nullptr);
code_.Add(RegExpInstruction::Jmp(initial_fork_index), zone_);
int end_index = code_.length();
code_[initial_fork_index].payload.pc = end_index;
return nullptr;
}
void* VisitCapture(RegExpCapture* node, void*) override {
// TODO(mbid,v8:10765): Support this case.
UNREACHABLE();
}
void* VisitGroup(RegExpGroup* node, void*) override {
node->body()->Accept(this, nullptr);
return nullptr;
}
void* VisitLookaround(RegExpLookaround* node, void*) override {
// TODO(mbid,v8:10765): Support this case.
UNREACHABLE();
}
void* VisitBackReference(RegExpBackReference* node, void*) override {
UNREACHABLE();
}
void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
void* VisitText(RegExpText* node, void*) override {
for (TextElement& text_el : *node->elements()) {
text_el.tree()->Accept(this, nullptr);
}
return nullptr;
}
private:
Zone* zone_;
ZoneList<RegExpInstruction> code_;
};
} // namespace
void ExperimentalRegExp::Compile(Isolate* isolate, Handle<JSRegExp> re) {
DCHECK_EQ(re->TypeTag(), JSRegExp::EXPERIMENTAL);
// TODO(mbid,v8:10765): Actually compile here.
Handle<FixedArray> data =
Handle<FixedArray>(FixedArray::cast(re->data()), isolate);
Handle<Code> trampoline = BUILTIN_CODE(isolate, RegExpExperimentalTrampoline);
#ifdef VERIFY_HEAP
re->JSRegExpVerify(isolate);
#endif
Handle<String> source(re->Pattern(), isolate);
if (FLAG_trace_experimental_regexp_engine) {
StdoutStream{} << "Compiling experimental regexp " << *source << std::endl;
}
Zone zone(isolate->allocator(), ZONE_NAME);
// Parse and compile the regexp source.
RegExpCompileData parse_result;
JSRegExp::Flags flags = re->GetFlags();
FlatStringReader reader(isolate, source);
DCHECK(!isolate->has_pending_exception());
data->set(JSRegExp::kIrregexpLatin1CodeIndex, *trampoline);
data->set(JSRegExp::kIrregexpUC16CodeIndex, *trampoline);
// The pattern was already parsed during initialization, so it should never
// fail here:
bool parse_success =
RegExpParser::ParseRegExp(isolate, &zone, &reader, flags, &parse_result);
CHECK(parse_success);
data->set(JSRegExp::kExperimentalPatternIndex,
data->get(JSRegExp::kSourceIndex));
Handle<ByteArray> bytecode =
Compiler::Compile(parse_result.tree, isolate, &zone);
re->SetDataAt(JSRegExp::kIrregexpLatin1BytecodeIndex, *bytecode);
re->SetDataAt(JSRegExp::kIrregexpUC16BytecodeIndex, *bytecode);
Handle<Code> trampoline = BUILTIN_CODE(isolate, RegExpExperimentalTrampoline);
re->SetDataAt(JSRegExp::kIrregexpLatin1CodeIndex, *trampoline);
re->SetDataAt(JSRegExp::kIrregexpUC16CodeIndex, *trampoline);
}
struct match_range {
int32_t begin;
int32_t end;
namespace {
// A half-open range in the input string denoting a (sub)match. Used to access
// output registers of a regexp match grouped by [begin, end) pairs.
struct MatchRange {
int32_t begin; // inclusive
int32_t end; // exclusive
};
// Returns the number of matches.
int32_t ExperimentalRegExp::ExecRaw(JSRegExp regexp, String subject,
int32_t* output_registers,
int32_t output_register_count,
int32_t subject_index) {
String needle =
String::cast(regexp.DataAt(JSRegExp::kExperimentalPatternIndex));
template <class Character>
class NfaInterpreter {
// Executes a bytecode program in breadth-first mode, without backtracking.
// `Character` can be instantiated with `uint8_t` or `uc16` for one byte or
// two byte input strings.
//
// In contrast to the backtracking implementation, this has linear time
// complexity in the length of the input string. Breadth-first mode means
// that threads are executed in lockstep with respect to their input
// position, i.e. the threads share a common input index. This is similar
// to breadth-first simulation of a non-deterministic finite automaton (nfa),
// hence the name of the class.
//
// To follow the semantics of a backtracking VM implementation, we have to be
// careful about whether we stop execution when a thread executes ACCEPT.
// For example, consider execution of the bytecode generated by the regexp
//
// r = /abc|..|[a-c]{10,}/
//
// on input "abcccccccccccccc". Clearly the three alternatives
// - /abc/
// - /../
// - /[a-c]{10,}/
// all match this input. A backtracking implementation will report "abc" as
// match, because it explores the first alternative before the others.
//
// However, if we execute breadth first, then we execute the 3 threads
// - t1, which tries to match /abc/
// - t2, which tries to match /../
// - t3, which tries to match /[a-c]{10,}/
// in lockstep i.e. by iterating over the input and feeding all threads one
// character at a time. t2 will execute an ACCEPT after two characters,
// while t1 will only execute ACCEPT after three characters. Thus we find a
// match for the second alternative before a match of the first alternative.
//
// This shows that we cannot always stop searching as soon as some thread t
// executes ACCEPT: If there is a thread u with higher priority than t, then
// it must be finished first. If u produces a match, then we can discard the
// match of t because matches produced by threads with higher priority are
// preferred over matches of threads with lower priority. On the other hand,
// we are allowed to abort all threads with lower priority than t if t
// produces a match: Such threads can only produce worse matches. In the
// example above, we can abort t3 after two characters because of t2's match.
//
// Thus the interpreter keeps track of a priority-ordered list of threads.
// If a thread ACCEPTs, all threads with lower priority are discarded, and
// the search continues with the threads with higher priority. If no threads
// with high priority are left, we return the match that was produced by the
// ACCEPTing thread with highest priority.
public:
NfaInterpreter(Vector<const RegExpInstruction> bytecode,
Vector<const Character> input, int32_t input_index)
: bytecode_(bytecode),
input_(input),
input_index_(input_index),
pc_last_input_index_(bytecode.size()),
active_threads_(),
blocked_threads_(),
best_match_(base::nullopt) {
DCHECK(!bytecode_.empty());
DCHECK_GE(input_index_, 0);
DCHECK_LE(input_index_, input_.length());
if (FLAG_trace_experimental_regexp_engine) {
std::cout << "Searching for " << output_register_count / 2
<< " occurences of " << needle << " in " << subject << std::endl;
std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1);
}
DCHECK(needle.IsFlat());
DCHECK(subject.IsFlat());
// Finds up to `max_match_num` matches and writes their boundaries to
// `matches_out`. The search begins at the current input index. Returns the
// number of matches found.
int FindMatches(MatchRange* matches_out, int max_match_num) {
int match_num;
for (match_num = 0; match_num != max_match_num; ++match_num) {
base::Optional<MatchRange> match = FindNextMatch();
if (!match.has_value()) {
break;
}
const int needle_len = needle.length();
const int subject_len = subject.length();
matches_out[match_num] = *match;
SetInputIndex(match->end);
}
return match_num;
}
DCHECK_GT(needle_len, 0);
private:
// The state of a "thread" executing experimental regexp bytecode. (Not to
// be confused with an OS thread.)
struct InterpreterThread {
// This thread's program counter, i.e. the index within `bytecode_` of the
// next instruction to be executed.
int32_t pc;
// The index in the input string where this thread started executing.
int32_t match_begin;
};
DCHECK_EQ(output_register_count % 2, 0);
// Change the current input index for future calls to `FindNextMatch`.
void SetInputIndex(int new_input_index) {
DCHECK_GE(input_index_, 0);
DCHECK_LE(input_index_, input_.length());
if (subject_index + needle_len > subject_len) {
return 0;
input_index_ = new_input_index;
}
match_range* matches = reinterpret_cast<match_range*>(output_registers);
const int32_t max_match_num = output_register_count / 2;
// Find the next match, begin search at input_index_;
base::Optional<MatchRange> FindNextMatch() {
DCHECK(active_threads_.empty());
// TODO(mbid,v8:10765): Can we get around resetting `pc_last_input_index_`
// here? As long as
//
// pc_last_input_index_[pc] < input_index_
//
// for all possible program counters pc that are reachable without input
// from pc = 0 and
//
// pc_last_input_index_[k] <= input_index_
//
// for all k > 0 hold I think everything should be fine. Maybe we can do
// something about this in `SetInputIndex`.
std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1);
// `state_num` does not overflow because the max length of strings is
// strictly less than INT_MAX.
const int state_num = needle_len + 1;
const int start_state = 0;
const int accepting_state = needle_len;
// TODO(mbid,v8:10765): We probably don't want to allocate a new vector here
// in every execution.
std::vector<int8_t> in_state(state_num, false);
in_state[start_state] = true;
DCHECK(blocked_threads_.empty());
DCHECK(active_threads_.empty());
DCHECK_EQ(best_match_, base::nullopt);
DisallowHeapAllocation no_gc;
String::FlatContent needle_content = needle.GetFlatContent(no_gc);
String::FlatContent subject_content = subject.GetFlatContent(no_gc);
// All threads start at bytecode 0.
PushActiveThreadUnchecked(InterpreterThread{0, input_index_});
// Run the initial thread, potentially forking new threads, until every
// thread is blocked without further input.
RunActiveThreads();
DCHECK(needle_content.IsFlat());
DCHECK(subject_content.IsFlat());
// We stop if one of the following conditions hold:
// - We have exhausted the entire input.
// - We have found a match at some point, and there are no remaining
// threads with higher priority than the thread that produced the match.
// Threads with low priority have been aborted earlier, and the remaining
// threads are blocked here, so the latter simply means that
// `blocked_threads_` is empty.
while (input_index_ != input_.length() &&
!(best_match_.has_value() && blocked_threads_.empty())) {
DCHECK(active_threads_.empty());
uc16 input_char = input_[input_index_];
++input_index_;
int32_t match_num = 0;
while (subject_index != subject_len && match_num != max_match_num) {
uc16 subject_char = subject_content.Get(subject_index);
// If we haven't found a match yet, we add a thread with least priority
// that attempts a match starting after `input_char`.
if (!best_match_.has_value()) {
active_threads_.emplace_back(InterpreterThread{0, input_index_});
}
// We unblock all blocked_threads_ by feeding them the input char.
FlushBlockedThreads(input_char);
for (int needle_index = needle_len - 1; needle_index >= 0; --needle_index) {
uc16 needle_char = needle_content.Get(needle_index);
if (in_state[needle_index] && needle_char == subject_char) {
in_state[needle_index + 1] = true;
} else {
in_state[needle_index + 1] = false;
// Run all threads until they block or accept.
RunActiveThreads();
}
// Clean up the data structures we used.
base::Optional<MatchRange> result = best_match_;
best_match_ = base::nullopt;
blocked_threads_.clear();
active_threads_.clear();
return result;
}
// Run an active thread `t` until it executes a CONSUME_RANGE or ACCEPT
// instruction, or its PC value was already processed.
// - If processing of `t` can't continue because of CONSUME_RANGE, it is
// pushed on `blocked_threads_`.
// - If `t` executes ACCEPT, set `best_match` according to `t.match_begin` and
// the current input index. All remaining `active_threads_` are discarded.
void RunActiveThread(InterpreterThread t) {
while (true) {
RegExpInstruction inst = bytecode_[t.pc];
switch (inst.opcode) {
case RegExpInstruction::CONSUME_RANGE: {
blocked_threads_.emplace_back(t);
return;
}
case RegExpInstruction::FORK: {
InterpreterThread fork = t;
fork.pc = inst.payload.pc;
++t.pc;
// t has higher priority than fork. If t.pc hasn't been processed,we
// push fork on the active_thread_ stack and continue directly with
// t. Otherwise we continue directly with fork if possible.
if (!IsPcProcessed(t.pc)) {
MarkPcProcessed(t.pc);
PushActiveThread(fork);
break;
} else if (!IsPcProcessed(fork.pc)) {
t = fork;
MarkPcProcessed(t.pc);
break;
}
return;
}
case RegExpInstruction::JMP:
t.pc = inst.payload.pc;
if (IsPcProcessed(t.pc)) return;
MarkPcProcessed(t.pc);
break;
case RegExpInstruction::ACCEPT:
best_match_ = MatchRange{t.match_begin, input_index_};
active_threads_.clear();
return;
}
}
if (in_state[accepting_state]) {
match_range& match = matches[match_num];
match.end = subject_index + 1;
match.begin = match.end - needle_len;
if (FLAG_trace_experimental_regexp_engine) {
std::cout << "Found match at [" << match.begin << ", " << match.end
<< ")" << std::endl;
}
// Run each active thread until it can't continue without further input.
// `active_threads_` is empty afterwards. `blocked_threads_` are sorted from
// low to high priority.
void RunActiveThreads() {
while (!active_threads_.empty()) {
InterpreterThread t = active_threads_.back();
active_threads_.pop_back();
RunActiveThread(t);
}
}
// Unblock all blocked_threads_ by feeding them an `input_char`. Should only
// be called with `input_index_` pointing to the character *after*
// `input_char` so that `pc_last_input_index_` is updated correctly.
void FlushBlockedThreads(uc16 input_char) {
// The threads in blocked_threads_ are sorted from high to low priority,
// but active_threads_ needs to be sorted from low to high priority, so we
// need to activate blocked threads in reverse order.
//
// TODO(mbid,v8:10765): base::SmallVector doesn't support `rbegin()` and
// `rend()`, should we implement that instead of this awkward iteration?
// Maybe we could at least use an int i and check for i >= 0, but
// SmallVectors don't have length() methods.
for (size_t i = blocked_threads_.size(); i > 0; --i) {
InterpreterThread t = blocked_threads_[i - 1];
RegExpInstruction inst = bytecode_[t.pc];
DCHECK_EQ(inst.opcode, RegExpInstruction::CONSUME_RANGE);
Uc16Range range = inst.payload.consume_range;
if (input_char >= range.min && input_char <= range.max) {
++t.pc;
PushActiveThreadUnchecked(t);
}
++match_num;
in_state.assign(state_num, false);
in_state[start_state] = true;
}
++subject_index;
blocked_threads_.clear();
}
// It is redundant to have two threads t, t0 execute at the same PC value,
// because one of t, t0 matches iff the other does. We can thus discard
// the one with lower priority. We check whether a thread executed at some
// PC value by recording for every possible value of PC what the value of
// input_index_ was the last time a thread executed at PC. If a thread
// tries to continue execution at a PC value that we have seen before at
// the current input index, we abort it. (We execute threads with higher
// priority first, so the second thread is guaranteed to have lower
// priority.)
//
// Check whether we've seen an active thread with a given pc value since the
// last increment of `input_index_`.
bool IsPcProcessed(int pc) {
DCHECK_LE(pc_last_input_index_[pc], input_index_);
return pc_last_input_index_[pc] == input_index_;
}
// Mark a pc as having been processed since the last increment of
// `input_index_`.
void MarkPcProcessed(int pc) {
DCHECK_LE(pc_last_input_index_[pc], input_index_);
pc_last_input_index_[pc] = input_index_;
}
return match_num;
// Functions to push a thread `t` onto the list of active threads, but only
// if `t.pc` was not already the pc of some other thread at the current
// subject index.
void PushActiveThreadUnchecked(InterpreterThread t) {
DCHECK(!IsPcProcessed(t.pc));
MarkPcProcessed(t.pc);
active_threads_.emplace_back(t);
}
void PushActiveThread(InterpreterThread t) {
if (IsPcProcessed(t.pc)) {
return;
}
PushActiveThreadUnchecked(t);
}
Vector<const RegExpInstruction> bytecode_;
Vector<const Character> input_;
int input_index_;
// TODO(mbid,v8:10765): The following `SmallVector`s have somehwat
// arbitrarily chosen small capacity sizes; should benchmark to find a good
// value.
// pc_last_input_index_[k] records the value of input_index_ the last
// time a thread t such that t.pc == k was activated, i.e. put on
// active_threads_. Thus pc_last_input_index.size() == bytecode.size(). See
// also `RunActiveThread`.
base::SmallVector<int, 64> pc_last_input_index_;
// Active threads can potentially (but not necessarily) continue without
// input. Sorted from low to high priority.
base::SmallVector<InterpreterThread, 64> active_threads_;
// The pc of a blocked thread points to an instruction that consumes a
// character. Sorted from high to low priority (so the opposite of
// `active_threads_`).
base::SmallVector<InterpreterThread, 64> blocked_threads_;
// The best match found so far during the current search. If several threads
// ACCEPTed, then this will be the match of the accepting thread with highest
// priority.
base::Optional<MatchRange> best_match_;
};
} // namespace
// Returns the number of matches.
int32_t ExperimentalRegExp::ExecRaw(JSRegExp regexp, String subject,
int32_t* output_registers,
int32_t output_register_count,
int32_t subject_index) {
DisallowHeapAllocation no_gc;
DCHECK(FLAG_enable_experimental_regexp_engine);
if (FLAG_trace_experimental_regexp_engine) {
String source = String::cast(regexp.DataAt(JSRegExp::kSourceIndex));
StdoutStream{} << "Executing experimental regexp " << source << std::endl;
}
Vector<RegExpInstruction> bytecode = AsInstructionSequence(
ByteArray::cast(regexp.DataAt(JSRegExp::kIrregexpLatin1BytecodeIndex)));
if (FLAG_print_regexp_bytecode) {
StdoutStream{} << "Bytecode:" << std::endl;
StdoutStream{} << bytecode << std::endl;
}
DCHECK(subject.IsFlat());
String::FlatContent subject_content = subject.GetFlatContent(no_gc);
DCHECK_EQ(output_register_count % 2, 0);
MatchRange* matches = reinterpret_cast<MatchRange*>(output_registers);
const int32_t max_match_num = output_register_count / 2;
if (subject_content.IsOneByte()) {
NfaInterpreter<uint8_t> interpreter(
bytecode, subject_content.ToOneByteVector(), subject_index);
return interpreter.FindMatches(matches, max_match_num);
} else {
NfaInterpreter<uc16> interpreter(bytecode, subject_content.ToUC16Vector(),
subject_index);
return interpreter.FindMatches(matches, max_match_num);
}
}
int32_t ExperimentalRegExp::MatchForCallFromJs(
......@@ -128,6 +1005,8 @@ int32_t ExperimentalRegExp::MatchForCallFromJs(
Address input_end, int* output_registers, int32_t output_register_count,
Address backtrack_stack, RegExp::CallOrigin call_origin, Isolate* isolate,
Address regexp) {
DCHECK(FLAG_enable_experimental_regexp_engine);
DCHECK_NOT_NULL(isolate);
DCHECK_NOT_NULL(output_registers);
DCHECK(call_origin == RegExp::CallOrigin::kFromJs);
......@@ -148,17 +1027,25 @@ int32_t ExperimentalRegExp::MatchForCallFromJs(
MaybeHandle<Object> ExperimentalRegExp::Exec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int subject_index, Handle<RegExpMatchInfo> last_match_info) {
regexp->DataAt(JSRegExp::kExperimentalPatternIndex);
if (!IsCompiled(regexp)) {
DCHECK(FLAG_enable_experimental_regexp_engine);
DCHECK_EQ(regexp->TypeTag(), JSRegExp::EXPERIMENTAL);
#ifdef VERIFY_HEAP
regexp->JSRegExpVerify(isolate);
#endif
if (!IsCompiled(regexp, isolate)) {
Compile(isolate, regexp);
}
DCHECK(IsCompiled(regexp, isolate));
subject = String::Flatten(isolate, subject);
match_range match;
MatchRange match;
int32_t* output_registers = &match.begin;
int32_t output_register_count = sizeof(match_range) / sizeof(int32_t);
int32_t output_register_count = sizeof(MatchRange) / sizeof(int32_t);
int capture_count = regexp->CaptureCount();
......
......@@ -12,11 +12,18 @@ namespace internal {
class ExperimentalRegExp final : public AllStatic {
public:
// Initialization & Compilation:
// Initialization & Compilation
// -------------------------------------------------------------------------
// Check whether a parsed regexp pattern can be compiled and executed by the
// EXPERIMENTAL engine.
// TODO(mbid, v8:10765): This walks the RegExpTree, but it could also be
// checked on the fly in the parser. Not done currently because walking the
// AST again is more flexible and less error prone (but less performant).
static bool CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags, Zone* zone);
static void Initialize(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> pattern, JSRegExp::Flags flags,
int capture_count);
static bool IsCompiled(Handle<JSRegExp> re);
static bool IsCompiled(Handle<JSRegExp> re, Isolate* isolate);
static void Compile(Isolate* isolate, Handle<JSRegExp> re);
// Execution:
......
......@@ -174,13 +174,10 @@ MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re,
bool has_been_compiled = false;
if (FLAG_enable_experimental_regexp_engine && parse_result.simple &&
!IgnoreCase(flags) && !IsSticky(flags)) {
// Parse-tree is a single atom that is equal to the pattern. For now we let
// the experimental regexp engine deal with this case instead of string
// search via ATOM (modulo some performance-related heuristic).
int capture_count = 0;
ExperimentalRegExp::Initialize(isolate, re, pattern, flags, capture_count);
if (FLAG_enable_experimental_regexp_engine &&
ExperimentalRegExp::CanBeHandled(parse_result.tree, flags, &zone)) {
ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
parse_result.capture_count);
has_been_compiled = true;
} else if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) &&
!HasFewDifferentCharacters(pattern)) {
......@@ -979,7 +976,7 @@ int32_t* RegExpGlobalCache::FetchNext() {
register_array_, register_array_size_);
break;
case JSRegExp::EXPERIMENTAL: {
if (!ExperimentalRegExp::IsCompiled(regexp_)) {
if (!ExperimentalRegExp::IsCompiled(regexp_, isolate_)) {
ExperimentalRegExp::Compile(isolate_, regexp_);
}
DisallowHeapAllocation no_gc;
......
......@@ -1254,6 +1254,28 @@ RUNTIME_FUNCTION(Runtime_RegexpHasNativeCode) {
return isolate->heap()->ToBoolean(result);
}
RUNTIME_FUNCTION(Runtime_RegexpTypeTag) {
HandleScope shs(isolate);
DCHECK_EQ(1, args.length());
CONVERT_ARG_CHECKED(JSRegExp, regexp, 0);
const char* type_str;
switch (regexp.TypeTag()) {
case JSRegExp::NOT_COMPILED:
type_str = "NOT_COMPILED";
break;
case JSRegExp::ATOM:
type_str = "ATOM";
break;
case JSRegExp::IRREGEXP:
type_str = "IRREGEXP";
break;
case JSRegExp::EXPERIMENTAL:
type_str = "EXPERIMENTAL";
break;
}
return *isolate->factory()->NewStringFromAsciiChecked(type_str);
}
#define ELEMENTS_KIND_CHECK_RUNTIME_FUNCTION(Name) \
RUNTIME_FUNCTION(Runtime_Has##Name) { \
CONVERT_ARG_CHECKED(JSObject, obj, 0); \
......
......@@ -516,6 +516,7 @@ namespace internal {
F(IsWasmTrapHandlerEnabled, 0, 1) \
F(RegexpHasBytecode, 2, 1) \
F(RegexpHasNativeCode, 2, 1) \
F(RegexpTypeTag, 1, 1) \
F(MapIteratorProtector, 0, 1) \
F(NeverOptimizeFunction, 1, 1) \
F(NotifyContextDisposed, 0, 1) \
......
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --allow-natives-syntax --enable-experimental-regexp-engine
function Test(regexp, subject, expectedResult, expectedLastIndex) {
//assertEquals(%RegexpTypeTag(regexp), "EXPERIMENTAL");
var result = regexp.exec(subject);
assertArrayEquals(result, expectedResult);
assertEquals(regexp.lastIndex, expectedLastIndex);
}
// The empty regexp.
Test(new RegExp(""), "asdf", [""], 0);
// Plain patterns without special operators.
Test(/asdf1/, "123asdf1xyz", ["asdf1"], 0);
// Escaped operators, otherwise plain string:
Test(/\*\.\(\[\]\?/, "123*.([]?123", ["*.([]?"], 0);
// Some two byte values:
Test(/쁰d섊/, "123쁰d섊abc", ["쁰d섊"], 0);
// A pattern with surrogates but without unicode flag:
Test(/💩f/, "123💩f", ["💩f"], 0);
// Disjunctions.
Test(/asdf|123/, "xyz123asdf", ["123"], 0);
Test(/asdf|123|fj|f|a/, "da123", ["a"], 0);
Test(/|123/, "123", [""], 0);
// Character ranges.
Test(/[abc]/, "123asdf", ["a"], 0);
Test(/[0-9]/, "asdf123xyz", ["1"], 0);
Test(/[^0-9]/, "123!xyz", ["!"], 0);
Test(/\w\d/, "?a??a3!!!", ["a3"], 0);
// [💩] without unicode flag is a character range matching one of the two
// surrogate characters that make up 💩. The leading surrogate is 0xD83D.
Test(/[💩]/, "f💩", [String.fromCodePoint(0xD83D)], 0);
// Greedy quantifier for 0 or more matches.
Test(/x*/, "asdfxk", [""], 0);
Test(/asdf*/, "aasdfffk", ["asdfff"], 0);
// Non-capturing groups and nested operators.
Test(/(?:)/, "asdf", [""], 0);
Test(/(?:asdf)/, "123asdfxyz", ["asdf"], 0);
Test(/(?:asdf)|123/, "xyz123asdf", ["123"], 0);
Test(/asdf(?:[0-9]|(?:xy|x)*)*/, "kkkasdf5xyx8xyyky", ["asdf5xyx8xy"], 0);
// The global flag.
Test(/asdf/g, "fjasdfkkasdf", ["asdf"], 6);
......@@ -6,6 +6,7 @@
// RegExp.prototype.replace with a function as an argument.
// Flags: --regexp-tier-up --regexp-tier-up-ticks=5
// Flags: --allow-natives-syntax --no-force-slow-path --no-regexp-interpret-all
// Flags: --no-enable-experimental-regexp-engine
const kLatin1 = true;
const kUnicode = false;
......
......@@ -6,6 +6,7 @@
// RegExp.prototype.replace with a function as an argument.
// Flags: --regexp-tier-up --regexp-tier-up-ticks=1
// Flags: --allow-natives-syntax --no-force-slow-path --no-regexp-interpret-all
// Flags: --no-enable-experimental-regexp-engine
const kLatin1 = true;
const kUnicode = false;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment