Commit 98b8ca89 authored by Martin Bidlingmaier's avatar Martin Bidlingmaier Committed by Commit Bot

[regexp] Support capture groups in experimental engine

This commit adds support for capture groups (as in e.g. /x(123|abc)y/)
in the experimental regexp engine.  Now every InterpreterThread owns a
register array containing (sub)match boundaries. There is a new
instruction to record the current input index in some register.

Submatches in quantifier bodies should be reported only if they occur
during the last repetition.  Thus we reset those registers before
attempting to match the body of a quantifier.  This is implemented with
another new instruction.

Because of concerns for the growing sizeof the NfaInterpreter object
(which is allocated on the stack), this commit replaces the
`SmallVector` members of the NfaInterpreter with zone-allocated arrays.
Register arrays, which for a fixed regexp are all the same size, are
allocated with a RecyclingZoneAllocator for cheap memory reclamation via
a linked list of equally-sized free blocks.

Possible optimizations for management of register array memory:
1. If there are few register per thread, then it is likely faster to
   store them inline in the InterpreterThread struct.
2. re2 implements copy-on-write:  InterpreterThreads can share the same
   register array. If a thread attempts to write to shared register
   array, the register array is cloned first.
3. The register at index 1 contains the end of the match; this is only
   written to right before an ACCEPT statement.  We could make ACCEPT
   equivalent to what's currently CAPTURE 1 followed by ACCEPT.  We
   could then save the memory for register 1 for threads that haven't
   finished yet.  This is particularly interesting if now optimization 1
   kicks in.

Cq-Include-Trybots: luci.v8.try:v8_linux64_fyi_rel_ng
Bug: v8:10765
Change-Id: I2c0503206ce331e13ac9912945bb66736d740197
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2390770
Commit-Queue: Martin Bidlingmaier <mbid@google.com>
Reviewed-by: 's avatarJakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#69929}
parent 10ffb113
...@@ -257,11 +257,15 @@ TNode<JSRegExpResult> RegExpBuiltinsAssembler::ConstructNewResultFromMatchInfo( ...@@ -257,11 +257,15 @@ TNode<JSRegExpResult> RegExpBuiltinsAssembler::ConstructNewResultFromMatchInfo(
TNode<FixedArray> data = TNode<FixedArray> data =
CAST(LoadObjectField(regexp, JSRegExp::kDataOffset)); CAST(LoadObjectField(regexp, JSRegExp::kDataOffset));
// We reach this point only if captures exist, implying that this is an // We reach this point only if captures exist, implying that the assigned
// IRREGEXP JSRegExp. // regexp engine must be able to handle captures.
CSA_ASSERT(this, CSA_ASSERT(
SmiEqual(CAST(LoadFixedArrayElement(data, JSRegExp::kTagIndex)), this,
SmiConstant(JSRegExp::IRREGEXP))); Word32Or(
SmiEqual(CAST(LoadFixedArrayElement(data, JSRegExp::kTagIndex)),
SmiConstant(JSRegExp::IRREGEXP)),
SmiEqual(CAST(LoadFixedArrayElement(data, JSRegExp::kTagIndex)),
SmiConstant(JSRegExp::EXPERIMENTAL))));
// The names fixed array associates names at even indices with a capture // The names fixed array associates names at even indices with a capture
// index at odd indices. // index at odd indices.
......
...@@ -1242,10 +1242,8 @@ void JSRegExp::JSRegExpVerify(Isolate* isolate) { ...@@ -1242,10 +1242,8 @@ void JSRegExp::JSRegExpVerify(Isolate* isolate) {
CHECK_EQ(arr.get(JSRegExp::kIrregexpMaxRegisterCountIndex), CHECK_EQ(arr.get(JSRegExp::kIrregexpMaxRegisterCountIndex),
uninitialized); uninitialized);
// TODO(mbid,v8:10765): Once the EXPERIMENTAL regexps support captures, CHECK(arr.get(JSRegExp::kIrregexpCaptureCountIndex).IsSmi());
// the capture count should be allowed to be a Smi >= 0. CHECK_GE(Smi::ToInt(arr.get(JSRegExp::kIrregexpCaptureCountIndex)), 0);
CHECK_EQ(arr.get(JSRegExp::kIrregexpCaptureCountIndex), Smi::FromInt(0));
CHECK_EQ(arr.get(JSRegExp::kIrregexpCaptureNameMapIndex), uninitialized);
CHECK_EQ(arr.get(JSRegExp::kIrregexpTicksUntilTierUpIndex), CHECK_EQ(arr.get(JSRegExp::kIrregexpTicksUntilTierUpIndex),
uninitialized); uninitialized);
CHECK_EQ(arr.get(JSRegExp::kIrregexpBacktrackLimit), uninitialized); CHECK_EQ(arr.get(JSRegExp::kIrregexpBacktrackLimit), uninitialized);
...@@ -1282,6 +1280,7 @@ void JSRegExp::JSRegExpVerify(Isolate* isolate) { ...@@ -1282,6 +1280,7 @@ void JSRegExp::JSRegExpVerify(Isolate* isolate) {
CHECK_IMPLIES(uc16_data.IsSmi(), uc16_bytecode.IsSmi()); CHECK_IMPLIES(uc16_data.IsSmi(), uc16_bytecode.IsSmi());
CHECK(arr.get(JSRegExp::kIrregexpCaptureCountIndex).IsSmi()); CHECK(arr.get(JSRegExp::kIrregexpCaptureCountIndex).IsSmi());
CHECK_GE(Smi::ToInt(arr.get(JSRegExp::kIrregexpCaptureCountIndex)), 0);
CHECK(arr.get(JSRegExp::kIrregexpMaxRegisterCountIndex).IsSmi()); CHECK(arr.get(JSRegExp::kIrregexpMaxRegisterCountIndex).IsSmi());
CHECK(arr.get(JSRegExp::kIrregexpTicksUntilTierUpIndex).IsSmi()); CHECK(arr.get(JSRegExp::kIrregexpTicksUntilTierUpIndex).IsSmi());
CHECK(arr.get(JSRegExp::kIrregexpBacktrackLimit).IsSmi()); CHECK(arr.get(JSRegExp::kIrregexpBacktrackLimit).IsSmi());
......
...@@ -67,7 +67,7 @@ String JSRegExp::Pattern() { ...@@ -67,7 +67,7 @@ String JSRegExp::Pattern() {
Object JSRegExp::CaptureNameMap() { Object JSRegExp::CaptureNameMap() {
DCHECK(this->data().IsFixedArray()); DCHECK(this->data().IsFixedArray());
DCHECK_EQ(TypeTag(), IRREGEXP); DCHECK(TypeSupportsCaptures(TypeTag()));
Object value = DataAt(kIrregexpCaptureNameMapIndex); Object value = DataAt(kIrregexpCaptureNameMapIndex);
DCHECK_NE(value, Smi::FromInt(JSRegExp::kUninitializedValue)); DCHECK_NE(value, Smi::FromInt(JSRegExp::kUninitializedValue));
return value; return value;
...@@ -85,6 +85,14 @@ void JSRegExp::SetDataAt(int index, Object value) { ...@@ -85,6 +85,14 @@ void JSRegExp::SetDataAt(int index, Object value) {
FixedArray::cast(data()).set(index, value); FixedArray::cast(data()).set(index, value);
} }
void JSRegExp::SetCaptureNameMap(Handle<FixedArray> capture_name_map) {
if (capture_name_map.is_null()) {
SetDataAt(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::zero());
} else {
SetDataAt(JSRegExp::kIrregexpCaptureNameMapIndex, *capture_name_map);
}
}
bool JSRegExp::HasCompiledCode() const { bool JSRegExp::HasCompiledCode() const {
if (TypeTag() != IRREGEXP) return false; if (TypeTag() != IRREGEXP) return false;
Smi uninitialized = Smi::FromInt(kUninitializedValue); Smi uninitialized = Smi::FromInt(kUninitializedValue);
......
...@@ -89,6 +89,9 @@ class JSRegExp : public TorqueGeneratedJSRegExp<JSRegExp, JSObject> { ...@@ -89,6 +89,9 @@ class JSRegExp : public TorqueGeneratedJSRegExp<JSRegExp, JSObject> {
void MarkTierUpForNextExec(); void MarkTierUpForNextExec();
inline Type TypeTag() const; inline Type TypeTag() const;
static bool TypeSupportsCaptures(Type t) {
return t == IRREGEXP || t == EXPERIMENTAL;
}
// Maximum number of captures allowed. // Maximum number of captures allowed.
static constexpr int kMaxCaptures = 1 << 16; static constexpr int kMaxCaptures = 1 << 16;
...@@ -105,6 +108,7 @@ class JSRegExp : public TorqueGeneratedJSRegExp<JSRegExp, JSObject> { ...@@ -105,6 +108,7 @@ class JSRegExp : public TorqueGeneratedJSRegExp<JSRegExp, JSObject> {
inline Object DataAt(int index) const; inline Object DataAt(int index) const;
// Set implementation data after the object has been prepared. // Set implementation data after the object has been prepared.
inline void SetDataAt(int index, Object value); inline void SetDataAt(int index, Object value);
inline void SetCaptureNameMap(Handle<FixedArray> capture_name_map);
static constexpr int code_index(bool is_latin1) { static constexpr int code_index(bool is_latin1) {
return is_latin1 ? kIrregexpLatin1CodeIndex : kIrregexpUC16CodeIndex; return is_latin1 ? kIrregexpLatin1CodeIndex : kIrregexpUC16CodeIndex;
......
...@@ -41,6 +41,12 @@ std::ostream& operator<<(std::ostream& os, const RegExpInstruction& inst) { ...@@ -41,6 +41,12 @@ std::ostream& operator<<(std::ostream& os, const RegExpInstruction& inst) {
case RegExpInstruction::ACCEPT: case RegExpInstruction::ACCEPT:
os << "ACCEPT"; os << "ACCEPT";
break; break;
case RegExpInstruction::SET_REGISTER_TO_CP:
os << "SET_REGISTER_TO_CP " << inst.payload.register_index;
break;
case RegExpInstruction::CLEAR_REGISTER:
os << "CLEAR_REGISTER " << inst.payload.register_index;
break;
} }
return os; return os;
} }
......
...@@ -46,6 +46,10 @@ ...@@ -46,6 +46,10 @@
// - JMP: Instead of incrementing the PC value after execution of this // - JMP: Instead of incrementing the PC value after execution of this
// instruction by 1, set PC of this thread to the value specified in the // instruction by 1, set PC of this thread to the value specified in the
// instruction payload and continue there. // instruction payload and continue there.
// - SET_REGISTER_TO_CP: Set a register specified in the paylod to the current
// position (CP) within the input, then continue with the next instruction.
// - CLEAR_REGISTER: Clear the register specified in the payload by resetting
// it to the initial value -1.
// //
// Special care must be exercised with respect to thread priority. It is // Special care must be exercised with respect to thread priority. It is
// possible that more than one thread executes an ACCEPT statement. The output // possible that more than one thread executes an ACCEPT statement. The output
...@@ -91,6 +95,8 @@ struct RegExpInstruction { ...@@ -91,6 +95,8 @@ struct RegExpInstruction {
FORK, FORK,
JMP, JMP,
ACCEPT, ACCEPT,
SET_REGISTER_TO_CP,
CLEAR_REGISTER,
}; };
struct Uc16Range { struct Uc16Range {
...@@ -125,12 +131,28 @@ struct RegExpInstruction { ...@@ -125,12 +131,28 @@ struct RegExpInstruction {
return result; return result;
} }
static RegExpInstruction SetRegisterToCp(int32_t register_index) {
RegExpInstruction result;
result.opcode = SET_REGISTER_TO_CP;
result.payload.register_index = register_index;
return result;
}
static RegExpInstruction ClearRegister(int32_t register_index) {
RegExpInstruction result;
result.opcode = CLEAR_REGISTER;
result.payload.register_index = register_index;
return result;
}
Opcode opcode; Opcode opcode;
union { union {
// Payload of CONSUME_RANGE: // Payload of CONSUME_RANGE:
Uc16Range consume_range; Uc16Range consume_range;
// Payload of FORK and JMP, the next/forked program counter (pc): // Payload of FORK and JMP, the next/forked program counter (pc):
int32_t pc; int32_t pc;
// Payload of SET_REGISTER_TO_CP and CLEAR_REGISTER:
int32_t register_index;
} payload; } payload;
STATIC_ASSERT(sizeof(payload) == 4); STATIC_ASSERT(sizeof(payload) == 4);
}; };
......
...@@ -21,9 +21,7 @@ class CanBeHandledVisitor final : private RegExpVisitor { ...@@ -21,9 +21,7 @@ class CanBeHandledVisitor final : private RegExpVisitor {
public: public:
static bool Check(RegExpTree* node, JSRegExp::Flags flags, int capture_count, static bool Check(RegExpTree* node, JSRegExp::Flags flags, int capture_count,
Zone* zone) { Zone* zone) {
if (!AreSuitableFlags(flags) || capture_count > 0) { if (!AreSuitableFlags(flags)) return false;
return false;
}
CanBeHandledVisitor visitor(zone); CanBeHandledVisitor visitor(zone);
node->Accept(&visitor, nullptr); node->Accept(&visitor, nullptr);
return visitor.result_; return visitor.result_;
...@@ -151,9 +149,7 @@ class CanBeHandledVisitor final : private RegExpVisitor { ...@@ -151,9 +149,7 @@ class CanBeHandledVisitor final : private RegExpVisitor {
} }
void* VisitCapture(RegExpCapture* node, void*) override { void* VisitCapture(RegExpCapture* node, void*) override {
// TODO(mbid, v8:10765): This can be implemented with the NFA interpreter, node->body()->Accept(this, nullptr);
// but not with the lazy DFA. See also re2.
result_ = false;
return nullptr; return nullptr;
} }
...@@ -287,7 +283,9 @@ class CompileVisitor : private RegExpVisitor { ...@@ -287,7 +283,9 @@ class CompileVisitor : private RegExpVisitor {
Zone* zone) { Zone* zone) {
CompileVisitor compiler(zone); CompileVisitor compiler(zone);
compiler.code_.Add(RegExpInstruction::SetRegisterToCp(0), zone);
tree->Accept(&compiler, nullptr); tree->Accept(&compiler, nullptr);
compiler.code_.Add(RegExpInstruction::SetRegisterToCp(1), zone);
compiler.code_.Add(RegExpInstruction::Accept(), zone); compiler.code_.Add(RegExpInstruction::Accept(), zone);
return std::move(compiler.code_); return std::move(compiler.code_);
...@@ -404,11 +402,35 @@ class CompileVisitor : private RegExpVisitor { ...@@ -404,11 +402,35 @@ class CompileVisitor : private RegExpVisitor {
return nullptr; return nullptr;
} }
void ClearRegisters(Interval indices) {
if (indices.is_empty()) return;
DCHECK_EQ(indices.from() % 2, 0);
DCHECK_EQ(indices.to() % 2, 1);
for (int i = indices.from(); i <= indices.to(); i += 2) {
// It suffices to clear the register containing the `begin` of a capture
// because this indicates that the capture is undefined, regardless of
// the value in the `end` register.
code_.Add(RegExpInstruction::ClearRegister(i), zone_);
}
}
void* VisitQuantifier(RegExpQuantifier* node, void*) override { void* VisitQuantifier(RegExpQuantifier* node, void*) override {
// First repeat the body `min()` times. // Emit the body, but clear registers occuring in body first.
for (int i = 0; i != node->min(); ++i) { //
// TODO(mbid,v8:10765): It's not always necessary to a) capture registers
// and b) clear them. For example, we don't have to capture anything for
// the first 4 repetitions if node->min() >= 5, and then we don't have to
// clear registers in the first node->min() repetitions.
// Later, and if node->min() == 0, we don't have to clear registers before
// the first optional repetition.
Interval body_registers = node->body()->CaptureRegisters();
auto emit_body = [&]() {
ClearRegisters(body_registers);
node->body()->Accept(this, nullptr); node->body()->Accept(this, nullptr);
} };
// First repeat the body `min()` times.
for (int i = 0; i != node->min(); ++i) emit_body();
switch (node->quantifier_type()) { switch (node->quantifier_type()) {
case RegExpQuantifier::POSSESSIVE: case RegExpQuantifier::POSSESSIVE:
...@@ -430,7 +452,7 @@ class CompileVisitor : private RegExpVisitor { ...@@ -430,7 +452,7 @@ class CompileVisitor : private RegExpVisitor {
DeferredLabel end; DeferredLabel end;
AddForkTo(end, code_, zone_); AddForkTo(end, code_, zone_);
node->body()->Accept(this, nullptr); emit_body();
AddJmpTo(begin, code_, zone_); AddJmpTo(begin, code_, zone_);
std::move(end).Bind(code_); std::move(end).Bind(code_);
...@@ -452,7 +474,7 @@ class CompileVisitor : private RegExpVisitor { ...@@ -452,7 +474,7 @@ class CompileVisitor : private RegExpVisitor {
DeferredLabel end; DeferredLabel end;
for (int i = node->min(); i != node->max(); ++i) { for (int i = node->min(); i != node->max(); ++i) {
AddForkTo(end, code_, zone_); AddForkTo(end, code_, zone_);
node->body()->Accept(this, nullptr); emit_body();
} }
std::move(end).Bind(code_); std::move(end).Bind(code_);
} }
...@@ -478,7 +500,7 @@ class CompileVisitor : private RegExpVisitor { ...@@ -478,7 +500,7 @@ class CompileVisitor : private RegExpVisitor {
DCHECK_EQ(body.index(), code_.length()); DCHECK_EQ(body.index(), code_.length());
node->body()->Accept(this, nullptr); emit_body();
AddForkTo(body, code_, zone_); AddForkTo(body, code_, zone_);
std::move(end).Bind(code_); std::move(end).Bind(code_);
...@@ -509,20 +531,24 @@ class CompileVisitor : private RegExpVisitor { ...@@ -509,20 +531,24 @@ class CompileVisitor : private RegExpVisitor {
DCHECK_EQ(body.index(), code_.length()); DCHECK_EQ(body.index(), code_.length());
node->body()->Accept(this, nullptr); emit_body();
} }
std::move(end).Bind(code_); std::move(end).Bind(code_);
} }
break; break;
} }
} }
return nullptr; return nullptr;
} }
void* VisitCapture(RegExpCapture* node, void*) override { void* VisitCapture(RegExpCapture* node, void*) override {
// TODO(mbid,v8:10765): Support this case. int index = node->index();
UNREACHABLE(); int start_register = RegExpCapture::StartRegister(index);
int end_register = RegExpCapture::EndRegister(index);
code_.Add(RegExpInstruction::SetRegisterToCp(start_register), zone_);
node->body()->Accept(this, nullptr);
code_.Add(RegExpInstruction::SetRegisterToCp(end_register), zone_);
return nullptr;
} }
void* VisitGroup(RegExpGroup* node, void*) override { void* VisitGroup(RegExpGroup* node, void*) override {
......
...@@ -11,15 +11,10 @@ ...@@ -11,15 +11,10 @@
namespace v8 { namespace v8 {
namespace internal { namespace internal {
class Zone;
class ExperimentalRegExpInterpreter final : public AllStatic { class ExperimentalRegExpInterpreter final : public AllStatic {
public: public:
// A half-open range in an a string denoting a (sub)match. Used to access
// output registers of regexp execution grouped by [begin, end) pairs.
struct MatchRange {
int32_t begin; // inclusive
int32_t end; // exclusive
};
// Executes a bytecode program in breadth-first NFA mode, without // Executes a bytecode program in breadth-first NFA mode, without
// backtracking, to find matching substrings. Trys to find up to // backtracking, to find matching substrings. Trys to find up to
// `max_match_num` matches in `input`, starting at `start_index`. Returns // `max_match_num` matches in `input`, starting at `start_index`. Returns
...@@ -27,11 +22,14 @@ class ExperimentalRegExpInterpreter final : public AllStatic { ...@@ -27,11 +22,14 @@ class ExperimentalRegExpInterpreter final : public AllStatic {
// are written to `matches_out`. Provided in variants for one-byte and // are written to `matches_out`. Provided in variants for one-byte and
// two-byte strings. // two-byte strings.
static int FindMatchesNfaOneByte(Vector<const RegExpInstruction> bytecode, static int FindMatchesNfaOneByte(Vector<const RegExpInstruction> bytecode,
int capture_count,
Vector<const uint8_t> input, int start_index, Vector<const uint8_t> input, int start_index,
MatchRange* matches_out, int max_match_num); int32_t* output_registers,
int output_register_count, Zone* zone);
static int FindMatchesNfaTwoByte(Vector<const RegExpInstruction> bytecode, static int FindMatchesNfaTwoByte(Vector<const RegExpInstruction> bytecode,
Vector<const uc16> input, int start_index, int capture_count, Vector<const uc16> input,
MatchRange* matches_out, int max_match_num); int start_index, int32_t* output_registers,
int output_register_count, Zone* zone);
}; };
} // namespace internal } // namespace internal
......
...@@ -44,7 +44,7 @@ bool ExperimentalRegExp::IsCompiled(Handle<JSRegExp> re, Isolate* isolate) { ...@@ -44,7 +44,7 @@ bool ExperimentalRegExp::IsCompiled(Handle<JSRegExp> re, Isolate* isolate) {
Smi::FromInt(JSRegExp::kUninitializedValue); Smi::FromInt(JSRegExp::kUninitializedValue);
} }
void ExperimentalRegExp::Compile(Isolate* isolate, Handle<JSRegExp> re) { bool ExperimentalRegExp::Compile(Isolate* isolate, Handle<JSRegExp> re) {
DCHECK_EQ(re->TypeTag(), JSRegExp::EXPERIMENTAL); DCHECK_EQ(re->TypeTag(), JSRegExp::EXPERIMENTAL);
#ifdef VERIFY_HEAP #ifdef VERIFY_HEAP
re->JSRegExpVerify(isolate); re->JSRegExpVerify(isolate);
...@@ -63,11 +63,15 @@ void ExperimentalRegExp::Compile(Isolate* isolate, Handle<JSRegExp> re) { ...@@ -63,11 +63,15 @@ void ExperimentalRegExp::Compile(Isolate* isolate, Handle<JSRegExp> re) {
FlatStringReader reader(isolate, source); FlatStringReader reader(isolate, source);
DCHECK(!isolate->has_pending_exception()); DCHECK(!isolate->has_pending_exception());
// The pattern was already parsed during initialization, so it should never
// fail here:
bool parse_success = bool parse_success =
RegExpParser::ParseRegExp(isolate, &zone, &reader, flags, &parse_result); RegExpParser::ParseRegExp(isolate, &zone, &reader, flags, &parse_result);
CHECK(parse_success); if (!parse_success) {
// The pattern was already parsed successfully during initialization, so
// the only way parsing can fail now is because of stack overflow.
CHECK_EQ(parse_result.error, RegExpError::kStackOverflow);
USE(RegExp::ThrowRegExpException(isolate, re, source, parse_result.error));
return false;
}
ZoneList<RegExpInstruction> bytecode = ZoneList<RegExpInstruction> bytecode =
ExperimentalRegExpCompiler::Compile(parse_result.tree, flags, &zone); ExperimentalRegExpCompiler::Compile(parse_result.tree, flags, &zone);
...@@ -84,6 +88,10 @@ void ExperimentalRegExp::Compile(Isolate* isolate, Handle<JSRegExp> re) { ...@@ -84,6 +88,10 @@ void ExperimentalRegExp::Compile(Isolate* isolate, Handle<JSRegExp> re) {
Handle<Code> trampoline = BUILTIN_CODE(isolate, RegExpExperimentalTrampoline); Handle<Code> trampoline = BUILTIN_CODE(isolate, RegExpExperimentalTrampoline);
re->SetDataAt(JSRegExp::kIrregexpLatin1CodeIndex, *trampoline); re->SetDataAt(JSRegExp::kIrregexpLatin1CodeIndex, *trampoline);
re->SetDataAt(JSRegExp::kIrregexpUC16CodeIndex, *trampoline); re->SetDataAt(JSRegExp::kIrregexpUC16CodeIndex, *trampoline);
re->SetCaptureNameMap(parse_result.capture_name_map);
return true;
} }
Vector<RegExpInstruction> AsInstructionSequence(ByteArray raw_bytes) { Vector<RegExpInstruction> AsInstructionSequence(ByteArray raw_bytes) {
...@@ -94,11 +102,9 @@ Vector<RegExpInstruction> AsInstructionSequence(ByteArray raw_bytes) { ...@@ -94,11 +102,9 @@ Vector<RegExpInstruction> AsInstructionSequence(ByteArray raw_bytes) {
return Vector<RegExpInstruction>(inst_begin, inst_num); return Vector<RegExpInstruction>(inst_begin, inst_num);
} }
using MatchRange = ExperimentalRegExpInterpreter::MatchRange;
// Returns the number of matches. // Returns the number of matches.
int32_t ExperimentalRegExp::ExecRaw(JSRegExp regexp, String subject, int32_t ExperimentalRegExp::ExecRaw(Isolate* isolate, JSRegExp regexp,
int32_t* output_registers, String subject, int32_t* output_registers,
int32_t output_register_count, int32_t output_register_count,
int32_t subject_index) { int32_t subject_index) {
DisallowHeapAllocation no_gc; DisallowHeapAllocation no_gc;
...@@ -118,21 +124,22 @@ int32_t ExperimentalRegExp::ExecRaw(JSRegExp regexp, String subject, ...@@ -118,21 +124,22 @@ int32_t ExperimentalRegExp::ExecRaw(JSRegExp regexp, String subject,
StdoutStream{} << bytecode << std::endl; StdoutStream{} << bytecode << std::endl;
} }
int register_count_per_match =
JSRegExp::RegistersForCaptureCount(regexp.CaptureCount());
DCHECK(subject.IsFlat()); DCHECK(subject.IsFlat());
String::FlatContent subject_content = subject.GetFlatContent(no_gc); String::FlatContent subject_content = subject.GetFlatContent(no_gc);
DCHECK_EQ(output_register_count % 2, 0); Zone zone(isolate->allocator(), ZONE_NAME);
MatchRange* matches = reinterpret_cast<MatchRange*>(output_registers);
const int32_t max_match_num = output_register_count / 2;
if (subject_content.IsOneByte()) { if (subject_content.IsOneByte()) {
return ExperimentalRegExpInterpreter::FindMatchesNfaOneByte( return ExperimentalRegExpInterpreter::FindMatchesNfaOneByte(
bytecode, subject_content.ToOneByteVector(), subject_index, matches, bytecode, register_count_per_match, subject_content.ToOneByteVector(),
max_match_num); subject_index, output_registers, output_register_count, &zone);
} else { } else {
return ExperimentalRegExpInterpreter::FindMatchesNfaTwoByte( return ExperimentalRegExpInterpreter::FindMatchesNfaTwoByte(
bytecode, subject_content.ToUC16Vector(), subject_index, matches, bytecode, register_count_per_match, subject_content.ToUC16Vector(),
max_match_num); subject_index, output_registers, output_register_count, &zone);
} }
} }
...@@ -156,7 +163,7 @@ int32_t ExperimentalRegExp::MatchForCallFromJs( ...@@ -156,7 +163,7 @@ int32_t ExperimentalRegExp::MatchForCallFromJs(
JSRegExp regexp_obj = JSRegExp::cast(Object(regexp)); JSRegExp regexp_obj = JSRegExp::cast(Object(regexp));
return ExecRaw(regexp_obj, subject_string, output_registers, return ExecRaw(isolate, regexp_obj, subject_string, output_registers,
output_register_count, start_position); output_register_count, start_position);
} }
...@@ -170,22 +177,28 @@ MaybeHandle<Object> ExperimentalRegExp::Exec( ...@@ -170,22 +177,28 @@ MaybeHandle<Object> ExperimentalRegExp::Exec(
regexp->JSRegExpVerify(isolate); regexp->JSRegExpVerify(isolate);
#endif #endif
if (!IsCompiled(regexp, isolate)) { if (!IsCompiled(regexp, isolate) && !Compile(isolate, regexp)) {
Compile(isolate, regexp); DCHECK(isolate->has_pending_exception());
return MaybeHandle<Object>();
} }
DCHECK(IsCompiled(regexp, isolate)); DCHECK(IsCompiled(regexp, isolate));
subject = String::Flatten(isolate, subject); subject = String::Flatten(isolate, subject);
MatchRange match;
int32_t* output_registers = &match.begin;
int32_t output_register_count = sizeof(MatchRange) / sizeof(int32_t);
int capture_count = regexp->CaptureCount(); int capture_count = regexp->CaptureCount();
int output_register_count = JSRegExp::RegistersForCaptureCount(capture_count);
int32_t* output_registers;
std::unique_ptr<int32_t[]> output_registers_release;
if (output_register_count <= Isolate::kJSRegexpStaticOffsetsVectorSize) {
output_registers = isolate->jsregexp_static_offsets_vector();
} else {
output_registers = NewArray<int32_t>(output_register_count);
output_registers_release.reset(output_registers);
}
int num_matches = ExecRaw(*regexp, *subject, output_registers, int num_matches = ExecRaw(isolate, *regexp, *subject, output_registers,
output_register_count, subject_index); output_register_count, subject_index);
if (num_matches == 0) { if (num_matches == 0) {
......
...@@ -25,7 +25,8 @@ class ExperimentalRegExp final : public AllStatic { ...@@ -25,7 +25,8 @@ class ExperimentalRegExp final : public AllStatic {
Handle<String> pattern, JSRegExp::Flags flags, Handle<String> pattern, JSRegExp::Flags flags,
int capture_count); int capture_count);
static bool IsCompiled(Handle<JSRegExp> re, Isolate* isolate); static bool IsCompiled(Handle<JSRegExp> re, Isolate* isolate);
static void Compile(Isolate* isolate, Handle<JSRegExp> re); V8_WARN_UNUSED_RESULT
static bool Compile(Isolate* isolate, Handle<JSRegExp> re);
// Execution: // Execution:
static int32_t MatchForCallFromJs(Address subject, int32_t start_position, static int32_t MatchForCallFromJs(Address subject, int32_t start_position,
...@@ -38,7 +39,7 @@ class ExperimentalRegExp final : public AllStatic { ...@@ -38,7 +39,7 @@ class ExperimentalRegExp final : public AllStatic {
static MaybeHandle<Object> Exec(Isolate* isolate, Handle<JSRegExp> regexp, static MaybeHandle<Object> Exec(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject, int index, Handle<String> subject, int index,
Handle<RegExpMatchInfo> last_match_info); Handle<RegExpMatchInfo> last_match_info);
static int32_t ExecRaw(JSRegExp regexp, String subject, static int32_t ExecRaw(Isolate* isolate, JSRegExp regexp, String subject,
int32_t* output_registers, int32_t* output_registers,
int32_t output_register_count, int32_t subject_index); int32_t output_register_count, int32_t subject_index);
......
This diff is collapsed.
...@@ -74,6 +74,13 @@ class RegExp final : public AllStatic { ...@@ -74,6 +74,13 @@ class RegExp final : public AllStatic {
Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern, Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
JSRegExp::Flags flags, uint32_t backtrack_limit); JSRegExp::Flags flags, uint32_t backtrack_limit);
// Ensures that a regexp is fully compiled and ready to be executed on a
// subject string. Returns true on success. Return false on failure, and
// then an exception will be pending.
V8_WARN_UNUSED_RESULT static bool EnsureFullyCompiled(Isolate* isolate,
Handle<JSRegExp> re,
Handle<String> subject);
enum CallOrigin : int { enum CallOrigin : int {
kFromRuntime = 0, kFromRuntime = 0,
kFromJs = 1, kFromJs = 1,
...@@ -97,16 +104,6 @@ class RegExp final : public AllStatic { ...@@ -97,16 +104,6 @@ class RegExp final : public AllStatic {
RE_EXCEPTION = kInternalRegExpException, RE_EXCEPTION = kInternalRegExpException,
}; };
// Prepare a RegExp for being executed one or more times (using
// IrregexpExecOnce) on the subject.
// This ensures that the regexp is compiled for the subject, and that
// the subject is flat.
// Returns the number of integer spaces required by IrregexpExecOnce
// as its "registers" argument. If the regexp cannot be compiled,
// an exception is set as pending, and this function returns negative.
static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject);
// Set last match info. If match is nullptr, then setting captures is // Set last match info. If match is nullptr, then setting captures is
// omitted. // omitted.
static Handle<RegExpMatchInfo> SetLastMatchInfo( static Handle<RegExpMatchInfo> SetLastMatchInfo(
...@@ -124,6 +121,14 @@ class RegExp final : public AllStatic { ...@@ -124,6 +121,14 @@ class RegExp final : public AllStatic {
RegExpNode* node); RegExpNode* node);
static const int kRegExpTooLargeToOptimize = 20 * KB; static const int kRegExpTooLargeToOptimize = 20 * KB;
V8_WARN_UNUSED_RESULT
static MaybeHandle<Object> ThrowRegExpException(Isolate* isolate,
Handle<JSRegExp> re,
Handle<String> pattern,
RegExpError error);
static void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
RegExpError error_text);
}; };
// Uses a special global mode of irregexp-generated code to perform a global // Uses a special global mode of irregexp-generated code to perform a global
......
...@@ -322,7 +322,7 @@ bool CompiledReplacement::Compile(Isolate* isolate, Handle<JSRegExp> regexp, ...@@ -322,7 +322,7 @@ bool CompiledReplacement::Compile(Isolate* isolate, Handle<JSRegExp> regexp,
FixedArray capture_name_map; FixedArray capture_name_map;
if (capture_count > 0) { if (capture_count > 0) {
DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); DCHECK(JSRegExp::TypeSupportsCaptures(regexp->TypeTag()));
Object maybe_capture_name_map = regexp->CaptureNameMap(); Object maybe_capture_name_map = regexp->CaptureNameMap();
if (maybe_capture_name_map.IsFixedArray()) { if (maybe_capture_name_map.IsFixedArray()) {
capture_name_map = FixedArray::cast(maybe_capture_name_map); capture_name_map = FixedArray::cast(maybe_capture_name_map);
...@@ -611,13 +611,9 @@ V8_WARN_UNUSED_RESULT static Object StringReplaceGlobalRegExpWithString( ...@@ -611,13 +611,9 @@ V8_WARN_UNUSED_RESULT static Object StringReplaceGlobalRegExpWithString(
int capture_count = regexp->CaptureCount(); int capture_count = regexp->CaptureCount();
int subject_length = subject->length(); int subject_length = subject->length();
JSRegExp::Type typeTag = regexp->TypeTag(); // Ensure the RegExp is compiled so we can access the capture-name map.
if (typeTag == JSRegExp::IRREGEXP) { if (!RegExp::EnsureFullyCompiled(isolate, regexp, subject)) {
// Ensure the RegExp is compiled so we can access the capture-name map. return ReadOnlyRoots(isolate).exception();
if (RegExp::IrregexpPrepare(isolate, regexp, subject) == -1) {
DCHECK(isolate->has_pending_exception());
return ReadOnlyRoots(isolate).exception();
}
} }
// CompiledReplacement uses zone allocation. // CompiledReplacement uses zone allocation.
...@@ -627,7 +623,7 @@ V8_WARN_UNUSED_RESULT static Object StringReplaceGlobalRegExpWithString( ...@@ -627,7 +623,7 @@ V8_WARN_UNUSED_RESULT static Object StringReplaceGlobalRegExpWithString(
isolate, regexp, replacement, capture_count, subject_length); isolate, regexp, replacement, capture_count, subject_length);
// Shortcut for simple non-regexp global replacements // Shortcut for simple non-regexp global replacements
if (typeTag == JSRegExp::ATOM && simple_replace) { if (regexp->TypeTag() == JSRegExp::ATOM && simple_replace) {
if (subject->IsOneByteRepresentation() && if (subject->IsOneByteRepresentation() &&
replacement->IsOneByteRepresentation()) { replacement->IsOneByteRepresentation()) {
return StringReplaceGlobalAtomRegExpWithString<SeqOneByteString>( return StringReplaceGlobalAtomRegExpWithString<SeqOneByteString>(
...@@ -1460,8 +1456,7 @@ RUNTIME_FUNCTION(Runtime_StringReplaceNonGlobalRegExpWithFunction) { ...@@ -1460,8 +1456,7 @@ RUNTIME_FUNCTION(Runtime_StringReplaceNonGlobalRegExpWithFunction) {
bool has_named_captures = false; bool has_named_captures = false;
Handle<FixedArray> capture_map; Handle<FixedArray> capture_map;
if (m > 1) { if (m > 1) {
// The existence of capture groups implies IRREGEXP kind. DCHECK(JSRegExp::TypeSupportsCaptures(regexp->TypeTag()));
DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
Object maybe_capture_map = regexp->CaptureNameMap(); Object maybe_capture_map = regexp->CaptureNameMap();
if (maybe_capture_map.IsFixedArray()) { if (maybe_capture_map.IsFixedArray()) {
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. // found in the LICENSE file.
// Flags: --allow-natives-syntax // Flags: --allow-natives-syntax --no-enable-experimental-regexp-engine
const kNoBacktrackLimit = 0; // To match JSRegExp::kNoBacktrackLimit. const kNoBacktrackLimit = 0; // To match JSRegExp::kNoBacktrackLimit.
const re0 = %NewRegExpWithBacktrackLimit("(\\d+)+x", "", kNoBacktrackLimit); const re0 = %NewRegExpWithBacktrackLimit("(\\d+)+x", "", kNoBacktrackLimit);
......
...@@ -60,5 +60,16 @@ Test(/(?:asdf)/, "123asdfxyz", ["asdf"], 0); ...@@ -60,5 +60,16 @@ Test(/(?:asdf)/, "123asdfxyz", ["asdf"], 0);
Test(/(?:asdf)|123/, "xyz123asdf", ["123"], 0); Test(/(?:asdf)|123/, "xyz123asdf", ["123"], 0);
Test(/asdf(?:[0-9]|(?:xy|x)*)*/, "kkkasdf5xyx8xyyky", ["asdf5xyx8xy"], 0); Test(/asdf(?:[0-9]|(?:xy|x)*)*/, "kkkasdf5xyx8xyyky", ["asdf5xyx8xy"], 0);
// Capturing groups.
Test(/()/, "asdf", ["", ""], 0);
Test(/(123)/, "asdf123xyz", ["123", "123"], 0);
Test(/asdf(123)xyz/, "asdf123xyz", ["asdf123xyz", "123"], 0);
Test(/(123|xyz)/, "123", ["123", "123"], 0);
Test(/(123|xyz)/, "xyz", ["xyz", "xyz"], 0);
Test(/(123)|(xyz)/, "123", ["123", "123", undefined], 0);
Test(/(123)|(xyz)/, "xyz", ["xyz", undefined, "xyz"], 0);
Test(/(?:(123)|(xyz))*/, "xyz123", ["xyz123", "123", undefined], 0);
Test(/((123)|(xyz)*)*/, "xyz123xyz", ["xyz123xyz", "xyz", undefined, "xyz"], 0);
// The global flag. // The global flag.
Test(/asdf/g, "fjasdfkkasdf", ["asdf"], 6); Test(/asdf/g, "fjasdfkkasdf", ["asdf"], 6);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment