Commit e2aa1a89 authored by Martin Bidlingmaier's avatar Martin Bidlingmaier Committed by Commit Bot

[regexp] Split experimental regexp code into multiple files

Bug: v8:10765
Change-Id: I49e425d861d900ab66b6f7801cddec8a7175ac03
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2385462
Commit-Queue: Martin Bidlingmaier <mbid@google.com>
Reviewed-by: 's avatarJakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#69637}
parent 7c912ffa
......@@ -3067,6 +3067,12 @@ v8_source_set("v8_base_without_compiler") {
"src/profiler/tick-sample.h",
"src/profiler/tracing-cpu-profiler.cc",
"src/profiler/tracing-cpu-profiler.h",
"src/regexp/experimental/experimental-bytecode.cc",
"src/regexp/experimental/experimental-bytecode.h",
"src/regexp/experimental/experimental-compiler.cc",
"src/regexp/experimental/experimental-compiler.h",
"src/regexp/experimental/experimental-interpreter.cc",
"src/regexp/experimental/experimental-interpreter.h",
"src/regexp/experimental/experimental.cc",
"src/regexp/experimental/experimental.h",
"src/regexp/property-sequences.cc",
......
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/regexp/experimental/experimental-bytecode.h"
#include <iomanip>
namespace v8 {
namespace internal {
namespace {
std::ostream& PrintAsciiOrHex(std::ostream& os, uc16 c) {
if (c < 128 && std::isprint(c)) {
os << static_cast<char>(c);
} else {
os << "0x" << std::hex << static_cast<int>(c);
}
return os;
}
} // namespace
std::ostream& operator<<(std::ostream& os, const RegExpInstruction& inst) {
switch (inst.opcode) {
case RegExpInstruction::CONSUME_RANGE: {
os << "CONSUME_RANGE [";
PrintAsciiOrHex(os, inst.payload.consume_range.min);
os << ", ";
PrintAsciiOrHex(os, inst.payload.consume_range.max);
os << "]";
break;
}
case RegExpInstruction::FORK:
os << "FORK " << inst.payload.pc;
break;
case RegExpInstruction::JMP:
os << "JMP " << inst.payload.pc;
break;
case RegExpInstruction::ACCEPT:
os << "ACCEPT";
break;
}
return os;
}
namespace {
// The maximum number of digits required to display a non-negative number < n
// in base 10.
int DigitsRequiredBelow(int n) {
DCHECK_GE(n, 0);
int result = 1;
for (int i = 10; i < n; i *= 10) {
result += 1;
}
return result;
}
} // namespace
std::ostream& operator<<(std::ostream& os,
Vector<const RegExpInstruction> insts) {
int inst_num = insts.length();
int line_digit_num = DigitsRequiredBelow(inst_num);
for (int i = 0; i != inst_num; ++i) {
const RegExpInstruction& inst = insts[i];
os << std::setfill('0') << std::setw(line_digit_num) << i << ": " << inst
<< std::endl;
}
return os;
}
} // namespace internal
} // namespace v8
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_BYTECODE_H_
#define V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_BYTECODE_H_
#include <ios>
#include "src/utils/vector.h"
// ----------------------------------------------------------------------------
// Definition and semantics of the EXPERIMENTAL bytecode.
// Background:
// - Russ Cox's blog post series on regular expression matching, in particular
// https://swtch.com/~rsc/regexp/regexp2.html
// - The re2 regular regexp library: https://github.com/google/re2
//
// This comment describes the bytecode used by the experimental regexp engine
// and its abstract semantics in terms of a VM. An implementation of the
// semantics that avoids exponential runtime can be found in `NfaInterpreter`.
//
// The experimental bytecode describes a non-deterministic finite automaton. It
// runs on a multithreaded virtual machine (VM), i.e. in several threads
// concurrently. (These "threads" don't need to be actual operating system
// threads.) Apart from a list of threads, the VM maintains an immutable
// shared input string which threads can read from. Each thread is given by a
// program counter (PC, index of the current instruction), a fixed number of
// registers of indices into the input string, and a monotonically increasing
// index which represents the current position within the input string.
//
// For the precise encoding of the instruction set, see the definition `struct
// RegExpInstruction` below. Currently we support the following instructions:
// - CONSUME_RANGE: Check whether the codepoint of the current character is
// contained in a non-empty closed interval [min, max] specified in the
// instruction payload. Abort this thread if false, otherwise advance the
// input position by 1 and continue with the next instruction.
// - ACCEPT: Stop this thread and signify the end of a match at the current
// input position.
// - FORK: If executed by a thread t, spawn a new thread t0 whose register
// values and input position agree with those of t, but whose PC value is set
// to the value specified in the instruction payload. The register values of
// t and t0 agree directly after the FORK, but they can diverge. Thread t
// continues with the instruction directly after the current FORK
// instruction.
// - JMP: Instead of incrementing the PC value after execution of this
// instruction by 1, set PC of this thread to the value specified in the
// instruction payload and continue there.
//
// Special care must be exercised with respect to thread priority. It is
// possible that more than one thread executes an ACCEPT statement. The output
// of the program is given by the contents of the matching thread's registers,
// so this is ambiguous in case of multiple matches. To resolve the ambiguity,
// every implementation of the VM must output the match that a backtracking
// implementation would output (i.e. behave the same as Irregexp).
//
// A backtracking implementation of the VM maintains a stack of postponed
// threads. Upon encountering a FORK statement, this VM will create a copy of
// the current thread, set the copy's PC value according to the instruction
// payload, and push it to the stack of postponed threads. The VM will then
// continue execution of the current thread.
//
// If at some point a thread t executes a MATCH statement, the VM stops and
// outputs the registers of t. Postponed threads are discarded. On the other
// hand, if a thread t is aborted because some input character didn't pass a
// check, then the VM pops the topmost postponed thread and continues execution
// with this thread. If there are no postponed threads, then the VM outputs
// failure, i.e. no matches.
//
// Equivalently, we can describe the behavior of the backtracking VM in terms
// of priority: Threads are linearly ordered by priority, and matches generated
// by threads with high priority must be preferred over matches generated by
// threads with low priority, regardless of the chronological order in which
// matches were found. If a thread t executes a FORK statement and spawns a
// thread t0, then the priority of t0 is such that the following holds:
// * t0 < t, i.e. t0 has lower priority than t.
// * For all threads u such that u != t and u != t0, we have t0 < u iff t < u,
// i.e. the t0 compares to other threads the same as t.
// For example, if there are currently 3 threads s, t, u such that s < t < u,
// then after t executes a fork, the thread priorities will be s < t0 < t < u.
namespace v8 {
namespace internal {
// Bytecode format.
// Currently very simple fixed-size: The opcode is encoded in the first 4
// bytes, the payload takes another 4 bytes.
struct RegExpInstruction {
enum Opcode : int32_t {
CONSUME_RANGE,
FORK,
JMP,
ACCEPT,
};
struct Uc16Range {
uc16 min; // Inclusive.
uc16 max; // Inclusive.
};
static RegExpInstruction ConsumeRange(Uc16Range consume_range) {
RegExpInstruction result;
result.opcode = CONSUME_RANGE;
result.payload.consume_range = consume_range;
return result;
}
static RegExpInstruction Fork(int32_t alt_index) {
RegExpInstruction result;
result.opcode = FORK;
result.payload.pc = alt_index;
return result;
}
static RegExpInstruction Jmp(int32_t alt_index) {
RegExpInstruction result;
result.opcode = JMP;
result.payload.pc = alt_index;
return result;
}
static RegExpInstruction Accept() {
RegExpInstruction result;
result.opcode = ACCEPT;
return result;
}
Opcode opcode;
union {
// Payload of CONSUME_RANGE:
Uc16Range consume_range;
// Payload of FORK and JMP, the next/forked program counter (pc):
int32_t pc;
} payload;
STATIC_ASSERT(sizeof(payload) == 4);
};
STATIC_ASSERT(sizeof(RegExpInstruction) == 8);
// TODO(mbid,v8:10765): This is rather wasteful. We can fit the opcode in 2-3
// bits, so the remaining 29/30 bits can be used as payload. Problem: The
// payload of CONSUME_RANGE consists of two 16-bit values `min` and `max`, so
// this wouldn't fit. We could encode the payload of a CONSUME_RANGE
// instruction by the start of the interval and its length instead, and then
// only allows lengths that fit into 14/13 bits. A longer range can then be
// encoded as a disjunction of smaller ranges.
//
// Another thought: CONSUME_RANGEs are only valid if the payloads are such that
// min <= max. Thus there are
//
// 2^16 + 2^16 - 1 + ... + 1
// = 2^16 * (2^16 + 1) / 2
// = 2^31 + 2^15
//
// valid payloads for a CONSUME_RANGE instruction. If we want to fit
// instructions into 4 bytes, we would still have almost 2^31 instructions left
// over if we encode everything as tight as possible. For example, we could
// use another 2^29 values for JMP, another 2^29 for FORK, 1 value for ACCEPT,
// and then still have almost 2^30 instructions left over for something like
// zero-width assertions and captures.
std::ostream& operator<<(std::ostream& os, const RegExpInstruction& inst);
std::ostream& operator<<(std::ostream& os,
Vector<const RegExpInstruction> insts);
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_BYTECODE_H_
This diff is collapsed.
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_COMPILER_H_
#define V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_COMPILER_H_
#include "src/regexp/experimental/experimental-bytecode.h"
#include "src/regexp/regexp-ast.h"
#include "src/zone/zone-list.h"
namespace v8 {
namespace internal {
class ExperimentalRegExpCompiler final : public AllStatic {
public:
// Checks whether a given RegExpTree can be compiled into an experimental
// bytecode program. This mostly amounts to the absence of back references,
// but see the definition.
// TODO(mbid,v8:10765): Currently more things are not handled, e.g. some
// quantifiers and unicode.
static bool CanBeHandled(RegExpTree* tree, JSRegExp::Flags flags, Zone* zone);
// Compile regexp into a bytecode program. The regexp must be handlable by
// the experimental engine; see`CanBeHandled`. The program is returned as a
// ZoneList backed by the same Zone that is used in the RegExpTree argument.
static ZoneList<RegExpInstruction> Compile(RegExpTree* tree,
JSRegExp::Flags flags, Zone* zone);
};
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_COMPILER_H_
This diff is collapsed.
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_INTERPRETER_H_
#define V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_INTERPRETER_H_
#include "src/regexp/experimental/experimental-bytecode.h"
#include "src/utils/vector.h"
namespace v8 {
namespace internal {
class ExperimentalRegExpInterpreter final : public AllStatic {
public:
// A half-open range in an a string denoting a (sub)match. Used to access
// output registers of regexp execution grouped by [begin, end) pairs.
struct MatchRange {
int32_t begin; // inclusive
int32_t end; // exclusive
};
// Executes a bytecode program in breadth-first NFA mode, without
// backtracking, to find matching substrings. Trys to find up to
// `max_match_num` matches in `input`, starting at `start_index`. Returns
// the actual number of matches found. The boundaires of matching subranges
// are written to `matches_out`. Provided in variants for one-byte and
// two-byte strings.
static int FindMatchesNfaOneByte(Vector<const RegExpInstruction> bytecode,
Vector<const uint8_t> input, int start_index,
MatchRange* matches_out, int max_match_num);
static int FindMatchesNfaTwoByte(Vector<const RegExpInstruction> bytecode,
Vector<const uc16> input, int start_index,
MatchRange* matches_out, int max_match_num);
};
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_INTERPRETER_H_
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment