Commit e83511c2 authored by Martin Bidlingmaier's avatar Martin Bidlingmaier Committed by Commit Bot

[regexp] Support assertions in experimental engine

Assertions are implemented with the new ASSERTION instruction.  The nfa
interpreter evaluates the assertion based on the current context in the
subject string every time a thread executes ASSERTION.  This is
analogous to what re2 and rust/regex do.

Alternatives to this approach:
- The interpreter could calculate eagerly for all assertion types
  whether they are satisfied whenever the current input position is
  advanced.  This would make evaluating the ASSERTION instruction itself
  cheaper, but at the cost of making every advance in the input string
  more expensive.  I suspect this would be slower on average because
  assertions are not that common that we typically evaluate >= 2
  assertions at every input position.
- Assertions in a regexp could be desugared into CONSUME_RANGE
  instructions, so that no new instruction would be necessary.  For
  example, the word boundary assertion \b is satisfied at a given
  position/state if we have just consumed a word character and will
  consume a non-word character next, or vice-versa.  The tricky part
  about this is that the assertion itself should not consume input, so
  we'd have to split (automaton) states according to whether we've
  arrived at them via a word character or not.  The current compiler is
  not really equipped for this kind of transformation.  For {start,end}
  of {line,file} assertions, we'd need to introduce dummy characters
  indicating start/end of input (say, 0x10000 and 0x10001) which we feed
  to the interpreter before respectively after the actual input.
  I suspect that this approach wouldn't make much of a difference for
  NFA execution. It would likely speed up (lazy) DFA execution though
  because assertions would be dealt with in the fast path.

Cq-Include-Trybots: luci.v8.try:v8_linux64_fyi_rel_ng
Bug: v8:10765
Change-Id: Ic2012c943e0ce54eb8662789fb3d4c1b6cd8d606
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2398644
Commit-Queue: Martin Bidlingmaier <mbid@google.com>
Reviewed-by: 's avatarLeszek Swirski <leszeks@chromium.org>
Reviewed-by: 's avatarJakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#70026}
parent c4fd16e9
......@@ -32,6 +32,29 @@ std::ostream& operator<<(std::ostream& os, const RegExpInstruction& inst) {
os << "]";
break;
}
case RegExpInstruction::ASSERTION:
os << "ASSERTION ";
switch (inst.payload.assertion_type) {
case RegExpAssertion::START_OF_INPUT:
os << "START_OF_INPUT";
break;
case RegExpAssertion::END_OF_INPUT:
os << "END_OF_INPUT";
break;
case RegExpAssertion::START_OF_LINE:
os << "START_OF_LINE";
break;
case RegExpAssertion::END_OF_LINE:
os << "END_OF_LINE";
break;
case RegExpAssertion::BOUNDARY:
os << "BOUNDARY";
break;
case RegExpAssertion::NON_BOUNDARY:
os << "NON_BOUNDARY";
break;
}
break;
case RegExpInstruction::FORK:
os << "FORK " << inst.payload.pc;
break;
......
......@@ -7,6 +7,7 @@
#include <ios>
#include "src/regexp/regexp-ast.h"
#include "src/utils/vector.h"
// ----------------------------------------------------------------------------
......@@ -91,12 +92,13 @@ namespace internal {
// bytes, the payload takes another 4 bytes.
struct RegExpInstruction {
enum Opcode : int32_t {
ACCEPT,
ASSERTION,
CLEAR_REGISTER,
CONSUME_RANGE,
FORK,
JMP,
ACCEPT,
SET_REGISTER_TO_CP,
CLEAR_REGISTER,
};
struct Uc16Range {
......@@ -145,6 +147,13 @@ struct RegExpInstruction {
return result;
}
static RegExpInstruction Assertion(RegExpAssertion::AssertionType t) {
RegExpInstruction result;
result.opcode = ASSERTION;
result.payload.assertion_type = t;
return result;
}
Opcode opcode;
union {
// Payload of CONSUME_RANGE:
......@@ -153,6 +162,8 @@ struct RegExpInstruction {
int32_t pc;
// Payload of SET_REGISTER_TO_CP and CLEAR_REGISTER:
int32_t register_index;
// Payload of ASSERTION:
RegExpAssertion::AssertionType assertion_type;
} payload;
STATIC_ASSERT(sizeof(payload) == 4);
};
......
......@@ -74,9 +74,12 @@ class CanBeHandledVisitor final : private RegExpVisitor {
}
void* VisitAssertion(RegExpAssertion* node, void*) override {
// TODO(mbid, v8:10765): We should be able to support at least some
// assertions. re2 does, too.
result_ = false;
// TODO(mbid,v8:10765): Once regexps that we shouldn't try to match at
// every input position (e.g. sticky) are supported, we should also support
// START_OF_INPUT.
result_ = result_ &&
node->assertion_type() != RegExpAssertion::START_OF_INPUT &&
AreSuitableFlags(node->flags());
return nullptr;
}
......@@ -357,8 +360,8 @@ class CompileVisitor : private RegExpVisitor {
}
void* VisitAssertion(RegExpAssertion* node, void*) override {
// TODO(mbid,v8:10765): Support this case.
UNREACHABLE();
code_.Add(RegExpInstruction::Assertion(node->assertion_type()), zone_);
return nullptr;
}
void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
......
......@@ -6,6 +6,7 @@
#include "src/base/optional.h"
#include "src/regexp/experimental/experimental.h"
#include "src/strings/char-predicates-inl.h"
#include "src/zone/zone-allocator.h"
#include "src/zone/zone-list-inl.h"
......@@ -16,6 +17,39 @@ namespace {
constexpr int kUndefinedRegisterValue = -1;
template <class Character>
bool SatisfiesAssertion(RegExpAssertion::AssertionType type,
Vector<const Character> context, int position) {
DCHECK_LE(position, context.length());
DCHECK_GE(position, 0);
switch (type) {
case RegExpAssertion::START_OF_INPUT:
return position == 0;
case RegExpAssertion::END_OF_INPUT:
return position == context.length();
case RegExpAssertion::START_OF_LINE:
if (position == 0) return true;
return unibrow::IsLineTerminator(context[position - 1]);
case RegExpAssertion::END_OF_LINE:
if (position == context.length()) return true;
return unibrow::IsLineTerminator(context[position]);
case RegExpAssertion::BOUNDARY:
if (context.length() == 0) {
return false;
} else if (position == 0) {
return IsRegExpWord(context[position]);
} else if (position == context.length()) {
return IsRegExpWord(context[position - 1]);
} else {
return IsRegExpWord(context[position - 1]) !=
IsRegExpWord(context[position]);
}
case RegExpAssertion::NON_BOUNDARY:
return !SatisfiesAssertion(RegExpAssertion::BOUNDARY, context, position);
}
}
template <class Character>
class NfaInterpreter {
// Executes a bytecode program in breadth-first mode, without backtracking.
......@@ -239,6 +273,14 @@ class NfaInterpreter {
blocked_threads_.Add(t, zone_);
return;
}
case RegExpInstruction::ASSERTION:
if (!SatisfiesAssertion(inst.payload.assertion_type, input_,
input_index_)) {
DestroyThread(t);
return;
}
++t.pc;
break;
case RegExpInstruction::FORK: {
InterpreterThread fork{inst.payload.pc,
NewRegisterArrayUninitialized()};
......
......@@ -69,14 +69,8 @@ inline constexpr uc32 ToAsciiLower(uc32 c) {
return c | (IsAsciiUpper(c) << 5);
}
inline constexpr bool IsRegExpWord(uc16 c) {
return base::IsInRange(AsciiAlphaToLower(c), 'a', 'z') || IsDecimalDigit(c) ||
(c == '_');
}
inline constexpr bool IsRegExpNewline(uc16 c) {
// CR LF LS PS
return c != 0x000A && c != 0x000D && c != 0x2028 && c != 0x2029;
inline constexpr bool IsRegExpWord(uc32 c) {
return IsAlphaNumeric(c) || c == '_';
}
// Constexpr cache table for character flags.
......
......@@ -24,7 +24,6 @@ inline constexpr bool IsHexDigit(uc32 c);
inline constexpr bool IsOctalDigit(uc32 c);
inline constexpr bool IsBinaryDigit(uc32 c);
inline constexpr bool IsRegExpWord(uc32 c);
inline constexpr bool IsRegExpNewline(uc32 c);
inline constexpr bool IsAsciiLower(uc32 ch);
inline constexpr bool IsAsciiUpper(uc32 ch);
......
......@@ -477,35 +477,25 @@ TEST(Errors) {
ExpectError("\\ka", kInvalidNamedReference, true);
}
static bool IsDigit(uc32 c) { return ('0' <= c && c <= '9'); }
static bool IsDigit(uc16 c) {
return ('0' <= c && c <= '9');
}
static bool NotDigit(uc16 c) {
return !IsDigit(c);
}
static bool NotDigit(uc32 c) { return !IsDigit(c); }
static bool IsWhiteSpaceOrLineTerminator(uc16 c) {
static bool IsWhiteSpaceOrLineTerminator(uc32 c) {
// According to ECMA 5.1, 15.10.2.12 the CharacterClassEscape \s includes
// WhiteSpace (7.2) and LineTerminator (7.3) values.
return v8::internal::IsWhiteSpaceOrLineTerminator(c);
}
static bool NotWhiteSpaceNorLineTermiantor(uc16 c) {
static bool NotWhiteSpaceNorLineTermiantor(uc32 c) {
return !IsWhiteSpaceOrLineTerminator(c);
}
static bool NotWord(uc32 c) { return !IsRegExpWord(c); }
static bool NotWord(uc16 c) {
return !IsRegExpWord(c);
}
static bool NotLineTerminator(uc32 c) { return !unibrow::IsLineTerminator(c); }
static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) {
static void TestCharacterClassEscapes(uc32 c, bool(pred)(uc32 c)) {
Zone zone(CcTest::i_isolate()->allocator(), ZONE_NAME);
ZoneList<CharacterRange>* ranges =
zone.New<ZoneList<CharacterRange>>(2, &zone);
......@@ -520,9 +510,8 @@ static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) {
}
}
TEST(CharacterClassEscapes) {
TestCharacterClassEscapes('.', IsRegExpNewline);
TestCharacterClassEscapes('.', NotLineTerminator);
TestCharacterClassEscapes('d', IsDigit);
TestCharacterClassEscapes('D', NotDigit);
TestCharacterClassEscapes('s', IsWhiteSpaceOrLineTerminator);
......
......@@ -7,7 +7,11 @@
function Test(regexp, subject, expectedResult, expectedLastIndex) {
assertEquals(%RegexpTypeTag(regexp), "EXPERIMENTAL");
var result = regexp.exec(subject);
assertArrayEquals(expectedResult, result);
if (result instanceof Array && expectedResult instanceof Array) {
assertArrayEquals(expectedResult, result);
} else {
assertEquals(expectedResult, result);
}
assertEquals(expectedLastIndex, regexp.lastIndex);
}
......@@ -71,5 +75,16 @@ Test(/(123)|(xyz)/, "xyz", ["xyz", undefined, "xyz"], 0);
Test(/(?:(123)|(xyz))*/, "xyz123", ["xyz123", "123", undefined], 0);
Test(/((123)|(xyz)*)*/, "xyz123xyz", ["xyz123xyz", "xyz", undefined, "xyz"], 0);
// Assertions.
// TODO(mbid,v8:10765): Once supported, we should also check ^ and $ with the
// multiline flag.
Test(/asdf\b/, "asdf---", ["asdf"], 0);
Test(/asdf\b/, "asdfg", null, 0);
Test(/asd[fg]\B/, "asdf asdgg", ["asdg"], 0);
// TODO(mbid,v8:10765): The ^ assertion should work once we support anchored
// regexps.
//Test(/^asd[fg]/, "asdf asdgg", ["asdf"], 0);
Test(/asd[fg]$/, "asdf asdg", ["asdg"], 0);
// The global flag.
Test(/asdf/g, "fjasdfkkasdf", ["asdf"], 6);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment