Commit e83511c2 authored by Martin Bidlingmaier's avatar Martin Bidlingmaier Committed by Commit Bot

[regexp] Support assertions in experimental engine

Assertions are implemented with the new ASSERTION instruction.  The nfa
interpreter evaluates the assertion based on the current context in the
subject string every time a thread executes ASSERTION.  This is
analogous to what re2 and rust/regex do.

Alternatives to this approach:
- The interpreter could calculate eagerly for all assertion types
  whether they are satisfied whenever the current input position is
  advanced.  This would make evaluating the ASSERTION instruction itself
  cheaper, but at the cost of making every advance in the input string
  more expensive.  I suspect this would be slower on average because
  assertions are not that common that we typically evaluate >= 2
  assertions at every input position.
- Assertions in a regexp could be desugared into CONSUME_RANGE
  instructions, so that no new instruction would be necessary.  For
  example, the word boundary assertion \b is satisfied at a given
  position/state if we have just consumed a word character and will
  consume a non-word character next, or vice-versa.  The tricky part
  about this is that the assertion itself should not consume input, so
  we'd have to split (automaton) states according to whether we've
  arrived at them via a word character or not.  The current compiler is
  not really equipped for this kind of transformation.  For {start,end}
  of {line,file} assertions, we'd need to introduce dummy characters
  indicating start/end of input (say, 0x10000 and 0x10001) which we feed
  to the interpreter before respectively after the actual input.
  I suspect that this approach wouldn't make much of a difference for
  NFA execution. It would likely speed up (lazy) DFA execution though
  because assertions would be dealt with in the fast path.

Cq-Include-Trybots: luci.v8.try:v8_linux64_fyi_rel_ng
Bug: v8:10765
Change-Id: Ic2012c943e0ce54eb8662789fb3d4c1b6cd8d606
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2398644
Commit-Queue: Martin Bidlingmaier <mbid@google.com>
Reviewed-by: 's avatarLeszek Swirski <leszeks@chromium.org>
Reviewed-by: 's avatarJakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#70026}
parent c4fd16e9
...@@ -32,6 +32,29 @@ std::ostream& operator<<(std::ostream& os, const RegExpInstruction& inst) { ...@@ -32,6 +32,29 @@ std::ostream& operator<<(std::ostream& os, const RegExpInstruction& inst) {
os << "]"; os << "]";
break; break;
} }
case RegExpInstruction::ASSERTION:
os << "ASSERTION ";
switch (inst.payload.assertion_type) {
case RegExpAssertion::START_OF_INPUT:
os << "START_OF_INPUT";
break;
case RegExpAssertion::END_OF_INPUT:
os << "END_OF_INPUT";
break;
case RegExpAssertion::START_OF_LINE:
os << "START_OF_LINE";
break;
case RegExpAssertion::END_OF_LINE:
os << "END_OF_LINE";
break;
case RegExpAssertion::BOUNDARY:
os << "BOUNDARY";
break;
case RegExpAssertion::NON_BOUNDARY:
os << "NON_BOUNDARY";
break;
}
break;
case RegExpInstruction::FORK: case RegExpInstruction::FORK:
os << "FORK " << inst.payload.pc; os << "FORK " << inst.payload.pc;
break; break;
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <ios> #include <ios>
#include "src/regexp/regexp-ast.h"
#include "src/utils/vector.h" #include "src/utils/vector.h"
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
...@@ -91,12 +92,13 @@ namespace internal { ...@@ -91,12 +92,13 @@ namespace internal {
// bytes, the payload takes another 4 bytes. // bytes, the payload takes another 4 bytes.
struct RegExpInstruction { struct RegExpInstruction {
enum Opcode : int32_t { enum Opcode : int32_t {
ACCEPT,
ASSERTION,
CLEAR_REGISTER,
CONSUME_RANGE, CONSUME_RANGE,
FORK, FORK,
JMP, JMP,
ACCEPT,
SET_REGISTER_TO_CP, SET_REGISTER_TO_CP,
CLEAR_REGISTER,
}; };
struct Uc16Range { struct Uc16Range {
...@@ -145,6 +147,13 @@ struct RegExpInstruction { ...@@ -145,6 +147,13 @@ struct RegExpInstruction {
return result; return result;
} }
static RegExpInstruction Assertion(RegExpAssertion::AssertionType t) {
RegExpInstruction result;
result.opcode = ASSERTION;
result.payload.assertion_type = t;
return result;
}
Opcode opcode; Opcode opcode;
union { union {
// Payload of CONSUME_RANGE: // Payload of CONSUME_RANGE:
...@@ -153,6 +162,8 @@ struct RegExpInstruction { ...@@ -153,6 +162,8 @@ struct RegExpInstruction {
int32_t pc; int32_t pc;
// Payload of SET_REGISTER_TO_CP and CLEAR_REGISTER: // Payload of SET_REGISTER_TO_CP and CLEAR_REGISTER:
int32_t register_index; int32_t register_index;
// Payload of ASSERTION:
RegExpAssertion::AssertionType assertion_type;
} payload; } payload;
STATIC_ASSERT(sizeof(payload) == 4); STATIC_ASSERT(sizeof(payload) == 4);
}; };
......
...@@ -74,9 +74,12 @@ class CanBeHandledVisitor final : private RegExpVisitor { ...@@ -74,9 +74,12 @@ class CanBeHandledVisitor final : private RegExpVisitor {
} }
void* VisitAssertion(RegExpAssertion* node, void*) override { void* VisitAssertion(RegExpAssertion* node, void*) override {
// TODO(mbid, v8:10765): We should be able to support at least some // TODO(mbid,v8:10765): Once regexps that we shouldn't try to match at
// assertions. re2 does, too. // every input position (e.g. sticky) are supported, we should also support
result_ = false; // START_OF_INPUT.
result_ = result_ &&
node->assertion_type() != RegExpAssertion::START_OF_INPUT &&
AreSuitableFlags(node->flags());
return nullptr; return nullptr;
} }
...@@ -357,8 +360,8 @@ class CompileVisitor : private RegExpVisitor { ...@@ -357,8 +360,8 @@ class CompileVisitor : private RegExpVisitor {
} }
void* VisitAssertion(RegExpAssertion* node, void*) override { void* VisitAssertion(RegExpAssertion* node, void*) override {
// TODO(mbid,v8:10765): Support this case. code_.Add(RegExpInstruction::Assertion(node->assertion_type()), zone_);
UNREACHABLE(); return nullptr;
} }
void* VisitCharacterClass(RegExpCharacterClass* node, void*) override { void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include "src/base/optional.h" #include "src/base/optional.h"
#include "src/regexp/experimental/experimental.h" #include "src/regexp/experimental/experimental.h"
#include "src/strings/char-predicates-inl.h"
#include "src/zone/zone-allocator.h" #include "src/zone/zone-allocator.h"
#include "src/zone/zone-list-inl.h" #include "src/zone/zone-list-inl.h"
...@@ -16,6 +17,39 @@ namespace { ...@@ -16,6 +17,39 @@ namespace {
constexpr int kUndefinedRegisterValue = -1; constexpr int kUndefinedRegisterValue = -1;
template <class Character>
bool SatisfiesAssertion(RegExpAssertion::AssertionType type,
Vector<const Character> context, int position) {
DCHECK_LE(position, context.length());
DCHECK_GE(position, 0);
switch (type) {
case RegExpAssertion::START_OF_INPUT:
return position == 0;
case RegExpAssertion::END_OF_INPUT:
return position == context.length();
case RegExpAssertion::START_OF_LINE:
if (position == 0) return true;
return unibrow::IsLineTerminator(context[position - 1]);
case RegExpAssertion::END_OF_LINE:
if (position == context.length()) return true;
return unibrow::IsLineTerminator(context[position]);
case RegExpAssertion::BOUNDARY:
if (context.length() == 0) {
return false;
} else if (position == 0) {
return IsRegExpWord(context[position]);
} else if (position == context.length()) {
return IsRegExpWord(context[position - 1]);
} else {
return IsRegExpWord(context[position - 1]) !=
IsRegExpWord(context[position]);
}
case RegExpAssertion::NON_BOUNDARY:
return !SatisfiesAssertion(RegExpAssertion::BOUNDARY, context, position);
}
}
template <class Character> template <class Character>
class NfaInterpreter { class NfaInterpreter {
// Executes a bytecode program in breadth-first mode, without backtracking. // Executes a bytecode program in breadth-first mode, without backtracking.
...@@ -239,6 +273,14 @@ class NfaInterpreter { ...@@ -239,6 +273,14 @@ class NfaInterpreter {
blocked_threads_.Add(t, zone_); blocked_threads_.Add(t, zone_);
return; return;
} }
case RegExpInstruction::ASSERTION:
if (!SatisfiesAssertion(inst.payload.assertion_type, input_,
input_index_)) {
DestroyThread(t);
return;
}
++t.pc;
break;
case RegExpInstruction::FORK: { case RegExpInstruction::FORK: {
InterpreterThread fork{inst.payload.pc, InterpreterThread fork{inst.payload.pc,
NewRegisterArrayUninitialized()}; NewRegisterArrayUninitialized()};
......
...@@ -69,14 +69,8 @@ inline constexpr uc32 ToAsciiLower(uc32 c) { ...@@ -69,14 +69,8 @@ inline constexpr uc32 ToAsciiLower(uc32 c) {
return c | (IsAsciiUpper(c) << 5); return c | (IsAsciiUpper(c) << 5);
} }
inline constexpr bool IsRegExpWord(uc16 c) { inline constexpr bool IsRegExpWord(uc32 c) {
return base::IsInRange(AsciiAlphaToLower(c), 'a', 'z') || IsDecimalDigit(c) || return IsAlphaNumeric(c) || c == '_';
(c == '_');
}
inline constexpr bool IsRegExpNewline(uc16 c) {
// CR LF LS PS
return c != 0x000A && c != 0x000D && c != 0x2028 && c != 0x2029;
} }
// Constexpr cache table for character flags. // Constexpr cache table for character flags.
......
...@@ -24,7 +24,6 @@ inline constexpr bool IsHexDigit(uc32 c); ...@@ -24,7 +24,6 @@ inline constexpr bool IsHexDigit(uc32 c);
inline constexpr bool IsOctalDigit(uc32 c); inline constexpr bool IsOctalDigit(uc32 c);
inline constexpr bool IsBinaryDigit(uc32 c); inline constexpr bool IsBinaryDigit(uc32 c);
inline constexpr bool IsRegExpWord(uc32 c); inline constexpr bool IsRegExpWord(uc32 c);
inline constexpr bool IsRegExpNewline(uc32 c);
inline constexpr bool IsAsciiLower(uc32 ch); inline constexpr bool IsAsciiLower(uc32 ch);
inline constexpr bool IsAsciiUpper(uc32 ch); inline constexpr bool IsAsciiUpper(uc32 ch);
......
...@@ -477,35 +477,25 @@ TEST(Errors) { ...@@ -477,35 +477,25 @@ TEST(Errors) {
ExpectError("\\ka", kInvalidNamedReference, true); ExpectError("\\ka", kInvalidNamedReference, true);
} }
static bool IsDigit(uc32 c) { return ('0' <= c && c <= '9'); }
static bool IsDigit(uc16 c) { static bool NotDigit(uc32 c) { return !IsDigit(c); }
return ('0' <= c && c <= '9');
}
static bool NotDigit(uc16 c) {
return !IsDigit(c);
}
static bool IsWhiteSpaceOrLineTerminator(uc16 c) { static bool IsWhiteSpaceOrLineTerminator(uc32 c) {
// According to ECMA 5.1, 15.10.2.12 the CharacterClassEscape \s includes // According to ECMA 5.1, 15.10.2.12 the CharacterClassEscape \s includes
// WhiteSpace (7.2) and LineTerminator (7.3) values. // WhiteSpace (7.2) and LineTerminator (7.3) values.
return v8::internal::IsWhiteSpaceOrLineTerminator(c); return v8::internal::IsWhiteSpaceOrLineTerminator(c);
} }
static bool NotWhiteSpaceNorLineTermiantor(uc32 c) {
static bool NotWhiteSpaceNorLineTermiantor(uc16 c) {
return !IsWhiteSpaceOrLineTerminator(c); return !IsWhiteSpaceOrLineTerminator(c);
} }
static bool NotWord(uc32 c) { return !IsRegExpWord(c); }
static bool NotWord(uc16 c) { static bool NotLineTerminator(uc32 c) { return !unibrow::IsLineTerminator(c); }
return !IsRegExpWord(c);
}
static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) { static void TestCharacterClassEscapes(uc32 c, bool(pred)(uc32 c)) {
Zone zone(CcTest::i_isolate()->allocator(), ZONE_NAME); Zone zone(CcTest::i_isolate()->allocator(), ZONE_NAME);
ZoneList<CharacterRange>* ranges = ZoneList<CharacterRange>* ranges =
zone.New<ZoneList<CharacterRange>>(2, &zone); zone.New<ZoneList<CharacterRange>>(2, &zone);
...@@ -520,9 +510,8 @@ static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) { ...@@ -520,9 +510,8 @@ static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) {
} }
} }
TEST(CharacterClassEscapes) { TEST(CharacterClassEscapes) {
TestCharacterClassEscapes('.', IsRegExpNewline); TestCharacterClassEscapes('.', NotLineTerminator);
TestCharacterClassEscapes('d', IsDigit); TestCharacterClassEscapes('d', IsDigit);
TestCharacterClassEscapes('D', NotDigit); TestCharacterClassEscapes('D', NotDigit);
TestCharacterClassEscapes('s', IsWhiteSpaceOrLineTerminator); TestCharacterClassEscapes('s', IsWhiteSpaceOrLineTerminator);
......
...@@ -7,7 +7,11 @@ ...@@ -7,7 +7,11 @@
function Test(regexp, subject, expectedResult, expectedLastIndex) { function Test(regexp, subject, expectedResult, expectedLastIndex) {
assertEquals(%RegexpTypeTag(regexp), "EXPERIMENTAL"); assertEquals(%RegexpTypeTag(regexp), "EXPERIMENTAL");
var result = regexp.exec(subject); var result = regexp.exec(subject);
assertArrayEquals(expectedResult, result); if (result instanceof Array && expectedResult instanceof Array) {
assertArrayEquals(expectedResult, result);
} else {
assertEquals(expectedResult, result);
}
assertEquals(expectedLastIndex, regexp.lastIndex); assertEquals(expectedLastIndex, regexp.lastIndex);
} }
...@@ -71,5 +75,16 @@ Test(/(123)|(xyz)/, "xyz", ["xyz", undefined, "xyz"], 0); ...@@ -71,5 +75,16 @@ Test(/(123)|(xyz)/, "xyz", ["xyz", undefined, "xyz"], 0);
Test(/(?:(123)|(xyz))*/, "xyz123", ["xyz123", "123", undefined], 0); Test(/(?:(123)|(xyz))*/, "xyz123", ["xyz123", "123", undefined], 0);
Test(/((123)|(xyz)*)*/, "xyz123xyz", ["xyz123xyz", "xyz", undefined, "xyz"], 0); Test(/((123)|(xyz)*)*/, "xyz123xyz", ["xyz123xyz", "xyz", undefined, "xyz"], 0);
// Assertions.
// TODO(mbid,v8:10765): Once supported, we should also check ^ and $ with the
// multiline flag.
Test(/asdf\b/, "asdf---", ["asdf"], 0);
Test(/asdf\b/, "asdfg", null, 0);
Test(/asd[fg]\B/, "asdf asdgg", ["asdg"], 0);
// TODO(mbid,v8:10765): The ^ assertion should work once we support anchored
// regexps.
//Test(/^asd[fg]/, "asdf asdgg", ["asdf"], 0);
Test(/asd[fg]$/, "asdf asdg", ["asdg"], 0);
// The global flag. // The global flag.
Test(/asdf/g, "fjasdfkkasdf", ["asdf"], 6); Test(/asdf/g, "fjasdfkkasdf", ["asdf"], 6);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment