Commit 7d849870 authored by Jakob Gruber's avatar Jakob Gruber Committed by V8 LUCI CQ

[regexp] Reorganize and deduplicate in the regexp parser

The parser is organized in a somewhat tricky way s.t. it can be
hard to map the implementation back to the specified grammar.

In particular, the logic for CharacterClassEscape, ClassEscape,
and CharacterEscape was implemented twice - once inside a character
class, once outside.

This CL refactors related logic to have only a single implementation.

As a drive-by, fix one related inconsistency related to \k inside
a character class.

Fixed: v8:10602
Change-Id: I5858840159694fa6f8d1aa857027db80754e3dfd
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3178966Reviewed-by: 's avatarMathias Bynens <mathias@chromium.org>
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/main@{#77114}
parent d6fb96ae
...@@ -23,6 +23,13 @@ namespace internal { ...@@ -23,6 +23,13 @@ namespace internal {
namespace { namespace {
// Whether we're currently inside the ClassEscape production
// (tc39.es/ecma262/#prod-annexB-CharacterEscape).
enum class InClassEscapeState {
kInClass,
kNotInClass,
};
// A BufferedZoneList is an automatically growing list, just like (and backed // A BufferedZoneList is an automatically growing list, just like (and backed
// by) a ZoneList, that is optimized for the case of adding and removing // by) a ZoneList, that is optimized for the case of adding and removing
// a single element. The last element added is stored outside the backing list, // a single element. The last element added is stored outside the backing list,
...@@ -255,10 +262,6 @@ class RegExpParserImpl final { ...@@ -255,10 +262,6 @@ class RegExpParserImpl final {
// out parameters. // out parameters.
bool ParseIntervalQuantifier(int* min_out, int* max_out); bool ParseIntervalQuantifier(int* min_out, int* max_out);
// Parses and returns a single escaped character. The character
// must not be 'b' or 'B' since they are usually handle specially.
base::uc32 ParseClassCharacterEscape();
// Checks whether the following is a length-digit hexadecimal number, // Checks whether the following is a length-digit hexadecimal number,
// and sets the value if it is. // and sets the value if it is.
bool ParseHexEscape(int length, base::uc32* value); bool ParseHexEscape(int length, base::uc32* value);
...@@ -286,8 +289,14 @@ class RegExpParserImpl final { ...@@ -286,8 +289,14 @@ class RegExpParserImpl final {
void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone, void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
bool add_unicode_case_equivalents, base::uc32* char_out, bool add_unicode_case_equivalents, base::uc32* char_out,
bool* is_class_escape); bool* is_class_escape);
// Returns true iff parsing was successful.
char ParseClassEscape(); bool TryParseCharacterClassEscape(base::uc32 next,
ZoneList<CharacterRange>* ranges,
Zone* zone,
bool add_unicode_case_equivalents);
// Parses and returns a single escaped character.
base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state,
bool* is_escaped_unicode_character);
RegExpTree* ReportError(RegExpError error); RegExpTree* ReportError(RegExpError error);
void Advance(); void Advance();
...@@ -687,62 +696,19 @@ RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() { ...@@ -687,62 +696,19 @@ RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
switch (Next()) { switch (Next()) {
case kEndMarker: case kEndMarker:
return ReportError(RegExpError::kEscapeAtEndOfPattern); return ReportError(RegExpError::kEscapeAtEndOfPattern);
case 'b':
Advance(2);
builder->AddAssertion(zone()->template New<RegExpAssertion>(
RegExpAssertion::BOUNDARY));
continue;
case 'B':
Advance(2);
builder->AddAssertion(zone()->template New<RegExpAssertion>(
RegExpAssertion::NON_BOUNDARY));
continue;
// AtomEscape :: // AtomEscape ::
// CharacterClassEscape // [+UnicodeMode] DecimalEscape
// [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber
// of DecimalEscape is ≤ NcapturingParens
// CharacterEscape (some cases of this mixed in too)
// //
// CharacterClassEscape :: one of // TODO(jgruber): It may make sense to disentangle all the different
// d D s S w W // cases and make the structure mirror the spec, e.g. for AtomEscape:
case 'd': //
case 'D': // if (TryParseDecimalEscape(...)) return;
case 's': // if (TryParseCharacterClassEscape(...)) return;
case 'S': // if (TryParseCharacterEscape(...)) return;
case 'w': // if (TryParseGroupName(...)) return;
case 'W': {
base::uc32 c = Next();
Advance(2);
ZoneList<CharacterRange>* ranges =
zone()->template New<ZoneList<CharacterRange>>(2, zone());
CharacterRange::AddClassEscape(
c, ranges, unicode() && builder->ignore_case(), zone());
RegExpCharacterClass* cc =
zone()->template New<RegExpCharacterClass>(zone(), ranges);
builder->AddCharacterClass(cc);
break;
}
case 'p':
case 'P': {
base::uc32 p = Next();
Advance(2);
if (unicode()) {
ZoneList<CharacterRange>* ranges =
zone()->template New<ZoneList<CharacterRange>>(2, zone());
ZoneVector<char> name_1(zone());
ZoneVector<char> name_2(zone());
if (ParsePropertyClassName(&name_1, &name_2)) {
if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
RegExpCharacterClass* cc =
zone()->template New<RegExpCharacterClass>(zone(),
ranges);
builder->AddCharacterClass(cc);
break;
}
}
return ReportError(RegExpError::kInvalidPropertyName);
} else {
builder->AddCharacter(p);
}
break;
}
case '1': case '1':
case '2': case '2':
case '3': case '3':
...@@ -753,7 +719,8 @@ RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() { ...@@ -753,7 +719,8 @@ RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
case '8': case '8':
case '9': { case '9': {
int index = 0; int index = 0;
bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED); const bool is_backref =
ParseBackReferenceIndex(&index CHECK_FAILED);
if (is_backref) { if (is_backref) {
if (state->IsInsideCaptureGroup(index)) { if (state->IsInsideCaptureGroup(index)) {
// The back reference is inside the capture group it refers to. // The back reference is inside the capture group it refers to.
...@@ -793,76 +760,48 @@ RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() { ...@@ -793,76 +760,48 @@ RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
builder->AddCharacter(octal); builder->AddCharacter(octal);
break; break;
} }
// ControlEscape :: one of case 'b':
// f n r t v
case 'f':
Advance(2);
builder->AddCharacter('\f');
break;
case 'n':
Advance(2);
builder->AddCharacter('\n');
break;
case 'r':
Advance(2);
builder->AddCharacter('\r');
break;
case 't':
Advance(2);
builder->AddCharacter('\t');
break;
case 'v':
Advance(2);
builder->AddCharacter('\v');
break;
case 'c': {
Advance();
base::uc32 controlLetter = Next();
// Special case if it is an ASCII letter.
// Convert lower case letters to uppercase.
base::uc32 letter = controlLetter & ~('a' ^ 'A');
if (letter < 'A' || 'Z' < letter) {
// controlLetter is not in range 'A'-'Z' or 'a'-'z'.
// Read the backslash as a literal character instead of as
// starting an escape.
// ES#prod-annexB-ExtendedPatternCharacter
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
return ReportError(RegExpError::kInvalidUnicodeEscape);
}
builder->AddCharacter('\\');
} else {
Advance(2); Advance(2);
builder->AddCharacter(controlLetter & 0x1F); builder->AddAssertion(zone()->template New<RegExpAssertion>(
} RegExpAssertion::BOUNDARY));
break; continue;
} case 'B':
case 'x': {
Advance(2); Advance(2);
base::uc32 value; builder->AddAssertion(zone()->template New<RegExpAssertion>(
if (ParseHexEscape(2, &value)) { RegExpAssertion::NON_BOUNDARY));
builder->AddCharacter(value); continue;
} else if (!unicode()) { // AtomEscape ::
builder->AddCharacter('x'); // CharacterClassEscape
case 'd':
case 'D':
case 's':
case 'S':
case 'w':
case 'W':
case 'p':
case 'P': {
base::uc32 next = Next();
ZoneList<CharacterRange>* ranges =
zone()->template New<ZoneList<CharacterRange>>(2, zone());
bool add_unicode_case_equivalents =
unicode() && builder->ignore_case();
bool parsed_character_class_escape = TryParseCharacterClassEscape(
next, ranges, zone(),
add_unicode_case_equivalents CHECK_FAILED);
if (parsed_character_class_escape) {
RegExpCharacterClass* cc =
zone()->template New<RegExpCharacterClass>(zone(), ranges);
builder->AddCharacterClass(cc);
} else { } else {
// With /u, invalid escapes are not treated as identity escapes. CHECK(!unicode());
return ReportError(RegExpError::kInvalidEscape);
}
break;
}
case 'u': {
Advance(2); Advance(2);
base::uc32 value; builder->AddCharacter(next); // IdentityEscape.
if (ParseUnicodeEscape(&value)) {
builder->AddEscapedUnicodeCharacter(value);
} else if (!unicode()) {
builder->AddCharacter('u');
} else {
// With /u, invalid escapes are not treated as identity escapes.
return ReportError(RegExpError::kInvalidUnicodeEscape);
} }
break; break;
} }
// AtomEscape ::
// k GroupName
case 'k': case 'k':
// Either an identity escape or a named back-reference. The two // Either an identity escape or a named back-reference. The two
// interpretations are mutually exclusive: '\k' is interpreted as // interpretations are mutually exclusive: '\k' is interpreted as
...@@ -875,18 +814,21 @@ RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() { ...@@ -875,18 +814,21 @@ RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
break; break;
} }
V8_FALLTHROUGH; V8_FALLTHROUGH;
default: // AtomEscape ::
Advance(); // CharacterEscape
// With /u, no identity escapes except for syntax characters default: {
// are allowed. Otherwise, all identity escapes are allowed. bool is_escaped_unicode_character = false;
if (!unicode() || IsSyntaxCharacterOrSlash(current())) { base::uc32 c = ParseCharacterEscape(
builder->AddCharacter(current()); InClassEscapeState::kNotInClass,
Advance(); &is_escaped_unicode_character CHECK_FAILED);
if (is_escaped_unicode_character) {
builder->AddEscapedUnicodeCharacter(c);
} else { } else {
return ReportError(RegExpError::kInvalidEscape); builder->AddCharacter(c);
} }
break; break;
} }
}
break; break;
case '{': { case '{': {
int dummy; int dummy;
...@@ -1044,6 +986,9 @@ static bool IsSpecialClassEscape(base::uc32 c) { ...@@ -1044,6 +986,9 @@ static bool IsSpecialClassEscape(base::uc32 c) {
// is called when needed. It can see the difference between capturing and // is called when needed. It can see the difference between capturing and
// noncapturing parentheses and can skip character classes and backslash-escaped // noncapturing parentheses and can skip character classes and backslash-escaped
// characters. // characters.
//
// Important: The scanner has to be in a consistent state when calling
// ScanForCaptures, e.g. not in the middle of an escape sequence '\['.
template <class CharT> template <class CharT>
void RegExpParserImpl<CharT>::ScanForCaptures() { void RegExpParserImpl<CharT>::ScanForCaptures() {
DCHECK(!is_scanned_for_captures_); DCHECK(!is_scanned_for_captures_);
...@@ -1295,14 +1240,14 @@ template <class CharT> ...@@ -1295,14 +1240,14 @@ template <class CharT>
RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) { RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) {
// The index for the capture groups are one-based. Its index in the list is // The index for the capture groups are one-based. Its index in the list is
// zero-based. // zero-based.
int know_captures = const int known_captures =
is_scanned_for_captures_ ? capture_count_ : captures_started_; is_scanned_for_captures_ ? capture_count_ : captures_started_;
DCHECK(index <= know_captures); DCHECK(index <= known_captures);
if (captures_ == nullptr) { if (captures_ == nullptr) {
captures_ = captures_ =
zone()->template New<ZoneList<RegExpCapture*>>(know_captures, zone()); zone()->template New<ZoneList<RegExpCapture*>>(known_captures, zone());
} }
while (captures_->length() < know_captures) { while (captures_->length() < known_captures) {
captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1), captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1),
zone()); zone());
} }
...@@ -1768,15 +1713,19 @@ bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value, ...@@ -1768,15 +1713,19 @@ bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value,
return true; return true;
} }
// https://tc39.es/ecma262/#prod-CharacterEscape
template <class CharT> template <class CharT>
base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() { base::uc32 RegExpParserImpl<CharT>::ParseCharacterEscape(
InClassEscapeState in_class_escape_state,
bool* is_escaped_unicode_character) {
DCHECK_EQ('\\', current()); DCHECK_EQ('\\', current());
DCHECK(has_next() && !IsSpecialClassEscape(Next())); DCHECK(has_next() && !IsSpecialClassEscape(Next()));
Advance(); Advance();
switch (current()) {
case 'b': const base::uc32 c = current();
Advance(); switch (c) {
return '\b'; // CharacterEscape ::
// ControlEscape :: one of // ControlEscape :: one of
// f n r t v // f n r t v
case 'f': case 'f':
...@@ -1794,12 +1743,11 @@ base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() { ...@@ -1794,12 +1743,11 @@ base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() {
case 'v': case 'v':
Advance(); Advance();
return '\v'; return '\v';
// CharacterEscape ::
// c ControlLetter
case 'c': { case 'c': {
base::uc32 controlLetter = Next(); base::uc32 controlLetter = Next();
base::uc32 letter = controlLetter & ~('A' ^ 'a'); base::uc32 letter = controlLetter & ~('A' ^ 'a');
// Inside a character class, we also accept digits and underscore as
// control characters, unless with /u. See Annex B:
// ES#prod-annexB-ClassControlLetter
if (letter >= 'A' && letter <= 'Z') { if (letter >= 'A' && letter <= 'Z') {
Advance(2); Advance(2);
// Control letters mapped to ASCII control characters in the range // Control letters mapped to ASCII control characters in the range
...@@ -1808,22 +1756,29 @@ base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() { ...@@ -1808,22 +1756,29 @@ base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() {
} }
if (unicode()) { if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes. // With /u, invalid escapes are not treated as identity escapes.
ReportError(RegExpError::kInvalidClassEscape); ReportError(RegExpError::kInvalidUnicodeEscape);
return 0; return 0;
} }
if (in_class_escape_state == InClassEscapeState::kInClass) {
// Inside a character class, we also accept digits and underscore as
// control characters, unless with /u. See Annex B:
// ES#prod-annexB-ClassControlLetter
if ((controlLetter >= '0' && controlLetter <= '9') || if ((controlLetter >= '0' && controlLetter <= '9') ||
controlLetter == '_') { controlLetter == '_') {
Advance(2); Advance(2);
return controlLetter & 0x1F; return controlLetter & 0x1F;
} }
}
// We match JSC in reading the backslash as a literal // We match JSC in reading the backslash as a literal
// character instead of as starting an escape. // character instead of as starting an escape.
// TODO(v8:6201): Not yet covered by the spec.
return '\\'; return '\\';
} }
// CharacterEscape ::
// 0 [lookahead ∉ DecimalDigit]
// [~UnicodeMode] LegacyOctalEscapeSequence
case '0': case '0':
// With /u, \0 is interpreted as NUL if not followed by another digit. // \0 is interpreted as NUL if not followed by another digit.
if (unicode() && !(Next() >= '0' && Next() <= '9')) { if (Next() < '0' || Next() > '9') {
Advance(); Advance();
return 0; return 0;
} }
...@@ -1845,6 +1800,8 @@ base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() { ...@@ -1845,6 +1800,8 @@ base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() {
return 0; return 0;
} }
return ParseOctalLiteral(); return ParseOctalLiteral();
// CharacterEscape ::
// HexEscapeSequence
case 'x': { case 'x': {
Advance(); Advance();
base::uc32 value; base::uc32 value;
...@@ -1858,10 +1815,15 @@ base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() { ...@@ -1858,10 +1815,15 @@ base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() {
// as an identity escape. // as an identity escape.
return 'x'; return 'x';
} }
// CharacterEscape ::
// RegExpUnicodeEscapeSequence [?UnicodeMode]
case 'u': { case 'u': {
Advance(); Advance();
base::uc32 value; base::uc32 value;
if (ParseUnicodeEscape(&value)) return value; if (ParseUnicodeEscape(&value)) {
*is_escaped_unicode_character = true;
return value;
}
if (unicode()) { if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes. // With /u, invalid escapes are not treated as identity escapes.
ReportError(RegExpError::kInvalidUnicodeEscape); ReportError(RegExpError::kInvalidUnicodeEscape);
...@@ -1871,48 +1833,108 @@ base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() { ...@@ -1871,48 +1833,108 @@ base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() {
// as an identity escape. // as an identity escape.
return 'u'; return 'u';
} }
default: { default:
base::uc32 result = current(); break;
// With /u, no identity escapes except for syntax characters and '-' are }
// allowed. Otherwise, all identity escapes are allowed.
if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { // CharacterEscape ::
// IdentityEscape[?UnicodeMode, ?N]
//
// * With /u, no identity escapes except for syntax characters are
// allowed.
// * Without /u:
// * '\c' is not an IdentityEscape.
// * '\k' is not an IdentityEscape when named captures exist.
// * Otherwise, all identity escapes are allowed.
if (unicode()) {
if (!IsSyntaxCharacterOrSlash(c)) {
ReportError(RegExpError::kInvalidEscape);
return 0;
}
Advance(); Advance();
return result; return c;
} }
DCHECK(!unicode());
if (c == 'c') {
ReportError(RegExpError::kInvalidEscape); ReportError(RegExpError::kInvalidEscape);
return 0; return 0;
} }
Advance();
// Note: It's important to Advance before the HasNamedCaptures call s.t. we
// don't start scanning in the middle of an escape.
if (HasNamedCaptures() && c == 'k') {
ReportError(RegExpError::kInvalidEscape);
return 0;
} }
UNREACHABLE(); return c;
} }
// https://tc39.es/ecma262/#prod-ClassEscape
template <class CharT> template <class CharT>
void RegExpParserImpl<CharT>::ParseClassEscape( void RegExpParserImpl<CharT>::ParseClassEscape(
ZoneList<CharacterRange>* ranges, Zone* zone, ZoneList<CharacterRange>* ranges, Zone* zone,
bool add_unicode_case_equivalents, base::uc32* char_out, bool add_unicode_case_equivalents, base::uc32* char_out,
bool* is_class_escape) { bool* is_class_escape) {
base::uc32 current_char = current(); *is_class_escape = false;
if (current_char == '\\') {
switch (Next()) { if (current() != '\\') {
case 'w': // Not a ClassEscape.
case 'W': *char_out = current();
case 'd': Advance();
case 'D': return;
case 's': }
case 'S': {
CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges, const base::uc32 next = Next();
add_unicode_case_equivalents, zone); switch (next) {
case 'b':
*char_out = '\b';
Advance(2);
return;
case '-':
if (unicode()) {
*char_out = next;
Advance(2); Advance(2);
*is_class_escape = true;
return; return;
} }
break;
case kEndMarker: case kEndMarker:
ReportError(RegExpError::kEscapeAtEndOfPattern); ReportError(RegExpError::kEscapeAtEndOfPattern);
return; return;
default:
break;
}
*is_class_escape = TryParseCharacterClassEscape(next, ranges, zone,
add_unicode_case_equivalents);
if (*is_class_escape) return;
bool dummy = false; // Unused.
*char_out = ParseCharacterEscape(InClassEscapeState::kInClass, &dummy);
}
// https://tc39.es/ecma262/#prod-CharacterClassEscape
template <class CharT>
bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape(
base::uc32 next, ZoneList<CharacterRange>* ranges, Zone* zone,
bool add_unicode_case_equivalents) {
DCHECK_EQ(current(), '\\');
DCHECK_EQ(Next(), next);
switch (next) {
case 'd':
case 'D':
case 's':
case 'S':
case 'w':
case 'W':
CharacterRange::AddClassEscape(static_cast<char>(next), ranges,
add_unicode_case_equivalents, zone);
Advance(2);
return true;
case 'p': case 'p':
case 'P': case 'P': {
if (unicode()) { if (!unicode()) return false;
bool negate = Next() == 'P'; bool negate = next == 'P';
Advance(2); Advance(2);
ZoneVector<char> name_1(zone); ZoneVector<char> name_1(zone);
ZoneVector<char> name_2(zone); ZoneVector<char> name_2(zone);
...@@ -1920,19 +1942,10 @@ void RegExpParserImpl<CharT>::ParseClassEscape( ...@@ -1920,19 +1942,10 @@ void RegExpParserImpl<CharT>::ParseClassEscape(
!AddPropertyClassRange(ranges, negate, name_1, name_2)) { !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
ReportError(RegExpError::kInvalidClassPropertyName); ReportError(RegExpError::kInvalidClassPropertyName);
} }
*is_class_escape = true; return true;
return;
} }
break;
default: default:
break; return false;
}
*char_out = ParseClassCharacterEscape();
*is_class_escape = false;
} else {
Advance();
*char_out = current_char;
*is_class_escape = false;
} }
} }
......
// Copyright 2021 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
assertThrows(String.raw`/[\k](?<a>)/.exec()`);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment