[regexp] Reorganize and deduplicate in the regexp parser

The parser is organized in a somewhat tricky way s.t. it can be hard to map the implementation back to the specified grammar. In particular, the logic for CharacterClassEscape, ClassEscape, and CharacterEscape was implemented twice - once inside a character class, once outside. This CL refactors related logic to have only a single implementation. As a drive-by, fix one related inconsistency related to \k inside a character class. Fixed: v8:10602 Change-Id: I5858840159694fa6f8d1aa857027db80754e3dfd Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3178966Reviewed-by: Mathias Bynens <mathias@chromium.org> Commit-Queue: Jakob Gruber <jgruber@chromium.org> Cr-Commit-Position: refs/heads/main@{#77114}

[regexp] Reorganize and deduplicate in the regexp parser
The parser is organized in a somewhat tricky way s.t. it can be hard to map the implementation back to the specified grammar. In particular, the logic for CharacterClassEscape, ClassEscape, and CharacterEscape was implemented twice - once inside a character class, once outside. This CL refactors related logic to have only a single implementation. As a drive-by, fix one related inconsistency related to \k inside a character class. Fixed: v8:10602 Change-Id: I5858840159694fa6f8d1aa857027db80754e3dfd Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3178966Reviewed-by: Mathias Bynens <mathias@chromium.org> Commit-Queue: Jakob Gruber <jgruber@chromium.org> Cr-Commit-Position: refs/heads/main@{#77114}
7d849870 · Jakob Gruber · V8 LUCI CQ · d6fb96ae · 7d849870 · 7d849870
Commit 7d849870 authored Sep 28, 2021 by Jakob Gruber Committed by V8 LUCI CQ Sep 28, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 220 additions and 202 deletions

regexp-parser.cc src/regexp/regexp-parser.cc +215 -202

regress-v8-10602.js test/mjsunit/regress/regress-v8-10602.js +5 -0

No files found.
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
@@ -23,6 +23,13 @@ namespace internal {

 namespace {

+// Whether we're currently inside the ClassEscape production
+// (tc39.es/ecma262/#prod-annexB-CharacterEscape).
+enum class InClassEscapeState {
+  kInClass,
+  kNotInClass,
+};
+
 // A BufferedZoneList is an automatically growing list, just like (and backed
 // by) a ZoneList, that is optimized for the case of adding and removing
 // a single element. The last element added is stored outside the backing list,
@@ -255,10 +262,6 @@ class RegExpParserImpl final {
  // out parameters.
  bool ParseIntervalQuantifier(int* min_out, int* max_out);

-  // Parses and returns a single escaped character.  The character
-  // must not be 'b' or 'B' since they are usually handle specially.
-  base::uc32 ParseClassCharacterEscape();
-
  // Checks whether the following is a length-digit hexadecimal number,
  // and sets the value if it is.
  bool ParseHexEscape(int length, base::uc32* value);
@@ -286,8 +289,14 @@ class RegExpParserImpl final {
  void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
                        bool add_unicode_case_equivalents, base::uc32* char_out,
                        bool* is_class_escape);
-
-  char ParseClassEscape();
+  // Returns true iff parsing was successful.
+  bool TryParseCharacterClassEscape(base::uc32 next,
+                                    ZoneList<CharacterRange>* ranges,
+                                    Zone* zone,
+                                    bool add_unicode_case_equivalents);
+  // Parses and returns a single escaped character.
+  base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state,
+                                  bool* is_escaped_unicode_character);

  RegExpTree* ReportError(RegExpError error);
  void Advance();
@@ -687,62 +696,19 @@ RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
        switch (Next()) {
          case kEndMarker:
            return ReportError(RegExpError::kEscapeAtEndOfPattern);
-          case 'b':
-            Advance(2);
-            builder->AddAssertion(zone()->template New<RegExpAssertion>(
-                RegExpAssertion::BOUNDARY));
-            continue;
-          case 'B':
-            Advance(2);
-            builder->AddAssertion(zone()->template New<RegExpAssertion>(
-                RegExpAssertion::NON_BOUNDARY));
-            continue;
          // AtomEscape ::
-          //   CharacterClassEscape
+          //   [+UnicodeMode] DecimalEscape
+          //   [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber
+          //                  of DecimalEscape is ≤ NcapturingParens
+          //   CharacterEscape (some cases of this mixed in too)
          //
-          // CharacterClassEscape :: one of
-          //   d D s S w W
-          case 'd':
-          case 'D':
-          case 's':
-          case 'S':
-          case 'w':
-          case 'W': {
-            base::uc32 c = Next();
-            Advance(2);
-            ZoneList<CharacterRange>* ranges =
-                zone()->template New<ZoneList<CharacterRange>>(2, zone());
-            CharacterRange::AddClassEscape(
-                c, ranges, unicode() && builder->ignore_case(), zone());
-            RegExpCharacterClass* cc =
-                zone()->template New<RegExpCharacterClass>(zone(), ranges);
-            builder->AddCharacterClass(cc);
-            break;
-          }
-          case 'p':
-          case 'P': {
-            base::uc32 p = Next();
-            Advance(2);
-            if (unicode()) {
-              ZoneList<CharacterRange>* ranges =
-                  zone()->template New<ZoneList<CharacterRange>>(2, zone());
-              ZoneVector<char> name_1(zone());
-              ZoneVector<char> name_2(zone());
-              if (ParsePropertyClassName(&name_1, &name_2)) {
-                if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
-                  RegExpCharacterClass* cc =
-                      zone()->template New<RegExpCharacterClass>(zone(),
-                                                                 ranges);
-                  builder->AddCharacterClass(cc);
-                  break;
-                }
-              }
-              return ReportError(RegExpError::kInvalidPropertyName);
-            } else {
-              builder->AddCharacter(p);
-            }
-            break;
-          }
+          // TODO(jgruber): It may make sense to disentangle all the different
+          // cases and make the structure mirror the spec, e.g. for AtomEscape:
+          //
+          //  if (TryParseDecimalEscape(...)) return;
+          //  if (TryParseCharacterClassEscape(...)) return;
+          //  if (TryParseCharacterEscape(...)) return;
+          //  if (TryParseGroupName(...)) return;
          case '1':
          case '2':
          case '3':
@@ -753,7 +719,8 @@ RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
          case '8':
          case '9': {
            int index = 0;
-            bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
+            const bool is_backref =
+                ParseBackReferenceIndex(&index CHECK_FAILED);
            if (is_backref) {
              if (state->IsInsideCaptureGroup(index)) {
                // The back reference is inside the capture group it refers to.
@@ -793,76 +760,48 @@ RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
            builder->AddCharacter(octal);
            break;
          }
-          // ControlEscape :: one of
-          //   f n r t v
-          case 'f':
-            Advance(2);
-            builder->AddCharacter('\f');
-            break;
-          case 'n':
-            Advance(2);
-            builder->AddCharacter('\n');
-            break;
-          case 'r':
-            Advance(2);
-            builder->AddCharacter('\r');
-            break;
-          case 't':
+          case 'b':
            Advance(2);
-            builder->AddCharacter('\t');
-            break;
-          case 'v':
+            builder->AddAssertion(zone()->template New<RegExpAssertion>(
+                RegExpAssertion::BOUNDARY));
+            continue;
+          case 'B':
            Advance(2);
-            builder->AddCharacter('\v');
-            break;
-          case 'c': {
-            Advance();
-            base::uc32 controlLetter = Next();
-            // Special case if it is an ASCII letter.
-            // Convert lower case letters to uppercase.
-            base::uc32 letter = controlLetter & ~('a' ^ 'A');
-            if (letter < 'A' || 'Z' < letter) {
-              // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
-              // Read the backslash as a literal character instead of as
-              // starting an escape.
-              // ES#prod-annexB-ExtendedPatternCharacter
-              if (unicode()) {
-                // With /u, invalid escapes are not treated as identity escapes.
-                return ReportError(RegExpError::kInvalidUnicodeEscape);
-              }
-              builder->AddCharacter('\\');
+            builder->AddAssertion(zone()->template New<RegExpAssertion>(
+                RegExpAssertion::NON_BOUNDARY));
+            continue;
+          // AtomEscape ::
+          //   CharacterClassEscape
+          case 'd':
+          case 'D':
+          case 's':
+          case 'S':
+          case 'w':
+          case 'W':
+          case 'p':
+          case 'P': {
+            base::uc32 next = Next();
+            ZoneList<CharacterRange>* ranges =
+                zone()->template New<ZoneList<CharacterRange>>(2, zone());
+            bool add_unicode_case_equivalents =
+                unicode() && builder->ignore_case();
+            bool parsed_character_class_escape = TryParseCharacterClassEscape(
+                next, ranges, zone(),
+                add_unicode_case_equivalents CHECK_FAILED);
+
+            if (parsed_character_class_escape) {
+              RegExpCharacterClass* cc =
+                  zone()->template New<RegExpCharacterClass>(zone(), ranges);
+              builder->AddCharacterClass(cc);
            } else {
+              CHECK(!unicode());
              Advance(2);
-              builder->AddCharacter(controlLetter & 0x1F);
-            }
-            break;
-          }
-          case 'x': {
-            Advance(2);
-            base::uc32 value;
-            if (ParseHexEscape(2, &value)) {
-              builder->AddCharacter(value);
-            } else if (!unicode()) {
-              builder->AddCharacter('x');
-            } else {
-              // With /u, invalid escapes are not treated as identity escapes.
-              return ReportError(RegExpError::kInvalidEscape);
-            }
-            break;
-          }
-          case 'u': {
-            Advance(2);
-            base::uc32 value;
-            if (ParseUnicodeEscape(&value)) {
-              builder->AddEscapedUnicodeCharacter(value);
-            } else if (!unicode()) {
-              builder->AddCharacter('u');
-            } else {
-              // With /u, invalid escapes are not treated as identity escapes.
-              return ReportError(RegExpError::kInvalidUnicodeEscape);
+              builder->AddCharacter(next);  // IdentityEscape.
            }
            break;
          }
+          // AtomEscape ::
+          //   k GroupName
          case 'k':
            // Either an identity escape or a named back-reference.  The two
            // interpretations are mutually exclusive: '\k' is interpreted as
@@ -875,17 +814,20 @@ RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
              break;
            }
            V8_FALLTHROUGH;
-          default:
-            Advance();
-            // With /u, no identity escapes except for syntax characters
-            // are allowed. Otherwise, all identity escapes are allowed.
-            if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
-              builder->AddCharacter(current());
-              Advance();
+          // AtomEscape ::
+          //   CharacterEscape
+          default: {
+            bool is_escaped_unicode_character = false;
+            base::uc32 c = ParseCharacterEscape(
+                InClassEscapeState::kNotInClass,
+                &is_escaped_unicode_character CHECK_FAILED);
+            if (is_escaped_unicode_character) {
+              builder->AddEscapedUnicodeCharacter(c);
            } else {
-              return ReportError(RegExpError::kInvalidEscape);
+              builder->AddCharacter(c);
            }
            break;
+          }
        }
        break;
      case '{': {
@@ -1044,6 +986,9 @@ static bool IsSpecialClassEscape(base::uc32 c) {
 // is called when needed.  It can see the difference between capturing and
 // noncapturing parentheses and can skip character classes and backslash-escaped
 // characters.
+//
+// Important: The scanner has to be in a consistent state when calling
+// ScanForCaptures, e.g. not in the middle of an escape sequence '\['.
 template <class CharT>
 void RegExpParserImpl<CharT>::ScanForCaptures() {
  DCHECK(!is_scanned_for_captures_);
@@ -1295,14 +1240,14 @@ template <class CharT>
 RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) {
  // The index for the capture groups are one-based. Its index in the list is
  // zero-based.
-  int know_captures =
+  const int known_captures =
      is_scanned_for_captures_ ? capture_count_ : captures_started_;
-  DCHECK(index <= know_captures);
+  DCHECK(index <= known_captures);
  if (captures_ == nullptr) {
    captures_ =
-        zone()->template New<ZoneList<RegExpCapture*>>(know_captures, zone());
+        zone()->template New<ZoneList<RegExpCapture*>>(known_captures, zone());
  }
-  while (captures_->length() < know_captures) {
+  while (captures_->length() < known_captures) {
    captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1),
                   zone());
  }
@@ -1768,17 +1713,21 @@ bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value,
  return true;
 }

+// https://tc39.es/ecma262/#prod-CharacterEscape
 template <class CharT>
-base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() {
+base::uc32 RegExpParserImpl<CharT>::ParseCharacterEscape(
+    InClassEscapeState in_class_escape_state,
+    bool* is_escaped_unicode_character) {
  DCHECK_EQ('\\', current());
  DCHECK(has_next() && !IsSpecialClassEscape(Next()));
+
  Advance();
-  switch (current()) {
-    case 'b':
-      Advance();
-      return '\b';
-    // ControlEscape :: one of
-    //   f n r t v
+
+  const base::uc32 c = current();
+  switch (c) {
+    // CharacterEscape ::
+    //   ControlEscape :: one of
+    //     f n r t v
    case 'f':
      Advance();
      return '\f';
@@ -1794,12 +1743,11 @@ base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() {
    case 'v':
      Advance();
      return '\v';
+    // CharacterEscape ::
+    //   c ControlLetter
    case 'c': {
      base::uc32 controlLetter = Next();
      base::uc32 letter = controlLetter & ~('A' ^ 'a');
-      // Inside a character class, we also accept digits and underscore as
-      // control characters, unless with /u. See Annex B:
-      // ES#prod-annexB-ClassControlLetter
      if (letter >= 'A' && letter <= 'Z') {
        Advance(2);
        // Control letters mapped to ASCII control characters in the range
@@ -1808,22 +1756,29 @@ base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() {
      }
      if (unicode()) {
        // With /u, invalid escapes are not treated as identity escapes.
-        ReportError(RegExpError::kInvalidClassEscape);
+        ReportError(RegExpError::kInvalidUnicodeEscape);
        return 0;
      }
-      if ((controlLetter >= '0' && controlLetter <= '9') ||
-          controlLetter == '_') {
-        Advance(2);
-        return controlLetter & 0x1F;
+      if (in_class_escape_state == InClassEscapeState::kInClass) {
+        // Inside a character class, we also accept digits and underscore as
+        // control characters, unless with /u. See Annex B:
+        // ES#prod-annexB-ClassControlLetter
+        if ((controlLetter >= '0' && controlLetter <= '9') ||
+            controlLetter == '_') {
+          Advance(2);
+          return controlLetter & 0x1F;
+        }
      }
      // We match JSC in reading the backslash as a literal
      // character instead of as starting an escape.
-      // TODO(v8:6201): Not yet covered by the spec.
      return '\\';
    }
+    // CharacterEscape ::
+    //   0 [lookahead ∉ DecimalDigit]
+    //   [~UnicodeMode] LegacyOctalEscapeSequence
    case '0':
-      // With /u, \0 is interpreted as NUL if not followed by another digit.
-      if (unicode() && !(Next() >= '0' && Next() <= '9')) {
+      // \0 is interpreted as NUL if not followed by another digit.
+      if (Next() < '0' || Next() > '9') {
        Advance();
        return 0;
      }
@@ -1845,6 +1800,8 @@ base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() {
        return 0;
      }
      return ParseOctalLiteral();
+    // CharacterEscape ::
+    //   HexEscapeSequence
    case 'x': {
      Advance();
      base::uc32 value;
@@ -1858,10 +1815,15 @@ base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() {
      // as an identity escape.
      return 'x';
    }
+    // CharacterEscape ::
+    //   RegExpUnicodeEscapeSequence [?UnicodeMode]
    case 'u': {
      Advance();
      base::uc32 value;
-      if (ParseUnicodeEscape(&value)) return value;
+      if (ParseUnicodeEscape(&value)) {
+        *is_escaped_unicode_character = true;
+        return value;
+      }
      if (unicode()) {
        // With /u, invalid escapes are not treated as identity escapes.
        ReportError(RegExpError::kInvalidUnicodeEscape);
@@ -1871,68 +1833,119 @@ base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() {
      // as an identity escape.
      return 'u';
    }
-    default: {
-      base::uc32 result = current();
-      // With /u, no identity escapes except for syntax characters and '-' are
-      // allowed. Otherwise, all identity escapes are allowed.
-      if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
-        Advance();
-        return result;
-      }
+    default:
+      break;
+  }
+
+  // CharacterEscape ::
+  //   IdentityEscape[?UnicodeMode, ?N]
+  //
+  // * With /u, no identity escapes except for syntax characters are
+  //   allowed.
+  // * Without /u:
+  //   * '\c' is not an IdentityEscape.
+  //   * '\k' is not an IdentityEscape when named captures exist.
+  //   * Otherwise, all identity escapes are allowed.
+  if (unicode()) {
+    if (!IsSyntaxCharacterOrSlash(c)) {
      ReportError(RegExpError::kInvalidEscape);
      return 0;
    }
+    Advance();
+    return c;
  }
-  UNREACHABLE();
+  DCHECK(!unicode());
+  if (c == 'c') {
+    ReportError(RegExpError::kInvalidEscape);
+    return 0;
+  }
+  Advance();
+  // Note: It's important to Advance before the HasNamedCaptures call s.t. we
+  // don't start scanning in the middle of an escape.
+  if (HasNamedCaptures() && c == 'k') {
+    ReportError(RegExpError::kInvalidEscape);
+    return 0;
+  }
+  return c;
 }

+// https://tc39.es/ecma262/#prod-ClassEscape
 template <class CharT>
 void RegExpParserImpl<CharT>::ParseClassEscape(
    ZoneList<CharacterRange>* ranges, Zone* zone,
    bool add_unicode_case_equivalents, base::uc32* char_out,
    bool* is_class_escape) {
-  base::uc32 current_char = current();
-  if (current_char == '\\') {
-    switch (Next()) {
-      case 'w':
-      case 'W':
-      case 'd':
-      case 'D':
-      case 's':
-      case 'S': {
-        CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges,
-                                       add_unicode_case_equivalents, zone);
+  *is_class_escape = false;
+
+  if (current() != '\\') {
+    // Not a ClassEscape.
+    *char_out = current();
+    Advance();
+    return;
+  }
+
+  const base::uc32 next = Next();
+  switch (next) {
+    case 'b':
+      *char_out = '\b';
+      Advance(2);
+      return;
+    case '-':
+      if (unicode()) {
+        *char_out = next;
        Advance(2);
-        *is_class_escape = true;
        return;
      }
-      case kEndMarker:
-        ReportError(RegExpError::kEscapeAtEndOfPattern);
-        return;
-      case 'p':
-      case 'P':
-        if (unicode()) {
-          bool negate = Next() == 'P';
-          Advance(2);
-          ZoneVector<char> name_1(zone);
-          ZoneVector<char> name_2(zone);
-          if (!ParsePropertyClassName(&name_1, &name_2) ||
-              !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
-            ReportError(RegExpError::kInvalidClassPropertyName);
-          }
-          *is_class_escape = true;
-          return;
-        }
-        break;
-      default:
-        break;
+      break;
+    case kEndMarker:
+      ReportError(RegExpError::kEscapeAtEndOfPattern);
+      return;
+    default:
+      break;
+  }
+
+  *is_class_escape = TryParseCharacterClassEscape(next, ranges, zone,
+                                                  add_unicode_case_equivalents);
+  if (*is_class_escape) return;
+
+  bool dummy = false;  // Unused.
+  *char_out = ParseCharacterEscape(InClassEscapeState::kInClass, &dummy);
+}
+
+// https://tc39.es/ecma262/#prod-CharacterClassEscape
+template <class CharT>
+bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape(
+    base::uc32 next, ZoneList<CharacterRange>* ranges, Zone* zone,
+    bool add_unicode_case_equivalents) {
+  DCHECK_EQ(current(), '\\');
+  DCHECK_EQ(Next(), next);
+
+  switch (next) {
+    case 'd':
+    case 'D':
+    case 's':
+    case 'S':
+    case 'w':
+    case 'W':
+      CharacterRange::AddClassEscape(static_cast<char>(next), ranges,
+                                     add_unicode_case_equivalents, zone);
+      Advance(2);
+      return true;
+    case 'p':
+    case 'P': {
+      if (!unicode()) return false;
+      bool negate = next == 'P';
+      Advance(2);
+      ZoneVector<char> name_1(zone);
+      ZoneVector<char> name_2(zone);
+      if (!ParsePropertyClassName(&name_1, &name_2) ||
+          !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
+        ReportError(RegExpError::kInvalidClassPropertyName);
+      }
+      return true;
    }
-    *char_out = ParseClassCharacterEscape();
-    *is_class_escape = false;
-  } else {
-    Advance();
-    *char_out = current_char;
-    *is_class_escape = false;
+    default:
+      return false;
  }
 }


--- a/test/mjsunit/regress/regress-v8-10602.js
+++ b/test/mjsunit/regress/regress-v8-10602.js
+// Copyright 2021 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+assertThrows(String.raw`/[\k](?<a>)/.exec()`);