Commit f67dd50a authored by Jakob Gruber's avatar Jakob Gruber Committed by V8 LUCI CQ

[regexp] Update capture name parsing for recent spec changes

Capture group names were extended in

https://github.com/tc39/ecma262/pull/1869/files
https://github.com/tc39/ecma262/pull/1932/files

RegExpIdentifierName now explicitly enables unicode (+U) for
unicode escape sequences; likewise, surrogate pairs are now allowed
unconditionally.

The implementation simply switches on unicode temporarily while
parsing a capture group name.

Good news everyone, /(?<𝒜>.)/ is now a legal pattern.

Bug: v8:10384
Change-Id: Ida805998eb91ed717b2e05d81d52c1ed61104e3f
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3233234
Auto-Submit: Jakob Gruber <jgruber@chromium.org>
Commit-Queue: Leszek Swirski <leszeks@chromium.org>
Reviewed-by: 's avatarLeszek Swirski <leszeks@chromium.org>
Cr-Commit-Position: refs/heads/main@{#77722}
parent 37355e2d
......@@ -234,17 +234,18 @@ class RegExpParserImpl final {
RegExpTree* ReportError(RegExpError error);
void Advance();
void Advance(int dist);
void RewindByOneCodepoint(); // Rewinds to before the previous Advance().
void Reset(int pos);
// Reports whether the pattern might be used as a literal search string.
// Only use if the result of the parse is a single atom node.
bool simple();
bool contains_anchor() { return contains_anchor_; }
bool simple() const { return simple_; }
bool contains_anchor() const { return contains_anchor_; }
void set_contains_anchor() { contains_anchor_ = true; }
int captures_started() { return captures_started_; }
int position() { return next_pos_ - 1; }
bool failed() { return failed_; }
bool unicode() const { return IsUnicode(top_level_flags_); }
int captures_started() const { return captures_started_; }
int position() const { return next_pos_ - 1; }
bool failed() const { return failed_; }
bool unicode() const { return IsUnicode(top_level_flags_) || force_unicode_; }
static bool IsSyntaxCharacterOrSlash(base::uc32 c);
......@@ -280,9 +281,9 @@ class RegExpParserImpl final {
Zone* zone() const { return zone_; }
base::uc32 current() { return current_; }
bool has_more() { return has_more_; }
bool has_next() { return next_pos_ < input_length(); }
base::uc32 current() const { return current_; }
bool has_more() const { return has_more_; }
bool has_next() const { return next_pos_ < input_length(); }
base::uc32 Next();
template <bool update_position>
base::uc32 ReadNext();
......@@ -301,6 +302,22 @@ class RegExpParserImpl final {
}
};
class ForceUnicodeScope final {
public:
explicit ForceUnicodeScope(RegExpParserImpl<CharT>* parser)
: parser_(parser) {
DCHECK(!parser_->force_unicode_);
parser_->force_unicode_ = true;
}
~ForceUnicodeScope() {
DCHECK(parser_->force_unicode_);
parser_->force_unicode_ = false;
}
private:
RegExpParserImpl<CharT>* const parser_;
};
const DisallowGarbageCollection no_gc_;
Zone* const zone_;
RegExpError error_ = RegExpError::kNone;
......@@ -312,6 +329,7 @@ class RegExpParserImpl final {
const int input_length_;
base::uc32 current_;
const RegExpFlags top_level_flags_;
bool force_unicode_ = false; // Force parser to act as if unicode were set.
int next_pos_;
int captures_started_;
int capture_count_; // Only valid after we have scanned for captures.
......@@ -422,6 +440,17 @@ void RegExpParserImpl<CharT>::Advance() {
}
}
template <class CharT>
void RegExpParserImpl<CharT>::RewindByOneCodepoint() {
if (current() == kEndMarker) return;
// Rewinds by one code point, i.e.: two code units if `current` is outside
// the basic multilingual plane (= composed of a lead and trail surrogate),
// or one code unit otherwise.
const int rewind_by =
current() > unibrow::Utf16::kMaxNonSurrogateCharCode ? -2 : -1;
Advance(rewind_by); // Undo the last Advance.
}
template <class CharT>
void RegExpParserImpl<CharT>::Reset(int pos) {
next_pos_ = pos;
......@@ -435,11 +464,6 @@ void RegExpParserImpl<CharT>::Advance(int dist) {
Advance();
}
template <class CharT>
bool RegExpParserImpl<CharT>::simple() {
return simple_;
}
template <class CharT>
bool RegExpParserImpl<CharT>::IsSyntaxCharacterOrSlash(base::uc32 c) {
switch (c) {
......@@ -1048,48 +1072,73 @@ void push_code_unit(ZoneVector<base::uc16>* v, uint32_t code_unit) {
template <class CharT>
const ZoneVector<base::uc16>* RegExpParserImpl<CharT>::ParseCaptureGroupName() {
// Due to special Advance requirements (see the next comment), rewind by one
// such that names starting with a surrogate pair are parsed correctly for
// patterns where the unicode flag is unset.
//
// Note that we use this odd pattern of rewinding the last advance in order
// to adhere to the common parser behavior of expecting `current` to point at
// the first candidate character for a function (e.g. when entering ParseFoo,
// `current` should point at the first character of Foo).
RewindByOneCodepoint();
ZoneVector<base::uc16>* name =
zone()->template New<ZoneVector<base::uc16>>(zone());
bool at_start = true;
while (true) {
base::uc32 c = current();
Advance();
// Convert unicode escapes.
if (c == '\\' && current() == 'u') {
{
// Advance behavior inside this function is tricky since
// RegExpIdentifierName explicitly enables unicode (in spec terms, sets +U)
// and thus allows surrogate pairs and \u{}-style escapes even in
// non-unicode patterns. Therefore Advance within the capture group name
// has to force-enable unicode, and outside the name revert to default
// behavior.
ForceUnicodeScope force_unicode(this);
bool at_start = true;
while (true) {
Advance();
if (!ParseUnicodeEscape(&c)) {
ReportError(RegExpError::kInvalidUnicodeEscape);
return nullptr;
}
}
base::uc32 c = current();
// The backslash char is misclassified as both ID_Start and ID_Continue.
if (c == '\\') {
ReportError(RegExpError::kInvalidCaptureGroupName);
return nullptr;
}
// Convert unicode escapes.
if (c == '\\' && Next() == 'u') {
Advance(2);
if (!ParseUnicodeEscape(&c)) {
ReportError(RegExpError::kInvalidUnicodeEscape);
return nullptr;
}
RewindByOneCodepoint();
}
if (at_start) {
if (!IsIdentifierStart(c)) {
// The backslash char is misclassified as both ID_Start and ID_Continue.
if (c == '\\') {
ReportError(RegExpError::kInvalidCaptureGroupName);
return nullptr;
}
push_code_unit(name, c);
at_start = false;
} else {
if (c == '>') {
break;
} else if (IsIdentifierPart(c)) {
if (at_start) {
if (!IsIdentifierStart(c)) {
ReportError(RegExpError::kInvalidCaptureGroupName);
return nullptr;
}
push_code_unit(name, c);
at_start = false;
} else {
ReportError(RegExpError::kInvalidCaptureGroupName);
return nullptr;
if (c == '>') {
break;
} else if (IsIdentifierPart(c)) {
push_code_unit(name, c);
} else {
ReportError(RegExpError::kInvalidCaptureGroupName);
return nullptr;
}
}
}
}
// This final advance goes back into the state of pointing at the next
// relevant char, which the rest of the parser expects. See also the previous
// comments in this function.
Advance();
return name;
}
......@@ -2045,7 +2094,6 @@ void RegExpBuilder::FlushPendingSurrogate() {
}
}
void RegExpBuilder::FlushCharacters() {
FlushPendingSurrogate();
pending_empty_ = false;
......@@ -2057,7 +2105,6 @@ void RegExpBuilder::FlushCharacters() {
}
}
void RegExpBuilder::FlushText() {
FlushCharacters();
int num_text = text_.length();
......@@ -2113,7 +2160,6 @@ void RegExpBuilder::AddEscapedUnicodeCharacter(base::uc32 character) {
void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
if (NeedsDesugaringForUnicode(cc)) {
// With /u, character class needs to be desugared, so it
......@@ -2144,24 +2190,20 @@ void RegExpBuilder::AddAtom(RegExpTree* term) {
LAST(ADD_ATOM);
}
void RegExpBuilder::AddTerm(RegExpTree* term) {
FlushText();
terms_.Add(term, zone());
LAST(ADD_ATOM);
}
void RegExpBuilder::AddAssertion(RegExpTree* assert) {
FlushText();
terms_.Add(assert, zone());
LAST(ADD_ASSERT);
}
void RegExpBuilder::NewAlternative() { FlushTerms(); }
void RegExpBuilder::FlushTerms() {
FlushText();
int num_terms = terms_.length();
......@@ -2179,7 +2221,6 @@ void RegExpBuilder::FlushTerms() {
LAST(ADD_NONE);
}
bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
if (!unicode()) return false;
// TODO(yangguo): we could be smarter than this. Case-insensitivity does not
......
......@@ -147,7 +147,7 @@ assertThrows('/(?<𐒤>a)/u', SyntaxError); // ID_Continue but not ID_Start.
assertEquals("a", /(?<π>a)/.exec("bab").groups.π);
assertEquals("a", /(?<$>a)/.exec("bab").groups.$);
assertEquals("a", /(?<_>a)/.exec("bab").groups._);
assertThrows("/(?<$𐒤>a)/", SyntaxError);
assertEquals("a", /(?<$𐒤>a)/.exec("bab").groups.$𐒤);
assertEquals("a", /(?<ಠ_ಠ>a)/.exec("bab").groups._);
assertThrows('/(?<❤>a)/', SyntaxError);
assertThrows('/(?<𐒤>a)/', SyntaxError); // ID_Continue but not ID_Start.
......@@ -209,18 +209,18 @@ assertThrows("/(?<a\\uD801\uDCA4>.)/", SyntaxError);
assertThrows("/(?<a\\uD801>.)/", SyntaxError);
assertThrows("/(?<a\\uDCA4>.)/", SyntaxError);
assertTrue(/(?<\u0041>.)/.test("a"));
assertThrows("/(?<\\u{0041}>.)/", SyntaxError);
assertThrows("/(?<a\\u{104A4}>.)/", SyntaxError);
assertTrue(/(?<\u{0041}>.)/.test("a"));
assertTrue(/(?<a\u{104A4}>.)/.test("a"));
assertThrows("/(?<a\\u{10FFFF}>.)/", SyntaxError);
assertThrows("/(?<a\\uD801>.)/", SyntaxError); // Lead
assertThrows("/(?<a\\uDCA4>.)/", SyntaxError); // Trail;
assertThrows("/(?<a\uD801>.)/", SyntaxError); // Lead
assertThrows("/(?<a\uDCA4>.)/", SyntaxError); // Trail
assertThrows("/(?<\\u{0041}>.)/", SyntaxError); // Non-surrogate
assertThrows("/(?<a\\u{104A4}>.)/", SyntaxError); // Surrogate, ID_Continue
assertTrue(RegExp("(?<\u{0041}>.)").test("a")); // Non-surrogate
assertThrows("(?<a\u{104A4}>.)", SyntaxError); // Surrogate, ID_Continue
assertTrue(RegExp("(?<\\u0041>.)").test("a")); // Non-surrogate
assertThrows("/(?<a\\uD801>.)/", SyntaxError); // Lead
assertThrows("/(?<a\\uDCA4>.)/", SyntaxError); // Trail
assertThrows("/(?<a\uD801>.)/", SyntaxError); // Lead
assertThrows("/(?<a\uDCA4>.)/", SyntaxError); // Trail
assertTrue(/(?<\u{0041}>.)/.test("a")); // Non-surrogate
assertTrue(/(?<a\u{104A4}>.)/.test("a")); // Surrogate, ID_Continue
assertTrue(RegExp("(?<\u{0041}>.)").test("a")); // Non-surrogate
assertTrue(RegExp("(?<a\u{104A4}>.)").test("a")); // Surrogate, ID_Continue
assertTrue(RegExp("(?<\\u0041>.)").test("a")); // Non-surrogate
// @@replace with a callable replacement argument (no named captures).
{
......
......@@ -421,8 +421,8 @@
['no_i18n', {
# Case-insensitive unicode regexp relies on case mapping provided by ICU.
'es6/unicode-regexp-ignore-case': [FAIL],
'regress/regress-5036': [FAIL],
'es7/regexp-ui-word': [FAIL],
'regress/regress-5036': [FAIL],
# Desugaring regexp property class relies on ICU. Anything goes as long as we
# don't crash.
......@@ -430,8 +430,9 @@
'regress/regress-1262423': [PASS,FAIL],
'regress/regress-793588': [PASS,FAIL],
# noi18n build cannot parse characters in supplementary plane.
# The noi18n build cannot parse characters in supplementary plane.
'harmony/regexp-named-captures': [FAIL],
'regress/regress-v8-10384': [FAIL],
# noi18n cannot turn on ICU backend for Date. Anything goes as long as we
# don't crash.
......
// Copyright 2021 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
const s = "x";
assertEquals(/(?<\ud835\udc9c>.)/.exec(s).groups["\u{1d49c}"], s);
assertEquals(/(?<\ud835\udc9c>.)/u.exec(s).groups["\u{1d49c}"], s);
assertEquals(/(?<\u{1d49c}>.)/.exec(s).groups["\u{1d49c}"], s);
assertEquals(/(?<\u{1d49c}>.)/u.exec(s).groups["\u{1d49c}"], s);
assertEquals(/(?<𝒜>.)/.exec(s).groups["\u{1d49c}"], s);
assertEquals(/(?<𝒜>.)/u.exec(s).groups["\u{1d49c}"], s);
......@@ -73,9 +73,6 @@
'language/expressions/assignment/destructuring/iterator-destructuring-property-reference-target-evaluation-order': [FAIL],
'language/expressions/assignment/destructuring/keyed-destructuring-property-reference-target-evaluation-order': [FAIL],
# https://bugs.chromium.org/p/v8/issues/detail?id=10379
'built-ins/RegExp/named-groups/non-unicode-property-names-valid': [FAIL],
# https://bugs.chromium.org/p/v8/issues/detail?id=4628
'language/eval-code/direct/async-gen-func-decl-fn-body-cntns-arguments-func-decl-declare-arguments': [FAIL],
'language/eval-code/direct/async-gen-func-decl-fn-body-cntns-arguments-func-decl-declare-arguments-and-assign': [FAIL],
......@@ -534,10 +531,10 @@
# https://bugs.chromium.org/p/v8/issues/detail?id=7833
#
# Test262 needs to expose CanBlock
# Test262 needs to expose CanBlock.
'built-ins/Atomics/wait/bigint/cannot-suspend-throws': [SKIP],
'built-ins/Atomics/wait/cannot-suspend-throws': [SKIP],
# Flaky
# Flaky.
'built-ins/Atomics/wait/undefined-index-defaults-to-zero': [SKIP],
##################### DELIBERATE INCOMPATIBILITIES #####################
......@@ -564,7 +561,7 @@
'built-ins/Date/prototype/setFullYear/new-value-time-clip': [PASS, FAIL],
'built-ins/Date/prototype/setMonth/new-value-time-clip': [PASS, FAIL],
# Test against internals of harness; we plug in differently
# Test against internals of harness; we plug in differently.
'harness/detachArrayBuffer': [SKIP],
'harness/detachArrayBuffer-host-detachArrayBuffer': [SKIP],
......@@ -606,7 +603,7 @@
# Unicode regexp case mapping is not available with i18n turned off.
'language/literals/regexp/u-case-mapping': [SKIP],
# Unicode in capture group
# Unicode in capture group.
'built-ins/RegExp/prototype/Symbol.replace/named-groups': [FAIL],
# BUG(v8:4437).
......@@ -632,13 +629,14 @@
'intl402/String/prototype/toLocaleUpperCase/special_casing_Lithuanian': [FAIL],
'intl402/String/prototype/toLocaleUpperCase/special_casing_Turkish': [FAIL],
# Unicode features unavaible without i18n, ie property escapes.
# Unicode features unavaible without i18n, e.g. property escapes.
'built-ins/RegExp/property-escapes/*': [SKIP],
'built-ins/RegExp/named-groups/unicode-property-names': [SKIP],
'built-ins/RegExp/named-groups/unicode-property-names-valid': [SKIP],
'built-ins/RegExp/named-groups/non-unicode-property-names-valid': [FAIL],
'built-ins/RegExp/match-indices/indices-array-unicode-property-names': [SKIP],
# Unicode in identifiers
# Unicode in identifiers.
'language/identifiers/part-unicode-*': [FAIL],
'language/identifiers/start-unicode-1*': [FAIL],
'language/identifiers/start-unicode-5*': [FAIL],
......@@ -646,7 +644,6 @@
'language/identifiers/start-unicode-7*': [FAIL],
'language/identifiers/start-unicode-8*': [FAIL],
'language/identifiers/start-unicode-9*': [FAIL],
}], # no_i18n == True
['arch == arm or arch == mipsel or arch == mips or arch == arm64 or arch == mips64 or arch == mips64el', {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment