Commit a3be9e78 authored by jgruber's avatar jgruber Committed by Commit bot

[regexp] Allow named captures and back-references in non-unicode patterns

Previously, named captures (and related functionality) were restricted to
unicode-mode regexps.

This CL extends that support to non-unicode patterns. Named groups are
supported regardless of the mode, and named back-references are supported if
the regexp is in unicode mode or if it contains a named capture (otherwise '\k'
is treated as an identity escape).

BUG=v8:5437,v8:6192

Review-Url: https://codereview.chromium.org/2788873002
Cr-Commit-Position: refs/heads/master@{#44324}
parent ee81214a
......@@ -40,6 +40,7 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
simple_(false),
contains_anchor_(false),
is_scanned_for_captures_(false),
has_named_captures_(false),
failed_(false) {
DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);
Advance();
......@@ -325,7 +326,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {
break;
}
}
if (FLAG_harmony_regexp_named_captures && unicode()) {
if (FLAG_harmony_regexp_named_captures) {
has_named_captures_ = true;
is_named_capture = true;
Advance();
break;
......@@ -541,7 +543,13 @@ RegExpTree* RegExpParser::ParseDisjunction() {
break;
}
case 'k':
if (FLAG_harmony_regexp_named_captures && unicode()) {
// Either an identity escape or a named back-reference. The two
// interpretations are mutually exclusive: '\k' is interpreted as
// an identity escape for non-unicode patterns without named
// capture groups, and as the beginning of a named back-reference
// in all other cases.
if (FLAG_harmony_regexp_named_captures &&
(unicode() || HasNamedCaptures())) {
Advance(2);
ParseNamedBackReference(builder, state CHECK_FAILED);
break;
......@@ -657,6 +665,8 @@ static bool IsSpecialClassEscape(uc32 c) {
// noncapturing parentheses and can skip character classes and backslash-escaped
// characters.
void RegExpParser::ScanForCaptures() {
DCHECK(!is_scanned_for_captures_);
const int saved_position = position();
// Start with captures started previous to current position
int capture_count = captures_started();
// Add count of captures after this position.
......@@ -692,11 +702,19 @@ void RegExpParser::ScanForCaptures() {
Advance();
if (current() != '<') break;
// TODO(jgruber): To be more future-proof we could test for
// IdentifierStart here once it becomes clear whether group names
// allow unicode escapes.
Advance();
if (current() == '=' || current() == '!') break;
if (FLAG_harmony_regexp_lookbehind) {
// TODO(jgruber): To be more future-proof we could test for
// IdentifierStart here once it becomes clear whether group names
// allow unicode escapes.
// https://github.com/tc39/proposal-regexp-named-groups/issues/23
Advance();
if (current() == '=' || current() == '!') break;
}
// Found a possible named capture. It could turn out to be a syntax
// error (e.g. an unterminated or invalid name), but that distinction
// does not matter for our purposes.
has_named_captures_ = true;
}
capture_count++;
break;
......@@ -704,6 +722,7 @@ void RegExpParser::ScanForCaptures() {
}
capture_count_ = capture_count;
is_scanned_for_captures_ = true;
Reset(saved_position);
}
......@@ -729,11 +748,7 @@ bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
}
}
if (value > captures_started()) {
if (!is_scanned_for_captures_) {
int saved_position = position();
ScanForCaptures();
Reset(saved_position);
}
if (!is_scanned_for_captures_) ScanForCaptures();
if (value > capture_count_) {
Reset(start);
return false;
......@@ -754,7 +769,6 @@ static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
DCHECK(FLAG_harmony_regexp_named_captures);
DCHECK(unicode());
ZoneVector<uc16>* name =
new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
......@@ -766,6 +780,8 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
// Convert unicode escapes.
if (c == '\\' && current() == 'u') {
// TODO(jgruber): Reconsider this once the spec has settled.
// https://github.com/tc39/proposal-regexp-named-groups/issues/23
Advance();
if (!ParseUnicodeEscape(&c)) {
ReportError(CStrVector("Invalid Unicode escape sequence"));
......@@ -798,7 +814,6 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
int index) {
DCHECK(FLAG_harmony_regexp_named_captures);
DCHECK(unicode());
DCHECK(0 < index && index <= captures_started_);
DCHECK_NOT_NULL(name);
......@@ -806,6 +821,7 @@ bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
} else {
// Check for duplicates and bail if we find any.
// TODO(jgruber): O(n^2).
for (const auto& named_capture : *named_captures_) {
if (*named_capture->name() == *name) {
ReportError(CStrVector("Duplicate capture group name"));
......@@ -920,6 +936,16 @@ Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
return array;
}
bool RegExpParser::HasNamedCaptures() {
if (has_named_captures_ || is_scanned_for_captures_) {
return has_named_captures_;
}
ScanForCaptures();
DCHECK(is_scanned_for_captures_);
return has_named_captures_;
}
bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
if (s->group_type() != CAPTURE) continue;
......
......@@ -293,6 +293,10 @@ class RegExpParser BASE_EMBEDDED {
Handle<FixedArray> CreateCaptureNameMap();
// Returns true iff the pattern contains named captures. May call
// ScanForCaptures to look ahead at the remaining pattern.
bool HasNamedCaptures();
Isolate* isolate() { return isolate_; }
Zone* zone() const { return zone_; }
......@@ -319,12 +323,12 @@ class RegExpParser BASE_EMBEDDED {
bool unicode_;
int next_pos_;
int captures_started_;
// The capture count is only valid after we have scanned for captures.
int capture_count_;
int capture_count_; // Only valid after we have scanned for captures.
bool has_more_;
bool simple_;
bool contains_anchor_;
bool is_scanned_for_captures_;
bool has_named_captures_; // Only valid after we have scanned for captures.
bool failed_;
};
......
// Copyright 2015 the V8 project authors. All rights reserved.
// Copyright 2017 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-regexp-named-captures --harmony-regexp-lookbehind
// Malformed named captures.
assertThrows("/(?<>a)/u"); // Empty name.
assertThrows("/(?<aa)/u"); // Unterminated name.
assertThrows("/(?<42a>a)/u"); // Name starting with digits.
assertThrows("/(?<:a>a)/u"); // Name starting with invalid char.
assertThrows("/(?<a:>a)/u"); // Name containing with invalid char.
assertThrows("/(?<a>a)(?<a>a)/u"); // Duplicate name.
assertThrows("/(?<a>a)(?<b>b)(?<a>a)/u"); // Duplicate name.
assertThrows("/\\k<a>/u"); // Invalid reference.
assertThrows("/(?<a>a)\\k<ab>/u"); // Invalid reference.
assertThrows("/(?<ab>a)\\k<a>/u"); // Invalid reference.
assertThrows("/\\k<a>(?<ab>a)/u"); // Invalid reference.
// Fallback behavior in non-unicode mode.
assertThrows("/(?<>a)/u", SyntaxError); // Empty name.
assertThrows("/(?<aa)/u", SyntaxError); // Unterminated name.
assertThrows("/(?<42a>a)/u", SyntaxError); // Name starting with digits.
assertThrows("/(?<:a>a)/u", SyntaxError); // Name starting with invalid char.
assertThrows("/(?<a:>a)/u", SyntaxError); // Name containing with invalid char.
assertThrows("/(?<a>a)(?<a>a)/u", SyntaxError); // Duplicate name.
assertThrows("/(?<a>a)(?<b>b)(?<a>a)/u", SyntaxError); // Duplicate name.
assertThrows("/\\k<a>/u", SyntaxError); // Invalid reference.
assertThrows("/\\k<a/u", SyntaxError); // Unterminated reference.
assertThrows("/\\k/u", SyntaxError); // Lone \k.
assertThrows("/(?<a>.)\\k/u", SyntaxError); // Lone \k.
assertThrows("/(?<a>.)\\k<a/u", SyntaxError); // Unterminated reference.
assertThrows("/(?<a>.)\\k<b>/u", SyntaxError); // Invalid reference.
assertThrows("/(?<a>a)\\k<ab>/u", SyntaxError); // Invalid reference.
assertThrows("/(?<ab>a)\\k<a>/u", SyntaxError); // Invalid reference.
assertThrows("/\\k<a>(?<ab>a)/u", SyntaxError); // Invalid reference.
assertThrows("/(?<a>\\a)/u", SyntaxError); // Identity escape in capture.
// Behavior in non-unicode mode.
assertThrows("/(?<>a)/", SyntaxError);
assertThrows("/(?<aa)/", SyntaxError);
assertThrows("/(?<42a>a)/", SyntaxError);
......@@ -25,12 +31,35 @@ assertThrows("/(?<:a>a)/", SyntaxError);
assertThrows("/(?<a:>a)/", SyntaxError);
assertThrows("/(?<a>a)(?<a>a)/", SyntaxError);
assertThrows("/(?<a>a)(?<b>b)(?<a>a)/", SyntaxError);
assertTrue(/\k<a>/.test("k<a>"));
assertTrue(/\k<4>/.test("k<4>"));
assertTrue(/\k<a/.test("k<a"));
assertTrue(/\k/.test("k"));
assertThrows("/(?<a>.)\\k/", SyntaxError);
assertThrows("/(?<a>.)\\k<a/", SyntaxError);
assertThrows("/(?<a>.)\\k<b>/", SyntaxError);
assertThrows("/(?<a>a)\\k<ab>/", SyntaxError);
assertThrows("/(?<ab>a)\\k<a>/", SyntaxError);
assertThrows("/\\k<a>(?<ab>a)/", SyntaxError);
assertThrows("/\\k<a(?<a>a)/", SyntaxError);
assertTrue(/(?<a>\a)/.test("a"));
assertEquals(["k<a>"], "xxxk<a>xxx".match(/\k<a>/));
assertEquals(["k<a"], "xxxk<a>xxx".match(/\k<a/));
assertEquals({a: "a", b: "b", c: "c"},
/(?<a>.)(?<b>.)(?<c>.)\k<c>\k<b>\k<a>/.exec("abccba").groups);
// A couple of corner cases around '\k' as named back-references vs. identity
// escapes.
assertTrue(/\k<a>(?<=>)a/.test("k<a>a"));
assertTrue(/\k<a>(?<!a)a/.test("k<a>a"));
assertTrue(/\k<a>(<a>x)/.test("k<a><a>x"));
assertTrue(/\k<a>(?<a>x)/.test("x"));
assertThrows("/\\k<a>(?<b>x)/", SyntaxError);
assertThrows("/\\k<a(?<a>.)/", SyntaxError);
assertThrows("/\\k(?<a>.)/", SyntaxError);
// Basic named groups.
assertEquals(["a", "a"], "bab".match(/(?<a>a)/u));
assertEquals(["a", "a"], "bab".match(/(?<a42>a)/u));
......@@ -43,6 +72,17 @@ assertEquals(["bab", "ab"], "bab".match(/.(?<a>\w\w)/u));
assertEquals(["bab", "bab"], "bab".match(/(?<a>\w\w\w)/u));
assertEquals(["bab", "ba", "b"], "bab".match(/(?<a>\w\w)(?<b>\w)/u));
assertEquals(["a", "a"], "bab".match(/(?<a>a)/));
assertEquals(["a", "a"], "bab".match(/(?<a42>a)/));
assertEquals(["a", "a"], "bab".match(/(?<_>a)/));
assertEquals(["a", "a"], "bab".match(/(?<$>a)/));
assertEquals(["bab", "a"], "bab".match(/.(?<$>a)./));
assertEquals(["bab", "a", "b"], "bab".match(/.(?<a>a)(.)/));
assertEquals(["bab", "a", "b"], "bab".match(/.(?<a>a)(?<b>.)/));
assertEquals(["bab", "ab"], "bab".match(/.(?<a>\w\w)/));
assertEquals(["bab", "bab"], "bab".match(/(?<a>\w\w\w)/));
assertEquals(["bab", "ba", "b"], "bab".match(/(?<a>\w\w)(?<b>\w)/));
assertEquals("bab".match(/(a)/u), "bab".match(/(?<a>a)/u));
assertEquals("bab".match(/(a)/u), "bab".match(/(?<a42>a)/u));
assertEquals("bab".match(/(a)/u), "bab".match(/(?<_>a)/u));
......@@ -81,6 +121,9 @@ assertEquals(["bab", "b", "a"], "bab".match(/(?<b>b)\k<a>(?<a>a)\k<b>/u));
assertEquals({a: "a", b: "b"},
"bab".match(/(?<b>b)\k<a>(?<a>a)\k<b>/u).groups);
assertEquals(["bab", "b"], "bab".match(/\k<a>(?<a>b)\w\k<a>/));
assertEquals(["bab", "b", "a"], "bab".match(/(?<b>b)\k<a>(?<a>a)\k<b>/));
// Reference properties.
assertEquals("a", /(?<a>a)(?<b>b)\k<a>/u.exec("aba").groups.a);
assertEquals("b", /(?<a>a)(?<b>b)\k<a>/u.exec("aba").groups.b);
......@@ -89,6 +132,8 @@ assertEquals(undefined, /(?<a>a)(?<b>b)\k<a>|(?<c>c)/u.exec("aba").groups.c);
// Unicode names.
assertEquals("a", /(?<π>a)/u.exec("bab").groups.π);
assertEquals("a", /(?<\u{03C0}>a)/u.exec("bab").groups.π);
assertEquals("a", /(?<π>a)/u.exec("bab").groups.\u03C0);
assertEquals("a", /(?<\u{03C0}>a)/u.exec("bab").groups.\u03C0);
assertEquals("a", /(?<$>a)/u.exec("bab").groups.$);
assertEquals("a", /(?<_>a)/u.exec("bab").groups._);
......@@ -99,6 +144,14 @@ assertEquals("a", /(?<ಠ_ಠ>a)/u.exec("bab").groups.ಠ_ಠ);
assertThrows('/(?<❤>a)/u', SyntaxError);
assertThrows('/(?<𐒤>a)/u', SyntaxError); // ID_Continue but not ID_Start.
assertEquals("a", /(?<π>a)/.exec("bab").groups.π);
assertEquals("a", /(?<$>a)/.exec("bab").groups.$);
assertEquals("a", /(?<_>a)/.exec("bab").groups._);
assertThrows("/(?<$𐒤>a)/", SyntaxError);
assertEquals("a", /(?<ಠ_ಠ>a)/.exec("bab").groups._);
assertThrows('/(?<❤>a)/', SyntaxError);
assertThrows('/(?<𐒤>a)/', SyntaxError); // ID_Continue but not ID_Start.
// Interaction with lookbehind assertions.
assertEquals(["f", "c"], "abcdef".match(/(?<=(?<a>\w){3})f/u));
assertEquals({a: "c"}, "abcdef".match(/(?<=(?<a>\w){3})f/u).groups);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment