Commit a3be9e78 authored by jgruber's avatar jgruber Committed by Commit bot

[regexp] Allow named captures and back-references in non-unicode patterns

Previously, named captures (and related functionality) were restricted to
unicode-mode regexps.

This CL extends that support to non-unicode patterns. Named groups are
supported regardless of the mode, and named back-references are supported if
the regexp is in unicode mode or if it contains a named capture (otherwise '\k'
is treated as an identity escape).

BUG=v8:5437,v8:6192

Review-Url: https://codereview.chromium.org/2788873002
Cr-Commit-Position: refs/heads/master@{#44324}
parent ee81214a
...@@ -40,6 +40,7 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, ...@@ -40,6 +40,7 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
simple_(false), simple_(false),
contains_anchor_(false), contains_anchor_(false),
is_scanned_for_captures_(false), is_scanned_for_captures_(false),
has_named_captures_(false),
failed_(false) { failed_(false) {
DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall); DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);
Advance(); Advance();
...@@ -325,7 +326,8 @@ RegExpTree* RegExpParser::ParseDisjunction() { ...@@ -325,7 +326,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {
break; break;
} }
} }
if (FLAG_harmony_regexp_named_captures && unicode()) { if (FLAG_harmony_regexp_named_captures) {
has_named_captures_ = true;
is_named_capture = true; is_named_capture = true;
Advance(); Advance();
break; break;
...@@ -541,7 +543,13 @@ RegExpTree* RegExpParser::ParseDisjunction() { ...@@ -541,7 +543,13 @@ RegExpTree* RegExpParser::ParseDisjunction() {
break; break;
} }
case 'k': case 'k':
if (FLAG_harmony_regexp_named_captures && unicode()) { // Either an identity escape or a named back-reference. The two
// interpretations are mutually exclusive: '\k' is interpreted as
// an identity escape for non-unicode patterns without named
// capture groups, and as the beginning of a named back-reference
// in all other cases.
if (FLAG_harmony_regexp_named_captures &&
(unicode() || HasNamedCaptures())) {
Advance(2); Advance(2);
ParseNamedBackReference(builder, state CHECK_FAILED); ParseNamedBackReference(builder, state CHECK_FAILED);
break; break;
...@@ -657,6 +665,8 @@ static bool IsSpecialClassEscape(uc32 c) { ...@@ -657,6 +665,8 @@ static bool IsSpecialClassEscape(uc32 c) {
// noncapturing parentheses and can skip character classes and backslash-escaped // noncapturing parentheses and can skip character classes and backslash-escaped
// characters. // characters.
void RegExpParser::ScanForCaptures() { void RegExpParser::ScanForCaptures() {
DCHECK(!is_scanned_for_captures_);
const int saved_position = position();
// Start with captures started previous to current position // Start with captures started previous to current position
int capture_count = captures_started(); int capture_count = captures_started();
// Add count of captures after this position. // Add count of captures after this position.
...@@ -692,11 +702,19 @@ void RegExpParser::ScanForCaptures() { ...@@ -692,11 +702,19 @@ void RegExpParser::ScanForCaptures() {
Advance(); Advance();
if (current() != '<') break; if (current() != '<') break;
// TODO(jgruber): To be more future-proof we could test for if (FLAG_harmony_regexp_lookbehind) {
// IdentifierStart here once it becomes clear whether group names // TODO(jgruber): To be more future-proof we could test for
// allow unicode escapes. // IdentifierStart here once it becomes clear whether group names
Advance(); // allow unicode escapes.
if (current() == '=' || current() == '!') break; // https://github.com/tc39/proposal-regexp-named-groups/issues/23
Advance();
if (current() == '=' || current() == '!') break;
}
// Found a possible named capture. It could turn out to be a syntax
// error (e.g. an unterminated or invalid name), but that distinction
// does not matter for our purposes.
has_named_captures_ = true;
} }
capture_count++; capture_count++;
break; break;
...@@ -704,6 +722,7 @@ void RegExpParser::ScanForCaptures() { ...@@ -704,6 +722,7 @@ void RegExpParser::ScanForCaptures() {
} }
capture_count_ = capture_count; capture_count_ = capture_count;
is_scanned_for_captures_ = true; is_scanned_for_captures_ = true;
Reset(saved_position);
} }
...@@ -729,11 +748,7 @@ bool RegExpParser::ParseBackReferenceIndex(int* index_out) { ...@@ -729,11 +748,7 @@ bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
} }
} }
if (value > captures_started()) { if (value > captures_started()) {
if (!is_scanned_for_captures_) { if (!is_scanned_for_captures_) ScanForCaptures();
int saved_position = position();
ScanForCaptures();
Reset(saved_position);
}
if (value > capture_count_) { if (value > capture_count_) {
Reset(start); Reset(start);
return false; return false;
...@@ -754,7 +769,6 @@ static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) { ...@@ -754,7 +769,6 @@ static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
DCHECK(FLAG_harmony_regexp_named_captures); DCHECK(FLAG_harmony_regexp_named_captures);
DCHECK(unicode());
ZoneVector<uc16>* name = ZoneVector<uc16>* name =
new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
...@@ -766,6 +780,8 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { ...@@ -766,6 +780,8 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
// Convert unicode escapes. // Convert unicode escapes.
if (c == '\\' && current() == 'u') { if (c == '\\' && current() == 'u') {
// TODO(jgruber): Reconsider this once the spec has settled.
// https://github.com/tc39/proposal-regexp-named-groups/issues/23
Advance(); Advance();
if (!ParseUnicodeEscape(&c)) { if (!ParseUnicodeEscape(&c)) {
ReportError(CStrVector("Invalid Unicode escape sequence")); ReportError(CStrVector("Invalid Unicode escape sequence"));
...@@ -798,7 +814,6 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { ...@@ -798,7 +814,6 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
int index) { int index) {
DCHECK(FLAG_harmony_regexp_named_captures); DCHECK(FLAG_harmony_regexp_named_captures);
DCHECK(unicode());
DCHECK(0 < index && index <= captures_started_); DCHECK(0 < index && index <= captures_started_);
DCHECK_NOT_NULL(name); DCHECK_NOT_NULL(name);
...@@ -806,6 +821,7 @@ bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, ...@@ -806,6 +821,7 @@ bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone()); named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
} else { } else {
// Check for duplicates and bail if we find any. // Check for duplicates and bail if we find any.
// TODO(jgruber): O(n^2).
for (const auto& named_capture : *named_captures_) { for (const auto& named_capture : *named_captures_) {
if (*named_capture->name() == *name) { if (*named_capture->name() == *name) {
ReportError(CStrVector("Duplicate capture group name")); ReportError(CStrVector("Duplicate capture group name"));
...@@ -920,6 +936,16 @@ Handle<FixedArray> RegExpParser::CreateCaptureNameMap() { ...@@ -920,6 +936,16 @@ Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
return array; return array;
} }
bool RegExpParser::HasNamedCaptures() {
if (has_named_captures_ || is_scanned_for_captures_) {
return has_named_captures_;
}
ScanForCaptures();
DCHECK(is_scanned_for_captures_);
return has_named_captures_;
}
bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) { bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
if (s->group_type() != CAPTURE) continue; if (s->group_type() != CAPTURE) continue;
......
...@@ -293,6 +293,10 @@ class RegExpParser BASE_EMBEDDED { ...@@ -293,6 +293,10 @@ class RegExpParser BASE_EMBEDDED {
Handle<FixedArray> CreateCaptureNameMap(); Handle<FixedArray> CreateCaptureNameMap();
// Returns true iff the pattern contains named captures. May call
// ScanForCaptures to look ahead at the remaining pattern.
bool HasNamedCaptures();
Isolate* isolate() { return isolate_; } Isolate* isolate() { return isolate_; }
Zone* zone() const { return zone_; } Zone* zone() const { return zone_; }
...@@ -319,12 +323,12 @@ class RegExpParser BASE_EMBEDDED { ...@@ -319,12 +323,12 @@ class RegExpParser BASE_EMBEDDED {
bool unicode_; bool unicode_;
int next_pos_; int next_pos_;
int captures_started_; int captures_started_;
// The capture count is only valid after we have scanned for captures. int capture_count_; // Only valid after we have scanned for captures.
int capture_count_;
bool has_more_; bool has_more_;
bool simple_; bool simple_;
bool contains_anchor_; bool contains_anchor_;
bool is_scanned_for_captures_; bool is_scanned_for_captures_;
bool has_named_captures_; // Only valid after we have scanned for captures.
bool failed_; bool failed_;
}; };
......
// Copyright 2015 the V8 project authors. All rights reserved. // Copyright 2017 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. // found in the LICENSE file.
// Flags: --harmony-regexp-named-captures --harmony-regexp-lookbehind // Flags: --harmony-regexp-named-captures --harmony-regexp-lookbehind
// Malformed named captures. // Malformed named captures.
assertThrows("/(?<>a)/u"); // Empty name. assertThrows("/(?<>a)/u", SyntaxError); // Empty name.
assertThrows("/(?<aa)/u"); // Unterminated name. assertThrows("/(?<aa)/u", SyntaxError); // Unterminated name.
assertThrows("/(?<42a>a)/u"); // Name starting with digits. assertThrows("/(?<42a>a)/u", SyntaxError); // Name starting with digits.
assertThrows("/(?<:a>a)/u"); // Name starting with invalid char. assertThrows("/(?<:a>a)/u", SyntaxError); // Name starting with invalid char.
assertThrows("/(?<a:>a)/u"); // Name containing with invalid char. assertThrows("/(?<a:>a)/u", SyntaxError); // Name containing with invalid char.
assertThrows("/(?<a>a)(?<a>a)/u"); // Duplicate name. assertThrows("/(?<a>a)(?<a>a)/u", SyntaxError); // Duplicate name.
assertThrows("/(?<a>a)(?<b>b)(?<a>a)/u"); // Duplicate name. assertThrows("/(?<a>a)(?<b>b)(?<a>a)/u", SyntaxError); // Duplicate name.
assertThrows("/\\k<a>/u"); // Invalid reference. assertThrows("/\\k<a>/u", SyntaxError); // Invalid reference.
assertThrows("/(?<a>a)\\k<ab>/u"); // Invalid reference. assertThrows("/\\k<a/u", SyntaxError); // Unterminated reference.
assertThrows("/(?<ab>a)\\k<a>/u"); // Invalid reference. assertThrows("/\\k/u", SyntaxError); // Lone \k.
assertThrows("/\\k<a>(?<ab>a)/u"); // Invalid reference. assertThrows("/(?<a>.)\\k/u", SyntaxError); // Lone \k.
assertThrows("/(?<a>.)\\k<a/u", SyntaxError); // Unterminated reference.
// Fallback behavior in non-unicode mode. assertThrows("/(?<a>.)\\k<b>/u", SyntaxError); // Invalid reference.
assertThrows("/(?<a>a)\\k<ab>/u", SyntaxError); // Invalid reference.
assertThrows("/(?<ab>a)\\k<a>/u", SyntaxError); // Invalid reference.
assertThrows("/\\k<a>(?<ab>a)/u", SyntaxError); // Invalid reference.
assertThrows("/(?<a>\\a)/u", SyntaxError); // Identity escape in capture.
// Behavior in non-unicode mode.
assertThrows("/(?<>a)/", SyntaxError); assertThrows("/(?<>a)/", SyntaxError);
assertThrows("/(?<aa)/", SyntaxError); assertThrows("/(?<aa)/", SyntaxError);
assertThrows("/(?<42a>a)/", SyntaxError); assertThrows("/(?<42a>a)/", SyntaxError);
...@@ -25,12 +31,35 @@ assertThrows("/(?<:a>a)/", SyntaxError); ...@@ -25,12 +31,35 @@ assertThrows("/(?<:a>a)/", SyntaxError);
assertThrows("/(?<a:>a)/", SyntaxError); assertThrows("/(?<a:>a)/", SyntaxError);
assertThrows("/(?<a>a)(?<a>a)/", SyntaxError); assertThrows("/(?<a>a)(?<a>a)/", SyntaxError);
assertThrows("/(?<a>a)(?<b>b)(?<a>a)/", SyntaxError); assertThrows("/(?<a>a)(?<b>b)(?<a>a)/", SyntaxError);
assertTrue(/\k<a>/.test("k<a>"));
assertTrue(/\k<4>/.test("k<4>"));
assertTrue(/\k<a/.test("k<a"));
assertTrue(/\k/.test("k"));
assertThrows("/(?<a>.)\\k/", SyntaxError);
assertThrows("/(?<a>.)\\k<a/", SyntaxError);
assertThrows("/(?<a>.)\\k<b>/", SyntaxError);
assertThrows("/(?<a>a)\\k<ab>/", SyntaxError); assertThrows("/(?<a>a)\\k<ab>/", SyntaxError);
assertThrows("/(?<ab>a)\\k<a>/", SyntaxError); assertThrows("/(?<ab>a)\\k<a>/", SyntaxError);
assertThrows("/\\k<a>(?<ab>a)/", SyntaxError);
assertThrows("/\\k<a(?<a>a)/", SyntaxError);
assertTrue(/(?<a>\a)/.test("a"));
assertEquals(["k<a>"], "xxxk<a>xxx".match(/\k<a>/)); assertEquals(["k<a>"], "xxxk<a>xxx".match(/\k<a>/));
assertEquals(["k<a"], "xxxk<a>xxx".match(/\k<a/)); assertEquals(["k<a"], "xxxk<a>xxx".match(/\k<a/));
assertEquals({a: "a", b: "b", c: "c"},
/(?<a>.)(?<b>.)(?<c>.)\k<c>\k<b>\k<a>/.exec("abccba").groups);
// A couple of corner cases around '\k' as named back-references vs. identity
// escapes.
assertTrue(/\k<a>(?<=>)a/.test("k<a>a"));
assertTrue(/\k<a>(?<!a)a/.test("k<a>a"));
assertTrue(/\k<a>(<a>x)/.test("k<a><a>x"));
assertTrue(/\k<a>(?<a>x)/.test("x"));
assertThrows("/\\k<a>(?<b>x)/", SyntaxError);
assertThrows("/\\k<a(?<a>.)/", SyntaxError);
assertThrows("/\\k(?<a>.)/", SyntaxError);
// Basic named groups. // Basic named groups.
assertEquals(["a", "a"], "bab".match(/(?<a>a)/u)); assertEquals(["a", "a"], "bab".match(/(?<a>a)/u));
assertEquals(["a", "a"], "bab".match(/(?<a42>a)/u)); assertEquals(["a", "a"], "bab".match(/(?<a42>a)/u));
...@@ -43,6 +72,17 @@ assertEquals(["bab", "ab"], "bab".match(/.(?<a>\w\w)/u)); ...@@ -43,6 +72,17 @@ assertEquals(["bab", "ab"], "bab".match(/.(?<a>\w\w)/u));
assertEquals(["bab", "bab"], "bab".match(/(?<a>\w\w\w)/u)); assertEquals(["bab", "bab"], "bab".match(/(?<a>\w\w\w)/u));
assertEquals(["bab", "ba", "b"], "bab".match(/(?<a>\w\w)(?<b>\w)/u)); assertEquals(["bab", "ba", "b"], "bab".match(/(?<a>\w\w)(?<b>\w)/u));
assertEquals(["a", "a"], "bab".match(/(?<a>a)/));
assertEquals(["a", "a"], "bab".match(/(?<a42>a)/));
assertEquals(["a", "a"], "bab".match(/(?<_>a)/));
assertEquals(["a", "a"], "bab".match(/(?<$>a)/));
assertEquals(["bab", "a"], "bab".match(/.(?<$>a)./));
assertEquals(["bab", "a", "b"], "bab".match(/.(?<a>a)(.)/));
assertEquals(["bab", "a", "b"], "bab".match(/.(?<a>a)(?<b>.)/));
assertEquals(["bab", "ab"], "bab".match(/.(?<a>\w\w)/));
assertEquals(["bab", "bab"], "bab".match(/(?<a>\w\w\w)/));
assertEquals(["bab", "ba", "b"], "bab".match(/(?<a>\w\w)(?<b>\w)/));
assertEquals("bab".match(/(a)/u), "bab".match(/(?<a>a)/u)); assertEquals("bab".match(/(a)/u), "bab".match(/(?<a>a)/u));
assertEquals("bab".match(/(a)/u), "bab".match(/(?<a42>a)/u)); assertEquals("bab".match(/(a)/u), "bab".match(/(?<a42>a)/u));
assertEquals("bab".match(/(a)/u), "bab".match(/(?<_>a)/u)); assertEquals("bab".match(/(a)/u), "bab".match(/(?<_>a)/u));
...@@ -81,6 +121,9 @@ assertEquals(["bab", "b", "a"], "bab".match(/(?<b>b)\k<a>(?<a>a)\k<b>/u)); ...@@ -81,6 +121,9 @@ assertEquals(["bab", "b", "a"], "bab".match(/(?<b>b)\k<a>(?<a>a)\k<b>/u));
assertEquals({a: "a", b: "b"}, assertEquals({a: "a", b: "b"},
"bab".match(/(?<b>b)\k<a>(?<a>a)\k<b>/u).groups); "bab".match(/(?<b>b)\k<a>(?<a>a)\k<b>/u).groups);
assertEquals(["bab", "b"], "bab".match(/\k<a>(?<a>b)\w\k<a>/));
assertEquals(["bab", "b", "a"], "bab".match(/(?<b>b)\k<a>(?<a>a)\k<b>/));
// Reference properties. // Reference properties.
assertEquals("a", /(?<a>a)(?<b>b)\k<a>/u.exec("aba").groups.a); assertEquals("a", /(?<a>a)(?<b>b)\k<a>/u.exec("aba").groups.a);
assertEquals("b", /(?<a>a)(?<b>b)\k<a>/u.exec("aba").groups.b); assertEquals("b", /(?<a>a)(?<b>b)\k<a>/u.exec("aba").groups.b);
...@@ -89,6 +132,8 @@ assertEquals(undefined, /(?<a>a)(?<b>b)\k<a>|(?<c>c)/u.exec("aba").groups.c); ...@@ -89,6 +132,8 @@ assertEquals(undefined, /(?<a>a)(?<b>b)\k<a>|(?<c>c)/u.exec("aba").groups.c);
// Unicode names. // Unicode names.
assertEquals("a", /(?<π>a)/u.exec("bab").groups.π); assertEquals("a", /(?<π>a)/u.exec("bab").groups.π);
assertEquals("a", /(?<\u{03C0}>a)/u.exec("bab").groups.π);
assertEquals("a", /(?<π>a)/u.exec("bab").groups.\u03C0);
assertEquals("a", /(?<\u{03C0}>a)/u.exec("bab").groups.\u03C0); assertEquals("a", /(?<\u{03C0}>a)/u.exec("bab").groups.\u03C0);
assertEquals("a", /(?<$>a)/u.exec("bab").groups.$); assertEquals("a", /(?<$>a)/u.exec("bab").groups.$);
assertEquals("a", /(?<_>a)/u.exec("bab").groups._); assertEquals("a", /(?<_>a)/u.exec("bab").groups._);
...@@ -99,6 +144,14 @@ assertEquals("a", /(?<ಠ_ಠ>a)/u.exec("bab").groups.ಠ_ಠ); ...@@ -99,6 +144,14 @@ assertEquals("a", /(?<ಠ_ಠ>a)/u.exec("bab").groups.ಠ_ಠ);
assertThrows('/(?<❤>a)/u', SyntaxError); assertThrows('/(?<❤>a)/u', SyntaxError);
assertThrows('/(?<𐒤>a)/u', SyntaxError); // ID_Continue but not ID_Start. assertThrows('/(?<𐒤>a)/u', SyntaxError); // ID_Continue but not ID_Start.
assertEquals("a", /(?<π>a)/.exec("bab").groups.π);
assertEquals("a", /(?<$>a)/.exec("bab").groups.$);
assertEquals("a", /(?<_>a)/.exec("bab").groups._);
assertThrows("/(?<$𐒤>a)/", SyntaxError);
assertEquals("a", /(?<ಠ_ಠ>a)/.exec("bab").groups._);
assertThrows('/(?<❤>a)/', SyntaxError);
assertThrows('/(?<𐒤>a)/', SyntaxError); // ID_Continue but not ID_Start.
// Interaction with lookbehind assertions. // Interaction with lookbehind assertions.
assertEquals(["f", "c"], "abcdef".match(/(?<=(?<a>\w){3})f/u)); assertEquals(["f", "c"], "abcdef".match(/(?<=(?<a>\w){3})f/u));
assertEquals({a: "c"}, "abcdef".match(/(?<=(?<a>\w){3})f/u).groups); assertEquals({a: "c"}, "abcdef".match(/(?<=(?<a>\w){3})f/u).groups);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment