Commit ae23436c authored by jgruber's avatar jgruber Committed by Commit bot

[regexp] Experimental support for regexp named captures

Named capture groups may be specified using the /(?<name>pattern)/u
syntax, with named backreferences specified as /\k<name>/u. They're
hidden behind the --harmony-regexp-named-captures flag, and are only
enabled for unicode regexps.

R=yangguo@chromium.org
BUG=

Review-Url: https://codereview.chromium.org/2050343002
Cr-Commit-Position: refs/heads/master@{#36986}
parent 5c5985b8
......@@ -2645,6 +2645,7 @@ EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(harmony_for_in)
EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(harmony_iterator_close)
EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(harmony_regexp_exec)
EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(harmony_regexp_lookbehind)
EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(harmony_regexp_named_captures)
EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(harmony_regexp_property)
EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(harmony_function_name)
EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(harmony_function_sent)
......@@ -3243,6 +3244,7 @@ bool Genesis::InstallExperimentalNatives() {
static const char* harmony_regexp_lookbehind_natives[] = {nullptr};
static const char* harmony_instanceof_natives[] = {nullptr};
static const char* harmony_restrictive_declarations_natives[] = {nullptr};
static const char* harmony_regexp_named_captures_natives[] = {nullptr};
static const char* harmony_regexp_property_natives[] = {nullptr};
static const char* harmony_function_name_natives[] = {nullptr};
static const char* harmony_function_sent_natives[] = {nullptr};
......
......@@ -2385,6 +2385,7 @@ void Factory::SetRegExpIrregexpData(Handle<JSRegExp> regexp,
store->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(0));
store->set(JSRegExp::kIrregexpCaptureCountIndex,
Smi::FromInt(capture_count));
store->set(JSRegExp::kIrregexpCaptureNameMapIndex, uninitialized);
regexp->set_data(*store);
}
......
......@@ -205,6 +205,7 @@ DEFINE_IMPLICATION(es_staging, harmony_async_await)
V(harmony_do_expressions, "harmony do-expressions") \
V(harmony_restrictive_generators, \
"harmony restrictions on generator declarations") \
V(harmony_regexp_named_captures, "harmony regexp named captures") \
V(harmony_regexp_property, "harmony unicode regexp property classes") \
V(harmony_for_in, "harmony for-in syntax") \
V(harmony_async_await, "harmony async-await")
......
......@@ -7935,7 +7935,6 @@ class JSRegExp: public JSObject {
// NOT_COMPILED: Initial value. No data has been stored in the JSRegExp yet.
// ATOM: A simple string to match against using an indexOf operation.
// IRREGEXP: Compiled with Irregexp.
// IRREGEXP_NATIVE: Compiled to native code with Irregexp.
enum Type { NOT_COMPILED, ATOM, IRREGEXP };
enum Flag {
kNone = 0,
......@@ -8028,8 +8027,11 @@ class JSRegExp: public JSObject {
static const int kIrregexpMaxRegisterCountIndex = kDataIndex + 4;
// Number of captures in the compiled regexp.
static const int kIrregexpCaptureCountIndex = kDataIndex + 5;
// Maps names of named capture groups (at indices 2i) to their corresponding
// capture group indices (at indices 2i + 1).
static const int kIrregexpCaptureNameMapIndex = kDataIndex + 6;
static const int kIrregexpDataSize = kIrregexpCaptureCountIndex + 1;
static const int kIrregexpDataSize = kIrregexpCaptureNameMapIndex + 1;
// Offsets directly into the data fixed array.
static const int kDataTagOffset =
......
......@@ -397,6 +397,7 @@ bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
data->set(JSRegExp::code_index(is_one_byte), result.code);
SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
int register_max = IrregexpMaxRegisterCount(*data);
if (result.num_registers > register_max) {
SetIrregexpMaxRegisterCount(*data, result.num_registers);
......@@ -416,6 +417,14 @@ void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
}
void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re,
Handle<FixedArray> value) {
if (value.is_null()) {
re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::FromInt(0));
} else {
re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
}
}
int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
......
......@@ -196,6 +196,8 @@ class RegExpImpl {
// For acting on the JSRegExp data FixedArray.
static int IrregexpMaxRegisterCount(FixedArray* re);
static void SetIrregexpMaxRegisterCount(FixedArray* re, int value);
static void SetIrregexpCaptureNameMap(FixedArray* re,
Handle<FixedArray> value);
static int IrregexpNumberOfCaptures(FixedArray* re);
static int IrregexpNumberOfRegisters(FixedArray* re);
static ByteArray* IrregexpByteCode(FixedArray* re, bool is_one_byte);
......@@ -1530,6 +1532,7 @@ struct RegExpCompileData {
RegExpNode* node;
bool simple;
bool contains_anchor;
Handle<FixedArray> capture_name_map;
Handle<String> error;
int capture_count;
};
......
......@@ -7,6 +7,7 @@
#include "src/objects.h"
#include "src/utils.h"
#include "src/zone-containers.h"
#include "src/zone.h"
namespace v8 {
......@@ -412,7 +413,8 @@ class RegExpQuantifier final : public RegExpTree {
class RegExpCapture final : public RegExpTree {
public:
explicit RegExpCapture(int index) : body_(NULL), index_(index) {}
explicit RegExpCapture(int index)
: body_(NULL), index_(index), name_(nullptr) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
static RegExpNode* ToNode(RegExpTree* body, int index,
......@@ -427,12 +429,15 @@ class RegExpCapture final : public RegExpTree {
RegExpTree* body() { return body_; }
void set_body(RegExpTree* body) { body_ = body; }
int index() { return index_; }
const ZoneVector<uc16>* name() const { return name_; }
void set_name(const ZoneVector<uc16>* name) { name_ = name; }
static int StartRegister(int index) { return index * 2; }
static int EndRegister(int index) { return index * 2 + 1; }
private:
RegExpTree* body_;
int index_;
const ZoneVector<uc16>* name_;
};
......@@ -489,7 +494,9 @@ class RegExpLookaround final : public RegExpTree {
class RegExpBackReference final : public RegExpTree {
public:
explicit RegExpBackReference(RegExpCapture* capture) : capture_(capture) {}
RegExpBackReference() : capture_(nullptr), name_(nullptr) {}
explicit RegExpBackReference(RegExpCapture* capture)
: capture_(capture), name_(nullptr) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpBackReference* AsBackReference() override;
......@@ -500,9 +507,13 @@ class RegExpBackReference final : public RegExpTree {
int max_match() override { return kInfinity; }
int index() { return capture_->index(); }
RegExpCapture* capture() { return capture_; }
void set_capture(RegExpCapture* capture) { capture_ = capture; }
const ZoneVector<uc16>* name() const { return name_; }
void set_name(const ZoneVector<uc16>* name) { name_ = name; }
private:
RegExpCapture* capture_;
const ZoneVector<uc16>* name_;
};
......
This diff is collapsed.
......@@ -222,13 +222,15 @@ class RegExpParser BASE_EMBEDDED {
RegExpParserState(RegExpParserState* previous_state,
SubexpressionType group_type,
RegExpLookaround::Type lookaround_type,
int disjunction_capture_index, bool ignore_case,
int disjunction_capture_index,
const ZoneVector<uc16>* capture_name, bool ignore_case,
bool unicode, Zone* zone)
: previous_state_(previous_state),
builder_(new (zone) RegExpBuilder(zone, ignore_case, unicode)),
group_type_(group_type),
lookaround_type_(lookaround_type),
disjunction_capture_index_(disjunction_capture_index) {}
disjunction_capture_index_(disjunction_capture_index),
capture_name_(capture_name) {}
// Parser state of containing expression, if any.
RegExpParserState* previous_state() { return previous_state_; }
bool IsSubexpression() { return previous_state_ != NULL; }
......@@ -242,9 +244,16 @@ class RegExpParser BASE_EMBEDDED {
// Also the capture index of this sub-expression itself, if group_type
// is CAPTURE.
int capture_index() { return disjunction_capture_index_; }
// The name of the current sub-expression, if group_type is CAPTURE. Only
// used for named captures.
const ZoneVector<uc16>* capture_name() { return capture_name_; }
bool IsNamedCapture() const { return capture_name_ != nullptr; }
// Check whether the parser is inside a capture group with the given index.
bool IsInsideCaptureGroup(int index);
// Check whether the parser is inside a capture group with the given name.
bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
private:
// Linked list implementation of stack of states.
......@@ -257,11 +266,32 @@ class RegExpParser BASE_EMBEDDED {
RegExpLookaround::Type lookaround_type_;
// Stored disjunction's capture index (if any).
int disjunction_capture_index_;
// Stored capture name (if any).
const ZoneVector<uc16>* capture_name_;
};
// Return the 1-indexed RegExpCapture object, allocate if necessary.
RegExpCapture* GetCapture(int index);
// Creates a new named capture at the specified index. Must be called exactly
// once for each named capture. Fails if a capture with the same name is
// encountered.
bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
// Parses the name of a capture group (?<name>pattern). The name must adhere
// to IdentifierName in the ECMAScript standard.
const ZoneVector<uc16>* ParseCaptureGroupName();
bool ParseNamedBackReference(RegExpBuilder* builder,
RegExpParserState* state);
// After the initial parsing pass, patch corresponding RegExpCapture objects
// into all RegExpBackReferences. This is done after initial parsing in order
// to avoid complicating cases in which references comes before the capture.
void PatchNamedBackReferences();
Handle<FixedArray> CreateCaptureNameMap();
Isolate* isolate() { return isolate_; }
Zone* zone() const { return zone_; }
......@@ -278,6 +308,8 @@ class RegExpParser BASE_EMBEDDED {
Zone* zone_;
Handle<String>* error_;
ZoneList<RegExpCapture*>* captures_;
ZoneList<RegExpCapture*>* named_captures_;
ZoneList<RegExpBackReference*>* named_back_references_;
FlatStringReader* in_;
uc32 current_;
bool ignore_case_;
......
......@@ -438,6 +438,23 @@ void TestRegExpParser(bool lookbehind) {
CHECK_MIN_MAX("a(?=b)c", 2, 2);
CHECK_MIN_MAX("a(?=bbb|bb)c", 2, 2);
CHECK_MIN_MAX("a(?!bbb|bb)c", 2, 2);
FLAG_harmony_regexp_named_captures = true;
CheckParseEq("(?<a>x)(?<b>x)(?<c>x)\\k<a>",
"(: (^ 'x') (^ 'x') (^ 'x') (<- 1))", true);
CheckParseEq("(?<a>x)(?<b>x)(?<c>x)\\k<b>",
"(: (^ 'x') (^ 'x') (^ 'x') (<- 2))", true);
CheckParseEq("(?<a>x)(?<b>x)(?<c>x)\\k<c>",
"(: (^ 'x') (^ 'x') (^ 'x') (<- 3))", true);
CheckParseEq("(?<a>a)\\k<a>", "(: (^ 'a') (<- 1))", true);
CheckParseEq("(?<a>a\\k<a>)", "(^ 'a')", true);
CheckParseEq("(?<a>\\k<a>a)", "(^ 'a')", true);
CheckParseEq("(?<a>\\k<b>)(?<b>\\k<a>)", "(: (^ (<- 2)) (^ (<- 1)))", true);
CheckParseEq("\\k<a>(?<a>a)", "(: (<- 1) (^ 'a'))", true);
CheckParseEq("(?<\\u{03C0}>a)", "(^ 'a')", true);
CheckParseEq("(?<\\u03C0>a)", "(^ 'a')", true);
FLAG_harmony_regexp_named_captures = false;
}
......@@ -450,7 +467,6 @@ TEST(ParserWithoutLookbehind) {
TestRegExpParser(true); // Lookbehind enabled.
}
TEST(ParserRegression) {
CheckParseEq("[A-Z$-][x]", "(! [A-Z $ -] [x])");
CheckParseEq("a{3,4*}", "(: 'a{3,' (# 0 - g '4') '}')");
......@@ -458,14 +474,16 @@ TEST(ParserRegression) {
CheckParseEq("a|", "(| 'a' %)");
}
static void ExpectError(const char* input,
const char* expected) {
static void ExpectError(const char* input, const char* expected,
bool unicode = false) {
v8::HandleScope scope(CcTest::isolate());
Zone zone(CcTest::i_isolate()->allocator());
FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
RegExpCompileData result;
CHECK(!v8::internal::RegExpParser::ParseRegExp(
CcTest::i_isolate(), &zone, &reader, JSRegExp::kNone, &result));
JSRegExp::Flags flags = JSRegExp::kNone;
if (unicode) flags |= JSRegExp::kUnicode;
CHECK(!v8::internal::RegExpParser::ParseRegExp(CcTest::i_isolate(), &zone,
&reader, flags, &result));
CHECK(result.tree == NULL);
CHECK(!result.error.is_null());
v8::base::SmartArrayPointer<char> str = result.error->ToCString(ALLOW_NULLS);
......@@ -499,6 +517,23 @@ TEST(Errors) {
os << "()";
}
ExpectError(os.str().c_str(), kTooManyCaptures);
FLAG_harmony_regexp_named_captures = true;
const char* kInvalidCaptureName = "Invalid capture group name";
ExpectError("(?<>.)", kInvalidCaptureName, true);
ExpectError("(?<1>.)", kInvalidCaptureName, true);
ExpectError("(?<_%>.)", kInvalidCaptureName, true);
ExpectError("\\k<a", kInvalidCaptureName, true);
const char* kDuplicateCaptureName = "Duplicate capture group name";
ExpectError("(?<a>.)(?<a>.)", kDuplicateCaptureName, true);
const char* kInvalidUnicodeEscape = "Invalid Unicode escape sequence";
ExpectError("(?<\\u{FISK}", kInvalidUnicodeEscape, true);
const char* kInvalidCaptureReferenced = "Invalid named capture referenced";
ExpectError("\\k<a>", kInvalidCaptureReferenced, true);
ExpectError("(?<b>)\\k<a>", kInvalidCaptureReferenced, true);
const char* kInvalidNamedReference = "Invalid named reference";
ExpectError("\\ka", kInvalidNamedReference, true);
FLAG_harmony_regexp_named_captures = false;
}
......
// Copyright 2015 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-regexp-named-captures
// Malformed named captures.
assertThrows("/(?<>a)/u"); // Empty name.
assertThrows("/(?<aa)/u"); // Unterminated name.
assertThrows("/(?<42a>a)/u"); // Name starting with digits.
assertThrows("/(?<:a>a)/u"); // Name starting with invalid char.
assertThrows("/(?<a:>a)/u"); // Name containing with invalid char.
assertThrows("/(?<a>a)(?<a>a)/u"); // Duplicate name.
assertThrows("/(?<a>a)(?<b>b)(?<a>a)/u"); // Duplicate name.
assertThrows("/\\k<a>/u"); // Invalid reference.
assertThrows("/(?<a>a)\\k<ab>/u"); // Invalid reference.
assertThrows("/(?<ab>a)\\k<a>/u"); // Invalid reference.
assertThrows("/\\k<a>(?<ab>a)/u"); // Invalid reference.
// Fallback behavior in non-unicode mode.
assertThrows("/(?<>a)/");
assertThrows("/(?<aa)/");
assertThrows("/(?<42a>a)/");
assertThrows("/(?<:a>a)/");
assertThrows("/(?<a:>a)/");
assertThrows("/(?<a>a)(?<a>a)/");
assertThrows("/(?<a>a)(?<b>b)(?<a>a)/");
assertThrows("/(?<a>a)\\k<ab>/");
assertThrows("/(?<ab>a)\\k<a>/");
assertEquals(["k<a>"], "xxxk<a>xxx".match(/\k<a>/));
assertEquals(["k<a"], "xxxk<a>xxx".match(/\k<a/));
// Basic named groups.
assertEquals(["a", "a"], "bab".match(/(?<a>a)/u));
assertEquals(["a", "a"], "bab".match(/(?<a42>a)/u));
assertEquals(["a", "a"], "bab".match(/(?<_>a)/u));
assertEquals(["a", "a"], "bab".match(/(?<$>a)/u));
assertEquals(["bab", "a"], "bab".match(/.(?<$>a)./u));
assertEquals(["bab", "a", "b"], "bab".match(/.(?<a>a)(.)/u));
assertEquals(["bab", "a", "b"], "bab".match(/.(?<a>a)(?<b>.)/u));
assertEquals(["bab", "ab"], "bab".match(/.(?<a>\w\w)/u));
assertEquals(["bab", "bab"], "bab".match(/(?<a>\w\w\w)/u));
assertEquals(["bab", "ba", "b"], "bab".match(/(?<a>\w\w)(?<b>\w)/u));
assertEquals("bab".match(/(a)/u), "bab".match(/(?<a>a)/u));
assertEquals("bab".match(/(a)/u), "bab".match(/(?<a42>a)/u));
assertEquals("bab".match(/(a)/u), "bab".match(/(?<_>a)/u));
assertEquals("bab".match(/(a)/u), "bab".match(/(?<$>a)/u));
assertEquals("bab".match(/.(a)./u), "bab".match(/.(?<$>a)./u));
assertEquals("bab".match(/.(a)(.)/u), "bab".match(/.(?<a>a)(.)/u));
assertEquals("bab".match(/.(a)(.)/u), "bab".match(/.(?<a>a)(?<b>.)/u));
assertEquals("bab".match(/.(\w\w)/u), "bab".match(/.(?<a>\w\w)/u));
assertEquals("bab".match(/(\w\w\w)/u), "bab".match(/(?<a>\w\w\w)/u));
assertEquals("bab".match(/(\w\w)(\w)/u), "bab".match(/(?<a>\w\w)(?<b>\w)/u));
assertEquals(["bab", "b"], "bab".match(/(?<b>b).\1/u));
assertEquals(["baba", "b", "a"], "baba".match(/(.)(?<a>a)\1\2/u));
assertEquals(["baba", "b", "a", "b", "a"],
"baba".match(/(.)(?<a>a)(?<b>\1)(\2)/u));
assertEquals(["<a", "<"], "<a".match(/(?<lt><)a/u));
assertEquals([">a", ">"], ">a".match(/(?<gt>>)a/u));
// Named references.
assertEquals(["bab", "b"], "bab".match(/(?<b>.).\k<b>/u));
assertNull("baa".match(/(?<b>.).\k<b>/u));
// Nested groups.
assertEquals(["bab", "bab", "ab", "b"], "bab".match(/(?<a>.(?<b>.(?<c>.)))/u));
// Reference inside group.
assertEquals(["bab", "b"], "bab".match(/(?<a>\k<a>\w)../u));
// Reference before group.
assertEquals(["bab", "b"], "bab".match(/\k<a>(?<a>b)\w\k<a>/u));
assertEquals(["bab", "b", "a"], "bab".match(/(?<b>b)\k<a>(?<a>a)\k<b>/u));
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment