Commit ee94fa11 authored by jgruber's avatar jgruber Committed by Commit bot

[regexp] Store named captures on the regexp result

This implements storing named captures on the regexp result object.
For instance, /(?<a>.)/u.exec("b") will return a result such that:

result.group.a  // "b"

The spec proposal is not yet final, so this may still change in the future.

BUG=v8:5437

Review-Url: https://codereview.chromium.org/2630233003
Cr-Original-Commit-Position: refs/heads/master@{#42532}
Committed: https://chromium.googlesource.com/v8/v8/+/70000946eb2a9155679528702a766219a1fcf154
Review-Url: https://codereview.chromium.org/2630233003
Cr-Commit-Position: refs/heads/master@{#42570}
parent 24b9fc3a
...@@ -33,8 +33,9 @@ class RegExpBuiltinsAssembler : public CodeStubAssembler { ...@@ -33,8 +33,9 @@ class RegExpBuiltinsAssembler : public CodeStubAssembler {
void StoreLastIndex(Node* context, Node* regexp, Node* value, void StoreLastIndex(Node* context, Node* regexp, Node* value,
bool is_fastpath); bool is_fastpath);
Node* ConstructNewResultFromMatchInfo(Node* context, Node* match_info, Node* ConstructNewResultFromMatchInfo(Node* const context, Node* const regexp,
Node* string); Node* const match_info,
Node* const string);
Node* RegExpPrototypeExecBodyWithoutResult(Node* const context, Node* RegExpPrototypeExecBodyWithoutResult(Node* const context,
Node* const regexp, Node* const regexp,
...@@ -141,10 +142,10 @@ void RegExpBuiltinsAssembler::StoreLastIndex(Node* context, Node* regexp, ...@@ -141,10 +142,10 @@ void RegExpBuiltinsAssembler::StoreLastIndex(Node* context, Node* regexp,
} }
} }
Node* RegExpBuiltinsAssembler::ConstructNewResultFromMatchInfo(Node* context, Node* RegExpBuiltinsAssembler::ConstructNewResultFromMatchInfo(
Node* match_info, Node* const context, Node* const regexp, Node* const match_info,
Node* string) { Node* const string) {
Label out(this); Label named_captures(this), out(this);
Node* const num_indices = SmiUntag(LoadFixedArrayElement( Node* const num_indices = SmiUntag(LoadFixedArrayElement(
match_info, RegExpMatchInfo::kNumberOfCapturesIndex)); match_info, RegExpMatchInfo::kNumberOfCapturesIndex));
...@@ -164,7 +165,8 @@ Node* RegExpBuiltinsAssembler::ConstructNewResultFromMatchInfo(Node* context, ...@@ -164,7 +165,8 @@ Node* RegExpBuiltinsAssembler::ConstructNewResultFromMatchInfo(Node* context,
StoreFixedArrayElement(result_elements, 0, first, SKIP_WRITE_BARRIER); StoreFixedArrayElement(result_elements, 0, first, SKIP_WRITE_BARRIER);
GotoIf(SmiEqual(num_results, SmiConstant(Smi::FromInt(1))), &out); // If no captures exist we can skip named capture handling as well.
GotoIf(SmiEqual(num_results, SmiConstant(1)), &out);
// Store all remaining captures. // Store all remaining captures.
Node* const limit = IntPtrAdd( Node* const limit = IntPtrAdd(
...@@ -187,7 +189,7 @@ Node* RegExpBuiltinsAssembler::ConstructNewResultFromMatchInfo(Node* context, ...@@ -187,7 +189,7 @@ Node* RegExpBuiltinsAssembler::ConstructNewResultFromMatchInfo(Node* context,
Node* const start = LoadFixedArrayElement(match_info, from_cursor); Node* const start = LoadFixedArrayElement(match_info, from_cursor);
Label next_iter(this); Label next_iter(this);
GotoIf(SmiEqual(start, SmiConstant(Smi::FromInt(-1))), &next_iter); GotoIf(SmiEqual(start, SmiConstant(-1)), &next_iter);
Node* const from_cursor_plus1 = IntPtrAdd(from_cursor, IntPtrConstant(1)); Node* const from_cursor_plus1 = IntPtrAdd(from_cursor, IntPtrConstant(1));
Node* const end = LoadFixedArrayElement(match_info, from_cursor_plus1); Node* const end = LoadFixedArrayElement(match_info, from_cursor_plus1);
...@@ -199,7 +201,85 @@ Node* RegExpBuiltinsAssembler::ConstructNewResultFromMatchInfo(Node* context, ...@@ -199,7 +201,85 @@ Node* RegExpBuiltinsAssembler::ConstructNewResultFromMatchInfo(Node* context,
Bind(&next_iter); Bind(&next_iter);
var_from_cursor.Bind(IntPtrAdd(from_cursor, IntPtrConstant(2))); var_from_cursor.Bind(IntPtrAdd(from_cursor, IntPtrConstant(2)));
var_to_cursor.Bind(IntPtrAdd(to_cursor, IntPtrConstant(1))); var_to_cursor.Bind(IntPtrAdd(to_cursor, IntPtrConstant(1)));
Branch(UintPtrLessThan(var_from_cursor.value(), limit), &loop, &out); Branch(UintPtrLessThan(var_from_cursor.value(), limit), &loop,
&named_captures);
}
Bind(&named_captures);
{
// We reach this point only if captures exist, implying that this is an
// IRREGEXP JSRegExp.
CSA_ASSERT(this, HasInstanceType(regexp, JS_REGEXP_TYPE));
CSA_ASSERT(this, SmiGreaterThan(num_results, SmiConstant(1)));
// Preparations for named capture properties. Exit early if the result does
// not have any named captures to minimize performance impact.
Node* const data = LoadObjectField(regexp, JSRegExp::kDataOffset);
CSA_ASSERT(this, SmiEqual(LoadFixedArrayElement(data, JSRegExp::kTagIndex),
SmiConstant(JSRegExp::IRREGEXP)));
// The names fixed array associates names at even indices with a capture
// index at odd indices.
Node* const names =
LoadFixedArrayElement(data, JSRegExp::kIrregexpCaptureNameMapIndex);
GotoIf(SmiEqual(names, SmiConstant(0)), &out);
// Allocate a new object to store the named capture properties.
// TODO(jgruber): Could be optimized by adding the object map to the heap
// root list.
Node* const native_context = LoadNativeContext(context);
Node* const object_function =
LoadContextElement(native_context, Context::OBJECT_FUNCTION_INDEX);
Node* const object_function_map = LoadObjectField(
object_function, JSFunction::kPrototypeOrInitialMapOffset);
Node* const group_object = AllocateJSObjectFromMap(object_function_map);
// Store it on the result as a 'group' property.
{
Node* const name = HeapConstant(isolate()->factory()->group_string());
Node* const language_mode = SmiConstant(Smi::FromInt(STRICT));
CallRuntime(Runtime::kSetProperty, context, result, name, group_object,
language_mode);
}
// One or more named captures exist, add a property for each one.
CSA_ASSERT(this, HasInstanceType(names, FIXED_ARRAY_TYPE));
Node* const names_length = LoadAndUntagFixedArrayBaseLength(names);
CSA_ASSERT(this, IntPtrGreaterThan(names_length, IntPtrConstant(0)));
Variable var_i(this, MachineType::PointerRepresentation());
var_i.Bind(IntPtrConstant(0));
Variable* vars[] = {&var_i};
const int vars_count = sizeof(vars) / sizeof(vars[0]);
Label loop(this, vars_count, vars);
Goto(&loop);
Bind(&loop);
{
Node* const i = var_i.value();
Node* const i_plus_1 = IntPtrAdd(i, IntPtrConstant(1));
Node* const i_plus_2 = IntPtrAdd(i_plus_1, IntPtrConstant(1));
Node* const name = LoadFixedArrayElement(names, i);
Node* const index = LoadFixedArrayElement(names, i_plus_1);
Node* const capture =
LoadFixedArrayElement(result_elements, SmiUntag(index));
Node* const language_mode = SmiConstant(Smi::FromInt(STRICT));
CallRuntime(Runtime::kSetProperty, context, group_object, name, capture,
language_mode);
var_i.Bind(i_plus_2);
Branch(IntPtrGreaterThanOrEqual(var_i.value(), names_length), &out,
&loop);
}
} }
Bind(&out); Bind(&out);
...@@ -352,7 +432,7 @@ Node* RegExpBuiltinsAssembler::RegExpPrototypeExecBody(Node* const context, ...@@ -352,7 +432,7 @@ Node* RegExpBuiltinsAssembler::RegExpPrototypeExecBody(Node* const context,
{ {
Node* const match_indices = indices_or_null; Node* const match_indices = indices_or_null;
Node* const result = Node* const result =
ConstructNewResultFromMatchInfo(context, match_indices, string); ConstructNewResultFromMatchInfo(context, regexp, match_indices, string);
var_result.Bind(result); var_result.Bind(result);
Goto(&out); Goto(&out);
} }
...@@ -2522,7 +2602,7 @@ TF_BUILTIN(RegExpInternalMatch, RegExpBuiltinsAssembler) { ...@@ -2522,7 +2602,7 @@ TF_BUILTIN(RegExpInternalMatch, RegExpBuiltinsAssembler) {
Bind(&if_matched); Bind(&if_matched);
{ {
Node* result = Node* result =
ConstructNewResultFromMatchInfo(context, match_indices, string); ConstructNewResultFromMatchInfo(context, regexp, match_indices, string);
Return(result); Return(result);
} }
} }
......
...@@ -88,6 +88,7 @@ ...@@ -88,6 +88,7 @@
V(get_string, "get") \ V(get_string, "get") \
V(get_space_string, "get ") \ V(get_space_string, "get ") \
V(global_string, "global") \ V(global_string, "global") \
V(group_string, "group") \
V(has_string, "has") \ V(has_string, "has") \
V(hour_string, "hour") \ V(hour_string, "hour") \
V(ignoreCase_string, "ignoreCase") \ V(ignoreCase_string, "ignoreCase") \
......
...@@ -770,6 +770,15 @@ bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, ...@@ -770,6 +770,15 @@ bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
DCHECK(0 < index && index <= captures_started_); DCHECK(0 < index && index <= captures_started_);
DCHECK_NOT_NULL(name); DCHECK_NOT_NULL(name);
// Disallow captures named '__proto__'.
static const char16_t proto_string[] = u"__proto__";
if (name->size() == arraysize(proto_string) - 1) {
if (std::equal(name->begin(), name->end(), &proto_string[0])) {
ReportError(CStrVector("Illegal capture group name"));
return false;
}
}
if (named_captures_ == nullptr) { if (named_captures_ == nullptr) {
named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone()); named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
} else { } else {
......
...@@ -18,15 +18,15 @@ assertThrows("/(?<ab>a)\\k<a>/u"); // Invalid reference. ...@@ -18,15 +18,15 @@ assertThrows("/(?<ab>a)\\k<a>/u"); // Invalid reference.
assertThrows("/\\k<a>(?<ab>a)/u"); // Invalid reference. assertThrows("/\\k<a>(?<ab>a)/u"); // Invalid reference.
// Fallback behavior in non-unicode mode. // Fallback behavior in non-unicode mode.
assertThrows("/(?<>a)/"); assertThrows("/(?<>a)/", SyntaxError);
assertThrows("/(?<aa)/"); assertThrows("/(?<aa)/", SyntaxError);
assertThrows("/(?<42a>a)/"); assertThrows("/(?<42a>a)/", SyntaxError);
assertThrows("/(?<:a>a)/"); assertThrows("/(?<:a>a)/", SyntaxError);
assertThrows("/(?<a:>a)/"); assertThrows("/(?<a:>a)/", SyntaxError);
assertThrows("/(?<a>a)(?<a>a)/"); assertThrows("/(?<a>a)(?<a>a)/", SyntaxError);
assertThrows("/(?<a>a)(?<b>b)(?<a>a)/"); assertThrows("/(?<a>a)(?<b>b)(?<a>a)/", SyntaxError);
assertThrows("/(?<a>a)\\k<ab>/"); assertThrows("/(?<a>a)\\k<ab>/", SyntaxError);
assertThrows("/(?<ab>a)\\k<a>/"); assertThrows("/(?<ab>a)\\k<a>/", SyntaxError);
assertEquals(["k<a>"], "xxxk<a>xxx".match(/\k<a>/)); assertEquals(["k<a>"], "xxxk<a>xxx".match(/\k<a>/));
assertEquals(["k<a"], "xxxk<a>xxx".match(/\k<a/)); assertEquals(["k<a"], "xxxk<a>xxx".match(/\k<a/));
...@@ -74,3 +74,29 @@ assertEquals(["bab", "b"], "bab".match(/(?<a>\k<a>\w)../u)); ...@@ -74,3 +74,29 @@ assertEquals(["bab", "b"], "bab".match(/(?<a>\k<a>\w)../u));
// Reference before group. // Reference before group.
assertEquals(["bab", "b"], "bab".match(/\k<a>(?<a>b)\w\k<a>/u)); assertEquals(["bab", "b"], "bab".match(/\k<a>(?<a>b)\w\k<a>/u));
assertEquals(["bab", "b", "a"], "bab".match(/(?<b>b)\k<a>(?<a>a)\k<b>/u)); assertEquals(["bab", "b", "a"], "bab".match(/(?<b>b)\k<a>(?<a>a)\k<b>/u));
// Reference properties.
assertEquals("a", /(?<a>a)(?<b>b)\k<a>/u.exec("aba").group.a);
assertEquals("b", /(?<a>a)(?<b>b)\k<a>/u.exec("aba").group.b);
assertEquals(undefined, /(?<a>a)(?<b>b)\k<a>/u.exec("aba").group.c);
assertFalse(/(?<a>a)(?<b>b)\k<a>/u.exec("aba").group.hasOwnProperty("c"));
assertEquals(undefined, /(?<a>a)(?<b>b)\k<a>|(?<c>c)/u.exec("aba").group.c);
assertTrue(/(?<a>a)(?<b>b)\k<a>|(?<c>c)/u
.exec("aba").group.hasOwnProperty("c"));
// Unicode names.
assertEquals("a", /(?<π>a)/u.exec("bab").group.π);
assertEquals("a", /(?<\u{03C0}>a)/u.exec("bab").group.\u03C0);
assertEquals("a", /(?<$>a)/u.exec("bab").group.$);
assertEquals("a", /(?<_>a)/u.exec("bab").group._);
assertEquals("a", /(?<$𐒤>a)/u.exec("bab").group.$𐒤);
assertEquals("a", /(?<_\u200C>a)/u.exec("bab").group._\u200C);
assertEquals("a", /(?<_\u200D>a)/u.exec("bab").group._\u200D);
assertEquals("a", /(?<ಠ_ಠ>a)/u.exec("bab").group._);
assertThrows('/(?<❤>a)/u', SyntaxError);
assertThrows('/(?<𐒤>a)/u', SyntaxError); // ID_Continue but not ID_Start.
// Capture name conflicts.
assertThrows(() => /(?<__proto__>a)/u, SyntaxError);
assertEquals("a", /(?<__proto_>a)/u.exec("a").group.__proto_);
assertEquals("a", /(?<__proto___>a)/u.exec("a").group.__proto___);
...@@ -156,6 +156,9 @@ ...@@ -156,6 +156,9 @@
# desugaring regexp property class relies on ICU. # desugaring regexp property class relies on ICU.
'harmony/regexp-property-*': [PASS, ['no_i18n == True', FAIL]], 'harmony/regexp-property-*': [PASS, ['no_i18n == True', FAIL]],
# noi18n build cannot parse characters in supplementary plane.
'harmony/regexp-named-captures': [PASS, ['no_i18n == True', FAIL]],
# Allocates a large array buffer, which TSAN sometimes cannot handle. # Allocates a large array buffer, which TSAN sometimes cannot handle.
'regress/regress-599717': [PASS, ['tsan', SKIP]], 'regress/regress-599717': [PASS, ['tsan', SKIP]],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment