Commit 51fcfd58 authored by Jakob Gruber's avatar Jakob Gruber Committed by Commit Bot

[regexp] Don't update last match info in @@split special case

V8 implements a fast-path for RegExp.prototype.split which diverges
from the spec: instead of creating a new sticky regexp instance
`splitter` and running it in a loop, we reuse the existing non-sticky
regexp without looping through each character.

This works fine in most cases, but we run into issues when matching at
the very end of the string. According to the spec, matches at the end
of the string are impossible in @@split, but in our fast-path
implementation they can happen.

The obvious fix would be to remove our fast-path but this comes with
high performance costs. The fix implemented in this CL adds a special
flag to `exec` s.t. matches at the end of the string can be treated as
failures. This is only relevant for @@split.

Bug: chromium:1075514
Change-Id: Ifb790ed116793998d7aeb37e307f3f3f764023d3
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2681950
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Auto-Submit: Jakob Gruber <jgruber@chromium.org>
Reviewed-by: 's avatarShu-yu Guo <syg@chromium.org>
Cr-Commit-Position: refs/heads/master@{#72644}
parent fc8743da
......@@ -18,7 +18,6 @@
#include "src/objects/js-regexp-string-iterator.h"
#include "src/objects/js-regexp.h"
#include "src/objects/regexp-match-info.h"
#include "src/regexp/regexp.h"
namespace v8 {
namespace internal {
......@@ -436,7 +435,8 @@ void RegExpBuiltinsAssembler::GetStringPointers(
TNode<HeapObject> RegExpBuiltinsAssembler::RegExpExecInternal(
TNode<Context> context, TNode<JSRegExp> regexp, TNode<String> string,
TNode<Number> last_index, TNode<RegExpMatchInfo> match_info) {
TNode<Number> last_index, TNode<RegExpMatchInfo> match_info,
RegExp::ExecQuirks exec_quirks) {
ToDirectStringAssembler to_direct(state(), string);
TVARIABLE(HeapObject, var_result);
......@@ -676,6 +676,14 @@ TNode<HeapObject> RegExpBuiltinsAssembler::RegExpExecInternal(
BIND(&if_success);
{
if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) {
static constexpr int kMatchStartOffset = 0;
TNode<IntPtrT> value = ChangeInt32ToIntPtr(UncheckedCast<Int32T>(
Load(MachineType::Int32(), static_offsets_vector_address,
IntPtrConstant(kMatchStartOffset))));
GotoIf(UintPtrGreaterThanOrEqual(value, int_string_length), &if_failure);
}
// Check that the last match info has space for the capture registers and
// the additional information. Ensure no overflow in add.
STATIC_ASSERT(FixedArray::kMaxLength < kMaxInt - FixedArray::kLengthOffset);
......@@ -747,15 +755,22 @@ TNode<HeapObject> RegExpBuiltinsAssembler::RegExpExecInternal(
BIND(&retry_experimental);
{
var_result =
CAST(CallRuntime(Runtime::kRegExpExperimentalOneshotExec, context,
regexp, string, last_index, match_info));
auto target_fn =
exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure
? Runtime::kRegExpExperimentalOneshotExecTreatMatchAtEndAsFailure
: Runtime::kRegExpExperimentalOneshotExec;
var_result = CAST(CallRuntime(target_fn, context, regexp, string,
last_index, match_info));
Goto(&out);
}
BIND(&runtime);
{
var_result = CAST(CallRuntime(Runtime::kRegExpExec, context, regexp, string,
auto target_fn =
exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure
? Runtime::kRegExpExecTreatMatchAtEndAsFailure
: Runtime::kRegExpExec;
var_result = CAST(CallRuntime(target_fn, context, regexp, string,
last_index, match_info));
Goto(&out);
}
......@@ -951,6 +966,14 @@ TF_BUILTIN(RegExpExecAtom, RegExpBuiltinsAssembler) {
const TNode<String> needle_string =
CAST(UnsafeLoadFixedArrayElement(data, JSRegExp::kAtomPatternIndex));
// ATOM patterns are guaranteed to not be the empty string (these are
// intercepted and replaced in JSRegExp::Initialize.
//
// This is especially relevant for crbug.com/1075514: atom patterns are
// non-empty and thus guaranteed not to match at the end of the string.
CSA_ASSERT(this, IntPtrGreaterThan(LoadStringLengthAsWord(needle_string),
IntPtrConstant(0)));
const TNode<Smi> match_from =
CAST(CallBuiltin(Builtins::kStringIndexOf, context, subject_string,
needle_string, last_index));
......@@ -1609,9 +1632,9 @@ TNode<JSArray> RegExpBuiltinsAssembler::RegExpPrototypeSplitBody(
const TNode<Object> last_match_info = LoadContextElement(
native_context, Context::REGEXP_LAST_MATCH_INFO_INDEX);
const TNode<HeapObject> match_indices_ho =
CAST(CallBuiltin(Builtins::kRegExpExecInternal, context, regexp, string,
next_search_from, last_match_info));
const TNode<HeapObject> match_indices_ho = RegExpExecInternal(
context, regexp, string, next_search_from, CAST(last_match_info),
RegExp::ExecQuirks::kTreatMatchAtEndAsFailure);
// We're done if no match was found.
{
......@@ -1623,16 +1646,9 @@ TNode<JSArray> RegExpBuiltinsAssembler::RegExpPrototypeSplitBody(
TNode<FixedArray> match_indices = CAST(match_indices_ho);
const TNode<Smi> match_from = CAST(UnsafeLoadFixedArrayElement(
match_indices, RegExpMatchInfo::kFirstCaptureIndex));
// We're done if the match starts beyond the string.
{
Label next(this);
Branch(SmiEqual(match_from, string_length), &push_suffix_and_out, &next);
BIND(&next);
}
const TNode<Smi> match_to = CAST(UnsafeLoadFixedArrayElement(
match_indices, RegExpMatchInfo::kFirstCaptureIndex + 1));
CSA_ASSERT(this, SmiNotEqual(match_from, string_length));
// Advance index and continue if the match is empty.
{
......
......@@ -8,6 +8,7 @@
#include "src/base/optional.h"
#include "src/codegen/code-stub-assembler.h"
#include "src/common/message-template.h"
#include "src/regexp/regexp.h"
namespace v8 {
namespace internal {
......@@ -51,11 +52,10 @@ class RegExpBuiltinsAssembler : public CodeStubAssembler {
TVariable<RawPtrT>* var_string_end);
// Low level logic around the actual call into pattern matching code.
TNode<HeapObject> RegExpExecInternal(TNode<Context> context,
TNode<JSRegExp> regexp,
TNode<String> string,
TNode<Number> last_index,
TNode<RegExpMatchInfo> match_info);
TNode<HeapObject> RegExpExecInternal(
TNode<Context> context, TNode<JSRegExp> regexp, TNode<String> string,
TNode<Number> last_index, TNode<RegExpMatchInfo> match_info,
RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone);
TNode<JSRegExpResult> ConstructNewResultFromMatchInfo(
TNode<Context> context, TNode<JSRegExp> regexp,
......
......@@ -214,7 +214,8 @@ int32_t ExperimentalRegExp::MatchForCallFromJs(
MaybeHandle<Object> ExperimentalRegExp::Exec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int subject_index, Handle<RegExpMatchInfo> last_match_info) {
int subject_index, Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks) {
DCHECK(FLAG_enable_experimental_regexp_engine);
DCHECK_EQ(regexp->TypeTag(), JSRegExp::EXPERIMENTAL);
#ifdef VERIFY_HEAP
......@@ -248,6 +249,11 @@ MaybeHandle<Object> ExperimentalRegExp::Exec(
if (num_matches > 0) {
DCHECK_EQ(num_matches, 1);
if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) {
if (output_registers[0] >= subject->length()) {
return isolate->factory()->null_value();
}
}
return RegExp::SetLastMatchInfo(isolate, last_match_info, subject,
capture_count, output_registers);
} else if (num_matches == 0) {
......@@ -285,7 +291,8 @@ int32_t ExperimentalRegExp::OneshotExecRaw(Isolate* isolate,
MaybeHandle<Object> ExperimentalRegExp::OneshotExec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int subject_index, Handle<RegExpMatchInfo> last_match_info) {
int subject_index, Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks) {
DCHECK(FLAG_enable_experimental_regexp_engine_on_excessive_backtracks);
DCHECK_NE(regexp->TypeTag(), JSRegExp::NOT_COMPILED);
......@@ -306,6 +313,11 @@ MaybeHandle<Object> ExperimentalRegExp::OneshotExec(
if (num_matches > 0) {
DCHECK_EQ(num_matches, 1);
if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) {
if (output_registers[0] >= subject->length()) {
return isolate->factory()->null_value();
}
}
return RegExp::SetLastMatchInfo(isolate, last_match_info, subject,
capture_count, output_registers);
} else if (num_matches == 0) {
......
......@@ -36,9 +36,10 @@ class ExperimentalRegExp final : public AllStatic {
Address backtrack_stack,
RegExp::CallOrigin call_origin,
Isolate* isolate, Address regexp);
static MaybeHandle<Object> Exec(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject, int index,
Handle<RegExpMatchInfo> last_match_info);
static MaybeHandle<Object> Exec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int index, Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone);
static int32_t ExecRaw(Isolate* isolate, RegExp::CallOrigin call_origin,
JSRegExp regexp, String subject,
int32_t* output_registers,
......@@ -48,7 +49,8 @@ class ExperimentalRegExp final : public AllStatic {
// its type tag. The regexp itself is not changed (apart from lastIndex).
static MaybeHandle<Object> OneshotExec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int index, Handle<RegExpMatchInfo> last_match_info);
int index, Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone);
static int32_t OneshotExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject,
int32_t* output_registers,
......
......@@ -76,7 +76,8 @@ class RegExpImpl final : public AllStatic {
// Returns an empty handle in case of an exception.
V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int index, Handle<RegExpMatchInfo> last_match_info);
int index, Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone);
static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> sample_subject, bool is_one_byte);
......@@ -268,15 +269,17 @@ bool RegExp::EnsureFullyCompiled(Isolate* isolate, Handle<JSRegExp> re,
// static
MaybeHandle<Object> RegExp::ExperimentalOneshotExec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int index, Handle<RegExpMatchInfo> last_match_info) {
int index, Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks) {
return ExperimentalRegExp::OneshotExec(isolate, regexp, subject, index,
last_match_info);
last_match_info, exec_quirks);
}
// static
MaybeHandle<Object> RegExp::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject, int index,
Handle<RegExpMatchInfo> last_match_info) {
Handle<RegExpMatchInfo> last_match_info,
ExecQuirks exec_quirks) {
switch (regexp->TypeTag()) {
case JSRegExp::NOT_COMPILED:
UNREACHABLE();
......@@ -285,10 +288,10 @@ MaybeHandle<Object> RegExp::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
last_match_info);
case JSRegExp::IRREGEXP:
return RegExpImpl::IrregexpExec(isolate, regexp, subject, index,
last_match_info);
last_match_info, exec_quirks);
case JSRegExp::EXPERIMENTAL:
return ExperimentalRegExp::Exec(isolate, regexp, subject, index,
last_match_info);
last_match_info, exec_quirks);
}
}
......@@ -641,7 +644,8 @@ int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
MaybeHandle<Object> RegExpImpl::IrregexpExec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int previous_index, Handle<RegExpMatchInfo> last_match_info) {
int previous_index, Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks) {
DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
subject = String::Flatten(isolate, subject);
......@@ -691,6 +695,11 @@ MaybeHandle<Object> RegExpImpl::IrregexpExec(
output_registers, required_registers);
if (res == RegExp::RE_SUCCESS) {
if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) {
if (output_registers[0] >= subject->length()) {
return isolate->factory()->null_value();
}
}
int capture_count = regexp->CaptureCount();
return RegExp::SetLastMatchInfo(isolate, last_match_info, subject,
capture_count, output_registers);
......
......@@ -86,16 +86,28 @@ class RegExp final : public AllStatic {
kFromJs = 1,
};
enum class ExecQuirks {
kNone,
// Used to work around an issue in the RegExpPrototypeSplit fast path,
// which diverges from the spec by not creating a sticky copy of the RegExp
// instance and calling `exec` in a loop. If called in this context, we
// must not update the last_match_info on a successful match at the subject
// string end. See crbug.com/1075514 for more information.
kTreatMatchAtEndAsFailure,
};
// See ECMA-262 section 15.10.6.2.
// This function calls the garbage collector if necessary.
V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Exec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int index, Handle<RegExpMatchInfo> last_match_info);
int index, Handle<RegExpMatchInfo> last_match_info,
ExecQuirks exec_quirks = ExecQuirks::kNone);
V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object>
ExperimentalOneshotExec(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject, int index,
Handle<RegExpMatchInfo> last_match_info);
Handle<RegExpMatchInfo> last_match_info,
ExecQuirks exec_quirks = ExecQuirks::kNone);
// Integral return values used throughout regexp code layers.
static constexpr int kInternalRegExpFailure = 0;
......
......@@ -861,6 +861,36 @@ RUNTIME_FUNCTION(Runtime_StringSplit) {
return *result;
}
namespace {
MaybeHandle<Object> RegExpExec(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject, int32_t index,
Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks) {
// Due to the way the JS calls are constructed this must be less than the
// length of a string, i.e. it is always a Smi. We check anyway for security.
CHECK_LE(0, index);
CHECK_GE(subject->length(), index);
isolate->counters()->regexp_entry_runtime()->Increment();
return RegExp::Exec(isolate, regexp, subject, index, last_match_info,
exec_quirks);
}
MaybeHandle<Object> ExperimentalOneshotExec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int32_t index, Handle<RegExpMatchInfo> last_match_info,
RegExp::ExecQuirks exec_quirks) {
// Due to the way the JS calls are constructed this must be less than the
// length of a string, i.e. it is always a Smi. We check anyway for security.
CHECK_LE(0, index);
CHECK_GE(subject->length(), index);
isolate->counters()->regexp_entry_runtime()->Increment();
return RegExp::ExperimentalOneshotExec(isolate, regexp, subject, index,
last_match_info, exec_quirks);
}
} // namespace
RUNTIME_FUNCTION(Runtime_RegExpExec) {
HandleScope scope(isolate);
DCHECK_EQ(4, args.length());
......@@ -868,13 +898,21 @@ RUNTIME_FUNCTION(Runtime_RegExpExec) {
CONVERT_ARG_HANDLE_CHECKED(String, subject, 1);
CONVERT_INT32_ARG_CHECKED(index, 2);
CONVERT_ARG_HANDLE_CHECKED(RegExpMatchInfo, last_match_info, 3);
// Due to the way the JS calls are constructed this must be less than the
// length of a string, i.e. it is always a Smi. We check anyway for security.
CHECK_LE(0, index);
CHECK_GE(subject->length(), index);
isolate->counters()->regexp_entry_runtime()->Increment();
RETURN_RESULT_OR_FAILURE(
isolate, RegExp::Exec(isolate, regexp, subject, index, last_match_info));
isolate, RegExpExec(isolate, regexp, subject, index, last_match_info,
RegExp::ExecQuirks::kNone));
}
RUNTIME_FUNCTION(Runtime_RegExpExecTreatMatchAtEndAsFailure) {
HandleScope scope(isolate);
DCHECK_EQ(4, args.length());
CONVERT_ARG_HANDLE_CHECKED(JSRegExp, regexp, 0);
CONVERT_ARG_HANDLE_CHECKED(String, subject, 1);
CONVERT_INT32_ARG_CHECKED(index, 2);
CONVERT_ARG_HANDLE_CHECKED(RegExpMatchInfo, last_match_info, 3);
RETURN_RESULT_OR_FAILURE(
isolate, RegExpExec(isolate, regexp, subject, index, last_match_info,
RegExp::ExecQuirks::kTreatMatchAtEndAsFailure));
}
RUNTIME_FUNCTION(Runtime_RegExpExperimentalOneshotExec) {
......@@ -884,14 +922,24 @@ RUNTIME_FUNCTION(Runtime_RegExpExperimentalOneshotExec) {
CONVERT_ARG_HANDLE_CHECKED(String, subject, 1);
CONVERT_INT32_ARG_CHECKED(index, 2);
CONVERT_ARG_HANDLE_CHECKED(RegExpMatchInfo, last_match_info, 3);
// Due to the way the JS calls are constructed this must be less than the
// length of a string, i.e. it is always a Smi. We check anyway for security.
CHECK_LE(0, index);
CHECK_GE(subject->length(), index);
isolate->counters()->regexp_entry_runtime()->Increment();
RETURN_RESULT_OR_FAILURE(
isolate, RegExp::ExperimentalOneshotExec(isolate, regexp, subject, index,
last_match_info));
isolate,
ExperimentalOneshotExec(isolate, regexp, subject, index, last_match_info,
RegExp::ExecQuirks::kNone));
}
RUNTIME_FUNCTION(
Runtime_RegExpExperimentalOneshotExecTreatMatchAtEndAsFailure) {
HandleScope scope(isolate);
DCHECK_EQ(4, args.length());
CONVERT_ARG_HANDLE_CHECKED(JSRegExp, regexp, 0);
CONVERT_ARG_HANDLE_CHECKED(String, subject, 1);
CONVERT_INT32_ARG_CHECKED(index, 2);
CONVERT_ARG_HANDLE_CHECKED(RegExpMatchInfo, last_match_info, 3);
RETURN_RESULT_OR_FAILURE(
isolate,
ExperimentalOneshotExec(isolate, regexp, subject, index, last_match_info,
RegExp::ExecQuirks::kTreatMatchAtEndAsFailure));
}
RUNTIME_FUNCTION(Runtime_RegExpBuildIndices) {
......
......@@ -384,16 +384,18 @@ namespace internal {
F(JSProxyGetTarget, 1, 1) \
F(SetPropertyWithReceiver, 4, 1)
#define FOR_EACH_INTRINSIC_REGEXP(F, I) \
I(IsRegExp, 1, 1) \
F(RegExpBuildIndices, 3, 1) \
F(RegExpExec, 4, 1) \
F(RegExpExperimentalOneshotExec, 4, 1) \
F(RegExpExecMultiple, 4, 1) \
F(RegExpInitializeAndCompile, 3, 1) \
F(RegExpReplaceRT, 3, 1) \
F(RegExpSplit, 3, 1) \
F(StringReplaceNonGlobalRegExpWithFunction, 3, 1) \
#define FOR_EACH_INTRINSIC_REGEXP(F, I) \
I(IsRegExp, 1, 1) \
F(RegExpBuildIndices, 3, 1) \
F(RegExpExec, 4, 1) \
F(RegExpExecTreatMatchAtEndAsFailure, 4, 1) \
F(RegExpExperimentalOneshotExec, 4, 1) \
F(RegExpExperimentalOneshotExecTreatMatchAtEndAsFailure, 4, 1) \
F(RegExpExecMultiple, 4, 1) \
F(RegExpInitializeAndCompile, 3, 1) \
F(RegExpReplaceRT, 3, 1) \
F(RegExpSplit, 3, 1) \
F(StringReplaceNonGlobalRegExpWithFunction, 3, 1) \
F(StringSplit, 3, 1)
#define FOR_EACH_INTRINSIC_SCOPES(F, I) \
......
// Copyright 2021 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
const re = /$/;
// The runtime path (Runtime::kRegExpExec).
assertEquals(["a"], "a".split(re));
assertEquals("", RegExp.input);
// Runtime / compilation to generated code.
assertEquals(["a"], "a".split(re));
assertEquals("", RegExp.input);
// Generated code.
assertEquals(["a"], "a".split(re));
assertEquals("", RegExp.input);
// Once again just because we can.
assertEquals(["a"], "a".split(re));
assertEquals("", RegExp.input);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment