Commit eea168f8 authored by Jakob Gruber's avatar Jakob Gruber Committed by Commit Bot

Reland "[regexp] Correctly escape a backslash-newline sequence"

This is a reland of 7d1f95d6

The reland fixes a performance issue in that we incorrectly marked
every pattern containing a backslash as needing to be escaped,
resulting in a new string allocation instead of reusing the existing
string.

Original change's description:
> [regexp] Correctly escape a backslash-newline sequence
>
> When printing the source string, a backslash-newline sequence ('\\\n',
> '\\\r', '\\\u2028', '\\\u2029') should be formatted as '\n', '\r',
> '\u2028', '\u2029', respectively. Prior to this CL it was formatted as
> a backslash followed by the literal newline character.
>
> Bug: v8:8615
> Change-Id: Iac90195c56ea1707ea8469066b0cc967ea87fc73
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2016583
> Commit-Queue: Jakob Gruber <jgruber@chromium.org>
> Reviewed-by: Georg Neis <neis@chromium.org>
> Auto-Submit: Jakob Gruber <jgruber@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#65986}

Bug: v8:8615,chromium:1046678
Change-Id: I5d75904f1ea543ec679649668e54749821116442
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2074159
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Reviewed-by: 's avatarGeorg Neis <neis@chromium.org>
Auto-Submit: Jakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66476}
parent 3bfc6e3c
......@@ -6247,37 +6247,59 @@ void JSRegExp::MarkTierUpForNextExec() {
namespace {
bool IsLineTerminator(int c) {
// Expected to return true for '\n', '\r', 0x2028, and 0x2029.
return unibrow::IsLineTerminator(static_cast<unibrow::uchar>(c));
}
// TODO(jgruber): Consider merging CountAdditionalEscapeChars and
// WriteEscapedRegExpSource into a single function to deduplicate dispatch logic
// and move related code closer to each other.
template <typename Char>
int CountRequiredEscapes(Handle<String> source) {
int CountAdditionalEscapeChars(Handle<String> source, bool* needs_escapes_out) {
DisallowHeapAllocation no_gc;
int escapes = 0;
bool needs_escapes = false;
bool in_char_class = false;
Vector<const Char> src = source->GetCharVector<Char>(no_gc);
for (int i = 0; i < src.length(); i++) {
const Char c = src[i];
if (c == '\\') {
// Escape. Skip next character;
i++;
if (i + 1 < src.length() && IsLineTerminator(src[i + 1])) {
// This '\' is ignored since the next character itself will be escaped.
escapes--;
} else {
// Escape. Skip next character, which will be copied verbatim;
i++;
}
} else if (c == '/' && !in_char_class) {
// Not escaped forward-slash needs escape.
needs_escapes = true;
escapes++;
} else if (c == '[') {
in_char_class = true;
} else if (c == ']') {
in_char_class = false;
} else if (c == '\n') {
needs_escapes = true;
escapes++;
} else if (c == '\r') {
needs_escapes = true;
escapes++;
} else if (static_cast<int>(c) == 0x2028) {
needs_escapes = true;
escapes += std::strlen("\\u2028") - 1;
} else if (static_cast<int>(c) == 0x2029) {
needs_escapes = true;
escapes += std::strlen("\\u2029") - 1;
} else {
DCHECK(!unibrow::IsLineTerminator(static_cast<unibrow::uchar>(c)));
DCHECK(!IsLineTerminator(c));
}
}
DCHECK(!in_char_class);
DCHECK_GE(escapes, 0);
DCHECK_IMPLIES(escapes != 0, needs_escapes);
*needs_escapes_out = needs_escapes;
return escapes;
}
......@@ -6297,33 +6319,42 @@ Handle<StringType> WriteEscapedRegExpSource(Handle<String> source,
int d = 0;
bool in_char_class = false;
while (s < src.length()) {
if (src[s] == '\\') {
// Escape. Copy this and next character.
dst[d++] = src[s++];
const Char c = src[s];
if (c == '\\') {
if (s + 1 < src.length() && IsLineTerminator(src[s + 1])) {
// This '\' is ignored since the next character itself will be escaped.
s++;
continue;
} else {
// Escape. Copy this and next character.
dst[d++] = src[s++];
}
if (s == src.length()) break;
} else if (src[s] == '/' && !in_char_class) {
} else if (c == '/' && !in_char_class) {
// Not escaped forward-slash needs escape.
dst[d++] = '\\';
} else if (src[s] == '[') {
} else if (c == '[') {
in_char_class = true;
} else if (src[s] == ']') {
} else if (c == ']') {
in_char_class = false;
} else if (src[s] == '\n') {
} else if (c == '\n') {
WriteStringToCharVector(dst, &d, "\\n");
s++;
continue;
} else if (src[s] == '\r') {
} else if (c == '\r') {
WriteStringToCharVector(dst, &d, "\\r");
s++;
continue;
} else if (static_cast<int>(src[s]) == 0x2028) {
} else if (static_cast<int>(c) == 0x2028) {
WriteStringToCharVector(dst, &d, "\\u2028");
s++;
continue;
} else if (static_cast<int>(src[s]) == 0x2029) {
} else if (static_cast<int>(c) == 0x2029) {
WriteStringToCharVector(dst, &d, "\\u2029");
s++;
continue;
} else {
DCHECK(!IsLineTerminator(c));
}
dst[d++] = src[s++];
}
......@@ -6337,10 +6368,12 @@ MaybeHandle<String> EscapeRegExpSource(Isolate* isolate,
DCHECK(source->IsFlat());
if (source->length() == 0) return isolate->factory()->query_colon_string();
bool one_byte = String::IsOneByteRepresentationUnderneath(*source);
int escapes = one_byte ? CountRequiredEscapes<uint8_t>(source)
: CountRequiredEscapes<uc16>(source);
if (escapes == 0) return source;
int length = source->length() + escapes;
bool needs_escapes = false;
int additional_escape_chars =
one_byte ? CountAdditionalEscapeChars<uint8_t>(source, &needs_escapes)
: CountAdditionalEscapeChars<uc16>(source, &needs_escapes);
if (!needs_escapes) return source;
int length = source->length() + additional_escape_chars;
if (one_byte) {
Handle<SeqOneByteString> result;
ASSIGN_RETURN_ON_EXCEPTION(isolate, result,
......
......@@ -56,6 +56,14 @@ RUNTIME_FUNCTION(Runtime_StrictNotEqual) {
return isolate->heap()->ToBoolean(!x.StrictEquals(y));
}
RUNTIME_FUNCTION(Runtime_ReferenceEqual) {
SealHandleScope scope(isolate);
DCHECK_EQ(2, args.length());
CONVERT_ARG_CHECKED(Object, x, 0);
CONVERT_ARG_CHECKED(Object, y, 1);
return isolate->heap()->ToBoolean(x == y);
}
RUNTIME_FUNCTION(Runtime_LessThan) {
HandleScope scope(isolate);
DCHECK_EQ(2, args.length());
......
......@@ -349,7 +349,8 @@ namespace internal {
F(LessThanOrEqual, 2, 1) \
F(NotEqual, 2, 1) \
F(StrictEqual, 2, 1) \
F(StrictNotEqual, 2, 1)
F(StrictNotEqual, 2, 1) \
F(ReferenceEqual, 2, 1)
#define FOR_EACH_INTRINSIC_PROMISE(F, I) \
F(EnqueueMicrotask, 1, 1) \
......
......@@ -25,6 +25,8 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Flags: --allow-natives-syntax
function testEscape(str, regex) {
assertEquals("foo:bar:baz", str.split(regex).join(":"));
}
......@@ -835,3 +837,13 @@ assertEquals("[/]]", /[/]]/.source);
assertEquals("[[/]]", /[[/]]/.source);
assertEquals("[[\\/]", /[[\/]/.source);
assertEquals("[[\\/]]", /[[\/]]/.source);
assertEquals("\\n", new RegExp("\\\n").source);
assertEquals("\\r", new RegExp("\\\r").source);
assertEquals("\\u2028", new RegExp("\\\u2028").source);
assertEquals("\\u2029", new RegExp("\\\u2029").source);
{
// No escapes needed, the original string should be reused as `.source`.
const pattern = "\\n";
assertTrue(%ReferenceEqual(pattern, new RegExp(pattern).source));
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment