Commit eea168f8 authored by Jakob Gruber's avatar Jakob Gruber Committed by Commit Bot

Reland "[regexp] Correctly escape a backslash-newline sequence"

This is a reland of 7d1f95d6

The reland fixes a performance issue in that we incorrectly marked
every pattern containing a backslash as needing to be escaped,
resulting in a new string allocation instead of reusing the existing
string.

Original change's description:
> [regexp] Correctly escape a backslash-newline sequence
>
> When printing the source string, a backslash-newline sequence ('\\\n',
> '\\\r', '\\\u2028', '\\\u2029') should be formatted as '\n', '\r',
> '\u2028', '\u2029', respectively. Prior to this CL it was formatted as
> a backslash followed by the literal newline character.
>
> Bug: v8:8615
> Change-Id: Iac90195c56ea1707ea8469066b0cc967ea87fc73
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2016583
> Commit-Queue: Jakob Gruber <jgruber@chromium.org>
> Reviewed-by: Georg Neis <neis@chromium.org>
> Auto-Submit: Jakob Gruber <jgruber@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#65986}

Bug: v8:8615,chromium:1046678
Change-Id: I5d75904f1ea543ec679649668e54749821116442
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2074159
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Reviewed-by: 's avatarGeorg Neis <neis@chromium.org>
Auto-Submit: Jakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66476}
parent 3bfc6e3c
...@@ -6247,37 +6247,59 @@ void JSRegExp::MarkTierUpForNextExec() { ...@@ -6247,37 +6247,59 @@ void JSRegExp::MarkTierUpForNextExec() {
namespace { namespace {
bool IsLineTerminator(int c) {
// Expected to return true for '\n', '\r', 0x2028, and 0x2029.
return unibrow::IsLineTerminator(static_cast<unibrow::uchar>(c));
}
// TODO(jgruber): Consider merging CountAdditionalEscapeChars and
// WriteEscapedRegExpSource into a single function to deduplicate dispatch logic
// and move related code closer to each other.
template <typename Char> template <typename Char>
int CountRequiredEscapes(Handle<String> source) { int CountAdditionalEscapeChars(Handle<String> source, bool* needs_escapes_out) {
DisallowHeapAllocation no_gc; DisallowHeapAllocation no_gc;
int escapes = 0; int escapes = 0;
bool needs_escapes = false;
bool in_char_class = false; bool in_char_class = false;
Vector<const Char> src = source->GetCharVector<Char>(no_gc); Vector<const Char> src = source->GetCharVector<Char>(no_gc);
for (int i = 0; i < src.length(); i++) { for (int i = 0; i < src.length(); i++) {
const Char c = src[i]; const Char c = src[i];
if (c == '\\') { if (c == '\\') {
// Escape. Skip next character; if (i + 1 < src.length() && IsLineTerminator(src[i + 1])) {
// This '\' is ignored since the next character itself will be escaped.
escapes--;
} else {
// Escape. Skip next character, which will be copied verbatim;
i++; i++;
}
} else if (c == '/' && !in_char_class) { } else if (c == '/' && !in_char_class) {
// Not escaped forward-slash needs escape. // Not escaped forward-slash needs escape.
needs_escapes = true;
escapes++; escapes++;
} else if (c == '[') { } else if (c == '[') {
in_char_class = true; in_char_class = true;
} else if (c == ']') { } else if (c == ']') {
in_char_class = false; in_char_class = false;
} else if (c == '\n') { } else if (c == '\n') {
needs_escapes = true;
escapes++; escapes++;
} else if (c == '\r') { } else if (c == '\r') {
needs_escapes = true;
escapes++; escapes++;
} else if (static_cast<int>(c) == 0x2028) { } else if (static_cast<int>(c) == 0x2028) {
needs_escapes = true;
escapes += std::strlen("\\u2028") - 1; escapes += std::strlen("\\u2028") - 1;
} else if (static_cast<int>(c) == 0x2029) { } else if (static_cast<int>(c) == 0x2029) {
needs_escapes = true;
escapes += std::strlen("\\u2029") - 1; escapes += std::strlen("\\u2029") - 1;
} else { } else {
DCHECK(!unibrow::IsLineTerminator(static_cast<unibrow::uchar>(c))); DCHECK(!IsLineTerminator(c));
} }
} }
DCHECK(!in_char_class); DCHECK(!in_char_class);
DCHECK_GE(escapes, 0);
DCHECK_IMPLIES(escapes != 0, needs_escapes);
*needs_escapes_out = needs_escapes;
return escapes; return escapes;
} }
...@@ -6297,33 +6319,42 @@ Handle<StringType> WriteEscapedRegExpSource(Handle<String> source, ...@@ -6297,33 +6319,42 @@ Handle<StringType> WriteEscapedRegExpSource(Handle<String> source,
int d = 0; int d = 0;
bool in_char_class = false; bool in_char_class = false;
while (s < src.length()) { while (s < src.length()) {
if (src[s] == '\\') { const Char c = src[s];
if (c == '\\') {
if (s + 1 < src.length() && IsLineTerminator(src[s + 1])) {
// This '\' is ignored since the next character itself will be escaped.
s++;
continue;
} else {
// Escape. Copy this and next character. // Escape. Copy this and next character.
dst[d++] = src[s++]; dst[d++] = src[s++];
}
if (s == src.length()) break; if (s == src.length()) break;
} else if (src[s] == '/' && !in_char_class) { } else if (c == '/' && !in_char_class) {
// Not escaped forward-slash needs escape. // Not escaped forward-slash needs escape.
dst[d++] = '\\'; dst[d++] = '\\';
} else if (src[s] == '[') { } else if (c == '[') {
in_char_class = true; in_char_class = true;
} else if (src[s] == ']') { } else if (c == ']') {
in_char_class = false; in_char_class = false;
} else if (src[s] == '\n') { } else if (c == '\n') {
WriteStringToCharVector(dst, &d, "\\n"); WriteStringToCharVector(dst, &d, "\\n");
s++; s++;
continue; continue;
} else if (src[s] == '\r') { } else if (c == '\r') {
WriteStringToCharVector(dst, &d, "\\r"); WriteStringToCharVector(dst, &d, "\\r");
s++; s++;
continue; continue;
} else if (static_cast<int>(src[s]) == 0x2028) { } else if (static_cast<int>(c) == 0x2028) {
WriteStringToCharVector(dst, &d, "\\u2028"); WriteStringToCharVector(dst, &d, "\\u2028");
s++; s++;
continue; continue;
} else if (static_cast<int>(src[s]) == 0x2029) { } else if (static_cast<int>(c) == 0x2029) {
WriteStringToCharVector(dst, &d, "\\u2029"); WriteStringToCharVector(dst, &d, "\\u2029");
s++; s++;
continue; continue;
} else {
DCHECK(!IsLineTerminator(c));
} }
dst[d++] = src[s++]; dst[d++] = src[s++];
} }
...@@ -6337,10 +6368,12 @@ MaybeHandle<String> EscapeRegExpSource(Isolate* isolate, ...@@ -6337,10 +6368,12 @@ MaybeHandle<String> EscapeRegExpSource(Isolate* isolate,
DCHECK(source->IsFlat()); DCHECK(source->IsFlat());
if (source->length() == 0) return isolate->factory()->query_colon_string(); if (source->length() == 0) return isolate->factory()->query_colon_string();
bool one_byte = String::IsOneByteRepresentationUnderneath(*source); bool one_byte = String::IsOneByteRepresentationUnderneath(*source);
int escapes = one_byte ? CountRequiredEscapes<uint8_t>(source) bool needs_escapes = false;
: CountRequiredEscapes<uc16>(source); int additional_escape_chars =
if (escapes == 0) return source; one_byte ? CountAdditionalEscapeChars<uint8_t>(source, &needs_escapes)
int length = source->length() + escapes; : CountAdditionalEscapeChars<uc16>(source, &needs_escapes);
if (!needs_escapes) return source;
int length = source->length() + additional_escape_chars;
if (one_byte) { if (one_byte) {
Handle<SeqOneByteString> result; Handle<SeqOneByteString> result;
ASSIGN_RETURN_ON_EXCEPTION(isolate, result, ASSIGN_RETURN_ON_EXCEPTION(isolate, result,
......
...@@ -56,6 +56,14 @@ RUNTIME_FUNCTION(Runtime_StrictNotEqual) { ...@@ -56,6 +56,14 @@ RUNTIME_FUNCTION(Runtime_StrictNotEqual) {
return isolate->heap()->ToBoolean(!x.StrictEquals(y)); return isolate->heap()->ToBoolean(!x.StrictEquals(y));
} }
RUNTIME_FUNCTION(Runtime_ReferenceEqual) {
SealHandleScope scope(isolate);
DCHECK_EQ(2, args.length());
CONVERT_ARG_CHECKED(Object, x, 0);
CONVERT_ARG_CHECKED(Object, y, 1);
return isolate->heap()->ToBoolean(x == y);
}
RUNTIME_FUNCTION(Runtime_LessThan) { RUNTIME_FUNCTION(Runtime_LessThan) {
HandleScope scope(isolate); HandleScope scope(isolate);
DCHECK_EQ(2, args.length()); DCHECK_EQ(2, args.length());
......
...@@ -349,7 +349,8 @@ namespace internal { ...@@ -349,7 +349,8 @@ namespace internal {
F(LessThanOrEqual, 2, 1) \ F(LessThanOrEqual, 2, 1) \
F(NotEqual, 2, 1) \ F(NotEqual, 2, 1) \
F(StrictEqual, 2, 1) \ F(StrictEqual, 2, 1) \
F(StrictNotEqual, 2, 1) F(StrictNotEqual, 2, 1) \
F(ReferenceEqual, 2, 1)
#define FOR_EACH_INTRINSIC_PROMISE(F, I) \ #define FOR_EACH_INTRINSIC_PROMISE(F, I) \
F(EnqueueMicrotask, 1, 1) \ F(EnqueueMicrotask, 1, 1) \
......
...@@ -25,6 +25,8 @@ ...@@ -25,6 +25,8 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Flags: --allow-natives-syntax
function testEscape(str, regex) { function testEscape(str, regex) {
assertEquals("foo:bar:baz", str.split(regex).join(":")); assertEquals("foo:bar:baz", str.split(regex).join(":"));
} }
...@@ -835,3 +837,13 @@ assertEquals("[/]]", /[/]]/.source); ...@@ -835,3 +837,13 @@ assertEquals("[/]]", /[/]]/.source);
assertEquals("[[/]]", /[[/]]/.source); assertEquals("[[/]]", /[[/]]/.source);
assertEquals("[[\\/]", /[[\/]/.source); assertEquals("[[\\/]", /[[\/]/.source);
assertEquals("[[\\/]]", /[[\/]]/.source); assertEquals("[[\\/]]", /[[\/]]/.source);
assertEquals("\\n", new RegExp("\\\n").source);
assertEquals("\\r", new RegExp("\\\r").source);
assertEquals("\\u2028", new RegExp("\\\u2028").source);
assertEquals("\\u2029", new RegExp("\\\u2029").source);
{
// No escapes needed, the original string should be reused as `.source`.
const pattern = "\\n";
assertTrue(%ReferenceEqual(pattern, new RegExp(pattern).source));
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment