[regexp] Dont attempt to match '^' before the start of the string

This fixes an invalid assumption when emitting code for matching '^' (start of line) in multiline regexps and '\b', '\B' in general. What we used to do: if the current trace's cp_offset (the offset from the current position) was non-zero, we assumed that we were looking at subject string index 1 or greater (i.e.: not at the start of the string or before). This is no longer valid since cp_offsets can now be negative. This CL changes the logic to omit start- and bounds-checks only for strictly positive cp_offsets, where the above assumption still holds. Bug: chromium:996391 Change-Id: I79be4fc295c6f0b63e41c13d1e91fdd00f2f2b42 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1771794 Commit-Queue: Erik Corry <erikcorry@chromium.org> Auto-Submit: Jakob Gruber <jgruber@chromium.org> Reviewed-by: Erik Corry <erikcorry@chromium.org> Cr-Commit-Position: refs/heads/master@{#63424}

[regexp] Dont attempt to match '^' before the start of the string
This fixes an invalid assumption when emitting code for matching '^' (start of line) in multiline regexps and '\b', '\B' in general. What we used to do: if the current trace's cp_offset (the offset from the current position) was non-zero, we assumed that we were looking at subject string index 1 or greater (i.e.: not at the start of the string or before). This is no longer valid since cp_offsets can now be negative. This CL changes the logic to omit start- and bounds-checks only for strictly positive cp_offsets, where the above assumption still holds. Bug: chromium:996391 Change-Id: I79be4fc295c6f0b63e41c13d1e91fdd00f2f2b42 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1771794 Commit-Queue: Erik Corry <erikcorry@chromium.org> Auto-Submit: Jakob Gruber <jgruber@chromium.org> Reviewed-by: Erik Corry <erikcorry@chromium.org> Cr-Commit-Position: refs/heads/master@{#63424}
1990b1e1 · Jakob Gruber · Commit Bot · 967d0820 · 1990b1e1 · 1990b1e1
Commit 1990b1e1 authored Aug 28, 2019 by Jakob Gruber Committed by Commit Bot Aug 28, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 61 additions and 21 deletions

regexp-compiler.cc src/regexp/regexp-compiler.cc +52 -21

regress-996391.js test/mjsunit/regress/regress-996391.js +9 -0

No files found.
--- a/src/regexp/regexp-compiler.cc
+++ b/src/regexp/regexp-compiler.cc
@@ -2047,9 +2047,11 @@ void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
  }
 }

+namespace {
+
 // Check for [0-9A-Z_a-z].
-static void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word,
-                          Label* non_word, bool fall_through_on_word) {
+void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word,
+                   Label* non_word, bool fall_through_on_word) {
  if (assembler->CheckSpecialCharacterClass(
          fall_through_on_word ? 'w' : 'W',
          fall_through_on_word ? non_word : word)) {
@@ -2071,24 +2073,37 @@ static void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word,

 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
 // that matches newline or the start of input).
-static void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success,
-                    Trace* trace) {
+void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) {
  RegExpMacroAssembler* assembler = compiler->macro_assembler();
-  // We will be loading the previous character into the current character
-  // register.
+
+  // We will load the previous character into the current character register.
  Trace new_trace(*trace);
  new_trace.InvalidateCurrentCharacter();

+  // A positive (> 0) cp_offset means we've already successfully matched a
+  // non-empty-width part of the pattern, and thus cannot be at or before the
+  // start of the subject string. We can thus skip both at-start and
+  // bounds-checks when loading the one-character lookbehind.
+  const bool may_be_at_or_before_subject_string_start =
+      new_trace.cp_offset() <= 0;
+
  Label ok;
-  if (new_trace.cp_offset() == 0) {
-    // The start of input counts as a newline in this context, so skip to
-    // ok if we are at the start.
-    assembler->CheckAtStart(&ok);
+  if (may_be_at_or_before_subject_string_start) {
+    // The start of input counts as a newline in this context, so skip to ok if
+    // we are at the start.
+    // TODO(jgruber): It would be less awkward to use CheckAtStart here, but
+    // that currently does not support a non-zero cp_offset.
+    Label not_at_start;
+    assembler->CheckNotAtStart(new_trace.cp_offset(), &not_at_start);
+    assembler->GoTo(&ok);
+    assembler->Bind(&not_at_start);
  }
-  // We already checked that we are not at the start of input so it must be
-  // OK to load the previous character.
+
+  // If we've already checked that we are not at the start of input, it's okay
+  // to load the previous character without bounds checks.
+  const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
-                                  new_trace.backtrack(), false);
+                                  new_trace.backtrack(), can_skip_bounds_check);
  if (!assembler->CheckSpecialCharacterClass('n', new_trace.backtrack())) {
    // Newline means \n, \r, 0x2028 or 0x2029.
    if (!compiler->one_byte()) {
@@ -2101,6 +2116,8 @@ static void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success,
  on_success->Emit(compiler, &new_trace);
 }

+}  // namespace
+
 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
  RegExpMacroAssembler* assembler = compiler->macro_assembler();
@@ -2156,21 +2173,35 @@ void AssertionNode::BacktrackIfPrevious(
  Trace new_trace(*trace);
  new_trace.InvalidateCurrentCharacter();

-  Label fall_through, dummy;
-
+  Label fall_through;
  Label* non_word = backtrack_if_previous == kIsNonWord ? new_trace.backtrack()
                                                        : &fall_through;
  Label* word = backtrack_if_previous == kIsNonWord ? &fall_through
                                                    : new_trace.backtrack();

-  if (new_trace.cp_offset() == 0) {
+  // A positive (> 0) cp_offset means we've already successfully matched a
+  // non-empty-width part of the pattern, and thus cannot be at or before the
+  // start of the subject string. We can thus skip both at-start and
+  // bounds-checks when loading the one-character lookbehind.
+  const bool may_be_at_or_before_subject_string_start =
+      new_trace.cp_offset() <= 0;
+
+  if (may_be_at_or_before_subject_string_start) {
    // The start of input counts as a non-word character, so the question is
    // decided if we are at the start.
-    assembler->CheckAtStart(non_word);
-  }
-  // We already checked that we are not at the start of input so it must be
-  // OK to load the previous character.
-  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
+    // TODO(jgruber): It would be less awkward to use CheckAtStart here, but
+    // that currently does not support a non-zero cp_offset.
+    Label not_at_start;
+    assembler->CheckNotAtStart(new_trace.cp_offset(), &not_at_start);
+    assembler->GoTo(non_word);
+    assembler->Bind(&not_at_start);
+  }
+
+  // If we've already checked that we are not at the start of input, it's okay
+  // to load the previous character without bounds checks.
+  const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
+  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, non_word,
+                                  can_skip_bounds_check);
  EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);

  assembler->Bind(&fall_through);

--- a/test/mjsunit/regress/regress-996391.js
+++ b/test/mjsunit/regress/regress-996391.js
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Flags: --allow-natives-syntax --regexp-interpret-all
+
+assertArrayEquals(["o"], /.(?<!^.)/m.exec("foobar"));
+assertArrayEquals(["o"], /.(?<!\b.)/m.exec("foobar"));
+assertArrayEquals(["f"], /.(?<!\B.)/m.exec("foobar"));