Commit 49fda47c authored by yangguo's avatar yangguo Committed by Commit bot

[regexp] back refs must not start/end in the middle of a surrogate pair

R=littledan@chromium.org
BUG=v8:2952
LOG=N

Review URL: https://codereview.chromium.org/1601653006

Cr-Commit-Position: refs/heads/master@{#33540}
parent 85ba94f2
...@@ -3588,6 +3588,7 @@ class AlternativeGenerationList { ...@@ -3588,6 +3588,7 @@ class AlternativeGenerationList {
AlternativeGeneration a_few_alt_gens_[kAFew]; AlternativeGeneration a_few_alt_gens_[kAFew];
}; };
static const uc32 kRangeEndMarker = 0x110000; static const uc32 kRangeEndMarker = 0x110000;
// The '2' variant is has inclusive from and exclusive to. // The '2' variant is has inclusive from and exclusive to.
...@@ -4398,6 +4399,11 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { ...@@ -4398,6 +4399,11 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
} }
// We are going to advance backward, so we may end up at the start. // We are going to advance backward, so we may end up at the start.
if (read_backward()) trace->set_at_start(Trace::UNKNOWN); if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
// Check that the back reference does not end inside a surrogate pair.
if (compiler->unicode() && !compiler->one_byte()) {
assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
}
on_success()->Emit(compiler, trace); on_success()->Emit(compiler, trace);
} }
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include "src/allocation.h" #include "src/allocation.h"
#include "src/assembler.h" #include "src/assembler.h"
#include "src/regexp/regexp-ast.h" #include "src/regexp/regexp-ast.h"
#include "src/regexp/regexp-macro-assembler.h"
namespace v8 { namespace v8 {
namespace internal { namespace internal {
...@@ -19,15 +20,6 @@ class RegExpNode; ...@@ -19,15 +20,6 @@ class RegExpNode;
class RegExpTree; class RegExpTree;
class BoyerMooreLookahead; class BoyerMooreLookahead;
static const uc32 kLeadSurrogateStart = 0xd800;
static const uc32 kLeadSurrogateEnd = 0xdbff;
static const uc32 kTrailSurrogateStart = 0xdc00;
static const uc32 kTrailSurrogateEnd = 0xdfff;
static const uc32 kNonBmpStart = 0x10000;
static const uc32 kNonBmpEnd = 0x10ffff;
class RegExpImpl { class RegExpImpl {
public: public:
// Whether V8 is compiled with native regexp support or not. // Whether V8 is compiled with native regexp support or not.
......
...@@ -88,6 +88,19 @@ int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1, ...@@ -88,6 +88,19 @@ int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
} }
void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
Label* on_failure) {
Label ok;
// Check that current character is not a trail surrogate.
LoadCurrentCharacter(cp_offset, &ok);
CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
// Check that previous character is not a lead surrogate.
LoadCurrentCharacter(cp_offset - 1, &ok);
CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
Bind(&ok);
}
#ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM. #ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM.
NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate, NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
......
...@@ -11,6 +11,13 @@ ...@@ -11,6 +11,13 @@
namespace v8 { namespace v8 {
namespace internal { namespace internal {
static const uc32 kLeadSurrogateStart = 0xd800;
static const uc32 kLeadSurrogateEnd = 0xdbff;
static const uc32 kTrailSurrogateStart = 0xdc00;
static const uc32 kTrailSurrogateEnd = 0xdfff;
static const uc32 kNonBmpStart = 0x10000;
static const uc32 kNonBmpEnd = 0x10ffff;
struct DisjunctDecisionRow { struct DisjunctDecisionRow {
RegExpCharacterClass cc; RegExpCharacterClass cc;
Label* on_match; Label* on_match;
...@@ -152,6 +159,9 @@ class RegExpMacroAssembler { ...@@ -152,6 +159,9 @@ class RegExpMacroAssembler {
Address byte_offset2, Address byte_offset2,
size_t byte_length, Isolate* isolate); size_t byte_length, Isolate* isolate);
// Check that we are not in the middle of a surrogate pair.
void CheckNotInSurrogatePair(int cp_offset, Label* on_failure);
// Controls the generation of large inlined constants in the code. // Controls the generation of large inlined constants in the code.
void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; } void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; }
bool slow_safe() { return slow_safe_compiler_; } bool slow_safe() { return slow_safe_compiler_; }
......
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-unicode-regexps --harmony-regexp-lookbehind
// Back reference does not end in the middle of a surrogate pair.
function replace(string) {
return string.replace(/L/g, "\ud800")
.replace(/l/g, "\ud801")
.replace(/T/g, "\udc00")
.replace(/\./g, "[^]");
}
function test(expectation, regexp_source, subject) {
if (expectation !== null) expectation = expectation.map(replace);
subject = replace(subject);
regexp_source = replace(regexp_source);
assertEquals(expectation, new RegExp(regexp_source, "u").exec(subject));
}
// Back reference does not end in the middle of a surrogate pair.
test(null, "(L)\\1", "LLT");
test(["LLTLl", "L", "l"], "(L).*\\1(.)", "LLTLl");
test(null, "(aL).*\\1", "aLaLT");
test(["aLaLTaLl", "aL", "l"], "(aL).*\\1(.)", "aLaLTaLl");
var s = "TabcLxLTabcLxTabcLTyTabcLz";
test([s, "TabcL", "z"], "([^x]+).*\\1(.)", s);
// Back reference does not start in the middle of a surrogate pair.
test(["TLTabTc", "T", "c"], "(T).*\\1(.)", "TLTabTc");
// Lookbehinds.
test(null, "(?<=\\1(T)x)", "LTTx");
test(["", "b", "T"], "(?<=(.)\\2.*(T)x)", "bTaLTTx");
test(null, "(?<=\\1.*(L)x)", "LTLx");
test(["", "b", "L"], "(?<=(.)\\2.*(L)x)", "bLaLTLx");
test(null, "([^x]+)x*\\1", "LxLT");
test(null, "([^x]+)x*\\1", "TxLT");
test(null, "([^x]+)x*\\1", "LTxL");
test(null, "([^x]+)x*\\1", "LTxT");
test(null, "([^x]+)x*\\1", "xLxLT");
test(null, "([^x]+)x*\\1", "xTxLT");
test(null, "([^x]+)x*\\1", "xLTxL");
test(null, "([^x]+)x*\\1", "xLTxT");
test(null, "([^x]+)x*\\1", "xxxLxxLTxx");
test(null, "([^x]+)x*\\1", "xxxTxxLTxx");
test(null, "([^x]+)x*\\1", "xxxLTxxLxx");
test(null, "([^x]+)x*\\1", "xxxLTxxTxx");
test(["LTTxxLTT", "LTT"], "([^x]+)x*\\1", "xxxLTTxxLTTxx");
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment