Commit 57d202d8 authored by yangguo's avatar yangguo Committed by Commit bot

[regexp] correctly advance zero length matches for global/unicode.

R=erik.corry@gmail.com
BUG=v8:2952
LOG=N

Review URL: https://codereview.chromium.org/1630633002

Cr-Commit-Position: refs/heads/master@{#33550}
parent 66e2a786
...@@ -805,9 +805,12 @@ Handle<HeapObject> RegExpMacroAssemblerARM::GetCode(Handle<String> source) { ...@@ -805,9 +805,12 @@ Handle<HeapObject> RegExpMacroAssemblerARM::GetCode(Handle<String> source) {
__ cmp(current_input_offset(), Operand::Zero()); __ cmp(current_input_offset(), Operand::Zero());
__ b(eq, &exit_label_); __ b(eq, &exit_label_);
// Advance current position after a zero-length match. // Advance current position after a zero-length match.
Label advance;
__ bind(&advance);
__ add(current_input_offset(), __ add(current_input_offset(),
current_input_offset(), current_input_offset(),
Operand((mode_ == UC16) ? 2 : 1)); Operand((mode_ == UC16) ? 2 : 1));
if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
} }
__ b(&load_char_start_regexp); __ b(&load_char_start_regexp);
......
...@@ -998,9 +998,12 @@ Handle<HeapObject> RegExpMacroAssemblerARM64::GetCode(Handle<String> source) { ...@@ -998,9 +998,12 @@ Handle<HeapObject> RegExpMacroAssemblerARM64::GetCode(Handle<String> source) {
// Offset from the end is zero if we already reached the end. // Offset from the end is zero if we already reached the end.
__ Cbz(current_input_offset(), &return_w0); __ Cbz(current_input_offset(), &return_w0);
// Advance current position after a zero-length match. // Advance current position after a zero-length match.
Label advance;
__ bind(&advance);
__ Add(current_input_offset(), __ Add(current_input_offset(),
current_input_offset(), current_input_offset(),
Operand((mode_ == UC16) ? 2 : 1)); Operand((mode_ == UC16) ? 2 : 1));
if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
} }
__ B(&load_char_start_regexp); __ B(&load_char_start_regexp);
......
...@@ -829,13 +829,15 @@ Handle<HeapObject> RegExpMacroAssemblerIA32::GetCode(Handle<String> source) { ...@@ -829,13 +829,15 @@ Handle<HeapObject> RegExpMacroAssemblerIA32::GetCode(Handle<String> source) {
__ test(edi, edi); __ test(edi, edi);
__ j(zero, &exit_label_, Label::kNear); __ j(zero, &exit_label_, Label::kNear);
// Advance current position after a zero-length match. // Advance current position after a zero-length match.
Label advance;
__ bind(&advance);
if (mode_ == UC16) { if (mode_ == UC16) {
__ add(edi, Immediate(2)); __ add(edi, Immediate(2));
} else { } else {
__ inc(edi); __ inc(edi);
} }
if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
} }
__ jmp(&load_char_start_regexp); __ jmp(&load_char_start_regexp);
} else { } else {
__ mov(eax, Immediate(SUCCESS)); __ mov(eax, Immediate(SUCCESS));
......
...@@ -47,7 +47,10 @@ int32_t* RegExpImpl::GlobalCache::FetchNext() { ...@@ -47,7 +47,10 @@ int32_t* RegExpImpl::GlobalCache::FetchNext() {
register_array_size_); register_array_size_);
} else { } else {
int last_start_index = last_match[0]; int last_start_index = last_match[0];
if (last_start_index == last_end_index) last_end_index++; if (last_start_index == last_end_index) {
// Zero-length match. Advance by one code point.
last_end_index = AdvanceZeroLength(last_end_index);
}
if (last_end_index > subject_->length()) { if (last_end_index > subject_->length()) {
num_matches_ = 0; // Signal failed match. num_matches_ = 0; // Signal failed match.
return NULL; return NULL;
......
...@@ -638,7 +638,6 @@ Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info, ...@@ -638,7 +638,6 @@ Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp, RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
Handle<String> subject, Handle<String> subject,
bool is_global,
Isolate* isolate) Isolate* isolate)
: register_array_(NULL), : register_array_(NULL),
register_array_size_(0), register_array_size_(0),
...@@ -663,7 +662,8 @@ RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp, ...@@ -663,7 +662,8 @@ RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
} }
} }
if (is_global && !interpreted) { DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
if (!interpreted) {
register_array_size_ = register_array_size_ =
Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize); Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
max_matches_ = register_array_size_ / registers_per_match_; max_matches_ = register_array_size_ / registers_per_match_;
...@@ -692,6 +692,16 @@ RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp, ...@@ -692,6 +692,16 @@ RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
last_match[1] = 0; last_match[1] = 0;
} }
int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
last_index + 1 < subject_->length() &&
unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
// Advance over the surrogate pair.
return last_index + 2;
}
return last_index + 1;
}
// ------------------------------------------------------------------- // -------------------------------------------------------------------
// Implementation of the Irregexp regular expression engine. // Implementation of the Irregexp regular expression engine.
...@@ -6623,6 +6633,7 @@ RegExpEngine::CompilationResult RegExpEngine::Compile( ...@@ -6623,6 +6633,7 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
bool ignore_case = flags & JSRegExp::kIgnoreCase; bool ignore_case = flags & JSRegExp::kIgnoreCase;
bool is_sticky = flags & JSRegExp::kSticky; bool is_sticky = flags & JSRegExp::kSticky;
bool is_global = flags & JSRegExp::kGlobal; bool is_global = flags & JSRegExp::kGlobal;
bool is_unicode = flags & JSRegExp::kUnicode;
RegExpCompiler compiler(isolate, zone, data->capture_count, flags, RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
is_one_byte); is_one_byte);
...@@ -6742,10 +6753,13 @@ RegExpEngine::CompilationResult RegExpEngine::Compile( ...@@ -6742,10 +6753,13 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
} }
if (is_global) { if (is_global) {
macro_assembler.set_global_mode( RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
(data->tree->min_match() > 0) if (data->tree->min_match() > 0) {
? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
: RegExpMacroAssembler::GLOBAL); } else if (is_unicode) {
mode = RegExpMacroAssembler::GLOBAL_UNICODE;
}
macro_assembler.set_global_mode(mode);
} }
return compiler.Assemble(&macro_assembler, return compiler.Assemble(&macro_assembler,
......
...@@ -122,7 +122,6 @@ class RegExpImpl { ...@@ -122,7 +122,6 @@ class RegExpImpl {
public: public:
GlobalCache(Handle<JSRegExp> regexp, GlobalCache(Handle<JSRegExp> regexp,
Handle<String> subject, Handle<String> subject,
bool is_global,
Isolate* isolate); Isolate* isolate);
INLINE(~GlobalCache()); INLINE(~GlobalCache());
...@@ -138,6 +137,8 @@ class RegExpImpl { ...@@ -138,6 +137,8 @@ class RegExpImpl {
INLINE(bool HasException()) { return num_matches_ < 0; } INLINE(bool HasException()) { return num_matches_ < 0; }
private: private:
int AdvanceZeroLength(int last_index);
int num_matches_; int num_matches_;
int max_matches_; int max_matches_;
int current_match_index_; int current_match_index_;
......
...@@ -808,9 +808,12 @@ Handle<HeapObject> RegExpMacroAssemblerMIPS::GetCode(Handle<String> source) { ...@@ -808,9 +808,12 @@ Handle<HeapObject> RegExpMacroAssemblerMIPS::GetCode(Handle<String> source) {
__ Branch(&exit_label_, eq, current_input_offset(), __ Branch(&exit_label_, eq, current_input_offset(),
Operand(zero_reg)); Operand(zero_reg));
// Advance current position after a zero-length match. // Advance current position after a zero-length match.
Label advance;
__ bind(&advance);
__ Addu(current_input_offset(), __ Addu(current_input_offset(),
current_input_offset(), current_input_offset(),
Operand((mode_ == UC16) ? 2 : 1)); Operand((mode_ == UC16) ? 2 : 1));
if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
} }
__ Branch(&load_char_start_regexp); __ Branch(&load_char_start_regexp);
......
...@@ -848,9 +848,12 @@ Handle<HeapObject> RegExpMacroAssemblerMIPS::GetCode(Handle<String> source) { ...@@ -848,9 +848,12 @@ Handle<HeapObject> RegExpMacroAssemblerMIPS::GetCode(Handle<String> source) {
__ Branch(&exit_label_, eq, current_input_offset(), __ Branch(&exit_label_, eq, current_input_offset(),
Operand(zero_reg)); Operand(zero_reg));
// Advance current position after a zero-length match. // Advance current position after a zero-length match.
Label advance;
__ bind(&advance);
__ Daddu(current_input_offset(), __ Daddu(current_input_offset(),
current_input_offset(), current_input_offset(),
Operand((mode_ == UC16) ? 2 : 1)); Operand((mode_ == UC16) ? 2 : 1));
if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
} }
__ Branch(&load_char_start_regexp); __ Branch(&load_char_start_regexp);
......
...@@ -166,21 +166,27 @@ class RegExpMacroAssembler { ...@@ -166,21 +166,27 @@ class RegExpMacroAssembler {
void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; } void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; }
bool slow_safe() { return slow_safe_compiler_; } bool slow_safe() { return slow_safe_compiler_; }
enum GlobalMode { NOT_GLOBAL, GLOBAL, GLOBAL_NO_ZERO_LENGTH_CHECK }; enum GlobalMode {
NOT_GLOBAL,
GLOBAL_NO_ZERO_LENGTH_CHECK,
GLOBAL,
GLOBAL_UNICODE
};
// Set whether the regular expression has the global flag. Exiting due to // Set whether the regular expression has the global flag. Exiting due to
// a failure in a global regexp may still mean success overall. // a failure in a global regexp may still mean success overall.
inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; } inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; }
inline bool global() { return global_mode_ != NOT_GLOBAL; } inline bool global() { return global_mode_ != NOT_GLOBAL; }
inline bool global_with_zero_length_check() { inline bool global_with_zero_length_check() {
return global_mode_ == GLOBAL; return global_mode_ == GLOBAL || global_mode_ == GLOBAL_UNICODE;
} }
inline bool global_unicode() { return global_mode_ == GLOBAL_UNICODE; }
Isolate* isolate() const { return isolate_; } Isolate* isolate() const { return isolate_; }
Zone* zone() const { return zone_; } Zone* zone() const { return zone_; }
private: private:
bool slow_safe_compiler_; bool slow_safe_compiler_;
bool global_mode_; GlobalMode global_mode_;
Isolate* isolate_; Isolate* isolate_;
Zone* zone_; Zone* zone_;
}; };
......
...@@ -877,11 +877,14 @@ Handle<HeapObject> RegExpMacroAssemblerX64::GetCode(Handle<String> source) { ...@@ -877,11 +877,14 @@ Handle<HeapObject> RegExpMacroAssemblerX64::GetCode(Handle<String> source) {
__ testp(rdi, rdi); __ testp(rdi, rdi);
__ j(zero, &exit_label_, Label::kNear); __ j(zero, &exit_label_, Label::kNear);
// Advance current position after a zero-length match. // Advance current position after a zero-length match.
Label advance;
__ bind(&advance);
if (mode_ == UC16) { if (mode_ == UC16) {
__ addq(rdi, Immediate(2)); __ addq(rdi, Immediate(2));
} else { } else {
__ incq(rdi); __ incq(rdi);
} }
if (global_unicode()) CheckNotInSurrogatePair(0, &advance);
} }
__ jmp(&load_char_start_regexp); __ jmp(&load_char_start_regexp);
......
...@@ -492,7 +492,7 @@ MUST_USE_RESULT static Object* StringReplaceGlobalRegExpWithString( ...@@ -492,7 +492,7 @@ MUST_USE_RESULT static Object* StringReplaceGlobalRegExpWithString(
} }
} }
RegExpImpl::GlobalCache global_cache(regexp, subject, true, isolate); RegExpImpl::GlobalCache global_cache(regexp, subject, isolate);
if (global_cache.HasException()) return isolate->heap()->exception(); if (global_cache.HasException()) return isolate->heap()->exception();
int32_t* current_match = global_cache.FetchNext(); int32_t* current_match = global_cache.FetchNext();
...@@ -568,7 +568,7 @@ MUST_USE_RESULT static Object* StringReplaceGlobalRegExpWithEmptyString( ...@@ -568,7 +568,7 @@ MUST_USE_RESULT static Object* StringReplaceGlobalRegExpWithEmptyString(
} }
} }
RegExpImpl::GlobalCache global_cache(regexp, subject, true, isolate); RegExpImpl::GlobalCache global_cache(regexp, subject, isolate);
if (global_cache.HasException()) return isolate->heap()->exception(); if (global_cache.HasException()) return isolate->heap()->exception();
int32_t* current_match = global_cache.FetchNext(); int32_t* current_match = global_cache.FetchNext();
...@@ -876,7 +876,7 @@ static Object* SearchRegExpMultiple(Isolate* isolate, Handle<String> subject, ...@@ -876,7 +876,7 @@ static Object* SearchRegExpMultiple(Isolate* isolate, Handle<String> subject,
} }
} }
RegExpImpl::GlobalCache global_cache(regexp, subject, true, isolate); RegExpImpl::GlobalCache global_cache(regexp, subject, isolate);
if (global_cache.HasException()) return isolate->heap()->exception(); if (global_cache.HasException()) return isolate->heap()->exception();
// Ensured in Runtime_RegExpExecMultiple. // Ensured in Runtime_RegExpExecMultiple.
......
...@@ -341,7 +341,7 @@ RUNTIME_FUNCTION(Runtime_StringMatch) { ...@@ -341,7 +341,7 @@ RUNTIME_FUNCTION(Runtime_StringMatch) {
RUNTIME_ASSERT(regexp_info->HasFastObjectElements()); RUNTIME_ASSERT(regexp_info->HasFastObjectElements());
RegExpImpl::GlobalCache global_cache(regexp, subject, true, isolate); RegExpImpl::GlobalCache global_cache(regexp, subject, isolate);
if (global_cache.HasException()) return isolate->heap()->exception(); if (global_cache.HasException()) return isolate->heap()->exception();
int capture_count = regexp->CaptureCount(); int capture_count = regexp->CaptureCount();
......
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-unicode-regexps
var L = "\ud800";
var T = "\udc00";
var x = "x";
var r = /()/g; // Global, but not unicode.
// Zero-length matches do not advance lastIndex.
assertEquals(["", ""], r.exec(L + T + L + T));
assertEquals(0, r.lastIndex);
r.lastIndex = 1;
assertEquals(["", ""], r.exec(L + T + L + T));
assertEquals(1, r.lastIndex);
var u = /()/ug; // Global and unicode.
// Zero-length matches do not advance lastIndex.
assertEquals(["", ""], u.exec(L + T + L + T));
assertEquals(0, u.lastIndex);
u.lastIndex = 1;
assertEquals(["", ""], u.exec(L + T + L + T));
assertEquals(0, u.lastIndex);
// However, with repeating matches, lastIndex does not matter.
// We do advance from match to match.
r.lastIndex = 2;
assertEquals(x + L + x + T + x + L + x + T + x,
(L + T + L + T).replace(r, "x"));
// With unicode flag, we advance code point by code point.
u.lastIndex = 3;
assertEquals(x + L + T + x + L + T + x,
(L + T + L + T).replace(u, "x"));
// Test that exhausting the global match cache is fine.
assertEquals((x + L + T).repeat(1000) + x,
(L + T).repeat(1000).replace(u, "x"));
// Same thing for RegExp.prototype.match.
r.lastIndex = 1;
assertEquals(["","","","",""], (L + T + L + T).match(r));
r.lastIndex = 2;
assertEquals(["","","","",""], (L + T + L + T).match(r));
u.lastIndex = 1;
assertEquals(["","",""], (L + T + L + T).match(u));
u.lastIndex = 2;
assertEquals(["","",""], (L + T + L + T).match(u));
var expected = [];
for (var i = 0; i <= 1000; i++) expected.push("");
assertEquals(expected, (L + T).repeat(1000).match(u));
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment