Commit 33c78c39 authored by yangguo's avatar yangguo Committed by Commit bot

[regexp] simplify unanchored advance for unicode regexps.

When doing advance at the start of an unanchored unicode regexp,
we do not have to care about surrogate pairs. If we actually advance
into the middle of a surrogate pair, the only choice is to also
consume trail surrogate as nothing else can match from there.

This reduces the emitted code slightly. By not having choice in the
loop, we do not have to push backtrack onto the stack, preventing
stack overflow.

R=erik.corry@gmail.com, erikcorry@chromium.org

Review URL: https://codereview.chromium.org/1676293003

Cr-Commit-Position: refs/heads/master@{#33838}
parent 5082eaee
......@@ -5085,34 +5085,18 @@ void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
result->AddAlternative(GuardedAlternative(match));
}
void AddUnanchoredAdvance(RegExpCompiler* compiler, ChoiceNode* result,
RegExpNode* on_success) {
RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
RegExpNode* on_success) {
// This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
DCHECK(!compiler->read_backward());
Zone* zone = compiler->zone();
// Advancing can either consume a BMP character or a trail surrogate.
ZoneList<CharacterRange>* bmp_and_trail =
new (zone) ZoneList<CharacterRange>(2, zone);
bmp_and_trail->Add(CharacterRange::Range(0, kLeadSurrogateStart - 1), zone);
bmp_and_trail->Add(
CharacterRange::Range(kLeadSurrogateEnd + 1, kNonBmpStart - 1), zone);
result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
zone, bmp_and_trail, false, on_success)));
// Or it could consume a lead optionally followed by a trail surrogate.
ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
ChoiceNode* optional_trail = new (zone) ChoiceNode(2, zone);
optional_trail->AddAlternative(
GuardedAlternative(TextNode::CreateForCharacterRanges(
zone, trail_surrogates, false, on_success)));
optional_trail->AddAlternative(GuardedAlternative(on_success));
RegExpNode* optional_pair = TextNode::CreateForCharacterRanges(
zone, lead_surrogates, false, optional_trail);
result->AddAlternative(GuardedAlternative(optional_pair));
// Advance any character. If the character happens to be a lead surrogate and
// we advanced into the middle of a surrogate pair, it will work out, as
// nothing will match from there. We will have to advance again, consuming
// the associated trail surrogate.
ZoneList<CharacterRange>* range = CharacterRange::List(
zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
}
......@@ -5174,17 +5158,17 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
// No matches possible.
return new (zone) EndNode(EndNode::BACKTRACK, zone);
}
ChoiceNode* result = new (zone) ChoiceNode(2, zone);
if (standard_type() == '*') {
AddUnanchoredAdvance(compiler, result, on_success);
return UnanchoredAdvance(compiler, on_success);
} else {
ChoiceNode* result = new (zone) ChoiceNode(2, zone);
UnicodeRangeSplitter splitter(zone, ranges);
AddBmpCharacters(compiler, result, on_success, &splitter);
AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
return result;
}
return result;
} else {
return new (zone) TextNode(this, compiler->read_backward(), on_success);
}
......
// Copyright 2013 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-unicode-regexps
var s = "a".repeat(1E7) + "\u1234";
assertEquals(["\u1234", "\u1234"], /(\u1234)/u.exec(s));
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment