Commit 46355724 authored by jgruber's avatar jgruber Committed by Commit bot

[regexp] Consider surrogate pairs when optimizing disjunctions

RationalizeConsecutiveAtoms optimizes ab|ac|az to a(?:b|c|d).
Ensure that this optimization does not split surrogate pairs in unicode
mode.

BUG=chromium:641091

Review-Url: https://codereview.chromium.org/2813893002
Cr-Commit-Position: refs/heads/master@{#44599}
parent 483812d4
......@@ -3327,9 +3327,8 @@ TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
RegExpNode* on_success) {
DCHECK_NOT_NULL(ranges);
ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
elms->Add(
TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
zone);
elms->Add(TextElement::CharClass(new (zone) RegExpCharacterClass(ranges)),
zone);
return new (zone) TextNode(elms, read_backward, on_success);
}
......@@ -3341,12 +3340,12 @@ TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
elms->Add(TextElement::CharClass(
new (zone) RegExpCharacterClass(lead_ranges, false)),
zone);
elms->Add(TextElement::CharClass(
new (zone) RegExpCharacterClass(trail_ranges, false)),
zone);
elms->Add(
TextElement::CharClass(new (zone) RegExpCharacterClass(lead_ranges)),
zone);
elms->Add(
TextElement::CharClass(new (zone) RegExpCharacterClass(trail_ranges)),
zone);
return new (zone) TextNode(elms, read_backward, on_success);
}
......@@ -4851,7 +4850,7 @@ static bool CompareRanges(ZoneList<CharacterRange>* ranges,
bool RegExpCharacterClass::is_standard(Zone* zone) {
// TODO(lrn): Remove need for this function, by not throwing away information
// along the way.
if (is_negated_) {
if (is_negated()) {
return false;
}
if (set_.is_standard()) {
......@@ -5144,7 +5143,8 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
if (compiler->needs_unicode_case_equivalents()) {
AddUnicodeCaseEquivalents(ranges, zone);
}
if (compiler->unicode() && !compiler->one_byte()) {
if (compiler->unicode() && !compiler->one_byte() &&
!contains_split_surrogate()) {
if (is_negated()) {
ZoneList<CharacterRange>* negated =
new (zone) ZoneList<CharacterRange>(2, zone);
......@@ -5154,7 +5154,7 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
if (ranges->length() == 0) {
ranges->Add(CharacterRange::Everything(), zone);
RegExpCharacterClass* fail =
new (zone) RegExpCharacterClass(ranges, true);
new (zone) RegExpCharacterClass(ranges, NEGATED);
return new (zone) TextNode(fail, compiler->read_backward(), on_success);
}
if (standard_type() == '*') {
......@@ -5368,6 +5368,9 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions(
i++;
continue;
}
DCHECK(!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
bool contains_trail_surrogate =
unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
int first_in_run = i;
i++;
while (i < length) {
......@@ -5375,6 +5378,9 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions(
if (!alternative->IsAtom()) break;
atom = alternative->AsAtom();
if (atom->length() != 1) break;
DCHECK(!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
contains_trail_surrogate |=
unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
i++;
}
if (i > first_in_run + 1) {
......@@ -5387,8 +5393,12 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions(
DCHECK_EQ(old_atom->length(), 1);
ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
}
RegExpCharacterClass::Flags flags;
if (compiler->unicode() && contains_trail_surrogate) {
flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
}
alternatives->at(write_posn++) =
new (zone) RegExpCharacterClass(ranges, false);
new (zone) RegExpCharacterClass(ranges, flags);
} else {
// Just copy any trivial alternatives.
for (int j = first_in_run; j < i; j++) {
......
......@@ -291,9 +291,20 @@ class RegExpAssertion final : public RegExpTree {
class RegExpCharacterClass final : public RegExpTree {
public:
RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated)
: set_(ranges), is_negated_(is_negated) {}
explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {}
// NEGATED: The character class is negated and should match everything but
// the specified ranges.
// CONTAINS_SPLIT_SURROGATE: The character class contains part of a split
// surrogate and should not be unicode-desugared (crbug.com/641091).
enum Flag {
NEGATED = 1 << 0,
CONTAINS_SPLIT_SURROGATE = 1 << 1,
};
typedef base::Flags<Flag> Flags;
explicit RegExpCharacterClass(ZoneList<CharacterRange>* ranges,
Flags flags = Flags())
: set_(ranges), flags_(flags) {}
explicit RegExpCharacterClass(uc16 type) : set_(type), flags_(0) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpCharacterClass* AsCharacterClass() override;
......@@ -322,11 +333,14 @@ class RegExpCharacterClass final : public RegExpTree {
// * : All characters, for advancing unanchored regexp
uc16 standard_type() { return set_.standard_set_type(); }
ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
bool is_negated() { return is_negated_; }
bool is_negated() const { return (flags_ & NEGATED) != 0; }
bool contains_split_surrogate() const {
return (flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
}
private:
CharacterSet set_;
bool is_negated_;
const Flags flags_;
};
......
......@@ -283,8 +283,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
CharacterRange::AddClassEscape('.', ranges, false, zone());
}
RegExpCharacterClass* cc =
new (zone()) RegExpCharacterClass(ranges, false);
RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(ranges);
builder->AddCharacterClass(cc);
break;
}
......@@ -392,7 +391,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
CharacterRange::AddClassEscape(c, ranges,
unicode() && ignore_case(), zone());
RegExpCharacterClass* cc =
new (zone()) RegExpCharacterClass(ranges, false);
new (zone()) RegExpCharacterClass(ranges);
builder->AddCharacterClass(cc);
break;
}
......@@ -408,7 +407,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
return ReportError(CStrVector("Invalid property name"));
}
RegExpCharacterClass* cc =
new (zone()) RegExpCharacterClass(ranges, false);
new (zone()) RegExpCharacterClass(ranges);
builder->AddCharacterClass(cc);
} else {
// With /u, no identity escapes except for syntax characters
......@@ -1548,7 +1547,9 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
ranges->Add(CharacterRange::Everything(), zone());
is_negated = !is_negated;
}
return new (zone()) RegExpCharacterClass(ranges, is_negated);
RegExpCharacterClass::Flags flags;
if (is_negated) flags = RegExpCharacterClass::NEGATED;
return new (zone()) RegExpCharacterClass(ranges, flags);
}
......@@ -1722,7 +1723,7 @@ void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
AddTerm(new (zone()) RegExpCharacterClass(
CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
CharacterRange::List(zone(), CharacterRange::Singleton(c))));
}
......
// Copyright 2017 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
assertEquals(["🍤", "🍤"],
'🍤🍦🍋ππ🍋🍦🍤'.match(/🍤/ug));
assertEquals(["🍤", "🍦", "🍦", "🍤"],
'🍤🍦🍋ππ🍋🍦🍤'.match(/🍤|🍦/ug));
assertEquals(["🍤", "🍦", "🍋", "🍋", "🍦", "🍤"],
'🍤🍦🍋ππ🍋🍦🍤'.match(/🍤|🍦|🍋/ug));
assertEquals(["🍤", "🍦", "🍋", "π", "π", "🍋", "🍦", "🍤"],
'🍤🍦🍋ππ🍋🍦🍤'.match(/🍤|🍦|π|🍋/ug));
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment