Commit e709aa24 authored by yangguo's avatar yangguo Committed by Commit bot

[regexp] implement character classes for unicode regexps.

We divide character ranges into
- BMP, matched normally.
- non-BMP, matched as alternatives of surrogate pair ranges.
- lone surrogates, matched with lookaround assertion that its indeed lone.

R=erik.corry@gmail.com
BUG=v8:2952
LOG=N

Committed: https://crrev.com/ea820ad5fa282a323a86fe20e64f83ee67ba5f04
Cr-Commit-Position: refs/heads/master@{#33432}

Review URL: https://codereview.chromium.org/1578253005

Cr-Commit-Position: refs/heads/master@{#33437}
parent fe19b11e
......@@ -8880,6 +8880,7 @@ class String: public Name {
static const uint32_t kMaxOneByteCharCodeU = unibrow::Latin1::kMaxChar;
static const int kMaxUtf16CodeUnit = 0xffff;
static const uint32_t kMaxUtf16CodeUnitU = kMaxUtf16CodeUnit;
static const uc32 kMaxCodePoint = 0x10ffff;
// Value of hash field containing computed hash equal to zero.
static const int kEmptyStringHash = kIsNotArrayIndexMask;
......
......@@ -3,6 +3,7 @@
// found in the LICENSE file.
#include "src/ostreams.h"
#include "src/objects.h"
#if V8_OS_WIN
#if _MSC_VER < 1900
......@@ -60,6 +61,16 @@ std::ostream& PrintUC16(std::ostream& os, uint16_t c, bool (*pred)(uint16_t)) {
return os << buf;
}
std::ostream& PrintUC32(std::ostream& os, int32_t c, bool (*pred)(uint16_t)) {
if (c <= String::kMaxUtf16CodeUnit) {
return PrintUC16(os, static_cast<uint16_t>(c), pred);
}
char buf[13];
snprintf(buf, sizeof(buf), "\\u{%06x}", c);
return os << buf;
}
} // namespace
......@@ -81,5 +92,10 @@ std::ostream& operator<<(std::ostream& os, const AsUC16& c) {
return PrintUC16(os, c.value, IsPrint);
}
std::ostream& operator<<(std::ostream& os, const AsUC32& c) {
return PrintUC32(os, c.value, IsPrint);
}
} // namespace internal
} // namespace v8
......@@ -50,6 +50,12 @@ struct AsUC16 {
};
struct AsUC32 {
explicit AsUC32(int32_t v) : value(v) {}
int32_t value;
};
struct AsReversiblyEscapedUC16 {
explicit AsReversiblyEscapedUC16(uint16_t v) : value(v) {}
uint16_t value;
......@@ -73,6 +79,10 @@ std::ostream& operator<<(std::ostream& os, const AsEscapedUC16ForJSON& c);
// of printable ASCII range.
std::ostream& operator<<(std::ostream& os, const AsUC16& c);
// Writes the given character to the output escaping everything outside
// of printable ASCII range.
std::ostream& operator<<(std::ostream& os, const AsUC32& c);
} // namespace internal
} // namespace v8
......
This diff is collapsed.
......@@ -265,28 +265,28 @@ class DispatchTable : public ZoneObject {
class Entry {
public:
Entry() : from_(0), to_(0), out_set_(NULL) { }
Entry(uc16 from, uc16 to, OutSet* out_set)
: from_(from), to_(to), out_set_(out_set) { }
uc16 from() { return from_; }
uc16 to() { return to_; }
void set_to(uc16 value) { to_ = value; }
Entry(uc32 from, uc32 to, OutSet* out_set)
: from_(from), to_(to), out_set_(out_set) {}
uc32 from() { return from_; }
uc32 to() { return to_; }
void set_to(uc32 value) { to_ = value; }
void AddValue(int value, Zone* zone) {
out_set_ = out_set_->Extend(value, zone);
}
OutSet* out_set() { return out_set_; }
private:
uc16 from_;
uc16 to_;
uc32 from_;
uc32 to_;
OutSet* out_set_;
};
class Config {
public:
typedef uc16 Key;
typedef uc32 Key;
typedef Entry Value;
static const uc16 kNoKey;
static const uc32 kNoKey;
static const Entry NoValue() { return Value(); }
static inline int Compare(uc16 a, uc16 b) {
static inline int Compare(uc32 a, uc32 b) {
if (a == b)
return 0;
else if (a < b)
......@@ -297,7 +297,7 @@ class DispatchTable : public ZoneObject {
};
void AddRange(CharacterRange range, int value, Zone* zone);
OutSet* Get(uc16 value);
OutSet* Get(uc32 value);
void Dump();
template <typename Callback>
......@@ -315,6 +315,34 @@ class DispatchTable : public ZoneObject {
};
// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
class UnicodeRangeSplitter {
public:
UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base);
void Call(uc32 from, DispatchTable::Entry entry);
ZoneList<CharacterRange>* bmp() { return bmp_; }
ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
private:
static const int kBase = 0;
// Separate ranges into
static const int kBmpCodePoints = 1;
static const int kLeadSurrogates = 2;
static const int kTrailSurrogates = 3;
static const int kNonBmpCodePoints = 4;
Zone* zone_;
DispatchTable table_;
ZoneList<CharacterRange>* bmp_;
ZoneList<CharacterRange>* lead_surrogates_;
ZoneList<CharacterRange>* trail_surrogates_;
ZoneList<CharacterRange>* non_bmp_;
};
#define FOR_EACH_NODE_TYPE(VISIT) \
VISIT(End) \
VISIT(Action) \
......@@ -690,6 +718,17 @@ class TextNode: public SeqRegExpNode {
read_backward_(read_backward) {
elms_->Add(TextElement::CharClass(that), zone());
}
// Create TextNode for a single character class for the given ranges.
static TextNode* CreateForCharacterRanges(Zone* zone,
ZoneList<CharacterRange>* ranges,
bool read_backward,
RegExpNode* on_success);
// Create TextNode for a surrogate pair with a range given for the
// lead and the trail surrogate each.
static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
CharacterRange trail,
bool read_backward,
RegExpNode* on_success);
virtual void Accept(NodeVisitor* visitor);
virtual void Emit(RegExpCompiler* compiler, Trace* trace);
virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
......@@ -813,8 +852,7 @@ class BackReferenceNode: public SeqRegExpNode {
class EndNode: public RegExpNode {
public:
enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
explicit EndNode(Action action, Zone* zone)
: RegExpNode(zone), action_(action) { }
EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
virtual void Accept(NodeVisitor* visitor);
virtual void Emit(RegExpCompiler* compiler, Trace* trace);
virtual int EatsAtLeast(int still_to_find,
......@@ -1505,8 +1543,8 @@ class RegExpEngine: public AllStatic {
};
static CompilationResult Compile(Isolate* isolate, Zone* zone,
RegExpCompileData* input, bool ignore_case,
bool global, bool multiline, bool sticky,
RegExpCompileData* input,
JSRegExp::Flags flags,
Handle<String> pattern,
Handle<String> sample_subject,
bool is_one_byte);
......
......@@ -172,9 +172,9 @@ void* RegExpUnparser::VisitAlternative(RegExpAlternative* that, void* data) {
void RegExpUnparser::VisitCharacterRange(CharacterRange that) {
os_ << AsUC16(that.from());
os_ << AsUC32(that.from());
if (!that.IsSingleton()) {
os_ << "-" << AsUC16(that.to());
os_ << "-" << AsUC32(that.to());
}
}
......
......@@ -5,6 +5,7 @@
#ifndef V8_REGEXP_REGEXP_AST_H_
#define V8_REGEXP_REGEXP_AST_H_
#include "src/objects.h"
#include "src/utils.h"
#include "src/zone.h"
......@@ -77,33 +78,38 @@ class CharacterRange {
CharacterRange() : from_(0), to_(0) {}
// For compatibility with the CHECK_OK macro
CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT
CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) {}
CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {}
static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
Zone* zone);
static Vector<const int> GetWordBounds();
static inline CharacterRange Singleton(uc16 value) {
static inline CharacterRange Singleton(uc32 value) {
return CharacterRange(value, value);
}
static inline CharacterRange Range(uc16 from, uc16 to) {
DCHECK(from <= to);
static inline CharacterRange Range(uc32 from, uc32 to) {
DCHECK(0 <= from && to <= String::kMaxCodePoint);
DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to));
return CharacterRange(from, to);
}
static inline CharacterRange Everything() {
return CharacterRange(0, 0xFFFF);
return CharacterRange(0, String::kMaxCodePoint);
}
bool Contains(uc16 i) { return from_ <= i && i <= to_; }
uc16 from() const { return from_; }
void set_from(uc16 value) { from_ = value; }
uc16 to() const { return to_; }
void set_to(uc16 value) { to_ = value; }
static inline ZoneList<CharacterRange>* List(Zone* zone,
CharacterRange range) {
ZoneList<CharacterRange>* list =
new (zone) ZoneList<CharacterRange>(1, zone);
list->Add(range, zone);
return list;
}
bool Contains(uc32 i) { return from_ <= i && i <= to_; }
uc32 from() const { return from_; }
void set_from(uc32 value) { from_ = value; }
uc32 to() const { return to_; }
void set_to(uc32 value) { to_ = value; }
bool is_valid() { return from_ <= to_; }
bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
bool IsSingleton() { return (from_ == to_); }
void AddCaseEquivalents(Isolate* isolate, Zone* zone,
ZoneList<CharacterRange>* ranges, bool is_one_byte);
static void Split(ZoneList<CharacterRange>* base, Vector<const int> overlay,
ZoneList<CharacterRange>** included,
ZoneList<CharacterRange>** excluded, Zone* zone);
// Whether a range list is in canonical form: Ranges ordered by from value,
// and ranges non-overlapping and non-adjacent.
static bool IsCanonical(ZoneList<CharacterRange>* ranges);
......@@ -119,8 +125,8 @@ class CharacterRange {
static const int kPayloadMask = (1 << 24) - 1;
private:
uc16 from_;
uc16 to_;
uc32 from_;
uc32 to_;
};
......@@ -287,6 +293,7 @@ class RegExpCharacterClass final : public RegExpTree {
RegExpCharacterClass* AsCharacterClass() override;
bool IsCharacterClass() override;
bool IsTextElement() override { return true; }
bool NeedsDesugaringForUnicode(Zone* zone);
int min_match() override { return 1; }
int max_match() override { return 1; }
void AppendToText(RegExpText* text, Zone* zone) override;
......@@ -451,6 +458,22 @@ class RegExpLookaround final : public RegExpTree {
int capture_from() { return capture_from_; }
Type type() { return type_; }
class Builder {
public:
Builder(bool is_positive, RegExpNode* on_success,
int stack_pointer_register, int position_register,
int capture_register_count = 0, int capture_register_start = 0);
RegExpNode* on_match_success() { return on_match_success_; }
RegExpNode* ForMatch(RegExpNode* match);
private:
bool is_positive_;
RegExpNode* on_match_success_;
RegExpNode* on_success_;
int stack_pointer_register_;
int position_register_;
};
private:
RegExpTree* body_;
bool is_positive_;
......
This diff is collapsed.
......@@ -99,13 +99,15 @@ class BufferedZoneList {
// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
class RegExpBuilder : public ZoneObject {
public:
explicit RegExpBuilder(Zone* zone);
RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
void AddCharacter(uc16 character);
void AddUnicodeCharacter(uc32 character);
// "Adds" an empty expression. Does nothing except consume a
// following quantifier
void AddEmpty();
void AddCharacterClass(RegExpCharacterClass* cc);
void AddAtom(RegExpTree* tree);
void AddTerm(RegExpTree* tree);
void AddAssertion(RegExpTree* tree);
void NewAlternative(); // '|'
void AddQuantifierToAtom(int min, int max,
......@@ -113,14 +115,21 @@ class RegExpBuilder : public ZoneObject {
RegExpTree* ToRegExp();
private:
static const uc16 kNoPendingSurrogate = 0;
void AddLeadSurrogate(uc16 lead_surrogate);
void AddTrailSurrogate(uc16 trail_surrogate);
void FlushPendingSurrogate();
void FlushCharacters();
void FlushText();
void FlushTerms();
Zone* zone() const { return zone_; }
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
Zone* zone_;
bool pending_empty_;
JSRegExp::Flags flags_;
ZoneList<uc16>* characters_;
uc16 pending_surrogate_;
BufferedZoneList<RegExpTree, 2> terms_;
BufferedZoneList<RegExpTree, 2> text_;
BufferedZoneList<RegExpTree, 2> alternatives_;
......@@ -135,12 +144,11 @@ class RegExpBuilder : public ZoneObject {
class RegExpParser BASE_EMBEDDED {
public:
RegExpParser(FlatStringReader* in, Handle<String>* error, bool multiline_mode,
bool unicode, Isolate* isolate, Zone* zone);
RegExpParser(FlatStringReader* in, Handle<String>* error,
JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
bool multiline, bool unicode,
RegExpCompileData* result);
JSRegExp::Flags flags, RegExpCompileData* result);
RegExpTree* ParsePattern();
RegExpTree* ParseDisjunction();
......@@ -183,6 +191,8 @@ class RegExpParser BASE_EMBEDDED {
int captures_started() { return captures_started_; }
int position() { return next_pos_ - 1; }
bool failed() { return failed_; }
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
static bool IsSyntaxCharacter(uc32 c);
......@@ -203,9 +213,10 @@ class RegExpParser BASE_EMBEDDED {
RegExpParserState(RegExpParserState* previous_state,
SubexpressionType group_type,
RegExpLookaround::Type lookaround_type,
int disjunction_capture_index, Zone* zone)
int disjunction_capture_index, JSRegExp::Flags flags,
Zone* zone)
: previous_state_(previous_state),
builder_(new (zone) RegExpBuilder(zone)),
builder_(new (zone) RegExpBuilder(zone, flags)),
group_type_(group_type),
lookaround_type_(lookaround_type),
disjunction_capture_index_(disjunction_capture_index) {}
......@@ -249,6 +260,8 @@ class RegExpParser BASE_EMBEDDED {
bool has_more() { return has_more_; }
bool has_next() { return next_pos_ < in()->length(); }
uc32 Next();
template <bool update_position>
uc32 ReadNext();
FlatStringReader* in() { return in_; }
void ScanForCaptures();
......@@ -258,13 +271,12 @@ class RegExpParser BASE_EMBEDDED {
ZoneList<RegExpCapture*>* captures_;
FlatStringReader* in_;
uc32 current_;
JSRegExp::Flags flags_;
int next_pos_;
int captures_started_;
// The capture count is only valid after we have scanned for captures.
int capture_count_;
bool has_more_;
bool multiline_;
bool unicode_;
bool simple_;
bool contains_anchor_;
bool is_scanned_for_captures_;
......
......@@ -96,7 +96,7 @@ static bool CheckParse(const char* input) {
FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
RegExpCompileData result;
return v8::internal::RegExpParser::ParseRegExp(
CcTest::i_isolate(), &zone, &reader, false, false, &result);
CcTest::i_isolate(), &zone, &reader, JSRegExp::kNone, &result);
}
......@@ -106,8 +106,10 @@ static void CheckParseEq(const char* input, const char* expected,
Zone zone;
FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
RegExpCompileData result;
CHECK(v8::internal::RegExpParser::ParseRegExp(
CcTest::i_isolate(), &zone, &reader, false, unicode, &result));
JSRegExp::Flags flags = JSRegExp::kNone;
if (unicode) flags |= JSRegExp::kUnicode;
CHECK(v8::internal::RegExpParser::ParseRegExp(CcTest::i_isolate(), &zone,
&reader, flags, &result));
CHECK(result.tree != NULL);
CHECK(result.error.is_null());
std::ostringstream os;
......@@ -125,7 +127,7 @@ static bool CheckSimple(const char* input) {
FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
RegExpCompileData result;
CHECK(v8::internal::RegExpParser::ParseRegExp(
CcTest::i_isolate(), &zone, &reader, false, false, &result));
CcTest::i_isolate(), &zone, &reader, JSRegExp::kNone, &result));
CHECK(result.tree != NULL);
CHECK(result.error.is_null());
return result.simple;
......@@ -143,7 +145,7 @@ static MinMaxPair CheckMinMaxMatch(const char* input) {
FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
RegExpCompileData result;
CHECK(v8::internal::RegExpParser::ParseRegExp(
CcTest::i_isolate(), &zone, &reader, false, false, &result));
CcTest::i_isolate(), &zone, &reader, JSRegExp::kNone, &result));
CHECK(result.tree != NULL);
CHECK(result.error.is_null());
int min_match = result.tree->min_match();
......@@ -206,8 +208,8 @@ void TestRegExpParser(bool lookbehind) {
}
CheckParseEq("()", "(^ %)");
CheckParseEq("(?=)", "(-> + %)");
CheckParseEq("[]", "^[\\x00-\\uffff]"); // Doesn't compile on windows
CheckParseEq("[^]", "[\\x00-\\uffff]"); // \uffff isn't in codepage 1252
CheckParseEq("[]", "^[\\x00-\\u{10ffff}]"); // Doesn't compile on windows
CheckParseEq("[^]", "[\\x00-\\u{10ffff}]"); // \uffff isn't in codepage 1252
CheckParseEq("[x]", "[x]");
CheckParseEq("[xyz]", "[x y z]");
CheckParseEq("[a-zA-Z0-9]", "[a-z A-Z 0-9]");
......@@ -316,6 +318,10 @@ void TestRegExpParser(bool lookbehind) {
CheckParseEq("\\u{12345}{3}", "(# 3 3 g '\\ud808\\udf45')", true);
CheckParseEq("\\u{12345}*", "(# 0 - g '\\ud808\\udf45')", true);
CheckParseEq("\\ud808\\udf45*", "(# 0 - g '\\ud808\\udf45')", true);
CheckParseEq("[\\ud808\\udf45-\\ud809\\udccc]", "[\\u{012345}-\\u{0124cc}]",
true);
CHECK_SIMPLE("", false);
CHECK_SIMPLE("a", true);
CHECK_SIMPLE("a|b", false);
......@@ -454,7 +460,7 @@ static void ExpectError(const char* input,
FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
RegExpCompileData result;
CHECK(!v8::internal::RegExpParser::ParseRegExp(
CcTest::i_isolate(), &zone, &reader, false, false, &result));
CcTest::i_isolate(), &zone, &reader, JSRegExp::kNone, &result));
CHECK(result.tree == NULL);
CHECK(!result.error.is_null());
v8::base::SmartArrayPointer<char> str = result.error->ToCString(ALLOW_NULLS);
......@@ -523,7 +529,7 @@ static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) {
ZoneList<CharacterRange>* ranges =
new(&zone) ZoneList<CharacterRange>(2, &zone);
CharacterRange::AddClassEscape(c, ranges, &zone);
for (unsigned i = 0; i < (1 << 16); i++) {
for (uc32 i = 0; i < (1 << 16); i++) {
bool in_class = false;
for (int j = 0; !in_class && j < ranges->length(); j++) {
CharacterRange& range = ranges->at(j);
......@@ -550,17 +556,19 @@ static RegExpNode* Compile(const char* input, bool multiline, bool unicode,
Isolate* isolate = CcTest::i_isolate();
FlatStringReader reader(isolate, CStrVector(input));
RegExpCompileData compile_data;
JSRegExp::Flags flags = JSRegExp::kNone;
if (multiline) flags = JSRegExp::kMultiline;
if (unicode) flags = JSRegExp::kUnicode;
if (!v8::internal::RegExpParser::ParseRegExp(CcTest::i_isolate(), zone,
&reader, multiline, unicode,
&compile_data))
&reader, flags, &compile_data))
return NULL;
Handle<String> pattern = isolate->factory()
->NewStringFromUtf8(CStrVector(input))
.ToHandleChecked();
Handle<String> sample_subject =
isolate->factory()->NewStringFromUtf8(CStrVector("")).ToHandleChecked();
RegExpEngine::Compile(isolate, zone, &compile_data, false, false, multiline,
false, pattern, sample_subject, is_one_byte);
RegExpEngine::Compile(isolate, zone, &compile_data, flags, pattern,
sample_subject, is_one_byte);
return compile_data.node;
}
......@@ -1669,7 +1677,7 @@ TEST(CharacterRangeCaseIndependence) {
}
static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) {
static bool InClass(uc32 c, ZoneList<CharacterRange>* ranges) {
if (ranges == NULL)
return false;
for (int i = 0; i < ranges->length(); i++) {
......@@ -1681,29 +1689,46 @@ static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) {
}
TEST(CharClassDifference) {
TEST(UnicodeRangeSplitter) {
Zone zone;
ZoneList<CharacterRange>* base =
new(&zone) ZoneList<CharacterRange>(1, &zone);
base->Add(CharacterRange::Everything(), &zone);
Vector<const int> overlay = CharacterRange::GetWordBounds();
ZoneList<CharacterRange>* included = NULL;
ZoneList<CharacterRange>* excluded = NULL;
CharacterRange::Split(base, overlay, &included, &excluded, &zone);
for (int i = 0; i < (1 << 16); i++) {
bool in_base = InClass(i, base);
if (in_base) {
bool in_overlay = false;
for (int j = 0; !in_overlay && j < overlay.length(); j += 2) {
if (overlay[j] <= i && i < overlay[j+1])
in_overlay = true;
}
CHECK_EQ(in_overlay, InClass(i, included));
CHECK_EQ(!in_overlay, InClass(i, excluded));
} else {
CHECK(!InClass(i, included));
CHECK(!InClass(i, excluded));
}
UnicodeRangeSplitter splitter(&zone, base);
// BMP
for (uc32 c = 0; c < 0xd800; c++) {
CHECK(InClass(c, splitter.bmp()));
CHECK(!InClass(c, splitter.lead_surrogates()));
CHECK(!InClass(c, splitter.trail_surrogates()));
CHECK(!InClass(c, splitter.non_bmp()));
}
// Lead surrogates
for (uc32 c = 0xd800; c < 0xdbff; c++) {
CHECK(!InClass(c, splitter.bmp()));
CHECK(InClass(c, splitter.lead_surrogates()));
CHECK(!InClass(c, splitter.trail_surrogates()));
CHECK(!InClass(c, splitter.non_bmp()));
}
// Trail surrogates
for (uc32 c = 0xdc00; c < 0xdfff; c++) {
CHECK(!InClass(c, splitter.bmp()));
CHECK(!InClass(c, splitter.lead_surrogates()));
CHECK(InClass(c, splitter.trail_surrogates()));
CHECK(!InClass(c, splitter.non_bmp()));
}
// BMP
for (uc32 c = 0xe000; c < 0xffff; c++) {
CHECK(InClass(c, splitter.bmp()));
CHECK(!InClass(c, splitter.lead_surrogates()));
CHECK(!InClass(c, splitter.trail_surrogates()));
CHECK(!InClass(c, splitter.non_bmp()));
}
// Non-BMP
for (uc32 c = 0x10000; c < 0x10ffff; c++) {
CHECK(!InClass(c, splitter.bmp()));
CHECK(!InClass(c, splitter.lead_surrogates()));
CHECK(!InClass(c, splitter.trail_surrogates()));
CHECK(InClass(c, splitter.non_bmp()));
}
}
......
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-unicode-regexps --harmony-regexp-lookbehind
function execl(expectation, regexp, subject) {
if (regexp instanceof String) regexp = new RegExp(regexp, "u");
assertEquals(expectation, regexp.exec(subject));
}
function execs(expectation, regexp_source, subject) {
execl(expectation, new RegExp(regexp_source, "u"), subject);
}
// Character ranges.
execl(["A"], /[A-D]/u, "A");
execs(["A"], "[A-D]", "A");
execl(["ABCD"], /[A-D]+/u, "ZABCDEF");
execs(["ABCD"], "[A-D]+", "ZABCDEF");
execl(["\u{12345}"], /[\u1234-\u{12345}]/u, "\u{12345}");
execs(["\u{12345}"], "[\u1234-\u{12345}]", "\u{12345}");
execl(null, /[^\u1234-\u{12345}]/u, "\u{12345}");
execs(null, "[^\u1234-\u{12345}]", "\u{12345}");
execl(["\u{1234}"], /[\u1234-\u{12345}]/u, "\u{1234}");
execs(["\u{1234}"], "[\u1234-\u{12345}]", "\u{1234}");
execl(null, /[^\u1234-\u{12345}]/u, "\u{1234}");
execs(null, "[^\u1234-\u{12345}]", "\u{1234}");
execl(null, /[\u1234-\u{12345}]/u, "\u{1233}");
execs(null, "[\u1234-\u{12345}]", "\u{1233}");
execl(["\u{1233}"], /[^\u1234-\u{12345}]/u, "\u{1233}");
execs(["\u{1233}"], "[^\u1234-\u{12345}]", "\u{1233}");
execl(["\u{12346}"], /[^\u1234-\u{12345}]/u, "\u{12346}");
execs(["\u{12346}"], "[^\u1234-\u{12345}]", "\u{12346}");
execl(null, /[\u1234-\u{12345}]/u, "\u{12346}");
execs(null, "[\u1234-\u{12345}]", "\u{12346}");
execl(["\u{12342}"], /[\u{12340}-\u{12345}]/u, "\u{12342}");
execs(["\u{12342}"], "[\u{12340}-\u{12345}]", "\u{12342}");
execl(["\u{12342}"], /[\ud808\udf40-\ud808\udf45]/u, "\u{12342}");
execs(["\u{12342}"], "[\ud808\udf40-\ud808\udf45]", "\u{12342}");
execl(null, /[^\u{12340}-\u{12345}]/u, "\u{12342}");
execs(null, "[^\u{12340}-\u{12345}]", "\u{12342}");
execl(null, /[^\ud808\udf40-\ud808\udf45]/u, "\u{12342}");
execs(null, "[^\ud808\udf40-\ud808\udf45]", "\u{12342}");
execl(["\u{ffff}"], /[\u{ff80}-\u{12345}]/u, "\u{ffff}");
execs(["\u{ffff}"], "[\u{ff80}-\u{12345}]", "\u{ffff}");
execl(["\u{ffff}"], /[\u{ff80}-\ud808\udf45]/u, "\u{ffff}");
execs(["\u{ffff}"], "[\u{ff80}-\ud808\udf45]", "\u{ffff}");
execl(null, /[^\u{ff80}-\u{12345}]/u, "\u{ffff}");
execs(null, "[^\u{ff80}-\u{12345}]", "\u{ffff}");
execl(null, /[^\u{ff80}-\ud808\udf45]/u, "\u{ffff}");
execs(null, "[^\u{ff80}-\ud808\udf45]", "\u{ffff}");
// Lone surrogate
execl(["\ud800"], /[^\u{ff80}-\u{12345}]/u, "\uff99\u{d800}A");
execs(["\udc00"], "[^\u{ff80}-\u{12345}]", "\uff99\u{dc00}A");
execl(["\udc01"], /[\u0100-\u{10ffff}]/u, "A\udc01");
execl(["\udc03"], /[\udc01-\udc03]/u, "\ud801\udc02\udc03");
execl(["\ud801"], /[\ud801-\ud803]/u, "\ud802\udc01\ud801");
// Paired sorrogate.
execl(null, /[^\u{ff80}-\u{12345}]/u, "\u{d800}\u{dc00}");
execs(null, "[^\u{ff80}-\u{12345}]", "\u{d800}\u{dc00}");
execl(["\ud800\udc00"], /[\u{ff80}-\u{12345}]/u, "\u{d800}\u{dc00}");
execs(["\ud800\udc00"], "[\u{ff80}-\u{12345}]", "\u{d800}\u{dc00}");
execl(["foo\u{10e6d}bar"], /foo\ud803\ude6dbar/u, "foo\u{10e6d}bar");
// Lone surrogates
execl(["\ud801\ud801"], /\ud801+/u, "\ud801\udc01\ud801\ud801");
execl(["\udc01\udc01"], /\udc01+/u, "\ud801\ud801\udc01\udc01\udc01");
execl(["\udc02\udc03A"], /\W\WA/u, "\ud801\udc01A\udc02\udc03A");
execl(["\ud801\ud802"], /\ud801./u, "\ud801\udc01\ud801\ud802");
execl(["\udc02\udc03A"], /[\ud800-\udfff][\ud800-\udfff]A/u,
"\ud801\udc01A\udc02\udc03A");
// Character classes
execl(null, /\w/u, "\ud801\udc01");
execl(["\ud801"], /[^\w]/, "\ud801\udc01");
execl(["\ud801\udc01"], /[^\w]/u, "\ud801\udc01");
execl(["\ud801"], /\W/, "\ud801\udc01");
execl(["\ud801\udc01"], /\W/u, "\ud801\udc01");
execl(["\ud800X"], /.X/u, "\ud800XaX");
execl(["aX"], /.(?<!\ud800)X/u, "\ud800XaX");
execl(["aX"], /.(?<![\ud800-\ud900])X/u, "\ud800XaX");
execl(null, /[]/u, "\u1234");
execl(["0abc"], /[^]abc/u, "0abc");
execl(["\u1234abc"], /[^]abc/u, "\u1234abc");
execl(["\u{12345}abc"], /[^]abc/u, "\u{12345}abc");
// Backward matches of lone surrogates.
execl(["B", "\ud803A"], /(?<=([\ud800-\ud900]A))B/u,
"\ud801\udc00AB\udc00AB\ud802\ud803AB");
execl(["B", "\udc00A"], /(?<=([\ud800-\u{10300}]A))B/u,
"\ud801\udc00AB\udc00AB\ud802\ud803AB");
execl(["B", "\udc11A"], /(?<=([\udc00-\udd00]A))B/u,
"\ud801\udc00AB\udc11AB\ud802\ud803AB");
execl(["X", "\ud800C"], /(?<=(\ud800\w))X/u,
"\ud800\udc00AX\udc11BX\ud800\ud800CX");
execl(["C", "\ud800\ud800"], /(?<=(\ud800.))\w/u,
"\ud800\udc00AX\udc11BX\ud800\ud800CX");
execl(["X", "\udc01C"], /(?<=(\udc01\w))X/u,
"\ud800\udc01AX\udc11BX\udc01\udc01CX");
execl(["C", "\udc01\udc01"], /(?<=(\udc01.))./u,
"\ud800\udc01AX\udc11BX\udc01\udc01CX");
var L = "\ud800";
var T = "\udc00";
var X = "X";
// Test string contains only match.
function testw(expect, src, subject) {
var re = new RegExp("^" + src + "$", "u");
assertEquals(expect, re.test(subject));
}
// Test string starts with match.
function tests(expect, src, subject) {
var re = new RegExp("^" + src, "u");
assertEquals(expect, re.test(subject));
}
testw(true, X, X);
testw(true, L, L);
testw(true, T, T);
testw(true, L + T, L + T);
testw(true, T + L, T + L);
testw(false, T, L + T);
testw(false, L, L + T);
testw(true, ".(?<=" + L + ")", L);
testw(true, ".(?<=" + T + ")", T);
testw(true, ".(?<=" + L + T + ")", L + T);
testw(true, ".(?<=" + L + T + ")", L + T);
tests(true, ".(?<=" + T + ")", T + L);
tests(false, ".(?<=" + L + ")", L + T);
tests(false, ".(?<=" + T + ")", L + T);
tests(true, "..(?<=" + T + ")", T + T + L);
tests(true, "..(?<=" + T + ")", X + T + L);
tests(true, "...(?<=" + L + ")", X + T + L);
tests(false, "...(?<=" + T + ")", X + L + T)
tests(true, "..(?<=" + L + T + ")", X + L + T)
tests(true, "..(?<=" + L + T + "(?<=" + L + T + "))", X + L + T);
tests(false, "..(?<=" + L + "(" + T + "))", X + L + T);
tests(false, ".*" + L, X + L + T);
tests(true, ".*" + L, X + L + L + T);
tests(false, ".*" + L, X + L + T + L + T);
tests(false, ".*" + T, X + L + T + L + T);
tests(true, ".*" + T, X + L + T + T + L + T);
......@@ -252,6 +252,30 @@ assertFalse(/(\u{12345}|\u{23456}).\1/u.test("\u{12345}b\u{23456}"));
assertTrue(new RegExp("\u{12345}{3}", "u").test("\u{12345}\u{12345}\u{12345}"));
assertTrue(/\u{12345}{3}/u.test("\u{12345}\u{12345}\u{12345}"));
assertTrue(new RegExp("\u{12345}{3}").test("\u{12345}\udf45\udf45"));
assertTrue(/\ud808\udf45{3}/u.test("\u{12345}\udf45\udf45"));
assertFalse(/\ud808\udf45{3}/u.test("\u{12345}\udf45\udf45"));
assertTrue(/\ud808\udf45{3}/u.test("\u{12345}\u{12345}\u{12345}"));
assertFalse(new RegExp("\u{12345}{3}", "u").test("\u{12345}\udf45\udf45"));
assertFalse(/\u{12345}{3}/u.test("\u{12345}\udf45\udf45"));
// Mixed escapes and literal surrogates.
assertEquals(["\u{10000}\u{10000}"],
new RegExp("\ud800\udc00+", "u").exec("\u{10000}\u{10000}"));
assertEquals(["\u{10000}\u{10000}"],
new RegExp("\\ud800\\udc00+", "u").exec("\u{10000}\u{10000}"));
assertEquals(["\u{10000}\u{10000}"],
new RegExp("\\ud800\udc00+", "u").exec("\u{10000}\u{10000}"));
assertEquals(["\u{10000}\u{10000}"],
new RegExp("\ud800\\udc00+", "u").exec("\u{10000}\u{10000}"));
assertEquals(["\u{10003}\u{50001}"],
new RegExp("[\\ud800\\udc03-\\ud900\\udc01\]+", "u").exec(
"\u{10003}\u{50001}"));
assertEquals(["\u{10003}\u{50001}"],
new RegExp("[\\ud800\udc03-\ud900\\udc01\]+", "u").exec(
"\u{10003}\u{50001}"));
assertEquals(["\u{50001}"],
new RegExp("[\\ud800\udc03-\ud900\\udc01\]+", "u").exec(
"\u{10002}\u{50001}"));
assertEquals(["\u{10003}\u{50001}"],
new RegExp("[\ud800\udc03-\u{50001}\]+", "u").exec(
"\u{10003}\u{50001}"));
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment