Commit ea820ad5 authored by yangguo's avatar yangguo Committed by Commit bot

[regexp] implement character classes for unicode regexps.

We divide character ranges into
- BMP, matched normally.
- non-BMP, matched as alternatives of surrogate pair ranges.
- lone surrogates, matched with lookaround assertion that its indeed lone.

R=erik.corry@gmail.com
BUG=v8:2952
LOG=N

Review URL: https://codereview.chromium.org/1578253005

Cr-Commit-Position: refs/heads/master@{#33432}
parent 7f62e122
...@@ -8880,6 +8880,7 @@ class String: public Name { ...@@ -8880,6 +8880,7 @@ class String: public Name {
static const uint32_t kMaxOneByteCharCodeU = unibrow::Latin1::kMaxChar; static const uint32_t kMaxOneByteCharCodeU = unibrow::Latin1::kMaxChar;
static const int kMaxUtf16CodeUnit = 0xffff; static const int kMaxUtf16CodeUnit = 0xffff;
static const uint32_t kMaxUtf16CodeUnitU = kMaxUtf16CodeUnit; static const uint32_t kMaxUtf16CodeUnitU = kMaxUtf16CodeUnit;
static const uc32 kMaxCodePoint = 0x10ffff;
// Value of hash field containing computed hash equal to zero. // Value of hash field containing computed hash equal to zero.
static const int kEmptyStringHash = kIsNotArrayIndexMask; static const int kEmptyStringHash = kIsNotArrayIndexMask;
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
// found in the LICENSE file. // found in the LICENSE file.
#include "src/ostreams.h" #include "src/ostreams.h"
#include "src/objects.h"
#if V8_OS_WIN #if V8_OS_WIN
#if _MSC_VER < 1900 #if _MSC_VER < 1900
...@@ -60,6 +61,16 @@ std::ostream& PrintUC16(std::ostream& os, uint16_t c, bool (*pred)(uint16_t)) { ...@@ -60,6 +61,16 @@ std::ostream& PrintUC16(std::ostream& os, uint16_t c, bool (*pred)(uint16_t)) {
return os << buf; return os << buf;
} }
std::ostream& PrintUC32(std::ostream& os, int32_t c, bool (*pred)(uint16_t)) {
if (c <= String::kMaxUtf16CodeUnit) {
return PrintUC16(os, static_cast<uint16_t>(c), pred);
}
char buf[13];
snprintf(buf, sizeof(buf), "\\u{%06x}", c);
return os << buf;
}
} // namespace } // namespace
...@@ -81,5 +92,10 @@ std::ostream& operator<<(std::ostream& os, const AsUC16& c) { ...@@ -81,5 +92,10 @@ std::ostream& operator<<(std::ostream& os, const AsUC16& c) {
return PrintUC16(os, c.value, IsPrint); return PrintUC16(os, c.value, IsPrint);
} }
std::ostream& operator<<(std::ostream& os, const AsUC32& c) {
return PrintUC32(os, c.value, IsPrint);
}
} // namespace internal } // namespace internal
} // namespace v8 } // namespace v8
...@@ -50,6 +50,12 @@ struct AsUC16 { ...@@ -50,6 +50,12 @@ struct AsUC16 {
}; };
struct AsUC32 {
explicit AsUC32(int32_t v) : value(v) {}
int32_t value;
};
struct AsReversiblyEscapedUC16 { struct AsReversiblyEscapedUC16 {
explicit AsReversiblyEscapedUC16(uint16_t v) : value(v) {} explicit AsReversiblyEscapedUC16(uint16_t v) : value(v) {}
uint16_t value; uint16_t value;
...@@ -73,6 +79,10 @@ std::ostream& operator<<(std::ostream& os, const AsEscapedUC16ForJSON& c); ...@@ -73,6 +79,10 @@ std::ostream& operator<<(std::ostream& os, const AsEscapedUC16ForJSON& c);
// of printable ASCII range. // of printable ASCII range.
std::ostream& operator<<(std::ostream& os, const AsUC16& c); std::ostream& operator<<(std::ostream& os, const AsUC16& c);
// Writes the given character to the output escaping everything outside
// of printable ASCII range.
std::ostream& operator<<(std::ostream& os, const AsUC32& c);
} // namespace internal } // namespace internal
} // namespace v8 } // namespace v8
......
This diff is collapsed.
...@@ -265,28 +265,28 @@ class DispatchTable : public ZoneObject { ...@@ -265,28 +265,28 @@ class DispatchTable : public ZoneObject {
class Entry { class Entry {
public: public:
Entry() : from_(0), to_(0), out_set_(NULL) { } Entry() : from_(0), to_(0), out_set_(NULL) { }
Entry(uc16 from, uc16 to, OutSet* out_set) Entry(uc32 from, uc32 to, OutSet* out_set)
: from_(from), to_(to), out_set_(out_set) { } : from_(from), to_(to), out_set_(out_set) {}
uc16 from() { return from_; } uc32 from() { return from_; }
uc16 to() { return to_; } uc32 to() { return to_; }
void set_to(uc16 value) { to_ = value; } void set_to(uc32 value) { to_ = value; }
void AddValue(int value, Zone* zone) { void AddValue(int value, Zone* zone) {
out_set_ = out_set_->Extend(value, zone); out_set_ = out_set_->Extend(value, zone);
} }
OutSet* out_set() { return out_set_; } OutSet* out_set() { return out_set_; }
private: private:
uc16 from_; uc32 from_;
uc16 to_; uc32 to_;
OutSet* out_set_; OutSet* out_set_;
}; };
class Config { class Config {
public: public:
typedef uc16 Key; typedef uc32 Key;
typedef Entry Value; typedef Entry Value;
static const uc16 kNoKey; static const uc32 kNoKey;
static const Entry NoValue() { return Value(); } static const Entry NoValue() { return Value(); }
static inline int Compare(uc16 a, uc16 b) { static inline int Compare(uc32 a, uc32 b) {
if (a == b) if (a == b)
return 0; return 0;
else if (a < b) else if (a < b)
...@@ -297,7 +297,7 @@ class DispatchTable : public ZoneObject { ...@@ -297,7 +297,7 @@ class DispatchTable : public ZoneObject {
}; };
void AddRange(CharacterRange range, int value, Zone* zone); void AddRange(CharacterRange range, int value, Zone* zone);
OutSet* Get(uc16 value); OutSet* Get(uc32 value);
void Dump(); void Dump();
template <typename Callback> template <typename Callback>
...@@ -315,6 +315,34 @@ class DispatchTable : public ZoneObject { ...@@ -315,6 +315,34 @@ class DispatchTable : public ZoneObject {
}; };
// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
class UnicodeRangeSplitter {
public:
UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base);
void Call(uc32 from, DispatchTable::Entry entry);
ZoneList<CharacterRange>* bmp() { return bmp_; }
ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
private:
static const int kBase = 0;
// Separate ranges into
static const int kBmpCodePoints = 1;
static const int kLeadSurrogates = 2;
static const int kTrailSurrogates = 3;
static const int kNonBmpCodePoints = 4;
Zone* zone_;
DispatchTable table_;
ZoneList<CharacterRange>* bmp_;
ZoneList<CharacterRange>* lead_surrogates_;
ZoneList<CharacterRange>* trail_surrogates_;
ZoneList<CharacterRange>* non_bmp_;
};
#define FOR_EACH_NODE_TYPE(VISIT) \ #define FOR_EACH_NODE_TYPE(VISIT) \
VISIT(End) \ VISIT(End) \
VISIT(Action) \ VISIT(Action) \
...@@ -690,6 +718,17 @@ class TextNode: public SeqRegExpNode { ...@@ -690,6 +718,17 @@ class TextNode: public SeqRegExpNode {
read_backward_(read_backward) { read_backward_(read_backward) {
elms_->Add(TextElement::CharClass(that), zone()); elms_->Add(TextElement::CharClass(that), zone());
} }
// Create TextNode for a single character class for the given ranges.
static TextNode* CreateForCharacterRanges(Zone* zone,
ZoneList<CharacterRange>* ranges,
bool read_backward,
RegExpNode* on_success);
// Create TextNode for a surrogate pair with a range given for the
// lead and the trail surrogate each.
static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
CharacterRange trail,
bool read_backward,
RegExpNode* on_success);
virtual void Accept(NodeVisitor* visitor); virtual void Accept(NodeVisitor* visitor);
virtual void Emit(RegExpCompiler* compiler, Trace* trace); virtual void Emit(RegExpCompiler* compiler, Trace* trace);
virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start); virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
...@@ -813,8 +852,7 @@ class BackReferenceNode: public SeqRegExpNode { ...@@ -813,8 +852,7 @@ class BackReferenceNode: public SeqRegExpNode {
class EndNode: public RegExpNode { class EndNode: public RegExpNode {
public: public:
enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS }; enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
explicit EndNode(Action action, Zone* zone) EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
: RegExpNode(zone), action_(action) { }
virtual void Accept(NodeVisitor* visitor); virtual void Accept(NodeVisitor* visitor);
virtual void Emit(RegExpCompiler* compiler, Trace* trace); virtual void Emit(RegExpCompiler* compiler, Trace* trace);
virtual int EatsAtLeast(int still_to_find, virtual int EatsAtLeast(int still_to_find,
...@@ -1505,8 +1543,8 @@ class RegExpEngine: public AllStatic { ...@@ -1505,8 +1543,8 @@ class RegExpEngine: public AllStatic {
}; };
static CompilationResult Compile(Isolate* isolate, Zone* zone, static CompilationResult Compile(Isolate* isolate, Zone* zone,
RegExpCompileData* input, bool ignore_case, RegExpCompileData* input,
bool global, bool multiline, bool sticky, JSRegExp::Flags flags,
Handle<String> pattern, Handle<String> pattern,
Handle<String> sample_subject, Handle<String> sample_subject,
bool is_one_byte); bool is_one_byte);
......
...@@ -172,9 +172,9 @@ void* RegExpUnparser::VisitAlternative(RegExpAlternative* that, void* data) { ...@@ -172,9 +172,9 @@ void* RegExpUnparser::VisitAlternative(RegExpAlternative* that, void* data) {
void RegExpUnparser::VisitCharacterRange(CharacterRange that) { void RegExpUnparser::VisitCharacterRange(CharacterRange that) {
os_ << AsUC16(that.from()); os_ << AsUC32(that.from());
if (!that.IsSingleton()) { if (!that.IsSingleton()) {
os_ << "-" << AsUC16(that.to()); os_ << "-" << AsUC32(that.to());
} }
} }
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#ifndef V8_REGEXP_REGEXP_AST_H_ #ifndef V8_REGEXP_REGEXP_AST_H_
#define V8_REGEXP_REGEXP_AST_H_ #define V8_REGEXP_REGEXP_AST_H_
#include "src/objects.h"
#include "src/utils.h" #include "src/utils.h"
#include "src/zone.h" #include "src/zone.h"
...@@ -77,33 +78,37 @@ class CharacterRange { ...@@ -77,33 +78,37 @@ class CharacterRange {
CharacterRange() : from_(0), to_(0) {} CharacterRange() : from_(0), to_(0) {}
// For compatibility with the CHECK_OK macro // For compatibility with the CHECK_OK macro
CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT
CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) {} CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {}
static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges, static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
Zone* zone); Zone* zone);
static Vector<const int> GetWordBounds(); static Vector<const int> GetWordBounds();
static inline CharacterRange Singleton(uc16 value) { static inline CharacterRange Singleton(uc32 value) {
return CharacterRange(value, value); return CharacterRange(value, value);
} }
static inline CharacterRange Range(uc16 from, uc16 to) { static inline CharacterRange Range(uc32 from, uc32 to) {
DCHECK(from <= to); DCHECK(from <= to);
return CharacterRange(from, to); return CharacterRange(from, to);
} }
static inline CharacterRange Everything() { static inline CharacterRange Everything() {
return CharacterRange(0, 0xFFFF); return CharacterRange(0, String::kMaxCodePoint);
} }
bool Contains(uc16 i) { return from_ <= i && i <= to_; } static inline ZoneList<CharacterRange>* List(Zone* zone,
uc16 from() const { return from_; } CharacterRange range) {
void set_from(uc16 value) { from_ = value; } ZoneList<CharacterRange>* list =
uc16 to() const { return to_; } new (zone) ZoneList<CharacterRange>(1, zone);
void set_to(uc16 value) { to_ = value; } list->Add(range, zone);
return list;
}
bool Contains(uc32 i) { return from_ <= i && i <= to_; }
uc32 from() const { return from_; }
void set_from(uc32 value) { from_ = value; }
uc32 to() const { return to_; }
void set_to(uc32 value) { to_ = value; }
bool is_valid() { return from_ <= to_; } bool is_valid() { return from_ <= to_; }
bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; } bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
bool IsSingleton() { return (from_ == to_); } bool IsSingleton() { return (from_ == to_); }
void AddCaseEquivalents(Isolate* isolate, Zone* zone, void AddCaseEquivalents(Isolate* isolate, Zone* zone,
ZoneList<CharacterRange>* ranges, bool is_one_byte); ZoneList<CharacterRange>* ranges, bool is_one_byte);
static void Split(ZoneList<CharacterRange>* base, Vector<const int> overlay,
ZoneList<CharacterRange>** included,
ZoneList<CharacterRange>** excluded, Zone* zone);
// Whether a range list is in canonical form: Ranges ordered by from value, // Whether a range list is in canonical form: Ranges ordered by from value,
// and ranges non-overlapping and non-adjacent. // and ranges non-overlapping and non-adjacent.
static bool IsCanonical(ZoneList<CharacterRange>* ranges); static bool IsCanonical(ZoneList<CharacterRange>* ranges);
...@@ -119,8 +124,8 @@ class CharacterRange { ...@@ -119,8 +124,8 @@ class CharacterRange {
static const int kPayloadMask = (1 << 24) - 1; static const int kPayloadMask = (1 << 24) - 1;
private: private:
uc16 from_; uc32 from_;
uc16 to_; uc32 to_;
}; };
...@@ -287,6 +292,7 @@ class RegExpCharacterClass final : public RegExpTree { ...@@ -287,6 +292,7 @@ class RegExpCharacterClass final : public RegExpTree {
RegExpCharacterClass* AsCharacterClass() override; RegExpCharacterClass* AsCharacterClass() override;
bool IsCharacterClass() override; bool IsCharacterClass() override;
bool IsTextElement() override { return true; } bool IsTextElement() override { return true; }
bool NeedsDesugaringForUnicode(Zone* zone);
int min_match() override { return 1; } int min_match() override { return 1; }
int max_match() override { return 1; } int max_match() override { return 1; }
void AppendToText(RegExpText* text, Zone* zone) override; void AppendToText(RegExpText* text, Zone* zone) override;
...@@ -451,6 +457,22 @@ class RegExpLookaround final : public RegExpTree { ...@@ -451,6 +457,22 @@ class RegExpLookaround final : public RegExpTree {
int capture_from() { return capture_from_; } int capture_from() { return capture_from_; }
Type type() { return type_; } Type type() { return type_; }
class Builder {
public:
Builder(bool is_positive, RegExpNode* on_success,
int stack_pointer_register, int position_register,
int capture_register_count = 0, int capture_register_start = 0);
RegExpNode* on_match_success() { return on_match_success_; }
RegExpNode* ForMatch(RegExpNode* match);
private:
bool is_positive_;
RegExpNode* on_match_success_;
RegExpNode* on_success_;
int stack_pointer_register_;
int position_register_;
};
private: private:
RegExpTree* body_; RegExpTree* body_;
bool is_positive_; bool is_positive_;
......
This diff is collapsed.
...@@ -99,13 +99,15 @@ class BufferedZoneList { ...@@ -99,13 +99,15 @@ class BufferedZoneList {
// Accumulates RegExp atoms and assertions into lists of terms and alternatives. // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
class RegExpBuilder : public ZoneObject { class RegExpBuilder : public ZoneObject {
public: public:
explicit RegExpBuilder(Zone* zone); RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
void AddCharacter(uc16 character); void AddCharacter(uc16 character);
void AddUnicodeCharacter(uc32 character); void AddUnicodeCharacter(uc32 character);
// "Adds" an empty expression. Does nothing except consume a // "Adds" an empty expression. Does nothing except consume a
// following quantifier // following quantifier
void AddEmpty(); void AddEmpty();
void AddCharacterClass(RegExpCharacterClass* cc);
void AddAtom(RegExpTree* tree); void AddAtom(RegExpTree* tree);
void AddTerm(RegExpTree* tree);
void AddAssertion(RegExpTree* tree); void AddAssertion(RegExpTree* tree);
void NewAlternative(); // '|' void NewAlternative(); // '|'
void AddQuantifierToAtom(int min, int max, void AddQuantifierToAtom(int min, int max,
...@@ -113,14 +115,21 @@ class RegExpBuilder : public ZoneObject { ...@@ -113,14 +115,21 @@ class RegExpBuilder : public ZoneObject {
RegExpTree* ToRegExp(); RegExpTree* ToRegExp();
private: private:
static const uc16 kNoPendingSurrogate = 0;
void AddLeadSurrogate(uc16 lead_surrogate);
void AddTrailSurrogate(uc16 trail_surrogate);
void FlushPendingSurrogate();
void FlushCharacters(); void FlushCharacters();
void FlushText(); void FlushText();
void FlushTerms(); void FlushTerms();
Zone* zone() const { return zone_; } Zone* zone() const { return zone_; }
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
Zone* zone_; Zone* zone_;
bool pending_empty_; bool pending_empty_;
JSRegExp::Flags flags_;
ZoneList<uc16>* characters_; ZoneList<uc16>* characters_;
uc16 pending_surrogate_;
BufferedZoneList<RegExpTree, 2> terms_; BufferedZoneList<RegExpTree, 2> terms_;
BufferedZoneList<RegExpTree, 2> text_; BufferedZoneList<RegExpTree, 2> text_;
BufferedZoneList<RegExpTree, 2> alternatives_; BufferedZoneList<RegExpTree, 2> alternatives_;
...@@ -135,12 +144,11 @@ class RegExpBuilder : public ZoneObject { ...@@ -135,12 +144,11 @@ class RegExpBuilder : public ZoneObject {
class RegExpParser BASE_EMBEDDED { class RegExpParser BASE_EMBEDDED {
public: public:
RegExpParser(FlatStringReader* in, Handle<String>* error, bool multiline_mode, RegExpParser(FlatStringReader* in, Handle<String>* error,
bool unicode, Isolate* isolate, Zone* zone); JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input, static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
bool multiline, bool unicode, JSRegExp::Flags flags, RegExpCompileData* result);
RegExpCompileData* result);
RegExpTree* ParsePattern(); RegExpTree* ParsePattern();
RegExpTree* ParseDisjunction(); RegExpTree* ParseDisjunction();
...@@ -183,6 +191,8 @@ class RegExpParser BASE_EMBEDDED { ...@@ -183,6 +191,8 @@ class RegExpParser BASE_EMBEDDED {
int captures_started() { return captures_started_; } int captures_started() { return captures_started_; }
int position() { return next_pos_ - 1; } int position() { return next_pos_ - 1; }
bool failed() { return failed_; } bool failed() { return failed_; }
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
static bool IsSyntaxCharacter(uc32 c); static bool IsSyntaxCharacter(uc32 c);
...@@ -203,9 +213,10 @@ class RegExpParser BASE_EMBEDDED { ...@@ -203,9 +213,10 @@ class RegExpParser BASE_EMBEDDED {
RegExpParserState(RegExpParserState* previous_state, RegExpParserState(RegExpParserState* previous_state,
SubexpressionType group_type, SubexpressionType group_type,
RegExpLookaround::Type lookaround_type, RegExpLookaround::Type lookaround_type,
int disjunction_capture_index, Zone* zone) int disjunction_capture_index, JSRegExp::Flags flags,
Zone* zone)
: previous_state_(previous_state), : previous_state_(previous_state),
builder_(new (zone) RegExpBuilder(zone)), builder_(new (zone) RegExpBuilder(zone, flags)),
group_type_(group_type), group_type_(group_type),
lookaround_type_(lookaround_type), lookaround_type_(lookaround_type),
disjunction_capture_index_(disjunction_capture_index) {} disjunction_capture_index_(disjunction_capture_index) {}
...@@ -249,6 +260,8 @@ class RegExpParser BASE_EMBEDDED { ...@@ -249,6 +260,8 @@ class RegExpParser BASE_EMBEDDED {
bool has_more() { return has_more_; } bool has_more() { return has_more_; }
bool has_next() { return next_pos_ < in()->length(); } bool has_next() { return next_pos_ < in()->length(); }
uc32 Next(); uc32 Next();
template <bool update_position>
uc32 ReadNext();
FlatStringReader* in() { return in_; } FlatStringReader* in() { return in_; }
void ScanForCaptures(); void ScanForCaptures();
...@@ -258,13 +271,12 @@ class RegExpParser BASE_EMBEDDED { ...@@ -258,13 +271,12 @@ class RegExpParser BASE_EMBEDDED {
ZoneList<RegExpCapture*>* captures_; ZoneList<RegExpCapture*>* captures_;
FlatStringReader* in_; FlatStringReader* in_;
uc32 current_; uc32 current_;
JSRegExp::Flags flags_;
int next_pos_; int next_pos_;
int captures_started_; int captures_started_;
// The capture count is only valid after we have scanned for captures. // The capture count is only valid after we have scanned for captures.
int capture_count_; int capture_count_;
bool has_more_; bool has_more_;
bool multiline_;
bool unicode_;
bool simple_; bool simple_;
bool contains_anchor_; bool contains_anchor_;
bool is_scanned_for_captures_; bool is_scanned_for_captures_;
......
...@@ -96,7 +96,7 @@ static bool CheckParse(const char* input) { ...@@ -96,7 +96,7 @@ static bool CheckParse(const char* input) {
FlatStringReader reader(CcTest::i_isolate(), CStrVector(input)); FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
RegExpCompileData result; RegExpCompileData result;
return v8::internal::RegExpParser::ParseRegExp( return v8::internal::RegExpParser::ParseRegExp(
CcTest::i_isolate(), &zone, &reader, false, false, &result); CcTest::i_isolate(), &zone, &reader, JSRegExp::kNone, &result);
} }
...@@ -106,8 +106,10 @@ static void CheckParseEq(const char* input, const char* expected, ...@@ -106,8 +106,10 @@ static void CheckParseEq(const char* input, const char* expected,
Zone zone; Zone zone;
FlatStringReader reader(CcTest::i_isolate(), CStrVector(input)); FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
RegExpCompileData result; RegExpCompileData result;
CHECK(v8::internal::RegExpParser::ParseRegExp( JSRegExp::Flags flags = JSRegExp::kNone;
CcTest::i_isolate(), &zone, &reader, false, unicode, &result)); if (unicode) flags |= JSRegExp::kUnicode;
CHECK(v8::internal::RegExpParser::ParseRegExp(CcTest::i_isolate(), &zone,
&reader, flags, &result));
CHECK(result.tree != NULL); CHECK(result.tree != NULL);
CHECK(result.error.is_null()); CHECK(result.error.is_null());
std::ostringstream os; std::ostringstream os;
...@@ -125,7 +127,7 @@ static bool CheckSimple(const char* input) { ...@@ -125,7 +127,7 @@ static bool CheckSimple(const char* input) {
FlatStringReader reader(CcTest::i_isolate(), CStrVector(input)); FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
RegExpCompileData result; RegExpCompileData result;
CHECK(v8::internal::RegExpParser::ParseRegExp( CHECK(v8::internal::RegExpParser::ParseRegExp(
CcTest::i_isolate(), &zone, &reader, false, false, &result)); CcTest::i_isolate(), &zone, &reader, JSRegExp::kNone, &result));
CHECK(result.tree != NULL); CHECK(result.tree != NULL);
CHECK(result.error.is_null()); CHECK(result.error.is_null());
return result.simple; return result.simple;
...@@ -143,7 +145,7 @@ static MinMaxPair CheckMinMaxMatch(const char* input) { ...@@ -143,7 +145,7 @@ static MinMaxPair CheckMinMaxMatch(const char* input) {
FlatStringReader reader(CcTest::i_isolate(), CStrVector(input)); FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
RegExpCompileData result; RegExpCompileData result;
CHECK(v8::internal::RegExpParser::ParseRegExp( CHECK(v8::internal::RegExpParser::ParseRegExp(
CcTest::i_isolate(), &zone, &reader, false, false, &result)); CcTest::i_isolate(), &zone, &reader, JSRegExp::kNone, &result));
CHECK(result.tree != NULL); CHECK(result.tree != NULL);
CHECK(result.error.is_null()); CHECK(result.error.is_null());
int min_match = result.tree->min_match(); int min_match = result.tree->min_match();
...@@ -206,8 +208,8 @@ void TestRegExpParser(bool lookbehind) { ...@@ -206,8 +208,8 @@ void TestRegExpParser(bool lookbehind) {
} }
CheckParseEq("()", "(^ %)"); CheckParseEq("()", "(^ %)");
CheckParseEq("(?=)", "(-> + %)"); CheckParseEq("(?=)", "(-> + %)");
CheckParseEq("[]", "^[\\x00-\\uffff]"); // Doesn't compile on windows CheckParseEq("[]", "^[\\x00-\\u{10ffff}]"); // Doesn't compile on windows
CheckParseEq("[^]", "[\\x00-\\uffff]"); // \uffff isn't in codepage 1252 CheckParseEq("[^]", "[\\x00-\\u{10ffff}]"); // \uffff isn't in codepage 1252
CheckParseEq("[x]", "[x]"); CheckParseEq("[x]", "[x]");
CheckParseEq("[xyz]", "[x y z]"); CheckParseEq("[xyz]", "[x y z]");
CheckParseEq("[a-zA-Z0-9]", "[a-z A-Z 0-9]"); CheckParseEq("[a-zA-Z0-9]", "[a-z A-Z 0-9]");
...@@ -316,6 +318,10 @@ void TestRegExpParser(bool lookbehind) { ...@@ -316,6 +318,10 @@ void TestRegExpParser(bool lookbehind) {
CheckParseEq("\\u{12345}{3}", "(# 3 3 g '\\ud808\\udf45')", true); CheckParseEq("\\u{12345}{3}", "(# 3 3 g '\\ud808\\udf45')", true);
CheckParseEq("\\u{12345}*", "(# 0 - g '\\ud808\\udf45')", true); CheckParseEq("\\u{12345}*", "(# 0 - g '\\ud808\\udf45')", true);
CheckParseEq("\\ud808\\udf45*", "(# 0 - g '\\ud808\\udf45')", true);
CheckParseEq("[\\ud808\\udf45-\\ud809\\udccc]", "[\\u{012345}-\\u{0124cc}]",
true);
CHECK_SIMPLE("", false); CHECK_SIMPLE("", false);
CHECK_SIMPLE("a", true); CHECK_SIMPLE("a", true);
CHECK_SIMPLE("a|b", false); CHECK_SIMPLE("a|b", false);
...@@ -454,7 +460,7 @@ static void ExpectError(const char* input, ...@@ -454,7 +460,7 @@ static void ExpectError(const char* input,
FlatStringReader reader(CcTest::i_isolate(), CStrVector(input)); FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
RegExpCompileData result; RegExpCompileData result;
CHECK(!v8::internal::RegExpParser::ParseRegExp( CHECK(!v8::internal::RegExpParser::ParseRegExp(
CcTest::i_isolate(), &zone, &reader, false, false, &result)); CcTest::i_isolate(), &zone, &reader, JSRegExp::kNone, &result));
CHECK(result.tree == NULL); CHECK(result.tree == NULL);
CHECK(!result.error.is_null()); CHECK(!result.error.is_null());
v8::base::SmartArrayPointer<char> str = result.error->ToCString(ALLOW_NULLS); v8::base::SmartArrayPointer<char> str = result.error->ToCString(ALLOW_NULLS);
...@@ -523,7 +529,7 @@ static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) { ...@@ -523,7 +529,7 @@ static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) {
ZoneList<CharacterRange>* ranges = ZoneList<CharacterRange>* ranges =
new(&zone) ZoneList<CharacterRange>(2, &zone); new(&zone) ZoneList<CharacterRange>(2, &zone);
CharacterRange::AddClassEscape(c, ranges, &zone); CharacterRange::AddClassEscape(c, ranges, &zone);
for (unsigned i = 0; i < (1 << 16); i++) { for (uc32 i = 0; i < (1 << 16); i++) {
bool in_class = false; bool in_class = false;
for (int j = 0; !in_class && j < ranges->length(); j++) { for (int j = 0; !in_class && j < ranges->length(); j++) {
CharacterRange& range = ranges->at(j); CharacterRange& range = ranges->at(j);
...@@ -550,17 +556,19 @@ static RegExpNode* Compile(const char* input, bool multiline, bool unicode, ...@@ -550,17 +556,19 @@ static RegExpNode* Compile(const char* input, bool multiline, bool unicode,
Isolate* isolate = CcTest::i_isolate(); Isolate* isolate = CcTest::i_isolate();
FlatStringReader reader(isolate, CStrVector(input)); FlatStringReader reader(isolate, CStrVector(input));
RegExpCompileData compile_data; RegExpCompileData compile_data;
JSRegExp::Flags flags = JSRegExp::kNone;
if (multiline) flags = JSRegExp::kMultiline;
if (unicode) flags = JSRegExp::kUnicode;
if (!v8::internal::RegExpParser::ParseRegExp(CcTest::i_isolate(), zone, if (!v8::internal::RegExpParser::ParseRegExp(CcTest::i_isolate(), zone,
&reader, multiline, unicode, &reader, flags, &compile_data))
&compile_data))
return NULL; return NULL;
Handle<String> pattern = isolate->factory() Handle<String> pattern = isolate->factory()
->NewStringFromUtf8(CStrVector(input)) ->NewStringFromUtf8(CStrVector(input))
.ToHandleChecked(); .ToHandleChecked();
Handle<String> sample_subject = Handle<String> sample_subject =
isolate->factory()->NewStringFromUtf8(CStrVector("")).ToHandleChecked(); isolate->factory()->NewStringFromUtf8(CStrVector("")).ToHandleChecked();
RegExpEngine::Compile(isolate, zone, &compile_data, false, false, multiline, RegExpEngine::Compile(isolate, zone, &compile_data, flags, pattern,
false, pattern, sample_subject, is_one_byte); sample_subject, is_one_byte);
return compile_data.node; return compile_data.node;
} }
...@@ -1669,7 +1677,7 @@ TEST(CharacterRangeCaseIndependence) { ...@@ -1669,7 +1677,7 @@ TEST(CharacterRangeCaseIndependence) {
} }
static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) { static bool InClass(uc32 c, ZoneList<CharacterRange>* ranges) {
if (ranges == NULL) if (ranges == NULL)
return false; return false;
for (int i = 0; i < ranges->length(); i++) { for (int i = 0; i < ranges->length(); i++) {
...@@ -1681,29 +1689,46 @@ static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) { ...@@ -1681,29 +1689,46 @@ static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) {
} }
TEST(CharClassDifference) { TEST(UnicodeRangeSplitter) {
Zone zone; Zone zone;
ZoneList<CharacterRange>* base = ZoneList<CharacterRange>* base =
new(&zone) ZoneList<CharacterRange>(1, &zone); new(&zone) ZoneList<CharacterRange>(1, &zone);
base->Add(CharacterRange::Everything(), &zone); base->Add(CharacterRange::Everything(), &zone);
Vector<const int> overlay = CharacterRange::GetWordBounds(); UnicodeRangeSplitter splitter(&zone, base);
ZoneList<CharacterRange>* included = NULL; // BMP
ZoneList<CharacterRange>* excluded = NULL; for (uc32 c = 0; c < 0xd800; c++) {
CharacterRange::Split(base, overlay, &included, &excluded, &zone); CHECK(InClass(c, splitter.bmp()));
for (int i = 0; i < (1 << 16); i++) { CHECK(!InClass(c, splitter.lead_surrogates()));
bool in_base = InClass(i, base); CHECK(!InClass(c, splitter.trail_surrogates()));
if (in_base) { CHECK(!InClass(c, splitter.non_bmp()));
bool in_overlay = false; }
for (int j = 0; !in_overlay && j < overlay.length(); j += 2) { // Lead surrogates
if (overlay[j] <= i && i < overlay[j+1]) for (uc32 c = 0xd800; c < 0xdbff; c++) {
in_overlay = true; CHECK(!InClass(c, splitter.bmp()));
} CHECK(InClass(c, splitter.lead_surrogates()));
CHECK_EQ(in_overlay, InClass(i, included)); CHECK(!InClass(c, splitter.trail_surrogates()));
CHECK_EQ(!in_overlay, InClass(i, excluded)); CHECK(!InClass(c, splitter.non_bmp()));
} else { }
CHECK(!InClass(i, included)); // Trail surrogates
CHECK(!InClass(i, excluded)); for (uc32 c = 0xdc00; c < 0xdfff; c++) {
} CHECK(!InClass(c, splitter.bmp()));
CHECK(!InClass(c, splitter.lead_surrogates()));
CHECK(InClass(c, splitter.trail_surrogates()));
CHECK(!InClass(c, splitter.non_bmp()));
}
// BMP
for (uc32 c = 0xe000; c < 0xffff; c++) {
CHECK(InClass(c, splitter.bmp()));
CHECK(!InClass(c, splitter.lead_surrogates()));
CHECK(!InClass(c, splitter.trail_surrogates()));
CHECK(!InClass(c, splitter.non_bmp()));
}
// Non-BMP
for (uc32 c = 0x10000; c < 0x10ffff; c++) {
CHECK(!InClass(c, splitter.bmp()));
CHECK(!InClass(c, splitter.lead_surrogates()));
CHECK(!InClass(c, splitter.trail_surrogates()));
CHECK(InClass(c, splitter.non_bmp()));
} }
} }
......
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-unicode-regexps --harmony-regexp-lookbehind
function execl(expectation, regexp, subject) {
if (regexp instanceof String) regexp = new RegExp(regexp, "u");
assertEquals(expectation, regexp.exec(subject));
}
function execs(expectation, regexp_source, subject) {
execl(expectation, new RegExp(regexp_source, "u"), subject);
}
// Character ranges.
execl(["A"], /[A-D]/u, "A");
execs(["A"], "[A-D]", "A");
execl(["ABCD"], /[A-D]+/u, "ZABCDEF");
execs(["ABCD"], "[A-D]+", "ZABCDEF");
execl(["\u{12345}"], /[\u1234-\u{12345}]/u, "\u{12345}");
execs(["\u{12345}"], "[\u1234-\u{12345}]", "\u{12345}");
execl(null, /[^\u1234-\u{12345}]/u, "\u{12345}");
execs(null, "[^\u1234-\u{12345}]", "\u{12345}");
execl(["\u{1234}"], /[\u1234-\u{12345}]/u, "\u{1234}");
execs(["\u{1234}"], "[\u1234-\u{12345}]", "\u{1234}");
execl(null, /[^\u1234-\u{12345}]/u, "\u{1234}");
execs(null, "[^\u1234-\u{12345}]", "\u{1234}");
execl(null, /[\u1234-\u{12345}]/u, "\u{1233}");
execs(null, "[\u1234-\u{12345}]", "\u{1233}");
execl(["\u{1233}"], /[^\u1234-\u{12345}]/u, "\u{1233}");
execs(["\u{1233}"], "[^\u1234-\u{12345}]", "\u{1233}");
execl(["\u{12346}"], /[^\u1234-\u{12345}]/u, "\u{12346}");
execs(["\u{12346}"], "[^\u1234-\u{12345}]", "\u{12346}");
execl(null, /[\u1234-\u{12345}]/u, "\u{12346}");
execs(null, "[\u1234-\u{12345}]", "\u{12346}");
execl(["\u{12342}"], /[\u{12340}-\u{12345}]/u, "\u{12342}");
execs(["\u{12342}"], "[\u{12340}-\u{12345}]", "\u{12342}");
execl(null, /[^\u{12340}-\u{12345}]/u, "\u{12342}");
execs(null, "[^\u{12340}-\u{12345}]", "\u{12342}");
execl(["\u{ffff}"], /[\u{ff80}-\u{12345}]/u, "\u{ffff}");
execs(["\u{ffff}"], "[\u{ff80}-\u{12345}]", "\u{ffff}");
execl(null, /[^\u{ff80}-\u{12345}]/u, "\u{ffff}");
execs(null, "[^\u{ff80}-\u{12345}]", "\u{ffff}");
// Lone surrogate
execl(["\ud800"], /[^\u{ff80}-\u{12345}]/u, "\uff99\u{d800}A");
execs(["\udc00"], "[^\u{ff80}-\u{12345}]", "\uff99\u{dc00}A");
execl(["\udc01"], /[\u0100-\u{10ffff}]/u, "A\udc01");
execl(["\udc03"], /[\udc01-\udc03]/u, "\ud801\udc02\udc03");
execl(["\ud801"], /[\ud801-\ud803]/u, "\ud802\udc01\ud801");
// Paired sorrogate.
execl(null, /[^\u{ff80}-\u{12345}]/u, "\u{d800}\u{dc00}");
execs(null, "[^\u{ff80}-\u{12345}]", "\u{d800}\u{dc00}");
execl(["\ud800\udc00"], /[\u{ff80}-\u{12345}]/u, "\u{d800}\u{dc00}");
execs(["\ud800\udc00"], "[\u{ff80}-\u{12345}]", "\u{d800}\u{dc00}");
execl(["foo\u{10e6d}bar"], /foo\ud803\ude6dbar/u, "foo\u{10e6d}bar");
// Lone surrogates
execl(["\ud801\ud801"], /\ud801+/u, "\ud801\udc01\ud801\ud801");
execl(["\udc01\udc01"], /\udc01+/u, "\ud801\ud801\udc01\udc01\udc01");
execl(["\udc02\udc03A"], /\W\WA/u, "\ud801\udc01A\udc02\udc03A");
execl(["\ud801\ud802"], /\ud801./u, "\ud801\udc01\ud801\ud802");
execl(["\udc02\udc03A"], /[\ud800-\udfff][\ud800-\udfff]A/u,
"\ud801\udc01A\udc02\udc03A");
// Character classes
execl(null, /\w/u, "\ud801\udc01");
execl(["\ud801"], /[^\w]/, "\ud801\udc01");
execl(["\ud801\udc01"], /[^\w]/u, "\ud801\udc01");
execl(["\ud801"], /\W/, "\ud801\udc01");
execl(["\ud801\udc01"], /\W/u, "\ud801\udc01");
execl(["\ud800X"], /.X/u, "\ud800XaX");
execl(["aX"], /.(?<!\ud800)X/u, "\ud800XaX");
execl(["aX"], /.(?<![\ud800-\ud900])X/u, "\ud800XaX");
execl(null, /[]/u, "\u1234");
execl(["0abc"], /[^]abc/u, "0abc");
execl(["\u1234abc"], /[^]abc/u, "\u1234abc");
execl(["\u{12345}abc"], /[^]abc/u, "\u{12345}abc");
// Backward matches of lone surrogates.
execl(["B", "\ud803A"], /(?<=([\ud800-\ud900]A))B/u,
"\ud801\udc00AB\udc00AB\ud802\ud803AB");
execl(["B", "\udc00A"], /(?<=([\ud800-\u{10300}]A))B/u,
"\ud801\udc00AB\udc00AB\ud802\ud803AB");
execl(["B", "\udc11A"], /(?<=([\udc00-\udd00]A))B/u,
"\ud801\udc00AB\udc11AB\ud802\ud803AB");
execl(["X", "\ud800C"], /(?<=(\ud800\w))X/u,
"\ud800\udc00AX\udc11BX\ud800\ud800CX");
execl(["C", "\ud800\ud800"], /(?<=(\ud800.))\w/u,
"\ud800\udc00AX\udc11BX\ud800\ud800CX");
execl(["X", "\udc01C"], /(?<=(\udc01\w))X/u,
"\ud800\udc01AX\udc11BX\udc01\udc01CX");
execl(["C", "\udc01\udc01"], /(?<=(\udc01.))./u,
"\ud800\udc01AX\udc11BX\udc01\udc01CX");
var L = "\ud800";
var T = "\udc00";
var X = "X";
// Test string contains only match.
function testw(expect, src, subject) {
var re = new RegExp("^" + src + "$", "u");
assertEquals(expect, re.test(subject));
}
// Test string starts with match.
function tests(expect, src, subject) {
var re = new RegExp("^" + src, "u");
assertEquals(expect, re.test(subject));
}
testw(true, X, X);
testw(true, L, L);
testw(true, T, T);
testw(true, L + T, L + T);
testw(true, T + L, T + L);
testw(false, T, L + T);
testw(false, L, L + T);
testw(true, ".(?<=" + L + ")", L);
testw(true, ".(?<=" + T + ")", T);
testw(true, ".(?<=" + L + T + ")", L + T);
testw(true, ".(?<=" + L + T + ")", L + T);
tests(true, ".(?<=" + T + ")", T + L);
tests(false, ".(?<=" + L + ")", L + T);
tests(false, ".(?<=" + T + ")", L + T);
tests(true, "..(?<=" + T + ")", T + T + L);
tests(true, "..(?<=" + T + ")", X + T + L);
tests(true, "...(?<=" + L + ")", X + T + L);
tests(false, "...(?<=" + T + ")", X + L + T)
tests(true, "..(?<=" + L + T + ")", X + L + T)
tests(true, "..(?<=" + L + T + "(?<=" + L + T + "))", X + L + T);
tests(false, "..(?<=" + L + "(" + T + "))", X + L + T);
tests(false, ".*" + L, X + L + T);
tests(true, ".*" + L, X + L + L + T);
tests(false, ".*" + L, X + L + T + L + T);
tests(false, ".*" + T, X + L + T + L + T);
tests(true, ".*" + T, X + L + T + T + L + T);
...@@ -252,6 +252,30 @@ assertFalse(/(\u{12345}|\u{23456}).\1/u.test("\u{12345}b\u{23456}")); ...@@ -252,6 +252,30 @@ assertFalse(/(\u{12345}|\u{23456}).\1/u.test("\u{12345}b\u{23456}"));
assertTrue(new RegExp("\u{12345}{3}", "u").test("\u{12345}\u{12345}\u{12345}")); assertTrue(new RegExp("\u{12345}{3}", "u").test("\u{12345}\u{12345}\u{12345}"));
assertTrue(/\u{12345}{3}/u.test("\u{12345}\u{12345}\u{12345}")); assertTrue(/\u{12345}{3}/u.test("\u{12345}\u{12345}\u{12345}"));
assertTrue(new RegExp("\u{12345}{3}").test("\u{12345}\udf45\udf45")); assertTrue(new RegExp("\u{12345}{3}").test("\u{12345}\udf45\udf45"));
assertTrue(/\ud808\udf45{3}/u.test("\u{12345}\udf45\udf45")); assertFalse(/\ud808\udf45{3}/u.test("\u{12345}\udf45\udf45"));
assertTrue(/\ud808\udf45{3}/u.test("\u{12345}\u{12345}\u{12345}"));
assertFalse(new RegExp("\u{12345}{3}", "u").test("\u{12345}\udf45\udf45")); assertFalse(new RegExp("\u{12345}{3}", "u").test("\u{12345}\udf45\udf45"));
assertFalse(/\u{12345}{3}/u.test("\u{12345}\udf45\udf45")); assertFalse(/\u{12345}{3}/u.test("\u{12345}\udf45\udf45"));
// Mixed escapes and literal surrogates.
assertEquals(["\u{10000}\u{10000}"],
new RegExp("\ud800\udc00+", "u").exec("\u{10000}\u{10000}"));
assertEquals(["\u{10000}\u{10000}"],
new RegExp("\\ud800\\udc00+", "u").exec("\u{10000}\u{10000}"));
assertEquals(["\u{10000}\u{10000}"],
new RegExp("\\ud800\udc00+", "u").exec("\u{10000}\u{10000}"));
assertEquals(["\u{10000}\u{10000}"],
new RegExp("\ud800\\udc00+", "u").exec("\u{10000}\u{10000}"));
assertEquals(["\u{10003}\u{50001}"],
new RegExp("[\\ud800\\udc03-\\ud900\\udc01\]+", "u").exec(
"\u{10003}\u{50001}"));
assertEquals(["\u{10003}\u{50001}"],
new RegExp("[\\ud800\udc03-\ud900\\udc01\]+", "u").exec(
"\u{10003}\u{50001}"));
assertEquals(["\u{50001}"],
new RegExp("[\\ud800\udc03-\ud900\\udc01\]+", "u").exec(
"\u{10002}\u{50001}"));
assertEquals(["\u{10003}\u{50001}"],
new RegExp("[\ud800\udc03-\u{50001}\]+", "u").exec(
"\u{10003}\u{50001}"));
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment