// Copyright 2016 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef V8_REGEXP_REGEXP_AST_H_ #define V8_REGEXP_REGEXP_AST_H_ #include "src/base/strings.h" #include "src/regexp/regexp-flags.h" #include "src/zone/zone-containers.h" #include "src/zone/zone-list.h" #include "src/zone/zone.h" namespace v8 { namespace internal { #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ VISIT(Disjunction) \ VISIT(Alternative) \ VISIT(Assertion) \ VISIT(CharacterClass) \ VISIT(Atom) \ VISIT(Quantifier) \ VISIT(Capture) \ VISIT(Group) \ VISIT(Lookaround) \ VISIT(BackReference) \ VISIT(Empty) \ VISIT(Text) #define FORWARD_DECLARE(Name) class RegExp##Name; FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE) #undef FORWARD_DECLARE class RegExpCompiler; class RegExpNode; class RegExpTree; class RegExpVisitor { public: virtual ~RegExpVisitor() = default; #define MAKE_CASE(Name) \ virtual void* Visit##Name(RegExp##Name*, void* data) = 0; FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) #undef MAKE_CASE }; // A simple closed interval. class Interval { public: Interval() : from_(kNone), to_(kNone - 1) {} // '- 1' for branchless size(). Interval(int from, int to) : from_(from), to_(to) {} Interval Union(Interval that) { if (that.from_ == kNone) return *this; if (from_ == kNone) return that; return Interval(std::min(from_, that.from_), std::max(to_, that.to_)); } static Interval Empty() { return Interval(); } bool Contains(int value) const { return (from_ <= value) && (value <= to_); } bool is_empty() const { return from_ == kNone; } int from() const { return from_; } int to() const { return to_; } int size() const { return to_ - from_ + 1; } static constexpr int kNone = -1; private: int from_; int to_; }; // Named standard character sets. enum class StandardCharacterSet : char { kWhitespace = 's', // Like /\s/. kNotWhitespace = 'S', // Like /\S/. kWord = 'w', // Like /\w/. kNotWord = 'W', // Like /\W/. kDigit = 'd', // Like /\d/. kNotDigit = 'D', // Like /\D/. kLineTerminator = 'n', // The inverse of /./. kNotLineTerminator = '.', // Like /./. kEverything = '*', // Matches every character, like /./s. }; // Represents code points (with values up to 0x10FFFF) in the range from from_ // to to_, both ends are inclusive. class CharacterRange { public: CharacterRange() = default; // For compatibility with the CHECK_OK macro. CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT static inline CharacterRange Singleton(base::uc32 value) { return CharacterRange(value, value); } static inline CharacterRange Range(base::uc32 from, base::uc32 to) { DCHECK(0 <= from && to <= kMaxCodePoint); DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to)); return CharacterRange(from, to); } static inline CharacterRange Everything() { return CharacterRange(0, kMaxCodePoint); } static inline ZoneList<CharacterRange>* List(Zone* zone, CharacterRange range) { ZoneList<CharacterRange>* list = zone->New<ZoneList<CharacterRange>>(1, zone); list->Add(range, zone); return list; } V8_EXPORT_PRIVATE static void AddClassEscape( StandardCharacterSet standard_character_set, ZoneList<CharacterRange>* ranges, Zone* zone); // Add class escapes. Add case equivalent closure for \w and \W if necessary. V8_EXPORT_PRIVATE static void AddClassEscape( StandardCharacterSet standard_character_set, ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents, Zone* zone); V8_EXPORT_PRIVATE static void AddCaseEquivalents( Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges, bool is_one_byte); bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; } base::uc32 from() const { return from_; } base::uc32 to() const { return to_; } bool IsEverything(base::uc32 max) const { return from_ == 0 && to_ >= max; } bool IsSingleton() const { return from_ == to_; } // Whether a range list is in canonical form: Ranges ordered by from value, // and ranges non-overlapping and non-adjacent. V8_EXPORT_PRIVATE static bool IsCanonical(ZoneList<CharacterRange>* ranges); // Convert range list to canonical form. The characters covered by the ranges // will still be the same, but no character is in more than one range, and // adjacent ranges are merged. The resulting list may be shorter than the // original, but cannot be longer. static void Canonicalize(ZoneList<CharacterRange>* ranges); // Negate the contents of a character range in canonical form. static void Negate(ZoneList<CharacterRange>* src, ZoneList<CharacterRange>* dst, Zone* zone); private: CharacterRange(base::uc32 from, base::uc32 to) : from_(from), to_(to) {} static constexpr int kMaxCodePoint = 0x10ffff; base::uc32 from_ = 0; base::uc32 to_ = 0; }; #define DECL_BOILERPLATE(Name) \ void* Accept(RegExpVisitor* visitor, void* data) override; \ RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) \ override; \ RegExp##Name* As##Name() override; \ bool Is##Name() override class RegExpTree : public ZoneObject { public: static const int kInfinity = kMaxInt; virtual ~RegExpTree() = default; virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) = 0; virtual bool IsTextElement() { return false; } virtual bool IsAnchoredAtStart() { return false; } virtual bool IsAnchoredAtEnd() { return false; } virtual int min_match() = 0; virtual int max_match() = 0; // Returns the interval of registers used for captures within this // expression. virtual Interval CaptureRegisters() { return Interval::Empty(); } virtual void AppendToText(RegExpText* text, Zone* zone); V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os, Zone* zone); #define MAKE_ASTYPE(Name) \ virtual RegExp##Name* As##Name(); \ virtual bool Is##Name(); FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) #undef MAKE_ASTYPE }; class RegExpDisjunction final : public RegExpTree { public: explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives); DECL_BOILERPLATE(Disjunction); Interval CaptureRegisters() override; bool IsAnchoredAtStart() override; bool IsAnchoredAtEnd() override; int min_match() override { return min_match_; } int max_match() override { return max_match_; } ZoneList<RegExpTree*>* alternatives() const { return alternatives_; } private: bool SortConsecutiveAtoms(RegExpCompiler* compiler); void RationalizeConsecutiveAtoms(RegExpCompiler* compiler); void FixSingleCharacterDisjunctions(RegExpCompiler* compiler); ZoneList<RegExpTree*>* alternatives_; int min_match_; int max_match_; }; class RegExpAlternative final : public RegExpTree { public: explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes); DECL_BOILERPLATE(Alternative); Interval CaptureRegisters() override; bool IsAnchoredAtStart() override; bool IsAnchoredAtEnd() override; int min_match() override { return min_match_; } int max_match() override { return max_match_; } ZoneList<RegExpTree*>* nodes() const { return nodes_; } private: ZoneList<RegExpTree*>* nodes_; int min_match_; int max_match_; }; class RegExpAssertion final : public RegExpTree { public: enum AssertionType { START_OF_LINE = 0, START_OF_INPUT = 1, END_OF_LINE = 2, END_OF_INPUT = 3, BOUNDARY = 4, NON_BOUNDARY = 5, LAST_TYPE = NON_BOUNDARY, }; explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {} DECL_BOILERPLATE(Assertion); bool IsAnchoredAtStart() override; bool IsAnchoredAtEnd() override; int min_match() override { return 0; } int max_match() override { return 0; } AssertionType assertion_type() const { return assertion_type_; } private: const AssertionType assertion_type_; }; class CharacterSet final { public: explicit CharacterSet(StandardCharacterSet standard_set_type) : standard_set_type_(standard_set_type) {} explicit CharacterSet(ZoneList<CharacterRange>* ranges) : ranges_(ranges) {} ZoneList<CharacterRange>* ranges(Zone* zone); StandardCharacterSet standard_set_type() const { return standard_set_type_.value(); } void set_standard_set_type(StandardCharacterSet standard_set_type) { standard_set_type_ = standard_set_type; } bool is_standard() const { return standard_set_type_.has_value(); } V8_EXPORT_PRIVATE void Canonicalize(); private: ZoneList<CharacterRange>* ranges_ = nullptr; base::Optional<StandardCharacterSet> standard_set_type_; }; class RegExpCharacterClass final : public RegExpTree { public: // NEGATED: The character class is negated and should match everything but // the specified ranges. // CONTAINS_SPLIT_SURROGATE: The character class contains part of a split // surrogate and should not be unicode-desugared (crbug.com/641091). enum Flag { NEGATED = 1 << 0, CONTAINS_SPLIT_SURROGATE = 1 << 1, }; using CharacterClassFlags = base::Flags<Flag>; RegExpCharacterClass( Zone* zone, ZoneList<CharacterRange>* ranges, CharacterClassFlags character_class_flags = CharacterClassFlags()) : set_(ranges), character_class_flags_(character_class_flags) { // Convert the empty set of ranges to the negated Everything() range. if (ranges->is_empty()) { ranges->Add(CharacterRange::Everything(), zone); character_class_flags_ ^= NEGATED; } } explicit RegExpCharacterClass(StandardCharacterSet standard_set_type) : set_(standard_set_type), character_class_flags_() {} DECL_BOILERPLATE(CharacterClass); bool IsTextElement() override { return true; } int min_match() override { return 1; } // The character class may match two code units for unicode regexps. // TODO(yangguo): we should split this class for usage in TextElement, and // make max_match() dependent on the character class content. int max_match() override { return 2; } void AppendToText(RegExpText* text, Zone* zone) override; // TODO(lrn): Remove need for complex version if is_standard that // recognizes a mangled standard set and just do { return set_.is_special(); } bool is_standard(Zone* zone); // Returns a value representing the standard character set if is_standard() // returns true. StandardCharacterSet standard_type() const { return set_.standard_set_type(); } CharacterSet character_set() const { return set_; } ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); } bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; } bool contains_split_surrogate() const { return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0; } private: CharacterSet set_; CharacterClassFlags character_class_flags_; }; class RegExpAtom final : public RegExpTree { public: explicit RegExpAtom(base::Vector<const base::uc16> data) : data_(data) {} DECL_BOILERPLATE(Atom); bool IsTextElement() override { return true; } int min_match() override { return data_.length(); } int max_match() override { return data_.length(); } void AppendToText(RegExpText* text, Zone* zone) override; base::Vector<const base::uc16> data() const { return data_; } int length() const { return data_.length(); } private: base::Vector<const base::uc16> data_; }; class TextElement final { public: enum TextType { ATOM, CHAR_CLASS }; static TextElement Atom(RegExpAtom* atom); static TextElement CharClass(RegExpCharacterClass* char_class); int cp_offset() const { return cp_offset_; } void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } int length() const; TextType text_type() const { return text_type_; } RegExpTree* tree() const { return tree_; } RegExpAtom* atom() const { DCHECK(text_type() == ATOM); return reinterpret_cast<RegExpAtom*>(tree()); } RegExpCharacterClass* char_class() const { DCHECK(text_type() == CHAR_CLASS); return reinterpret_cast<RegExpCharacterClass*>(tree()); } private: TextElement(TextType text_type, RegExpTree* tree) : cp_offset_(-1), text_type_(text_type), tree_(tree) {} int cp_offset_; TextType text_type_; RegExpTree* tree_; }; class RegExpText final : public RegExpTree { public: explicit RegExpText(Zone* zone) : elements_(2, zone) {} DECL_BOILERPLATE(Text); bool IsTextElement() override { return true; } int min_match() override { return length_; } int max_match() override { return length_; } void AppendToText(RegExpText* text, Zone* zone) override; void AddElement(TextElement elm, Zone* zone) { elements_.Add(elm, zone); length_ += elm.length(); } ZoneList<TextElement>* elements() { return &elements_; } private: ZoneList<TextElement> elements_; int length_ = 0; }; class RegExpQuantifier final : public RegExpTree { public: enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE }; RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body) : body_(body), min_(min), max_(max), quantifier_type_(type) { if (min > 0 && body->min_match() > kInfinity / min) { min_match_ = kInfinity; } else { min_match_ = min * body->min_match(); } if (max > 0 && body->max_match() > kInfinity / max) { max_match_ = kInfinity; } else { max_match_ = max * body->max_match(); } } DECL_BOILERPLATE(Quantifier); static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body, RegExpCompiler* compiler, RegExpNode* on_success, bool not_at_start = false); Interval CaptureRegisters() override; int min_match() override { return min_match_; } int max_match() override { return max_match_; } int min() const { return min_; } int max() const { return max_; } QuantifierType quantifier_type() const { return quantifier_type_; } bool is_possessive() const { return quantifier_type_ == POSSESSIVE; } bool is_non_greedy() const { return quantifier_type_ == NON_GREEDY; } bool is_greedy() const { return quantifier_type_ == GREEDY; } RegExpTree* body() const { return body_; } private: RegExpTree* body_; int min_; int max_; int min_match_; int max_match_; QuantifierType quantifier_type_; }; class RegExpCapture final : public RegExpTree { public: explicit RegExpCapture(int index) : body_(nullptr), index_(index), min_match_(0), max_match_(0), name_(nullptr) {} DECL_BOILERPLATE(Capture); static RegExpNode* ToNode(RegExpTree* body, int index, RegExpCompiler* compiler, RegExpNode* on_success); bool IsAnchoredAtStart() override; bool IsAnchoredAtEnd() override; Interval CaptureRegisters() override; int min_match() override { return min_match_; } int max_match() override { return max_match_; } RegExpTree* body() { return body_; } void set_body(RegExpTree* body) { body_ = body; min_match_ = body->min_match(); max_match_ = body->max_match(); } int index() const { return index_; } const ZoneVector<base::uc16>* name() const { return name_; } void set_name(const ZoneVector<base::uc16>* name) { name_ = name; } static int StartRegister(int index) { return index * 2; } static int EndRegister(int index) { return index * 2 + 1; } private: RegExpTree* body_ = nullptr; int index_; int min_match_ = 0; int max_match_ = 0; const ZoneVector<base::uc16>* name_ = nullptr; }; class RegExpGroup final : public RegExpTree { public: explicit RegExpGroup(RegExpTree* body) : body_(body), min_match_(body->min_match()), max_match_(body->max_match()) {} DECL_BOILERPLATE(Group); bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); } bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); } int min_match() override { return min_match_; } int max_match() override { return max_match_; } Interval CaptureRegisters() override { return body_->CaptureRegisters(); } RegExpTree* body() const { return body_; } private: RegExpTree* body_; int min_match_; int max_match_; }; class RegExpLookaround final : public RegExpTree { public: enum Type { LOOKAHEAD, LOOKBEHIND }; RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count, int capture_from, Type type) : body_(body), is_positive_(is_positive), capture_count_(capture_count), capture_from_(capture_from), type_(type) {} DECL_BOILERPLATE(Lookaround); Interval CaptureRegisters() override; bool IsAnchoredAtStart() override; int min_match() override { return 0; } int max_match() override { return 0; } RegExpTree* body() const { return body_; } bool is_positive() const { return is_positive_; } int capture_count() const { return capture_count_; } int capture_from() const { return capture_from_; } Type type() const { return type_; } class Builder { public: Builder(bool is_positive, RegExpNode* on_success, int stack_pointer_register, int position_register, int capture_register_count = 0, int capture_register_start = 0); RegExpNode* on_match_success() const { return on_match_success_; } RegExpNode* ForMatch(RegExpNode* match); private: bool is_positive_; RegExpNode* on_match_success_; RegExpNode* on_success_; int stack_pointer_register_; int position_register_; }; private: RegExpTree* body_; bool is_positive_; int capture_count_; int capture_from_; Type type_; }; class RegExpBackReference final : public RegExpTree { public: explicit RegExpBackReference(RegExpFlags flags) : flags_(flags) {} RegExpBackReference(RegExpCapture* capture, RegExpFlags flags) : capture_(capture), flags_(flags) {} DECL_BOILERPLATE(BackReference); int min_match() override { return 0; } // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite // recursion, we give up. Ignorance is bliss. int max_match() override { return kInfinity; } int index() const { return capture_->index(); } RegExpCapture* capture() const { return capture_; } void set_capture(RegExpCapture* capture) { capture_ = capture; } const ZoneVector<base::uc16>* name() const { return name_; } void set_name(const ZoneVector<base::uc16>* name) { name_ = name; } private: RegExpCapture* capture_ = nullptr; const ZoneVector<base::uc16>* name_ = nullptr; const RegExpFlags flags_; }; class RegExpEmpty final : public RegExpTree { public: DECL_BOILERPLATE(Empty); int min_match() override { return 0; } int max_match() override { return 0; } }; } // namespace internal } // namespace v8 #undef DECL_BOILERPLATE #endif // V8_REGEXP_REGEXP_AST_H_