regexp-parser.h 12.3 KB
Newer Older
1 2 3 4 5 6 7 8
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef V8_REGEXP_REGEXP_PARSER_H_
#define V8_REGEXP_REGEXP_PARSER_H_

#include "src/objects.h"
9
#include "src/objects/js-regexp.h"
10
#include "src/regexp/regexp-ast.h"
11
#include "src/zone/zone.h"
12 13 14 15 16 17 18 19 20 21 22

namespace v8 {
namespace internal {

struct RegExpCompileData;

// A BufferedZoneList is an automatically growing list, just like (and backed
// by) a ZoneList, that is optimized for the case of adding and removing
// a single element. The last element added is stored outside the backing list,
// and if no more than one element is ever added, the ZoneList isn't even
// allocated.
23
// Elements must not be nullptr pointers.
24 25 26
template <typename T, int initial_size>
class BufferedZoneList {
 public:
27
  BufferedZoneList() : list_(nullptr), last_(nullptr) {}
28 29 30 31 32

  // Adds element at end of list. This element is buffered and can
  // be read using last() or removed using RemoveLast until a new Add or until
  // RemoveLast or GetList has been called.
  void Add(T* value, Zone* zone) {
33 34
    if (last_ != nullptr) {
      if (list_ == nullptr) {
35 36 37 38 39 40 41 42
        list_ = new (zone) ZoneList<T*>(initial_size, zone);
      }
      list_->Add(last_, zone);
    }
    last_ = value;
  }

  T* last() {
43
    DCHECK(last_ != nullptr);
44 45 46 47
    return last_;
  }

  T* RemoveLast() {
48
    DCHECK(last_ != nullptr);
49
    T* result = last_;
50
    if ((list_ != nullptr) && (list_->length() > 0))
51 52
      last_ = list_->RemoveLast();
    else
53
      last_ = nullptr;
54 55 56 57 58
    return result;
  }

  T* Get(int i) {
    DCHECK((0 <= i) && (i < length()));
59
    if (list_ == nullptr) {
60 61 62 63
      DCHECK_EQ(0, i);
      return last_;
    } else {
      if (i == list_->length()) {
64
        DCHECK(last_ != nullptr);
65 66 67 68 69 70 71 72
        return last_;
      } else {
        return list_->at(i);
      }
    }
  }

  void Clear() {
73 74
    list_ = nullptr;
    last_ = nullptr;
75 76 77
  }

  int length() {
78 79
    int length = (list_ == nullptr) ? 0 : list_->length();
    return length + ((last_ == nullptr) ? 0 : 1);
80 81 82
  }

  ZoneList<T*>* GetList(Zone* zone) {
83
    if (list_ == nullptr) {
84 85
      list_ = new (zone) ZoneList<T*>(initial_size, zone);
    }
86
    if (last_ != nullptr) {
87
      list_->Add(last_, zone);
88
      last_ = nullptr;
89 90 91 92 93 94 95 96 97 98 99 100 101
    }
    return list_;
  }

 private:
  ZoneList<T*>* list_;
  T* last_;
};


// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
class RegExpBuilder : public ZoneObject {
 public:
102
  RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
103
  void AddCharacter(uc16 character);
104
  void AddUnicodeCharacter(uc32 character);
105
  void AddEscapedUnicodeCharacter(uc32 character);
106 107 108
  // "Adds" an empty expression. Does nothing except consume a
  // following quantifier
  void AddEmpty();
109
  void AddCharacterClass(RegExpCharacterClass* cc);
110
  void AddCharacterClassForDesugaring(uc32 c);
111
  void AddAtom(RegExpTree* tree);
112
  void AddTerm(RegExpTree* tree);
113 114
  void AddAssertion(RegExpTree* tree);
  void NewAlternative();  // '|'
115
  bool AddQuantifierToAtom(int min, int max,
116
                           RegExpQuantifier::QuantifierType type);
117
  void FlushText();
118
  RegExpTree* ToRegExp();
119 120 121 122 123 124
  JSRegExp::Flags flags() const { return flags_; }
  void set_flags(JSRegExp::Flags flags) { flags_ = flags; }

  bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
  bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
  bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; }
125 126

 private:
127 128 129 130
  static const uc16 kNoPendingSurrogate = 0;
  void AddLeadSurrogate(uc16 lead_surrogate);
  void AddTrailSurrogate(uc16 trail_surrogate);
  void FlushPendingSurrogate();
131 132
  void FlushCharacters();
  void FlushTerms();
133 134
  bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
  bool NeedsDesugaringForIgnoreCase(uc32 c);
135
  Zone* zone() const { return zone_; }
136
  bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
137 138 139

  Zone* zone_;
  bool pending_empty_;
140
  JSRegExp::Flags flags_;
141
  ZoneList<uc16>* characters_;
142
  uc16 pending_surrogate_;
143 144 145 146 147 148 149 150 151 152 153
  BufferedZoneList<RegExpTree, 2> terms_;
  BufferedZoneList<RegExpTree, 2> text_;
  BufferedZoneList<RegExpTree, 2> alternatives_;
#ifdef DEBUG
  enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
#define LAST(x) last_added_ = x;
#else
#define LAST(x)
#endif
};

154
class V8_EXPORT_PRIVATE RegExpParser {
155
 public:
156 157
  RegExpParser(FlatStringReader* in, Handle<String>* error,
               JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
158 159

  static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
160
                          JSRegExp::Flags flags, RegExpCompileData* result);
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178

  RegExpTree* ParsePattern();
  RegExpTree* ParseDisjunction();
  RegExpTree* ParseGroup();

  // Parses a {...,...} quantifier and stores the range in the given
  // out parameters.
  bool ParseIntervalQuantifier(int* min_out, int* max_out);

  // Parses and returns a single escaped character.  The character
  // must not be 'b' or 'B' since they are usually handle specially.
  uc32 ParseClassCharacterEscape();

  // Checks whether the following is a length-digit hexadecimal number,
  // and sets the value if it is.
  bool ParseHexEscape(int length, uc32* value);
  bool ParseUnicodeEscape(uc32* value);
  bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
179 180 181 182 183 184 185 186

  bool ParsePropertyClassName(std::vector<char>* name_1,
                              std::vector<char>* name_2);
  bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
                             const std::vector<char>& name_1,
                             const std::vector<char>& name_2);

  RegExpTree* GetPropertySequence(const std::vector<char>& name_1);
187
  RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
188 189 190 191 192 193 194 195 196

  uc32 ParseOctalLiteral();

  // Tries to parse the input as a back reference.  If successful it
  // stores the result in the output parameter and returns true.  If
  // it fails it will push back the characters read so the same characters
  // can be reparsed.
  bool ParseBackReferenceIndex(int* index_out);

197 198 199 200 201 202 203 204
  // Parse inside a class. Either add escaped class to the range, or return
  // false and pass parsed single character through |char_out|.
  void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
                        bool add_unicode_case_equivalents, uc32* char_out,
                        bool* is_class_escape);

  char ParseClassEscape();

205
  RegExpTree* ReportError(Vector<const char> message);
206 207
  void Advance();
  void Advance(int dist);
208 209 210 211 212 213 214 215 216 217
  void Reset(int pos);

  // Reports whether the pattern might be used as a literal search string.
  // Only use if the result of the parse is a single atom node.
  bool simple();
  bool contains_anchor() { return contains_anchor_; }
  void set_contains_anchor() { contains_anchor_ = true; }
  int captures_started() { return captures_started_; }
  int position() { return next_pos_ - 1; }
  bool failed() { return failed_; }
218 219 220
  // The Unicode flag can't be changed using in-regexp syntax, so it's OK to
  // just read the initial flag value here.
  bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; }
221

222
  static bool IsSyntaxCharacterOrSlash(uc32 c);
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237

  static const int kMaxCaptures = 1 << 16;
  static const uc32 kEndMarker = (1 << 21);

 private:
  enum SubexpressionType {
    INITIAL,
    CAPTURE,  // All positive values represent captures.
    POSITIVE_LOOKAROUND,
    NEGATIVE_LOOKAROUND,
    GROUPING
  };

  class RegExpParserState : public ZoneObject {
   public:
238
    // Push a state on the stack.
239 240 241
    RegExpParserState(RegExpParserState* previous_state,
                      SubexpressionType group_type,
                      RegExpLookaround::Type lookaround_type,
242
                      int disjunction_capture_index,
243 244
                      const ZoneVector<uc16>* capture_name,
                      JSRegExp::Flags flags, Zone* zone)
245
        : previous_state_(previous_state),
246
          builder_(new (zone) RegExpBuilder(zone, flags)),
247 248
          group_type_(group_type),
          lookaround_type_(lookaround_type),
249 250
          disjunction_capture_index_(disjunction_capture_index),
          capture_name_(capture_name) {}
251
    // Parser state of containing expression, if any.
252
    RegExpParserState* previous_state() const { return previous_state_; }
253
    bool IsSubexpression() { return previous_state_ != nullptr; }
254
    // RegExpBuilder building this regexp's AST.
255
    RegExpBuilder* builder() const { return builder_; }
256
    // Type of regexp being parsed (parenthesized group or entire regexp).
257
    SubexpressionType group_type() const { return group_type_; }
258
    // Lookahead or Lookbehind.
259
    RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
260 261 262
    // Index in captures array of first capture in this sub-expression, if any.
    // Also the capture index of this sub-expression itself, if group_type
    // is CAPTURE.
263
    int capture_index() const { return disjunction_capture_index_; }
264 265
    // The name of the current sub-expression, if group_type is CAPTURE. Only
    // used for named captures.
266
    const ZoneVector<uc16>* capture_name() const { return capture_name_; }
267 268

    bool IsNamedCapture() const { return capture_name_ != nullptr; }
269 270 271

    // Check whether the parser is inside a capture group with the given index.
    bool IsInsideCaptureGroup(int index);
272 273
    // Check whether the parser is inside a capture group with the given name.
    bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
274 275 276

   private:
    // Linked list implementation of stack of states.
277
    RegExpParserState* const previous_state_;
278
    // Builder for the stored disjunction.
279
    RegExpBuilder* const builder_;
280
    // Stored disjunction type (capture, look-ahead or grouping), if any.
281
    const SubexpressionType group_type_;
282
    // Stored read direction.
283
    const RegExpLookaround::Type lookaround_type_;
284
    // Stored disjunction's capture index (if any).
285
    const int disjunction_capture_index_;
286
    // Stored capture name (if any).
287
    const ZoneVector<uc16>* const capture_name_;
288 289 290 291 292
  };

  // Return the 1-indexed RegExpCapture object, allocate if necessary.
  RegExpCapture* GetCapture(int index);

293 294 295 296 297 298 299 300 301 302 303
  // Creates a new named capture at the specified index. Must be called exactly
  // once for each named capture. Fails if a capture with the same name is
  // encountered.
  bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);

  // Parses the name of a capture group (?<name>pattern). The name must adhere
  // to IdentifierName in the ECMAScript standard.
  const ZoneVector<uc16>* ParseCaptureGroupName();

  bool ParseNamedBackReference(RegExpBuilder* builder,
                               RegExpParserState* state);
304
  RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
305 306 307 308 309 310 311 312

  // After the initial parsing pass, patch corresponding RegExpCapture objects
  // into all RegExpBackReferences. This is done after initial parsing in order
  // to avoid complicating cases in which references comes before the capture.
  void PatchNamedBackReferences();

  Handle<FixedArray> CreateCaptureNameMap();

313 314 315 316
  // Returns true iff the pattern contains named captures. May call
  // ScanForCaptures to look ahead at the remaining pattern.
  bool HasNamedCaptures();

317 318 319 320 321 322 323
  Isolate* isolate() { return isolate_; }
  Zone* zone() const { return zone_; }

  uc32 current() { return current_; }
  bool has_more() { return has_more_; }
  bool has_next() { return next_pos_ < in()->length(); }
  uc32 Next();
324 325
  template <bool update_position>
  uc32 ReadNext();
326 327 328 329 330 331 332
  FlatStringReader* in() { return in_; }
  void ScanForCaptures();

  Isolate* isolate_;
  Zone* zone_;
  Handle<String>* error_;
  ZoneList<RegExpCapture*>* captures_;
333 334
  ZoneList<RegExpCapture*>* named_captures_;
  ZoneList<RegExpBackReference*>* named_back_references_;
335 336
  FlatStringReader* in_;
  uc32 current_;
337 338 339 340
  // These are the flags specified outside the regexp syntax ie after the
  // terminating '/' or in the second argument to the constructor.  The current
  // flags are stored on the RegExpBuilder.
  JSRegExp::Flags top_level_flags_;
341 342
  int next_pos_;
  int captures_started_;
343
  int capture_count_;  // Only valid after we have scanned for captures.
344 345 346 347
  bool has_more_;
  bool simple_;
  bool contains_anchor_;
  bool is_scanned_for_captures_;
348
  bool has_named_captures_;  // Only valid after we have scanned for captures.
349 350 351 352 353 354 355
  bool failed_;
};

}  // namespace internal
}  // namespace v8

#endif  // V8_REGEXP_REGEXP_PARSER_H_