jsregexp.h 56.2 KB
Newer Older
1
// Copyright 2012 the V8 project authors. All rights reserved.
2 3
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
4

5 6
#ifndef V8_REGEXP_JSREGEXP_H_
#define V8_REGEXP_JSREGEXP_H_
7

8 9
#include "src/allocation.h"
#include "src/assembler.h"
10
#include "src/objects/js-regexp.h"
11
#include "src/regexp/regexp-ast.h"
12
#include "src/regexp/regexp-macro-assembler.h"
13

14 15
namespace v8 {
namespace internal {
16

17 18
class NodeVisitor;
class RegExpCompiler;
19
class RegExpMacroAssembler;
20 21
class RegExpNode;
class RegExpTree;
22
class BoyerMooreLookahead;
23

24 25
class RegExpImpl {
 public:
26 27
  // Whether V8 is compiled with native regexp support or not.
  static bool UsesNativeRegExp() {
28
#ifdef V8_INTERPRETED_REGEXP
29
    return false;
30 31
#else
    return true;
32 33
#endif
  }
34

35 36 37 38 39
  // Returns a string representation of a regular expression.
  // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
  // This function calls the garbage collector if necessary.
  static Handle<String> ToString(Handle<Object> value);

40 41 42
  // Parses the RegExp pattern and prepares the JSRegExp object with
  // generic data and choice of implementation - as well as what
  // the implementation wants to store in the data field.
43
  // Returns false if compilation fails.
44 45 46
  MUST_USE_RESULT static MaybeHandle<Object> Compile(Handle<JSRegExp> re,
                                                     Handle<String> pattern,
                                                     JSRegExp::Flags flags);
47 48 49

  // See ECMA-262 section 15.10.6.2.
  // This function calls the garbage collector if necessary.
50
  V8_EXPORT_PRIVATE MUST_USE_RESULT static MaybeHandle<Object> Exec(
51
      Handle<JSRegExp> regexp, Handle<String> subject, int index,
52
      Handle<RegExpMatchInfo> last_match_info);
53

54
  // Prepares a JSRegExp object with Irregexp-specific data.
55 56 57 58
  static void IrregexpInitialize(Handle<JSRegExp> re,
                                 Handle<String> pattern,
                                 JSRegExp::Flags flags,
                                 int capture_register_count);
59 60


61 62 63 64 65
  static void AtomCompile(Handle<JSRegExp> re,
                          Handle<String> pattern,
                          JSRegExp::Flags flags,
                          Handle<String> match_pattern);

66 67 68 69 70 71 72

  static int AtomExecRaw(Handle<JSRegExp> regexp,
                         Handle<String> subject,
                         int index,
                         int32_t* output,
                         int output_size);

73
  static Handle<Object> AtomExec(Handle<JSRegExp> regexp,
74
                                 Handle<String> subject, int index,
75
                                 Handle<RegExpMatchInfo> last_match_info);
76

77 78 79 80 81 82 83
  enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };

  // Prepare a RegExp for being executed one or more times (using
  // IrregexpExecOnce) on the subject.
  // This ensures that the regexp is compiled for the subject, and that
  // the subject is flat.
  // Returns the number of integer spaces required by IrregexpExecOnce
84
  // as its "registers" argument.  If the regexp cannot be compiled,
85 86
  // an exception is set as pending, and this function returns negative.
  static int IrregexpPrepare(Handle<JSRegExp> regexp,
87
                             Handle<String> subject);
88

89 90 91 92
  // Execute a regular expression on the subject, starting from index.
  // If matching succeeds, return the number of matches.  This can be larger
  // than one in the case of global regular expressions.
  // The captures and subcaptures are stored into the registers vector.
93 94
  // If matching fails, returns RE_FAILURE.
  // If execution fails, sets a pending exception and returns RE_EXCEPTION.
95 96 97
  static int IrregexpExecRaw(Handle<JSRegExp> regexp,
                             Handle<String> subject,
                             int index,
98 99
                             int32_t* output,
                             int output_size);
100

101
  // Execute an Irregexp bytecode pattern.
102
  // On a successful match, the result is a JSArray containing
103
  // captured positions.  On a failure, the result is the null value.
104
  // Returns an empty handle in case of an exception.
105
  MUST_USE_RESULT static MaybeHandle<Object> IrregexpExec(
106
      Handle<JSRegExp> regexp, Handle<String> subject, int index,
107
      Handle<RegExpMatchInfo> last_match_info);
108

109 110
  // Set last match info.  If match is nullptr, then setting captures is
  // omitted.
111 112 113
  static Handle<RegExpMatchInfo> SetLastMatchInfo(
      Handle<RegExpMatchInfo> last_match_info, Handle<String> subject,
      int capture_count, int32_t* match);
114 115 116 117 118 119 120

  class GlobalCache {
   public:
    GlobalCache(Handle<JSRegExp> regexp,
                Handle<String> subject,
                Isolate* isolate);

121
    INLINE(~GlobalCache());
122 123

    // Fetch the next entry in the cache for global regexp match results.
124 125 126
    // This does not set the last match info.  Upon failure, nullptr is
    // returned. The cause can be checked with Result().  The previous result is
    // still in available in memory when a failure happens.
127
    INLINE(int32_t* FetchNext());
128

129
    INLINE(int32_t* LastSuccessfulMatch());
130

131
    INLINE(bool HasException()) { return num_matches_ < 0; }
132 133

   private:
134 135
    int AdvanceZeroLength(int last_index);

136 137 138 139 140 141 142 143 144 145 146
    int num_matches_;
    int max_matches_;
    int current_match_index_;
    int registers_per_match_;
    // Pointer to the last set of captures.
    int32_t* register_array_;
    int register_array_size_;
    Handle<JSRegExp> regexp_;
    Handle<String> subject_;
  };

147
  // For acting on the JSRegExp data FixedArray.
148 149
  static int IrregexpMaxRegisterCount(FixedArray* re);
  static void SetIrregexpMaxRegisterCount(FixedArray* re, int value);
150 151
  static void SetIrregexpCaptureNameMap(FixedArray* re,
                                        Handle<FixedArray> value);
152 153
  static int IrregexpNumberOfCaptures(FixedArray* re);
  static int IrregexpNumberOfRegisters(FixedArray* re);
154 155
  static ByteArray* IrregexpByteCode(FixedArray* re, bool is_one_byte);
  static Code* IrregexpNativeCode(FixedArray* re, bool is_one_byte);
156

157 158 159 160 161
  // Limit the space regexps take up on the heap.  In order to limit this we
  // would like to keep track of the amount of regexp code on the heap.  This
  // is not tracked, however.  As a conservative approximation we track the
  // total regexp code compiled including code that has subsequently been freed
  // and the total executable memory at any point.
162
  static const size_t kRegExpExecutableMemoryLimit = 16 * MB;
163
  static const size_t kRegExpCompiledLimit = 1 * MB;
164
  static const int kRegExpTooLargeToOptimize = 20 * KB;
165

166
 private:
167 168 169 170 171
  static bool CompileIrregexp(Handle<JSRegExp> re,
                              Handle<String> sample_subject, bool is_one_byte);
  static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re,
                                            Handle<String> sample_subject,
                                            bool is_one_byte);
172 173 174
};


175 176 177 178 179 180 181 182 183 184
// Represents the location of one element relative to the intersection of
// two sets. Corresponds to the four areas of a Venn diagram.
enum ElementInSetsRelation {
  kInsideNone = 0,
  kInsideFirst = 1,
  kInsideSecond = 2,
  kInsideBoth = 3
};


185 186 187 188
// A set of unsigned integers that behaves especially well on small
// integers (< 32).  May do zone-allocation.
class OutSet: public ZoneObject {
 public:
189
  OutSet() : first_(0), remaining_(nullptr), successors_(nullptr) {}
190
  OutSet* Extend(unsigned value, Zone* zone);
191
  bool Get(unsigned value) const;
192 193
  static const unsigned kFirstLimit = 32;

194
 private:
195 196 197
  // Destructively set a value in this set.  In most cases you want
  // to use Extend instead to ensure that only one instance exists
  // that contains the same values.
198
  void Set(unsigned value, Zone* zone);
199 200 201 202

  // The successors are a list of sets that contain the same values
  // as this set and the one more value that is not present in this
  // set.
203
  ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
204 205

  OutSet(uint32_t first, ZoneList<unsigned>* remaining)
206
      : first_(first), remaining_(remaining), successors_(nullptr) {}
207 208 209
  uint32_t first_;
  ZoneList<unsigned>* remaining_;
  ZoneList<OutSet*>* successors_;
210
  friend class Trace;
211 212 213 214 215
};


// A mapping from integers, specified as ranges, to a set of integers.
// Used for mapping character ranges to choices.
216
class DispatchTable : public ZoneObject {
217
 public:
218 219
  explicit DispatchTable(Zone* zone) : tree_(zone) { }

220 221
  class Entry {
   public:
222
    Entry() : from_(0), to_(0), out_set_(nullptr) {}
223
    Entry(uc32 from, uc32 to, OutSet* out_set)
224 225 226
        : from_(from), to_(to), out_set_(out_set) {
      DCHECK(from <= to);
    }
227 228 229
    uc32 from() { return from_; }
    uc32 to() { return to_; }
    void set_to(uc32 value) { to_ = value; }
230 231 232
    void AddValue(int value, Zone* zone) {
      out_set_ = out_set_->Extend(value, zone);
    }
233 234
    OutSet* out_set() { return out_set_; }
   private:
235 236
    uc32 from_;
    uc32 to_;
237 238 239 240 241
    OutSet* out_set_;
  };

  class Config {
   public:
242
    typedef uc32 Key;
243
    typedef Entry Value;
244
    static const uc32 kNoKey;
245
    static const Entry NoValue() { return Value(); }
246
    static inline int Compare(uc32 a, uc32 b) {
247 248 249 250 251 252 253 254 255
      if (a == b)
        return 0;
      else if (a < b)
        return -1;
      else
        return 1;
    }
  };

256
  void AddRange(CharacterRange range, int value, Zone* zone);
257
  OutSet* Get(uc32 value);
258 259 260
  void Dump();

  template <typename Callback>
261 262 263
  void ForEach(Callback* callback) {
    return tree()->ForEach(callback);
  }
264

265 266 267 268 269 270 271 272 273 274
 private:
  // There can't be a static empty set since it allocates its
  // successors in a zone and caches them.
  OutSet* empty() { return &empty_; }
  OutSet empty_;
  ZoneSplayTree<Config>* tree() { return &tree_; }
  ZoneSplayTree<Config> tree_;
};


275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302
// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
class UnicodeRangeSplitter {
 public:
  UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base);
  void Call(uc32 from, DispatchTable::Entry entry);

  ZoneList<CharacterRange>* bmp() { return bmp_; }
  ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
  ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
  ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }

 private:
  static const int kBase = 0;
  // Separate ranges into
  static const int kBmpCodePoints = 1;
  static const int kLeadSurrogates = 2;
  static const int kTrailSurrogates = 3;
  static const int kNonBmpCodePoints = 4;

  Zone* zone_;
  DispatchTable table_;
  ZoneList<CharacterRange>* bmp_;
  ZoneList<CharacterRange>* lead_surrogates_;
  ZoneList<CharacterRange>* trail_surrogates_;
  ZoneList<CharacterRange>* non_bmp_;
};


303 304 305 306 307
#define FOR_EACH_NODE_TYPE(VISIT)                                    \
  VISIT(End)                                                         \
  VISIT(Action)                                                      \
  VISIT(Choice)                                                      \
  VISIT(BackReference)                                               \
308
  VISIT(Assertion)                                                   \
309 310 311
  VISIT(Text)


312
class Trace;
313 314 315
struct PreloadState;
class GreedyLoopState;
class AlternativeGenerationList;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
316

317 318
struct NodeInfo {
  NodeInfo()
319 320 321 322
      : being_analyzed(false),
        been_analyzed(false),
        follows_word_interest(false),
        follows_newline_interest(false),
323 324
        follows_start_interest(false),
        at_end(false),
325 326
        visited(false),
        replacement_calculated(false) { }
327

328 329 330 331 332 333
  // Returns true if the interests and assumptions of this node
  // matches the given one.
  bool Matches(NodeInfo* that) {
    return (at_end == that->at_end) &&
           (follows_word_interest == that->follows_word_interest) &&
           (follows_newline_interest == that->follows_newline_interest) &&
334
           (follows_start_interest == that->follows_start_interest);
335
  }
336 337 338 339 340 341 342 343

  // Updates the interests of this node given the interests of the
  // node preceding it.
  void AddFromPreceding(NodeInfo* that) {
    at_end |= that->at_end;
    follows_word_interest |= that->follows_word_interest;
    follows_newline_interest |= that->follows_newline_interest;
    follows_start_interest |= that->follows_start_interest;
344
  }
345

346 347 348 349 350 351
  bool HasLookbehind() {
    return follows_word_interest ||
           follows_newline_interest ||
           follows_start_interest;
  }

352 353 354 355 356 357
  // Sets the interests of this node to include the interests of the
  // following node.
  void AddFromFollowing(NodeInfo* that) {
    follows_word_interest |= that->follows_word_interest;
    follows_newline_interest |= that->follows_newline_interest;
    follows_start_interest |= that->follows_start_interest;
358
  }
359

360 361 362 363 364
  void ResetCompilationState() {
    being_analyzed = false;
    been_analyzed = false;
  }

365 366
  bool being_analyzed: 1;
  bool been_analyzed: 1;
367

368 369
  // These bits are set of this node has to know what the preceding
  // character was.
370 371 372
  bool follows_word_interest: 1;
  bool follows_newline_interest: 1;
  bool follows_start_interest: 1;
373 374

  bool at_end: 1;
375
  bool visited: 1;
376
  bool replacement_calculated: 1;
377 378 379
};


380 381 382 383 384 385 386
// Details of a quick mask-compare check that can look ahead in the
// input stream.
class QuickCheckDetails {
 public:
  QuickCheckDetails()
      : characters_(0),
        mask_(0),
387 388
        value_(0),
        cannot_match_(false) { }
389 390 391
  explicit QuickCheckDetails(int characters)
      : characters_(characters),
        mask_(0),
392 393
        value_(0),
        cannot_match_(false) { }
394
  bool Rationalize(bool one_byte);
395 396 397
  // Merge in the information from another branch of an alternation.
  void Merge(QuickCheckDetails* other, int from_index);
  // Advance the current position by some amount.
398
  void Advance(int by, bool one_byte);
399
  void Clear();
400 401
  bool cannot_match() { return cannot_match_; }
  void set_cannot_match() { cannot_match_ = true; }
402 403 404 405 406 407 408 409 410
  struct Position {
    Position() : mask(0), value(0), determines_perfectly(false) { }
    uc16 mask;
    uc16 value;
    bool determines_perfectly;
  };
  int characters() { return characters_; }
  void set_characters(int characters) { characters_ = characters; }
  Position* positions(int index) {
411 412
    DCHECK_LE(0, index);
    DCHECK_GT(characters_, index);
413 414 415 416 417 418 419 420 421 422 423 424 425
    return positions_ + index;
  }
  uint32_t mask() { return mask_; }
  uint32_t value() { return value_; }

 private:
  // How many characters do we have quick check information from.  This is
  // the same for all branches of a choice node.
  int characters_;
  Position positions_[4];
  // These values are the condensate of the above array after Rationalize().
  uint32_t mask_;
  uint32_t value_;
426 427 428
  // If set to true, there is no way this quick check can match at all.
  // E.g., if it requires to be at the start of the input, and isn't.
  bool cannot_match_;
429 430 431
};


432 433 434
extern int kUninitializedRegExpNodePlaceHolder;


435 436
class RegExpNode: public ZoneObject {
 public:
437
  explicit RegExpNode(Zone* zone)
438 439 440 441 442
      : replacement_(nullptr),
        on_work_list_(false),
        trace_count_(0),
        zone_(zone) {
    bm_info_[0] = bm_info_[1] = nullptr;
443
  }
444
  virtual ~RegExpNode();
445 446
  virtual void Accept(NodeVisitor* visitor) = 0;
  // Generates a goto to this node or actually generates the code at this point.
447
  virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
448
  // How many characters must this node consume at a minimum in order to
449 450
  // succeed.  If we have found at least 'still_to_find' characters that
  // must be consumed there is no need to ask any following nodes whether
451 452 453 454
  // they are sure to eat any more characters.  The not_at_start argument is
  // used to indicate that we know we are not at the start of the input.  In
  // this case anchored branches will always fail and can be ignored when
  // determining how many characters are consumed on success.
455
  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
456 457 458 459
  // Emits some quick code that checks whether the preloaded characters match.
  // Falls through on certain failure, jumps to the label on possible success.
  // If the node cannot make a quick check it does nothing and returns false.
  bool EmitQuickCheck(RegExpCompiler* compiler,
460
                      Trace* bounds_check_trace,
461
                      Trace* trace,
462 463 464 465 466 467 468 469 470 471
                      bool preload_has_checked_bounds,
                      Label* on_possible_success,
                      QuickCheckDetails* details_return,
                      bool fall_through_on_failure);
  // For a given number of characters this returns a mask and a value.  The
  // next n characters are anded with the mask and compared with the value.
  // A comparison failure indicates the node cannot match the next n characters.
  // A comparison success indicates the node may match.
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
472 473
                                    int characters_filled_in,
                                    bool not_at_start) = 0;
474
  static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
475
  virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
476 477 478 479
  // Only returns the successor for a text node of length 1 that matches any
  // character and that has no guards on it.
  virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
      RegExpCompiler* compiler) {
480
    return nullptr;
481 482 483 484 485
  }

  // Collects information on the possible code units (mod 128) that can match if
  // we look forward.  This is used for a Boyer-Moore-like string searching
  // implementation.  TODO(erikcorry):  This should share more code with
486 487
  // EatsAtLeast, GetQuickCheckDetails.  The budget argument is used to limit
  // the number of nodes we are willing to look at in order to create this data.
488
  static const int kRecursionBudget = 200;
489
  bool KeepRecursing(RegExpCompiler* compiler);
490
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
491
                            BoyerMooreLookahead* bm, bool not_at_start) {
492 493
    UNREACHABLE();
  }
494

495
  // If we know that the input is one-byte then there are some nodes that can
496
  // never match.  This method returns a node that can be substituted for
497
  // itself, or nullptr if the node can never match.
498
  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case) {
499 500 501
    return this;
  }
  // Helper for FilterOneByte.
502
  RegExpNode* replacement() {
503
    DCHECK(info()->replacement_calculated);
504 505 506 507 508 509 510 511
    return replacement_;
  }
  RegExpNode* set_replacement(RegExpNode* replacement) {
    info()->replacement_calculated = true;
    replacement_ =  replacement;
    return replacement;  // For convenience.
  }

512 513 514 515 516 517 518
  // We want to avoid recalculating the lookahead info, so we store it on the
  // node.  Only info that is for this node is stored.  We can tell that the
  // info is for this node when offset == 0, so the information is calculated
  // relative to this node.
  void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
    if (offset == 0) set_bm_info(not_at_start, bm);
  }
519

erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
520
  Label* label() { return &label_; }
521
  // If non-generic code is generated for a node (i.e. the node is not at the
522 523 524 525 526
  // start of the trace) then it cannot be reused.  This variable sets a limit
  // on how often we allow that to happen before we insist on starting a new
  // trace and generating generic code for a node that can be reused by flushing
  // the deferred actions in the current trace and generating a goto.
  static const int kMaxCopiesCodeGenerated = 10;
527

528 529 530
  bool on_work_list() { return on_work_list_; }
  void set_on_work_list(bool value) { on_work_list_ = value; }

531
  NodeInfo* info() { return &info_; }
532

533 534 535
  BoyerMooreLookahead* bm_info(bool not_at_start) {
    return bm_info_[not_at_start ? 1 : 0];
  }
536

537
  Zone* zone() const { return zone_; }
538

539
 protected:
540
  enum LimitResult { DONE, CONTINUE };
541
  RegExpNode* replacement_;
542

543
  LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
544

545 546 547 548
  void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
    bm_info_[not_at_start ? 1 : 0] = bm;
  }

549
 private:
550
  static const int kFirstCharBudget = 10;
551
  Label label_;
552
  bool on_work_list_;
553
  NodeInfo info_;
554 555 556 557 558 559
  // This variable keeps track of how many times code has been generated for
  // this node (in different traces).  We don't keep track of where the
  // generated code is located unless the code is generated at the start of
  // a trace, in which case it is generic and can be reused by flushing the
  // deferred operations in the current trace and generating a goto.
  int trace_count_;
560
  BoyerMooreLookahead* bm_info_[2];
561 562

  Zone* zone_;
563 564 565 566 567 568
};


class SeqRegExpNode: public RegExpNode {
 public:
  explicit SeqRegExpNode(RegExpNode* on_success)
569
      : RegExpNode(on_success->zone()), on_success_(on_success) { }
570 571
  RegExpNode* on_success() { return on_success_; }
  void set_on_success(RegExpNode* node) { on_success_ = node; }
572 573
  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
574
                            BoyerMooreLookahead* bm, bool not_at_start) {
575
    on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
576
    if (offset == 0) set_bm_info(not_at_start, bm);
577
  }
578 579

 protected:
580
  RegExpNode* FilterSuccessor(int depth, bool ignore_case);
581

582 583 584 585 586 587 588
 private:
  RegExpNode* on_success_;
};


class ActionNode: public SeqRegExpNode {
 public:
589
  enum ActionType {
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
590
    SET_REGISTER,
591 592 593
    INCREMENT_REGISTER,
    STORE_POSITION,
    BEGIN_SUBMATCH,
594
    POSITIVE_SUBMATCH_SUCCESS,
595 596
    EMPTY_MATCH_CHECK,
    CLEAR_CAPTURES
597
  };
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
598
  static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
599
  static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
600 601 602
  static ActionNode* StorePosition(int reg,
                                   bool is_capture,
                                   RegExpNode* on_success);
603 604 605 606 607 608
  static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
  static ActionNode* BeginSubmatch(int stack_pointer_reg,
                                   int position_reg,
                                   RegExpNode* on_success);
  static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
                                             int restore_reg,
609 610
                                             int clear_capture_count,
                                             int clear_capture_from,
611 612 613 614 615
                                             RegExpNode* on_success);
  static ActionNode* EmptyMatchCheck(int start_register,
                                     int repetition_register,
                                     int repetition_limit,
                                     RegExpNode* on_success);
616
  virtual void Accept(NodeVisitor* visitor);
617
  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
618
  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
619 620
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
621 622 623 624
                                    int filled_in,
                                    bool not_at_start) {
    return on_success()->GetQuickCheckDetails(
        details, compiler, filled_in, not_at_start);
625
  }
626
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
627
                            BoyerMooreLookahead* bm, bool not_at_start);
628
  ActionType action_type() { return action_type_; }
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
629 630
  // TODO(erikcorry): We should allow some action nodes in greedy loops.
  virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
631

632 633 634 635 636 637 638 639 640 641 642
 private:
  union {
    struct {
      int reg;
      int value;
    } u_store_register;
    struct {
      int reg;
    } u_increment_register;
    struct {
      int reg;
643
      bool is_capture;
644 645
    } u_position_register;
    struct {
646 647
      int stack_pointer_register;
      int current_position_register;
648 649
      int clear_register_count;
      int clear_register_from;
650
    } u_submatch;
651 652 653 654 655
    struct {
      int start_register;
      int repetition_register;
      int repetition_limit;
    } u_empty_match_check;
656 657 658 659
    struct {
      int range_from;
      int range_to;
    } u_clear_captures;
660
  } data_;
661
  ActionNode(ActionType action_type, RegExpNode* on_success)
662
      : SeqRegExpNode(on_success),
663 664
        action_type_(action_type) { }
  ActionType action_type_;
665 666 667 668 669 670
  friend class DotPrinter;
};


class TextNode: public SeqRegExpNode {
 public:
671
  TextNode(ZoneList<TextElement>* elms, bool read_backward,
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
672
           RegExpNode* on_success)
673 674
      : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
  TextNode(RegExpCharacterClass* that, bool read_backward,
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
675
           RegExpNode* on_success)
676
      : SeqRegExpNode(on_success),
677 678
        elms_(new (zone()) ZoneList<TextElement>(1, zone())),
        read_backward_(read_backward) {
679
    elms_->Add(TextElement::CharClass(that), zone());
680
  }
681 682 683 684 685 686 687 688 689 690 691
  // Create TextNode for a single character class for the given ranges.
  static TextNode* CreateForCharacterRanges(Zone* zone,
                                            ZoneList<CharacterRange>* ranges,
                                            bool read_backward,
                                            RegExpNode* on_success);
  // Create TextNode for a surrogate pair with a range given for the
  // lead and the trail surrogate each.
  static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
                                          CharacterRange trail,
                                          bool read_backward,
                                          RegExpNode* on_success);
692
  virtual void Accept(NodeVisitor* visitor);
693
  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
694
  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
695 696
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
697 698
                                    int characters_filled_in,
                                    bool not_at_start);
699
  ZoneList<TextElement>* elements() { return elms_; }
700
  bool read_backward() { return read_backward_; }
701
  void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
702
  virtual int GreedyLoopTextLength();
703 704
  virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
      RegExpCompiler* compiler);
705
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
706
                            BoyerMooreLookahead* bm, bool not_at_start);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
707
  void CalculateOffsets();
708
  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
709

710
 private:
711
  enum TextEmitPassType {
712
    NON_LATIN1_MATCH,            // Check for characters that can't match.
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
713 714 715 716
    SIMPLE_CHARACTER_MATCH,      // Case-dependent single character check.
    NON_LETTER_CHARACTER_MATCH,  // Check characters that have no case equivs.
    CASE_CHARACTER_MATCH,        // Case-independent single character check.
    CHARACTER_CLASS_MATCH        // Character class.
717
  };
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
718 719 720
  static bool SkipPass(int pass, bool ignore_case);
  static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
  static const int kLastPass = CHARACTER_CLASS_MATCH;
721 722 723
  void TextEmitPass(RegExpCompiler* compiler,
                    TextEmitPassType pass,
                    bool preloaded,
724
                    Trace* trace,
725 726 727
                    bool first_element_checked,
                    int* checked_up_to);
  int Length();
728
  ZoneList<TextElement>* elms_;
729
  bool read_backward_;
730 731 732
};


733 734
class AssertionNode: public SeqRegExpNode {
 public:
735
  enum AssertionType {
736 737 738 739
    AT_END,
    AT_START,
    AT_BOUNDARY,
    AT_NON_BOUNDARY,
740
    AFTER_NEWLINE
741 742
  };
  static AssertionNode* AtEnd(RegExpNode* on_success) {
743
    return new(on_success->zone()) AssertionNode(AT_END, on_success);
744 745
  }
  static AssertionNode* AtStart(RegExpNode* on_success) {
746
    return new(on_success->zone()) AssertionNode(AT_START, on_success);
747 748
  }
  static AssertionNode* AtBoundary(RegExpNode* on_success) {
749
    return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
750 751
  }
  static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
752
    return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
753 754
  }
  static AssertionNode* AfterNewline(RegExpNode* on_success) {
755
    return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
756 757
  }
  virtual void Accept(NodeVisitor* visitor);
758
  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
759
  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
760 761
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
762 763
                                    int filled_in,
                                    bool not_at_start);
764
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
765
                            BoyerMooreLookahead* bm, bool not_at_start);
766
  AssertionType assertion_type() { return assertion_type_; }
767

768
 private:
769 770 771 772 773
  void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
  enum IfPrevious { kIsNonWord, kIsWord };
  void BacktrackIfPrevious(RegExpCompiler* compiler,
                           Trace* trace,
                           IfPrevious backtrack_if_previous);
774 775 776
  AssertionNode(AssertionType t, RegExpNode* on_success)
      : SeqRegExpNode(on_success), assertion_type_(t) { }
  AssertionType assertion_type_;
777 778 779
};


780 781
class BackReferenceNode: public SeqRegExpNode {
 public:
782
  BackReferenceNode(int start_reg, int end_reg, bool read_backward,
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
783
                    RegExpNode* on_success)
784 785
      : SeqRegExpNode(on_success),
        start_reg_(start_reg),
786 787
        end_reg_(end_reg),
        read_backward_(read_backward) {}
788 789 790
  virtual void Accept(NodeVisitor* visitor);
  int start_register() { return start_reg_; }
  int end_register() { return end_reg_; }
791
  bool read_backward() { return read_backward_; }
792
  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
793 794 795
  virtual int EatsAtLeast(int still_to_find,
                          int recursion_depth,
                          bool not_at_start);
796 797
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
798 799
                                    int characters_filled_in,
                                    bool not_at_start) {
800 801
    return;
  }
802
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
803
                            BoyerMooreLookahead* bm, bool not_at_start);
804

805 806 807
 private:
  int start_reg_;
  int end_reg_;
808
  bool read_backward_;
809 810 811 812 813
};


class EndNode: public RegExpNode {
 public:
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
814
  enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
815
  EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
816
  virtual void Accept(NodeVisitor* visitor);
817
  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
818 819 820
  virtual int EatsAtLeast(int still_to_find,
                          int recursion_depth,
                          bool not_at_start) { return 0; }
821 822
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
823 824
                                    int characters_filled_in,
                                    bool not_at_start) {
825 826 827
    // Returning 0 from EatsAtLeast should ensure we never get here.
    UNREACHABLE();
  }
828
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
829
                            BoyerMooreLookahead* bm, bool not_at_start) {
830 831 832
    // Returning 0 from EatsAtLeast should ensure we never get here.
    UNREACHABLE();
  }
833

834 835 836 837 838
 private:
  Action action_;
};


erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
839 840
class NegativeSubmatchSuccess: public EndNode {
 public:
841 842 843
  NegativeSubmatchSuccess(int stack_pointer_reg,
                          int position_reg,
                          int clear_capture_count,
844 845 846
                          int clear_capture_start,
                          Zone* zone)
      : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
847
        stack_pointer_register_(stack_pointer_reg),
848 849 850
        current_position_register_(position_reg),
        clear_capture_count_(clear_capture_count),
        clear_capture_start_(clear_capture_start) { }
851
  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
852 853 854 855

 private:
  int stack_pointer_register_;
  int current_position_register_;
856 857
  int clear_capture_count_;
  int clear_capture_start_;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
858 859 860
};


861 862 863 864
class Guard: public ZoneObject {
 public:
  enum Relation { LT, GEQ };
  Guard(int reg, Relation op, int value)
865 866 867
      : reg_(reg),
        op_(op),
        value_(value) { }
868 869 870
  int reg() { return reg_; }
  Relation op() { return op_; }
  int value() { return value_; }
871

872 873 874 875 876 877 878 879 880
 private:
  int reg_;
  Relation op_;
  int value_;
};


class GuardedAlternative {
 public:
881 882
  explicit GuardedAlternative(RegExpNode* node)
      : node_(node), guards_(nullptr) {}
883
  void AddGuard(Guard* guard, Zone* zone);
884 885 886
  RegExpNode* node() { return node_; }
  void set_node(RegExpNode* node) { node_ = node; }
  ZoneList<Guard*>* guards() { return guards_; }
887

888 889 890 891 892 893
 private:
  RegExpNode* node_;
  ZoneList<Guard*>* guards_;
};


894 895 896
class AlternativeGeneration;


897 898
class ChoiceNode: public RegExpNode {
 public:
899 900
  explicit ChoiceNode(int expected_size, Zone* zone)
      : RegExpNode(zone),
901 902 903
        alternatives_(new (zone)
                          ZoneList<GuardedAlternative>(expected_size, zone)),
        table_(nullptr),
904
        not_at_start_(false),
905
        being_calculated_(false) {}
906
  virtual void Accept(NodeVisitor* visitor);
907 908 909
  void AddAlternative(GuardedAlternative node) {
    alternatives()->Add(node, zone());
  }
910
  ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
911
  DispatchTable* GetTable(bool ignore_case);
912
  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
913
  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
914
  int EatsAtLeastHelper(int still_to_find,
915
                        int budget,
916 917
                        RegExpNode* ignore_this_node,
                        bool not_at_start);
918 919
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
920 921
                                    int characters_filled_in,
                                    bool not_at_start);
922
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
923
                            BoyerMooreLookahead* bm, bool not_at_start);
924

925
  bool being_calculated() { return being_calculated_; }
926 927
  bool not_at_start() { return not_at_start_; }
  void set_not_at_start() { not_at_start_ = true; }
928
  void set_being_calculated(bool b) { being_calculated_ = b; }
929 930 931
  virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
    return true;
  }
932
  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
933
  virtual bool read_backward() { return false; }
934

erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
935
 protected:
936
  int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
937 938
  ZoneList<GuardedAlternative>* alternatives_;

939
 private:
940
  friend class DispatchTableConstructor;
941
  friend class Analysis;
942
  void GenerateGuard(RegExpMacroAssembler* macro_assembler,
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
943
                     Guard* guard,
944
                     Trace* trace);
945
  int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
946
  void EmitOutOfLineContinuation(RegExpCompiler* compiler,
947
                                 Trace* trace,
948 949 950 951
                                 GuardedAlternative alternative,
                                 AlternativeGeneration* alt_gen,
                                 int preload_characters,
                                 bool next_expects_preload);
952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967
  void SetUpPreLoad(RegExpCompiler* compiler,
                    Trace* current_trace,
                    PreloadState* preloads);
  void AssertGuardsMentionRegisters(Trace* trace);
  int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
  Trace* EmitGreedyLoop(RegExpCompiler* compiler,
                        Trace* trace,
                        AlternativeGenerationList* alt_gens,
                        PreloadState* preloads,
                        GreedyLoopState* greedy_loop_state,
                        int text_length);
  void EmitChoices(RegExpCompiler* compiler,
                   AlternativeGenerationList* alt_gens,
                   int first_choice,
                   Trace* trace,
                   PreloadState* preloads);
968
  DispatchTable* table_;
969 970 971
  // If true, this node is never checked at the start of the input.
  // Allows a new trace to start with at_start() set to false.
  bool not_at_start_;
972 973 974 975
  bool being_calculated_;
};


976
class NegativeLookaroundChoiceNode : public ChoiceNode {
977
 public:
978 979 980
  explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
                                        GuardedAlternative then_do_this,
                                        Zone* zone)
981
      : ChoiceNode(2, zone) {
982 983 984
    AddAlternative(this_must_fail);
    AddAlternative(then_do_this);
  }
985
  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
986 987
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
988 989
                                    int characters_filled_in,
                                    bool not_at_start);
990
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
991
                            BoyerMooreLookahead* bm, bool not_at_start) {
992
    alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm,
993
                                              not_at_start);
994
    if (offset == 0) set_bm_info(not_at_start, bm);
995
  }
996 997 998 999 1000
  // For a negative lookahead we don't emit the quick check for the
  // alternative that is expected to fail.  This is because quick check code
  // starts by loading enough characters for the alternative that takes fewest
  // characters, but on a negative lookahead the negative branch did not take
  // part in that calculation (EatsAtLeast) so the assumptions don't hold.
1001 1002 1003
  virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
    return !is_first;
  }
1004
  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
1005 1006 1007
};


erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1008 1009
class LoopChoiceNode: public ChoiceNode {
 public:
1010
  LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
1011
      : ChoiceNode(2, zone),
1012 1013
        loop_node_(nullptr),
        continue_node_(nullptr),
1014 1015
        body_can_be_zero_length_(body_can_be_zero_length),
        read_backward_(read_backward) {}
1016 1017
  void AddLoopAlternative(GuardedAlternative alt);
  void AddContinueAlternative(GuardedAlternative alt);
1018
  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1019
  virtual int EatsAtLeast(int still_to_find,  int budget, bool not_at_start);
1020 1021
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
1022 1023
                                    int characters_filled_in,
                                    bool not_at_start);
1024
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
1025
                            BoyerMooreLookahead* bm, bool not_at_start);
1026 1027
  RegExpNode* loop_node() { return loop_node_; }
  RegExpNode* continue_node() { return continue_node_; }
1028
  bool body_can_be_zero_length() { return body_can_be_zero_length_; }
1029
  virtual bool read_backward() { return read_backward_; }
1030
  virtual void Accept(NodeVisitor* visitor);
1031
  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042

 private:
  // AddAlternative is made private for loop nodes because alternatives
  // should not be added freely, we need to keep track of which node
  // goes back to the node itself.
  void AddAlternative(GuardedAlternative node) {
    ChoiceNode::AddAlternative(node);
  }

  RegExpNode* loop_node_;
  RegExpNode* continue_node_;
1043
  bool body_can_be_zero_length_;
1044
  bool read_backward_;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1045 1046 1047
};


1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093
// Improve the speed that we scan for an initial point where a non-anchored
// regexp can match by using a Boyer-Moore-like table. This is done by
// identifying non-greedy non-capturing loops in the nodes that eat any
// character one at a time.  For example in the middle of the regexp
// /foo[\s\S]*?bar/ we find such a loop.  There is also such a loop implicitly
// inserted at the start of any non-anchored regexp.
//
// When we have found such a loop we look ahead in the nodes to find the set of
// characters that can come at given distances. For example for the regexp
// /.?foo/ we know that there are at least 3 characters ahead of us, and the
// sets of characters that can occur are [any, [f, o], [o]]. We find a range in
// the lookahead info where the set of characters is reasonably constrained. In
// our example this is from index 1 to 2 (0 is not constrained). We can now
// look 3 characters ahead and if we don't find one of [f, o] (the union of
// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
//
// For Unicode input strings we do the same, but modulo 128.
//
// We also look at the first string fed to the regexp and use that to get a hint
// of the character frequencies in the inputs. This affects the assessment of
// whether the set of characters is 'reasonably constrained'.
//
// We also have another lookahead mechanism (called quick check in the code),
// which uses a wide load of multiple characters followed by a mask and compare
// to determine whether a match is possible at this point.
enum ContainedInLattice {
  kNotYet = 0,
  kLatticeIn = 1,
  kLatticeOut = 2,
  kLatticeUnknown = 3  // Can also mean both in and out.
};


inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
  return static_cast<ContainedInLattice>(a | b);
}


ContainedInLattice AddRange(ContainedInLattice a,
                            const int* ranges,
                            int ranges_size,
                            Interval new_range);


class BoyerMoorePositionInfo : public ZoneObject {
 public:
1094
  explicit BoyerMoorePositionInfo(Zone* zone)
1095
      : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
1096 1097 1098 1099 1100 1101
        map_count_(0),
        w_(kNotYet),
        s_(kNotYet),
        d_(kNotYet),
        surrogate_(kNotYet) {
     for (int i = 0; i < kMapSize; i++) {
1102
       map_->Add(false, zone);
1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130
     }
  }

  bool& at(int i) { return map_->at(i); }

  static const int kMapSize = 128;
  static const int kMask = kMapSize - 1;

  int map_count() const { return map_count_; }

  void Set(int character);
  void SetInterval(const Interval& interval);
  void SetAll();
  bool is_non_word() { return w_ == kLatticeOut; }
  bool is_word() { return w_ == kLatticeIn; }

 private:
  ZoneList<bool>* map_;
  int map_count_;  // Number of set bits in the map.
  ContainedInLattice w_;  // The \w character class.
  ContainedInLattice s_;  // The \s character class.
  ContainedInLattice d_;  // The \d character class.
  ContainedInLattice surrogate_;  // Surrogate UTF-16 code units.
};


class BoyerMooreLookahead : public ZoneObject {
 public:
1131
  BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165

  int length() { return length_; }
  int max_char() { return max_char_; }
  RegExpCompiler* compiler() { return compiler_; }

  int Count(int map_number) {
    return bitmaps_->at(map_number)->map_count();
  }

  BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }

  void Set(int map_number, int character) {
    if (character > max_char_) return;
    BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
    info->Set(character);
  }

  void SetInterval(int map_number, const Interval& interval) {
    if (interval.from() > max_char_) return;
    BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
    if (interval.to() > max_char_) {
      info->SetInterval(Interval(interval.from(), max_char_));
    } else {
      info->SetInterval(interval);
    }
  }

  void SetAll(int map_number) {
    bitmaps_->at(map_number)->SetAll();
  }

  void SetRest(int from_map) {
    for (int i = from_map; i < length_; i++) SetAll(i);
  }
1166
  void EmitSkipInstructions(RegExpMacroAssembler* masm);
1167 1168 1169 1170 1171 1172 1173 1174

 private:
  // This is the value obtained by EatsAtLeast.  If we do not have at least this
  // many characters left in the sample string then the match is bound to fail.
  // Therefore it is OK to read a character this far ahead of the current match
  // point.
  int length_;
  RegExpCompiler* compiler_;
1175
  // 0xff for Latin1, 0xffff for UTF-16.
1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187
  int max_char_;
  ZoneList<BoyerMoorePositionInfo*>* bitmaps_;

  int GetSkipTable(int min_lookahead,
                   int max_lookahead,
                   Handle<ByteArray> boolean_skip_table);
  bool FindWorthwhileInterval(int* from, int* to);
  int FindBestInterval(
    int max_number_of_chars, int old_biggest_points, int* from, int* to);
};


erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1188 1189
// There are many ways to generate code for a node.  This class encapsulates
// the current way we should be generating.  In other words it encapsulates
1190 1191 1192 1193 1194 1195 1196 1197 1198 1199
// the current state of the code generator.  The effect of this is that we
// generate code for paths that the matcher can take through the regular
// expression.  A given node in the regexp can be code-generated several times
// as it can be part of several traces.  For example for the regexp:
// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
// of the foo-bar-baz trace and once as part of the foo-ip-baz trace.  The code
// to match foo is generated only once (the traces have a common prefix).  The
// code to store the capture is deferred and generated (twice) after the places
// where baz has been matched.
class Trace {
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1200
 public:
1201 1202 1203
  // A value for a property that is either known to be true, know to be false,
  // or not known.
  enum TriBool {
1204
    UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
1205 1206
  };

erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1207 1208
  class DeferredAction {
   public:
1209
    DeferredAction(ActionNode::ActionType action_type, int reg)
1210
        : action_type_(action_type), reg_(reg), next_(nullptr) {}
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1211
    DeferredAction* next() { return next_; }
1212
    bool Mentions(int reg);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1213
    int reg() { return reg_; }
1214
    ActionNode::ActionType action_type() { return action_type_; }
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1215
   private:
1216
    ActionNode::ActionType action_type_;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1217 1218
    int reg_;
    DeferredAction* next_;
1219
    friend class Trace;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1220 1221
  };

1222
  class DeferredCapture : public DeferredAction {
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1223
   public:
1224
    DeferredCapture(int reg, bool is_capture, Trace* trace)
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1225
        : DeferredAction(ActionNode::STORE_POSITION, reg),
1226 1227
          cp_offset_(trace->cp_offset()),
          is_capture_(is_capture) { }
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1228
    int cp_offset() { return cp_offset_; }
1229
    bool is_capture() { return is_capture_; }
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1230 1231
   private:
    int cp_offset_;
1232
    bool is_capture_;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1233 1234 1235
    void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
  };

1236
  class DeferredSetRegister : public DeferredAction {
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1237 1238 1239 1240 1241 1242 1243 1244 1245
   public:
    DeferredSetRegister(int reg, int value)
        : DeferredAction(ActionNode::SET_REGISTER, reg),
          value_(value) { }
    int value() { return value_; }
   private:
    int value_;
  };

1246 1247 1248 1249 1250 1251 1252 1253 1254 1255
  class DeferredClearCaptures : public DeferredAction {
   public:
    explicit DeferredClearCaptures(Interval range)
        : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
          range_(range) { }
    Interval range() { return range_; }
   private:
    Interval range_;
  };

1256
  class DeferredIncrementRegister : public DeferredAction {
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1257 1258 1259 1260 1261
   public:
    explicit DeferredIncrementRegister(int reg)
        : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
  };

1262
  Trace()
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1263
      : cp_offset_(0),
1264 1265 1266 1267
        actions_(nullptr),
        backtrack_(nullptr),
        stop_node_(nullptr),
        loop_label_(nullptr),
1268
        characters_preloaded_(0),
1269
        bound_checked_up_to_(0),
1270
        flush_budget_(100),
1271
        at_start_(UNKNOWN) {}
1272

erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1273 1274 1275 1276
  // End the trace.  This involves flushing the deferred actions in the trace
  // and pushing a backtrack location onto the backtrack stack.  Once this is
  // done we can start a new trace or go to one that has already been
  // generated.
1277
  void Flush(RegExpCompiler* compiler, RegExpNode* successor);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1278 1279
  int cp_offset() { return cp_offset_; }
  DeferredAction* actions() { return actions_; }
1280 1281 1282 1283 1284 1285 1286 1287 1288 1289
  // A trivial trace is one that has no deferred actions or other state that
  // affects the assumptions used when generating code.  There is no recorded
  // backtrack location in a trivial trace, so with a trivial trace we will
  // generate code that, on a failure to match, gets the backtrack location
  // from the backtrack stack rather than using a direct jump instruction.  We
  // always start code generation with a trivial trace and non-trivial traces
  // are created as we emit code for nodes or add to the list of deferred
  // actions in the trace.  The location of the code generated for a node using
  // a trivial trace is recorded in a label in the node so that gotos can be
  // generated to that code.
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1290
  bool is_trivial() {
1291 1292 1293
    return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
           characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
           quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1294
  }
1295
  TriBool at_start() { return at_start_; }
1296
  void set_at_start(TriBool at_start) { at_start_ = at_start; }
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1297 1298 1299
  Label* backtrack() { return backtrack_; }
  Label* loop_label() { return loop_label_; }
  RegExpNode* stop_node() { return stop_node_; }
1300
  int characters_preloaded() { return characters_preloaded_; }
1301
  int bound_checked_up_to() { return bound_checked_up_to_; }
1302
  int flush_budget() { return flush_budget_; }
1303 1304
  QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
  bool mentions_reg(int reg);
1305 1306 1307 1308
  // Returns true if a deferred position store exists to the specified
  // register and stores the offset in the out-parameter.  Otherwise
  // returns false.
  bool GetStoredPosition(int reg, int* cp_offset);
1309 1310
  // These set methods and AdvanceCurrentPositionInTrace should be used only on
  // new traces - the intention is that traces are immutable after creation.
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1311
  void add_action(DeferredAction* new_action) {
1312
    DCHECK(new_action->next_ == nullptr);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1313 1314 1315 1316 1317 1318
    new_action->next_ = actions_;
    actions_ = new_action;
  }
  void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
  void set_stop_node(RegExpNode* node) { stop_node_ = node; }
  void set_loop_label(Label* label) { loop_label_ = label; }
1319
  void set_characters_preloaded(int count) { characters_preloaded_ = count; }
1320
  void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
1321
  void set_flush_budget(int to) { flush_budget_ = to; }
1322 1323 1324
  void set_quick_check_performed(QuickCheckDetails* d) {
    quick_check_performed_ = *d;
  }
1325
  void InvalidateCurrentCharacter();
1326
  void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
1327

erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1328
 private:
1329
  int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1330
  void PerformDeferredActions(RegExpMacroAssembler* macro,
1331
                              int max_register,
1332
                              const OutSet& affected_registers,
1333 1334 1335
                              OutSet* registers_to_pop,
                              OutSet* registers_to_clear,
                              Zone* zone);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1336 1337
  void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
                                int max_register,
1338 1339
                                const OutSet& registers_to_pop,
                                const OutSet& registers_to_clear);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1340 1341 1342 1343 1344
  int cp_offset_;
  DeferredAction* actions_;
  Label* backtrack_;
  RegExpNode* stop_node_;
  Label* loop_label_;
1345
  int characters_preloaded_;
1346
  int bound_checked_up_to_;
1347
  QuickCheckDetails quick_check_performed_;
1348
  int flush_budget_;
1349
  TriBool at_start_;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1350
};
1351 1352


1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377
class GreedyLoopState {
 public:
  explicit GreedyLoopState(bool not_at_start);

  Label* label() { return &label_; }
  Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }

 private:
  Label label_;
  Trace counter_backtrack_trace_;
};


struct PreloadState {
  static const int kEatsAtLeastNotYetInitialized = -1;
  bool preload_is_current_;
  bool preload_has_checked_bounds_;
  int preload_characters_;
  int eats_at_least_;
  void init() {
    eats_at_least_ = kEatsAtLeastNotYetInitialized;
  }
};


1378 1379 1380 1381 1382 1383 1384
class NodeVisitor {
 public:
  virtual ~NodeVisitor() { }
#define DECLARE_VISIT(Type)                                          \
  virtual void Visit##Type(Type##Node* that) = 0;
FOR_EACH_NODE_TYPE(DECLARE_VISIT)
#undef DECLARE_VISIT
1385
  virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
1386 1387 1388 1389 1390 1391 1392
};


// Node visitor used to add the start set of the alternatives to the
// dispatch table of a choice node.
class DispatchTableConstructor: public NodeVisitor {
 public:
1393 1394
  DispatchTableConstructor(DispatchTable* table, bool ignore_case,
                           Zone* zone)
1395
      : table_(table),
1396
        choice_index_(-1),
1397 1398
        ignore_case_(ignore_case),
        zone_(zone) { }
1399 1400 1401 1402

  void BuildTable(ChoiceNode* node);

  void AddRange(CharacterRange range) {
1403
    table()->AddRange(range, choice_index_, zone_);
1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416
  }

  void AddInverse(ZoneList<CharacterRange>* ranges);

#define DECLARE_VISIT(Type)                                          \
  virtual void Visit##Type(Type##Node* that);
FOR_EACH_NODE_TYPE(DECLARE_VISIT)
#undef DECLARE_VISIT

  DispatchTable* table() { return table_; }
  void set_choice_index(int value) { choice_index_ = value; }

 protected:
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1417
  DispatchTable* table_;
1418
  int choice_index_;
1419
  bool ignore_case_;
1420
  Zone* zone_;
1421 1422 1423
};


1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435
// Assertion propagation moves information about assertions such as
// \b to the affected nodes.  For instance, in /.\b./ information must
// be propagated to the first '.' that whatever follows needs to know
// if it matched a word or a non-word, and to the second '.' that it
// has to check if it succeeds a word or non-word.  In this case the
// result will be something like:
//
//   +-------+        +------------+
//   |   .   |        |      .     |
//   +-------+  --->  +------------+
//   | word? |        | check word |
//   +-------+        +------------+
1436
class Analysis: public NodeVisitor {
1437
 public:
1438
  Analysis(Isolate* isolate, JSRegExp::Flags flags, bool is_one_byte)
1439
      : isolate_(isolate),
1440
        flags_(flags),
1441
        is_one_byte_(is_one_byte),
1442
        error_message_(nullptr) {}
1443 1444 1445 1446 1447 1448
  void EnsureAnalyzed(RegExpNode* node);

#define DECLARE_VISIT(Type)                                          \
  virtual void Visit##Type(Type##Node* that);
FOR_EACH_NODE_TYPE(DECLARE_VISIT)
#undef DECLARE_VISIT
1449
  virtual void VisitLoopChoice(LoopChoiceNode* that);
1450

1451
  bool has_failed() { return error_message_ != nullptr; }
1452
  const char* error_message() {
1453
    DCHECK(error_message_ != nullptr);
1454 1455 1456 1457 1458
    return error_message_;
  }
  void fail(const char* error_message) {
    error_message_ = error_message;
  }
1459

1460 1461
  Isolate* isolate() const { return isolate_; }

1462 1463 1464
  bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
  bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }

1465
 private:
1466
  Isolate* isolate_;
1467
  JSRegExp::Flags flags_;
1468
  bool is_one_byte_;
1469
  const char* error_message_;
1470

1471
  DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
1472 1473 1474
};


1475 1476
struct RegExpCompileData {
  RegExpCompileData()
1477 1478 1479 1480 1481
      : tree(nullptr),
        node(nullptr),
        simple(true),
        contains_anchor(false),
        capture_count(0) {}
1482
  RegExpTree* tree;
1483
  RegExpNode* node;
1484
  bool simple;
1485
  bool contains_anchor;
1486
  Handle<FixedArray> capture_name_map;
1487 1488 1489 1490 1491 1492 1493
  Handle<String> error;
  int capture_count;
};


class RegExpEngine: public AllStatic {
 public:
1494
  struct CompilationResult {
1495
    CompilationResult(Isolate* isolate, const char* error_message)
1496
        : error_message(error_message),
1497
          code(isolate->heap()->the_hole_value()),
1498 1499
          num_registers(0) {}
    CompilationResult(Object* code, int registers)
1500
        : error_message(nullptr), code(code), num_registers(registers) {}
1501 1502 1503 1504 1505
    const char* error_message;
    Object* code;
    int num_registers;
  };

1506
  static CompilationResult Compile(Isolate* isolate, Zone* zone,
1507 1508
                                   RegExpCompileData* input,
                                   JSRegExp::Flags flags,
1509
                                   Handle<String> pattern,
1510
                                   Handle<String> sample_subject,
1511
                                   bool is_one_byte);
1512

1513 1514
  static bool TooMuchRegExpCode(Handle<String> pattern);

1515
  static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
1516 1517 1518
};


1519 1520 1521 1522 1523 1524 1525
class RegExpResultsCache : public AllStatic {
 public:
  enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };

  // Attempt to retrieve a cached result.  On failure, 0 is returned as a Smi.
  // On success, the returned result is guaranteed to be a COW-array.
  static Object* Lookup(Heap* heap, String* key_string, Object* key_pattern,
1526
                        FixedArray** last_match_out, ResultsCacheType type);
1527 1528 1529 1530
  // Attempt to add value_array to the cache specified by type.  On success,
  // value_array is turned into a COW-array.
  static void Enter(Isolate* isolate, Handle<String> key_string,
                    Handle<Object> key_pattern, Handle<FixedArray> value_array,
1531
                    Handle<FixedArray> last_match_cache, ResultsCacheType type);
1532 1533 1534 1535 1536 1537 1538 1539
  static void Clear(FixedArray* cache);
  static const int kRegExpResultsCacheSize = 0x100;

 private:
  static const int kArrayEntriesPerCacheEntry = 4;
  static const int kStringOffset = 0;
  static const int kPatternOffset = 1;
  static const int kArrayOffset = 2;
1540
  static const int kLastMatchOffset = 3;
1541 1542 1543 1544
};

}  // namespace internal
}  // namespace v8
1545

1546
#endif  // V8_REGEXP_JSREGEXP_H_