jsregexp.h 56.1 KB
Newer Older
1
// Copyright 2012 the V8 project authors. All rights reserved.
2 3
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
4

5 6
#ifndef V8_REGEXP_JSREGEXP_H_
#define V8_REGEXP_JSREGEXP_H_
7

8 9
#include "src/allocation.h"
#include "src/assembler.h"
10
#include "src/regexp/regexp-ast.h"
11
#include "src/regexp/regexp-macro-assembler.h"
12

13 14
namespace v8 {
namespace internal {
15

16 17
class NodeVisitor;
class RegExpCompiler;
18
class RegExpMacroAssembler;
19 20
class RegExpNode;
class RegExpTree;
21
class BoyerMooreLookahead;
22

23 24
class RegExpImpl {
 public:
25 26
  // Whether V8 is compiled with native regexp support or not.
  static bool UsesNativeRegExp() {
27
#ifdef V8_INTERPRETED_REGEXP
28
    return false;
29 30
#else
    return true;
31 32
#endif
  }
33

34 35 36 37 38
  // Returns a string representation of a regular expression.
  // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
  // This function calls the garbage collector if necessary.
  static Handle<String> ToString(Handle<Object> value);

39 40 41
  // Parses the RegExp pattern and prepares the JSRegExp object with
  // generic data and choice of implementation - as well as what
  // the implementation wants to store in the data field.
42
  // Returns false if compilation fails.
43 44 45
  MUST_USE_RESULT static MaybeHandle<Object> Compile(Handle<JSRegExp> re,
                                                     Handle<String> pattern,
                                                     JSRegExp::Flags flags);
46 47 48

  // See ECMA-262 section 15.10.6.2.
  // This function calls the garbage collector if necessary.
49
  V8_EXPORT_PRIVATE MUST_USE_RESULT static MaybeHandle<Object> Exec(
50
      Handle<JSRegExp> regexp, Handle<String> subject, int index,
51
      Handle<RegExpMatchInfo> last_match_info);
52

53
  // Prepares a JSRegExp object with Irregexp-specific data.
54 55 56 57
  static void IrregexpInitialize(Handle<JSRegExp> re,
                                 Handle<String> pattern,
                                 JSRegExp::Flags flags,
                                 int capture_register_count);
58 59


60 61 62 63 64
  static void AtomCompile(Handle<JSRegExp> re,
                          Handle<String> pattern,
                          JSRegExp::Flags flags,
                          Handle<String> match_pattern);

65 66 67 68 69 70 71

  static int AtomExecRaw(Handle<JSRegExp> regexp,
                         Handle<String> subject,
                         int index,
                         int32_t* output,
                         int output_size);

72
  static Handle<Object> AtomExec(Handle<JSRegExp> regexp,
73
                                 Handle<String> subject, int index,
74
                                 Handle<RegExpMatchInfo> last_match_info);
75

76 77 78 79 80 81 82
  enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };

  // Prepare a RegExp for being executed one or more times (using
  // IrregexpExecOnce) on the subject.
  // This ensures that the regexp is compiled for the subject, and that
  // the subject is flat.
  // Returns the number of integer spaces required by IrregexpExecOnce
83
  // as its "registers" argument.  If the regexp cannot be compiled,
84 85
  // an exception is set as pending, and this function returns negative.
  static int IrregexpPrepare(Handle<JSRegExp> regexp,
86
                             Handle<String> subject);
87

88 89 90 91
  // Execute a regular expression on the subject, starting from index.
  // If matching succeeds, return the number of matches.  This can be larger
  // than one in the case of global regular expressions.
  // The captures and subcaptures are stored into the registers vector.
92 93
  // If matching fails, returns RE_FAILURE.
  // If execution fails, sets a pending exception and returns RE_EXCEPTION.
94 95 96
  static int IrregexpExecRaw(Handle<JSRegExp> regexp,
                             Handle<String> subject,
                             int index,
97 98
                             int32_t* output,
                             int output_size);
99

100
  // Execute an Irregexp bytecode pattern.
101
  // On a successful match, the result is a JSArray containing
102
  // captured positions.  On a failure, the result is the null value.
103
  // Returns an empty handle in case of an exception.
104
  MUST_USE_RESULT static MaybeHandle<Object> IrregexpExec(
105
      Handle<JSRegExp> regexp, Handle<String> subject, int index,
106
      Handle<RegExpMatchInfo> last_match_info);
107

108
  // Set last match info.  If match is NULL, then setting captures is omitted.
109 110 111
  static Handle<RegExpMatchInfo> SetLastMatchInfo(
      Handle<RegExpMatchInfo> last_match_info, Handle<String> subject,
      int capture_count, int32_t* match);
112 113 114 115 116 117 118

  class GlobalCache {
   public:
    GlobalCache(Handle<JSRegExp> regexp,
                Handle<String> subject,
                Isolate* isolate);

119
    INLINE(~GlobalCache());
120 121 122 123 124

    // Fetch the next entry in the cache for global regexp match results.
    // This does not set the last match info.  Upon failure, NULL is returned.
    // The cause can be checked with Result().  The previous
    // result is still in available in memory when a failure happens.
125
    INLINE(int32_t* FetchNext());
126

127
    INLINE(int32_t* LastSuccessfulMatch());
128

129
    INLINE(bool HasException()) { return num_matches_ < 0; }
130 131

   private:
132 133
    int AdvanceZeroLength(int last_index);

134 135 136 137 138 139 140 141 142 143 144
    int num_matches_;
    int max_matches_;
    int current_match_index_;
    int registers_per_match_;
    // Pointer to the last set of captures.
    int32_t* register_array_;
    int register_array_size_;
    Handle<JSRegExp> regexp_;
    Handle<String> subject_;
  };

145
  // For acting on the JSRegExp data FixedArray.
146 147
  static int IrregexpMaxRegisterCount(FixedArray* re);
  static void SetIrregexpMaxRegisterCount(FixedArray* re, int value);
148 149
  static void SetIrregexpCaptureNameMap(FixedArray* re,
                                        Handle<FixedArray> value);
150 151
  static int IrregexpNumberOfCaptures(FixedArray* re);
  static int IrregexpNumberOfRegisters(FixedArray* re);
152 153
  static ByteArray* IrregexpByteCode(FixedArray* re, bool is_one_byte);
  static Code* IrregexpNativeCode(FixedArray* re, bool is_one_byte);
154

155 156 157 158 159
  // Limit the space regexps take up on the heap.  In order to limit this we
  // would like to keep track of the amount of regexp code on the heap.  This
  // is not tracked, however.  As a conservative approximation we track the
  // total regexp code compiled including code that has subsequently been freed
  // and the total executable memory at any point.
160
  static const size_t kRegExpExecutableMemoryLimit = 16 * MB;
161
  static const int kRegExpCompiledLimit = 1 * MB;
162
  static const int kRegExpTooLargeToOptimize = 20 * KB;
163

164
 private:
165 166 167 168 169
  static bool CompileIrregexp(Handle<JSRegExp> re,
                              Handle<String> sample_subject, bool is_one_byte);
  static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re,
                                            Handle<String> sample_subject,
                                            bool is_one_byte);
170 171 172
};


173 174 175 176 177 178 179 180 181 182
// Represents the location of one element relative to the intersection of
// two sets. Corresponds to the four areas of a Venn diagram.
enum ElementInSetsRelation {
  kInsideNone = 0,
  kInsideFirst = 1,
  kInsideSecond = 2,
  kInsideBoth = 3
};


183 184 185 186 187
// A set of unsigned integers that behaves especially well on small
// integers (< 32).  May do zone-allocation.
class OutSet: public ZoneObject {
 public:
  OutSet() : first_(0), remaining_(NULL), successors_(NULL) { }
188
  OutSet* Extend(unsigned value, Zone* zone);
189
  bool Get(unsigned value) const;
190 191
  static const unsigned kFirstLimit = 32;

192
 private:
193 194 195
  // Destructively set a value in this set.  In most cases you want
  // to use Extend instead to ensure that only one instance exists
  // that contains the same values.
196
  void Set(unsigned value, Zone* zone);
197 198 199 200

  // The successors are a list of sets that contain the same values
  // as this set and the one more value that is not present in this
  // set.
201
  ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
202 203

  OutSet(uint32_t first, ZoneList<unsigned>* remaining)
204
      : first_(first), remaining_(remaining), successors_(NULL) { }
205 206 207
  uint32_t first_;
  ZoneList<unsigned>* remaining_;
  ZoneList<OutSet*>* successors_;
208
  friend class Trace;
209 210 211 212 213
};


// A mapping from integers, specified as ranges, to a set of integers.
// Used for mapping character ranges to choices.
214
class DispatchTable : public ZoneObject {
215
 public:
216 217
  explicit DispatchTable(Zone* zone) : tree_(zone) { }

218 219
  class Entry {
   public:
220
    Entry() : from_(0), to_(0), out_set_(NULL) { }
221
    Entry(uc32 from, uc32 to, OutSet* out_set)
222 223 224
        : from_(from), to_(to), out_set_(out_set) {
      DCHECK(from <= to);
    }
225 226 227
    uc32 from() { return from_; }
    uc32 to() { return to_; }
    void set_to(uc32 value) { to_ = value; }
228 229 230
    void AddValue(int value, Zone* zone) {
      out_set_ = out_set_->Extend(value, zone);
    }
231 232
    OutSet* out_set() { return out_set_; }
   private:
233 234
    uc32 from_;
    uc32 to_;
235 236 237 238 239
    OutSet* out_set_;
  };

  class Config {
   public:
240
    typedef uc32 Key;
241
    typedef Entry Value;
242
    static const uc32 kNoKey;
243
    static const Entry NoValue() { return Value(); }
244
    static inline int Compare(uc32 a, uc32 b) {
245 246 247 248 249 250 251 252 253
      if (a == b)
        return 0;
      else if (a < b)
        return -1;
      else
        return 1;
    }
  };

254
  void AddRange(CharacterRange range, int value, Zone* zone);
255
  OutSet* Get(uc32 value);
256 257 258
  void Dump();

  template <typename Callback>
259 260 261
  void ForEach(Callback* callback) {
    return tree()->ForEach(callback);
  }
262

263 264 265 266 267 268 269 270 271 272
 private:
  // There can't be a static empty set since it allocates its
  // successors in a zone and caches them.
  OutSet* empty() { return &empty_; }
  OutSet empty_;
  ZoneSplayTree<Config>* tree() { return &tree_; }
  ZoneSplayTree<Config> tree_;
};


273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
class UnicodeRangeSplitter {
 public:
  UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base);
  void Call(uc32 from, DispatchTable::Entry entry);

  ZoneList<CharacterRange>* bmp() { return bmp_; }
  ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
  ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
  ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }

 private:
  static const int kBase = 0;
  // Separate ranges into
  static const int kBmpCodePoints = 1;
  static const int kLeadSurrogates = 2;
  static const int kTrailSurrogates = 3;
  static const int kNonBmpCodePoints = 4;

  Zone* zone_;
  DispatchTable table_;
  ZoneList<CharacterRange>* bmp_;
  ZoneList<CharacterRange>* lead_surrogates_;
  ZoneList<CharacterRange>* trail_surrogates_;
  ZoneList<CharacterRange>* non_bmp_;
};


301 302 303 304 305
#define FOR_EACH_NODE_TYPE(VISIT)                                    \
  VISIT(End)                                                         \
  VISIT(Action)                                                      \
  VISIT(Choice)                                                      \
  VISIT(BackReference)                                               \
306
  VISIT(Assertion)                                                   \
307 308 309
  VISIT(Text)


310
class Trace;
311 312 313
struct PreloadState;
class GreedyLoopState;
class AlternativeGenerationList;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
314

315 316
struct NodeInfo {
  NodeInfo()
317 318 319 320
      : being_analyzed(false),
        been_analyzed(false),
        follows_word_interest(false),
        follows_newline_interest(false),
321 322
        follows_start_interest(false),
        at_end(false),
323 324
        visited(false),
        replacement_calculated(false) { }
325

326 327 328 329 330 331
  // Returns true if the interests and assumptions of this node
  // matches the given one.
  bool Matches(NodeInfo* that) {
    return (at_end == that->at_end) &&
           (follows_word_interest == that->follows_word_interest) &&
           (follows_newline_interest == that->follows_newline_interest) &&
332
           (follows_start_interest == that->follows_start_interest);
333
  }
334 335 336 337 338 339 340 341

  // Updates the interests of this node given the interests of the
  // node preceding it.
  void AddFromPreceding(NodeInfo* that) {
    at_end |= that->at_end;
    follows_word_interest |= that->follows_word_interest;
    follows_newline_interest |= that->follows_newline_interest;
    follows_start_interest |= that->follows_start_interest;
342
  }
343

344 345 346 347 348 349
  bool HasLookbehind() {
    return follows_word_interest ||
           follows_newline_interest ||
           follows_start_interest;
  }

350 351 352 353 354 355
  // Sets the interests of this node to include the interests of the
  // following node.
  void AddFromFollowing(NodeInfo* that) {
    follows_word_interest |= that->follows_word_interest;
    follows_newline_interest |= that->follows_newline_interest;
    follows_start_interest |= that->follows_start_interest;
356
  }
357

358 359 360 361 362
  void ResetCompilationState() {
    being_analyzed = false;
    been_analyzed = false;
  }

363 364
  bool being_analyzed: 1;
  bool been_analyzed: 1;
365

366 367
  // These bits are set of this node has to know what the preceding
  // character was.
368 369 370
  bool follows_word_interest: 1;
  bool follows_newline_interest: 1;
  bool follows_start_interest: 1;
371 372

  bool at_end: 1;
373
  bool visited: 1;
374
  bool replacement_calculated: 1;
375 376 377
};


378 379 380 381 382 383 384
// Details of a quick mask-compare check that can look ahead in the
// input stream.
class QuickCheckDetails {
 public:
  QuickCheckDetails()
      : characters_(0),
        mask_(0),
385 386
        value_(0),
        cannot_match_(false) { }
387 388 389
  explicit QuickCheckDetails(int characters)
      : characters_(characters),
        mask_(0),
390 391
        value_(0),
        cannot_match_(false) { }
392
  bool Rationalize(bool one_byte);
393 394 395
  // Merge in the information from another branch of an alternation.
  void Merge(QuickCheckDetails* other, int from_index);
  // Advance the current position by some amount.
396
  void Advance(int by, bool one_byte);
397
  void Clear();
398 399
  bool cannot_match() { return cannot_match_; }
  void set_cannot_match() { cannot_match_ = true; }
400 401 402 403 404 405 406 407 408
  struct Position {
    Position() : mask(0), value(0), determines_perfectly(false) { }
    uc16 mask;
    uc16 value;
    bool determines_perfectly;
  };
  int characters() { return characters_; }
  void set_characters(int characters) { characters_ = characters; }
  Position* positions(int index) {
409 410
    DCHECK(index >= 0);
    DCHECK(index < characters_);
411 412 413 414 415 416 417 418 419 420 421 422 423
    return positions_ + index;
  }
  uint32_t mask() { return mask_; }
  uint32_t value() { return value_; }

 private:
  // How many characters do we have quick check information from.  This is
  // the same for all branches of a choice node.
  int characters_;
  Position positions_[4];
  // These values are the condensate of the above array after Rationalize().
  uint32_t mask_;
  uint32_t value_;
424 425 426
  // If set to true, there is no way this quick check can match at all.
  // E.g., if it requires to be at the start of the input, and isn't.
  bool cannot_match_;
427 428 429
};


430 431 432
extern int kUninitializedRegExpNodePlaceHolder;


433 434
class RegExpNode: public ZoneObject {
 public:
435
  explicit RegExpNode(Zone* zone)
436
      : replacement_(NULL), on_work_list_(false), trace_count_(0), zone_(zone) {
437 438
    bm_info_[0] = bm_info_[1] = NULL;
  }
439
  virtual ~RegExpNode();
440 441
  virtual void Accept(NodeVisitor* visitor) = 0;
  // Generates a goto to this node or actually generates the code at this point.
442
  virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
443
  // How many characters must this node consume at a minimum in order to
444 445
  // succeed.  If we have found at least 'still_to_find' characters that
  // must be consumed there is no need to ask any following nodes whether
446 447 448 449
  // they are sure to eat any more characters.  The not_at_start argument is
  // used to indicate that we know we are not at the start of the input.  In
  // this case anchored branches will always fail and can be ignored when
  // determining how many characters are consumed on success.
450
  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
451 452 453 454
  // Emits some quick code that checks whether the preloaded characters match.
  // Falls through on certain failure, jumps to the label on possible success.
  // If the node cannot make a quick check it does nothing and returns false.
  bool EmitQuickCheck(RegExpCompiler* compiler,
455
                      Trace* bounds_check_trace,
456
                      Trace* trace,
457 458 459 460 461 462 463 464 465 466
                      bool preload_has_checked_bounds,
                      Label* on_possible_success,
                      QuickCheckDetails* details_return,
                      bool fall_through_on_failure);
  // For a given number of characters this returns a mask and a value.  The
  // next n characters are anded with the mask and compared with the value.
  // A comparison failure indicates the node cannot match the next n characters.
  // A comparison success indicates the node may match.
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
467 468
                                    int characters_filled_in,
                                    bool not_at_start) = 0;
469
  static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
470
  virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
471 472 473 474 475 476 477 478 479 480
  // Only returns the successor for a text node of length 1 that matches any
  // character and that has no guards on it.
  virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
      RegExpCompiler* compiler) {
    return NULL;
  }

  // Collects information on the possible code units (mod 128) that can match if
  // we look forward.  This is used for a Boyer-Moore-like string searching
  // implementation.  TODO(erikcorry):  This should share more code with
481 482
  // EatsAtLeast, GetQuickCheckDetails.  The budget argument is used to limit
  // the number of nodes we are willing to look at in order to create this data.
483
  static const int kRecursionBudget = 200;
484
  bool KeepRecursing(RegExpCompiler* compiler);
485
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
486
                            BoyerMooreLookahead* bm, bool not_at_start) {
487 488
    UNREACHABLE();
  }
489

490
  // If we know that the input is one-byte then there are some nodes that can
491 492
  // never match.  This method returns a node that can be substituted for
  // itself, or NULL if the node can never match.
493
  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case) {
494 495 496
    return this;
  }
  // Helper for FilterOneByte.
497
  RegExpNode* replacement() {
498
    DCHECK(info()->replacement_calculated);
499 500 501 502 503 504 505 506
    return replacement_;
  }
  RegExpNode* set_replacement(RegExpNode* replacement) {
    info()->replacement_calculated = true;
    replacement_ =  replacement;
    return replacement;  // For convenience.
  }

507 508 509 510 511 512 513
  // We want to avoid recalculating the lookahead info, so we store it on the
  // node.  Only info that is for this node is stored.  We can tell that the
  // info is for this node when offset == 0, so the information is calculated
  // relative to this node.
  void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
    if (offset == 0) set_bm_info(not_at_start, bm);
  }
514

erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
515
  Label* label() { return &label_; }
516
  // If non-generic code is generated for a node (i.e. the node is not at the
517 518 519 520 521
  // start of the trace) then it cannot be reused.  This variable sets a limit
  // on how often we allow that to happen before we insist on starting a new
  // trace and generating generic code for a node that can be reused by flushing
  // the deferred actions in the current trace and generating a goto.
  static const int kMaxCopiesCodeGenerated = 10;
522

523 524 525
  bool on_work_list() { return on_work_list_; }
  void set_on_work_list(bool value) { on_work_list_ = value; }

526
  NodeInfo* info() { return &info_; }
527

528 529 530
  BoyerMooreLookahead* bm_info(bool not_at_start) {
    return bm_info_[not_at_start ? 1 : 0];
  }
531

532
  Zone* zone() const { return zone_; }
533

534
 protected:
535
  enum LimitResult { DONE, CONTINUE };
536
  RegExpNode* replacement_;
537

538
  LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
539

540 541 542 543
  void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
    bm_info_[not_at_start ? 1 : 0] = bm;
  }

544
 private:
545
  static const int kFirstCharBudget = 10;
546
  Label label_;
547
  bool on_work_list_;
548
  NodeInfo info_;
549 550 551 552 553 554
  // This variable keeps track of how many times code has been generated for
  // this node (in different traces).  We don't keep track of where the
  // generated code is located unless the code is generated at the start of
  // a trace, in which case it is generic and can be reused by flushing the
  // deferred operations in the current trace and generating a goto.
  int trace_count_;
555
  BoyerMooreLookahead* bm_info_[2];
556 557

  Zone* zone_;
558 559 560 561 562 563
};


class SeqRegExpNode: public RegExpNode {
 public:
  explicit SeqRegExpNode(RegExpNode* on_success)
564
      : RegExpNode(on_success->zone()), on_success_(on_success) { }
565 566
  RegExpNode* on_success() { return on_success_; }
  void set_on_success(RegExpNode* node) { on_success_ = node; }
567 568
  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
569
                            BoyerMooreLookahead* bm, bool not_at_start) {
570
    on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
571
    if (offset == 0) set_bm_info(not_at_start, bm);
572
  }
573 574

 protected:
575
  RegExpNode* FilterSuccessor(int depth, bool ignore_case);
576

577 578 579 580 581 582 583
 private:
  RegExpNode* on_success_;
};


class ActionNode: public SeqRegExpNode {
 public:
584
  enum ActionType {
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
585
    SET_REGISTER,
586 587 588
    INCREMENT_REGISTER,
    STORE_POSITION,
    BEGIN_SUBMATCH,
589
    POSITIVE_SUBMATCH_SUCCESS,
590 591
    EMPTY_MATCH_CHECK,
    CLEAR_CAPTURES
592
  };
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
593
  static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
594
  static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
595 596 597
  static ActionNode* StorePosition(int reg,
                                   bool is_capture,
                                   RegExpNode* on_success);
598 599 600 601 602 603
  static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
  static ActionNode* BeginSubmatch(int stack_pointer_reg,
                                   int position_reg,
                                   RegExpNode* on_success);
  static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
                                             int restore_reg,
604 605
                                             int clear_capture_count,
                                             int clear_capture_from,
606 607 608 609 610
                                             RegExpNode* on_success);
  static ActionNode* EmptyMatchCheck(int start_register,
                                     int repetition_register,
                                     int repetition_limit,
                                     RegExpNode* on_success);
611
  virtual void Accept(NodeVisitor* visitor);
612
  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
613
  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
614 615
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
616 617 618 619
                                    int filled_in,
                                    bool not_at_start) {
    return on_success()->GetQuickCheckDetails(
        details, compiler, filled_in, not_at_start);
620
  }
621
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
622
                            BoyerMooreLookahead* bm, bool not_at_start);
623
  ActionType action_type() { return action_type_; }
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
624 625
  // TODO(erikcorry): We should allow some action nodes in greedy loops.
  virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
626

627 628 629 630 631 632 633 634 635 636 637
 private:
  union {
    struct {
      int reg;
      int value;
    } u_store_register;
    struct {
      int reg;
    } u_increment_register;
    struct {
      int reg;
638
      bool is_capture;
639 640
    } u_position_register;
    struct {
641 642
      int stack_pointer_register;
      int current_position_register;
643 644
      int clear_register_count;
      int clear_register_from;
645
    } u_submatch;
646 647 648 649 650
    struct {
      int start_register;
      int repetition_register;
      int repetition_limit;
    } u_empty_match_check;
651 652 653 654
    struct {
      int range_from;
      int range_to;
    } u_clear_captures;
655
  } data_;
656
  ActionNode(ActionType action_type, RegExpNode* on_success)
657
      : SeqRegExpNode(on_success),
658 659
        action_type_(action_type) { }
  ActionType action_type_;
660 661 662 663 664 665
  friend class DotPrinter;
};


class TextNode: public SeqRegExpNode {
 public:
666
  TextNode(ZoneList<TextElement>* elms, bool read_backward,
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
667
           RegExpNode* on_success)
668 669
      : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
  TextNode(RegExpCharacterClass* that, bool read_backward,
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
670
           RegExpNode* on_success)
671
      : SeqRegExpNode(on_success),
672 673
        elms_(new (zone()) ZoneList<TextElement>(1, zone())),
        read_backward_(read_backward) {
674
    elms_->Add(TextElement::CharClass(that), zone());
675
  }
676 677 678 679 680 681 682 683 684 685 686
  // Create TextNode for a single character class for the given ranges.
  static TextNode* CreateForCharacterRanges(Zone* zone,
                                            ZoneList<CharacterRange>* ranges,
                                            bool read_backward,
                                            RegExpNode* on_success);
  // Create TextNode for a surrogate pair with a range given for the
  // lead and the trail surrogate each.
  static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
                                          CharacterRange trail,
                                          bool read_backward,
                                          RegExpNode* on_success);
687
  virtual void Accept(NodeVisitor* visitor);
688
  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
689
  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
690 691
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
692 693
                                    int characters_filled_in,
                                    bool not_at_start);
694
  ZoneList<TextElement>* elements() { return elms_; }
695
  bool read_backward() { return read_backward_; }
696
  void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
697
  virtual int GreedyLoopTextLength();
698 699
  virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
      RegExpCompiler* compiler);
700
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
701
                            BoyerMooreLookahead* bm, bool not_at_start);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
702
  void CalculateOffsets();
703
  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
704

705
 private:
706
  enum TextEmitPassType {
707
    NON_LATIN1_MATCH,            // Check for characters that can't match.
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
708 709 710 711
    SIMPLE_CHARACTER_MATCH,      // Case-dependent single character check.
    NON_LETTER_CHARACTER_MATCH,  // Check characters that have no case equivs.
    CASE_CHARACTER_MATCH,        // Case-independent single character check.
    CHARACTER_CLASS_MATCH        // Character class.
712
  };
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
713 714 715
  static bool SkipPass(int pass, bool ignore_case);
  static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
  static const int kLastPass = CHARACTER_CLASS_MATCH;
716 717 718
  void TextEmitPass(RegExpCompiler* compiler,
                    TextEmitPassType pass,
                    bool preloaded,
719
                    Trace* trace,
720 721 722
                    bool first_element_checked,
                    int* checked_up_to);
  int Length();
723
  ZoneList<TextElement>* elms_;
724
  bool read_backward_;
725 726 727
};


728 729
class AssertionNode: public SeqRegExpNode {
 public:
730
  enum AssertionType {
731 732 733 734
    AT_END,
    AT_START,
    AT_BOUNDARY,
    AT_NON_BOUNDARY,
735
    AFTER_NEWLINE
736 737
  };
  static AssertionNode* AtEnd(RegExpNode* on_success) {
738
    return new(on_success->zone()) AssertionNode(AT_END, on_success);
739 740
  }
  static AssertionNode* AtStart(RegExpNode* on_success) {
741
    return new(on_success->zone()) AssertionNode(AT_START, on_success);
742 743
  }
  static AssertionNode* AtBoundary(RegExpNode* on_success) {
744
    return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
745 746
  }
  static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
747
    return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
748 749
  }
  static AssertionNode* AfterNewline(RegExpNode* on_success) {
750
    return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
751 752
  }
  virtual void Accept(NodeVisitor* visitor);
753
  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
754
  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
755 756
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
757 758
                                    int filled_in,
                                    bool not_at_start);
759
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
760
                            BoyerMooreLookahead* bm, bool not_at_start);
761
  AssertionType assertion_type() { return assertion_type_; }
762

763
 private:
764 765 766 767 768
  void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
  enum IfPrevious { kIsNonWord, kIsWord };
  void BacktrackIfPrevious(RegExpCompiler* compiler,
                           Trace* trace,
                           IfPrevious backtrack_if_previous);
769 770 771
  AssertionNode(AssertionType t, RegExpNode* on_success)
      : SeqRegExpNode(on_success), assertion_type_(t) { }
  AssertionType assertion_type_;
772 773 774
};


775 776
class BackReferenceNode: public SeqRegExpNode {
 public:
777
  BackReferenceNode(int start_reg, int end_reg, bool read_backward,
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
778
                    RegExpNode* on_success)
779 780
      : SeqRegExpNode(on_success),
        start_reg_(start_reg),
781 782
        end_reg_(end_reg),
        read_backward_(read_backward) {}
783 784 785
  virtual void Accept(NodeVisitor* visitor);
  int start_register() { return start_reg_; }
  int end_register() { return end_reg_; }
786
  bool read_backward() { return read_backward_; }
787
  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
788 789 790
  virtual int EatsAtLeast(int still_to_find,
                          int recursion_depth,
                          bool not_at_start);
791 792
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
793 794
                                    int characters_filled_in,
                                    bool not_at_start) {
795 796
    return;
  }
797
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
798
                            BoyerMooreLookahead* bm, bool not_at_start);
799

800 801 802
 private:
  int start_reg_;
  int end_reg_;
803
  bool read_backward_;
804 805 806 807 808
};


class EndNode: public RegExpNode {
 public:
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
809
  enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
810
  EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
811
  virtual void Accept(NodeVisitor* visitor);
812
  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
813 814 815
  virtual int EatsAtLeast(int still_to_find,
                          int recursion_depth,
                          bool not_at_start) { return 0; }
816 817
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
818 819
                                    int characters_filled_in,
                                    bool not_at_start) {
820 821 822
    // Returning 0 from EatsAtLeast should ensure we never get here.
    UNREACHABLE();
  }
823
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
824
                            BoyerMooreLookahead* bm, bool not_at_start) {
825 826 827
    // Returning 0 from EatsAtLeast should ensure we never get here.
    UNREACHABLE();
  }
828

829 830 831 832 833
 private:
  Action action_;
};


erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
834 835
class NegativeSubmatchSuccess: public EndNode {
 public:
836 837 838
  NegativeSubmatchSuccess(int stack_pointer_reg,
                          int position_reg,
                          int clear_capture_count,
839 840 841
                          int clear_capture_start,
                          Zone* zone)
      : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
842
        stack_pointer_register_(stack_pointer_reg),
843 844 845
        current_position_register_(position_reg),
        clear_capture_count_(clear_capture_count),
        clear_capture_start_(clear_capture_start) { }
846
  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
847 848 849 850

 private:
  int stack_pointer_register_;
  int current_position_register_;
851 852
  int clear_capture_count_;
  int clear_capture_start_;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
853 854 855
};


856 857 858 859
class Guard: public ZoneObject {
 public:
  enum Relation { LT, GEQ };
  Guard(int reg, Relation op, int value)
860 861 862
      : reg_(reg),
        op_(op),
        value_(value) { }
863 864 865
  int reg() { return reg_; }
  Relation op() { return op_; }
  int value() { return value_; }
866

867 868 869 870 871 872 873 874 875 876
 private:
  int reg_;
  Relation op_;
  int value_;
};


class GuardedAlternative {
 public:
  explicit GuardedAlternative(RegExpNode* node) : node_(node), guards_(NULL) { }
877
  void AddGuard(Guard* guard, Zone* zone);
878 879 880
  RegExpNode* node() { return node_; }
  void set_node(RegExpNode* node) { node_ = node; }
  ZoneList<Guard*>* guards() { return guards_; }
881

882 883 884 885 886 887
 private:
  RegExpNode* node_;
  ZoneList<Guard*>* guards_;
};


888 889 890
class AlternativeGeneration;


891 892
class ChoiceNode: public RegExpNode {
 public:
893 894
  explicit ChoiceNode(int expected_size, Zone* zone)
      : RegExpNode(zone),
895 896
        alternatives_(new(zone)
                      ZoneList<GuardedAlternative>(expected_size, zone)),
897
        table_(NULL),
898
        not_at_start_(false),
899
        being_calculated_(false) { }
900
  virtual void Accept(NodeVisitor* visitor);
901 902 903
  void AddAlternative(GuardedAlternative node) {
    alternatives()->Add(node, zone());
  }
904
  ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
905
  DispatchTable* GetTable(bool ignore_case);
906
  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
907
  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
908
  int EatsAtLeastHelper(int still_to_find,
909
                        int budget,
910 911
                        RegExpNode* ignore_this_node,
                        bool not_at_start);
912 913
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
914 915
                                    int characters_filled_in,
                                    bool not_at_start);
916
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
917
                            BoyerMooreLookahead* bm, bool not_at_start);
918

919
  bool being_calculated() { return being_calculated_; }
920 921
  bool not_at_start() { return not_at_start_; }
  void set_not_at_start() { not_at_start_ = true; }
922
  void set_being_calculated(bool b) { being_calculated_ = b; }
923 924 925
  virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
    return true;
  }
926
  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
927
  virtual bool read_backward() { return false; }
928

erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
929
 protected:
930
  int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
931 932
  ZoneList<GuardedAlternative>* alternatives_;

933
 private:
934
  friend class DispatchTableConstructor;
935
  friend class Analysis;
936
  void GenerateGuard(RegExpMacroAssembler* macro_assembler,
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
937
                     Guard* guard,
938
                     Trace* trace);
939
  int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
940
  void EmitOutOfLineContinuation(RegExpCompiler* compiler,
941
                                 Trace* trace,
942 943 944 945
                                 GuardedAlternative alternative,
                                 AlternativeGeneration* alt_gen,
                                 int preload_characters,
                                 bool next_expects_preload);
946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961
  void SetUpPreLoad(RegExpCompiler* compiler,
                    Trace* current_trace,
                    PreloadState* preloads);
  void AssertGuardsMentionRegisters(Trace* trace);
  int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
  Trace* EmitGreedyLoop(RegExpCompiler* compiler,
                        Trace* trace,
                        AlternativeGenerationList* alt_gens,
                        PreloadState* preloads,
                        GreedyLoopState* greedy_loop_state,
                        int text_length);
  void EmitChoices(RegExpCompiler* compiler,
                   AlternativeGenerationList* alt_gens,
                   int first_choice,
                   Trace* trace,
                   PreloadState* preloads);
962
  DispatchTable* table_;
963 964 965
  // If true, this node is never checked at the start of the input.
  // Allows a new trace to start with at_start() set to false.
  bool not_at_start_;
966 967 968 969
  bool being_calculated_;
};


970
class NegativeLookaroundChoiceNode : public ChoiceNode {
971
 public:
972 973 974
  explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
                                        GuardedAlternative then_do_this,
                                        Zone* zone)
975
      : ChoiceNode(2, zone) {
976 977 978
    AddAlternative(this_must_fail);
    AddAlternative(then_do_this);
  }
979
  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
980 981
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
982 983
                                    int characters_filled_in,
                                    bool not_at_start);
984
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
985
                            BoyerMooreLookahead* bm, bool not_at_start) {
986
    alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm,
987
                                              not_at_start);
988
    if (offset == 0) set_bm_info(not_at_start, bm);
989
  }
990 991 992 993 994
  // For a negative lookahead we don't emit the quick check for the
  // alternative that is expected to fail.  This is because quick check code
  // starts by loading enough characters for the alternative that takes fewest
  // characters, but on a negative lookahead the negative branch did not take
  // part in that calculation (EatsAtLeast) so the assumptions don't hold.
995 996 997
  virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
    return !is_first;
  }
998
  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
999 1000 1001
};


erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1002 1003
class LoopChoiceNode: public ChoiceNode {
 public:
1004
  LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
1005
      : ChoiceNode(2, zone),
1006
        loop_node_(NULL),
1007
        continue_node_(NULL),
1008 1009
        body_can_be_zero_length_(body_can_be_zero_length),
        read_backward_(read_backward) {}
1010 1011
  void AddLoopAlternative(GuardedAlternative alt);
  void AddContinueAlternative(GuardedAlternative alt);
1012
  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1013
  virtual int EatsAtLeast(int still_to_find,  int budget, bool not_at_start);
1014 1015
  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                    RegExpCompiler* compiler,
1016 1017
                                    int characters_filled_in,
                                    bool not_at_start);
1018
  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
1019
                            BoyerMooreLookahead* bm, bool not_at_start);
1020 1021
  RegExpNode* loop_node() { return loop_node_; }
  RegExpNode* continue_node() { return continue_node_; }
1022
  bool body_can_be_zero_length() { return body_can_be_zero_length_; }
1023
  virtual bool read_backward() { return read_backward_; }
1024
  virtual void Accept(NodeVisitor* visitor);
1025
  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036

 private:
  // AddAlternative is made private for loop nodes because alternatives
  // should not be added freely, we need to keep track of which node
  // goes back to the node itself.
  void AddAlternative(GuardedAlternative node) {
    ChoiceNode::AddAlternative(node);
  }

  RegExpNode* loop_node_;
  RegExpNode* continue_node_;
1037
  bool body_can_be_zero_length_;
1038
  bool read_backward_;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1039 1040 1041
};


1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087
// Improve the speed that we scan for an initial point where a non-anchored
// regexp can match by using a Boyer-Moore-like table. This is done by
// identifying non-greedy non-capturing loops in the nodes that eat any
// character one at a time.  For example in the middle of the regexp
// /foo[\s\S]*?bar/ we find such a loop.  There is also such a loop implicitly
// inserted at the start of any non-anchored regexp.
//
// When we have found such a loop we look ahead in the nodes to find the set of
// characters that can come at given distances. For example for the regexp
// /.?foo/ we know that there are at least 3 characters ahead of us, and the
// sets of characters that can occur are [any, [f, o], [o]]. We find a range in
// the lookahead info where the set of characters is reasonably constrained. In
// our example this is from index 1 to 2 (0 is not constrained). We can now
// look 3 characters ahead and if we don't find one of [f, o] (the union of
// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
//
// For Unicode input strings we do the same, but modulo 128.
//
// We also look at the first string fed to the regexp and use that to get a hint
// of the character frequencies in the inputs. This affects the assessment of
// whether the set of characters is 'reasonably constrained'.
//
// We also have another lookahead mechanism (called quick check in the code),
// which uses a wide load of multiple characters followed by a mask and compare
// to determine whether a match is possible at this point.
enum ContainedInLattice {
  kNotYet = 0,
  kLatticeIn = 1,
  kLatticeOut = 2,
  kLatticeUnknown = 3  // Can also mean both in and out.
};


inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
  return static_cast<ContainedInLattice>(a | b);
}


ContainedInLattice AddRange(ContainedInLattice a,
                            const int* ranges,
                            int ranges_size,
                            Interval new_range);


class BoyerMoorePositionInfo : public ZoneObject {
 public:
1088
  explicit BoyerMoorePositionInfo(Zone* zone)
1089
      : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
1090 1091 1092 1093 1094 1095
        map_count_(0),
        w_(kNotYet),
        s_(kNotYet),
        d_(kNotYet),
        surrogate_(kNotYet) {
     for (int i = 0; i < kMapSize; i++) {
1096
       map_->Add(false, zone);
1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124
     }
  }

  bool& at(int i) { return map_->at(i); }

  static const int kMapSize = 128;
  static const int kMask = kMapSize - 1;

  int map_count() const { return map_count_; }

  void Set(int character);
  void SetInterval(const Interval& interval);
  void SetAll();
  bool is_non_word() { return w_ == kLatticeOut; }
  bool is_word() { return w_ == kLatticeIn; }

 private:
  ZoneList<bool>* map_;
  int map_count_;  // Number of set bits in the map.
  ContainedInLattice w_;  // The \w character class.
  ContainedInLattice s_;  // The \s character class.
  ContainedInLattice d_;  // The \d character class.
  ContainedInLattice surrogate_;  // Surrogate UTF-16 code units.
};


class BoyerMooreLookahead : public ZoneObject {
 public:
1125
  BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159

  int length() { return length_; }
  int max_char() { return max_char_; }
  RegExpCompiler* compiler() { return compiler_; }

  int Count(int map_number) {
    return bitmaps_->at(map_number)->map_count();
  }

  BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }

  void Set(int map_number, int character) {
    if (character > max_char_) return;
    BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
    info->Set(character);
  }

  void SetInterval(int map_number, const Interval& interval) {
    if (interval.from() > max_char_) return;
    BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
    if (interval.to() > max_char_) {
      info->SetInterval(Interval(interval.from(), max_char_));
    } else {
      info->SetInterval(interval);
    }
  }

  void SetAll(int map_number) {
    bitmaps_->at(map_number)->SetAll();
  }

  void SetRest(int from_map) {
    for (int i = from_map; i < length_; i++) SetAll(i);
  }
1160
  void EmitSkipInstructions(RegExpMacroAssembler* masm);
1161 1162 1163 1164 1165 1166 1167 1168

 private:
  // This is the value obtained by EatsAtLeast.  If we do not have at least this
  // many characters left in the sample string then the match is bound to fail.
  // Therefore it is OK to read a character this far ahead of the current match
  // point.
  int length_;
  RegExpCompiler* compiler_;
1169
  // 0xff for Latin1, 0xffff for UTF-16.
1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181
  int max_char_;
  ZoneList<BoyerMoorePositionInfo*>* bitmaps_;

  int GetSkipTable(int min_lookahead,
                   int max_lookahead,
                   Handle<ByteArray> boolean_skip_table);
  bool FindWorthwhileInterval(int* from, int* to);
  int FindBestInterval(
    int max_number_of_chars, int old_biggest_points, int* from, int* to);
};


erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1182 1183
// There are many ways to generate code for a node.  This class encapsulates
// the current way we should be generating.  In other words it encapsulates
1184 1185 1186 1187 1188 1189 1190 1191 1192 1193
// the current state of the code generator.  The effect of this is that we
// generate code for paths that the matcher can take through the regular
// expression.  A given node in the regexp can be code-generated several times
// as it can be part of several traces.  For example for the regexp:
// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
// of the foo-bar-baz trace and once as part of the foo-ip-baz trace.  The code
// to match foo is generated only once (the traces have a common prefix).  The
// code to store the capture is deferred and generated (twice) after the places
// where baz has been matched.
class Trace {
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1194
 public:
1195 1196 1197
  // A value for a property that is either known to be true, know to be false,
  // or not known.
  enum TriBool {
1198
    UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
1199 1200
  };

erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1201 1202
  class DeferredAction {
   public:
1203 1204
    DeferredAction(ActionNode::ActionType action_type, int reg)
        : action_type_(action_type), reg_(reg), next_(NULL) { }
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1205
    DeferredAction* next() { return next_; }
1206
    bool Mentions(int reg);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1207
    int reg() { return reg_; }
1208
    ActionNode::ActionType action_type() { return action_type_; }
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1209
   private:
1210
    ActionNode::ActionType action_type_;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1211 1212
    int reg_;
    DeferredAction* next_;
1213
    friend class Trace;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1214 1215
  };

1216
  class DeferredCapture : public DeferredAction {
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1217
   public:
1218
    DeferredCapture(int reg, bool is_capture, Trace* trace)
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1219
        : DeferredAction(ActionNode::STORE_POSITION, reg),
1220 1221
          cp_offset_(trace->cp_offset()),
          is_capture_(is_capture) { }
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1222
    int cp_offset() { return cp_offset_; }
1223
    bool is_capture() { return is_capture_; }
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1224 1225
   private:
    int cp_offset_;
1226
    bool is_capture_;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1227 1228 1229
    void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
  };

1230
  class DeferredSetRegister : public DeferredAction {
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1231 1232 1233 1234 1235 1236 1237 1238 1239
   public:
    DeferredSetRegister(int reg, int value)
        : DeferredAction(ActionNode::SET_REGISTER, reg),
          value_(value) { }
    int value() { return value_; }
   private:
    int value_;
  };

1240 1241 1242 1243 1244 1245 1246 1247 1248 1249
  class DeferredClearCaptures : public DeferredAction {
   public:
    explicit DeferredClearCaptures(Interval range)
        : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
          range_(range) { }
    Interval range() { return range_; }
   private:
    Interval range_;
  };

1250
  class DeferredIncrementRegister : public DeferredAction {
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1251 1252 1253 1254 1255
   public:
    explicit DeferredIncrementRegister(int reg)
        : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
  };

1256
  Trace()
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1257 1258 1259 1260
      : cp_offset_(0),
        actions_(NULL),
        backtrack_(NULL),
        stop_node_(NULL),
1261
        loop_label_(NULL),
1262
        characters_preloaded_(0),
1263
        bound_checked_up_to_(0),
1264
        flush_budget_(100),
1265 1266
        at_start_(UNKNOWN) { }

erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1267 1268 1269 1270
  // End the trace.  This involves flushing the deferred actions in the trace
  // and pushing a backtrack location onto the backtrack stack.  Once this is
  // done we can start a new trace or go to one that has already been
  // generated.
1271
  void Flush(RegExpCompiler* compiler, RegExpNode* successor);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1272 1273
  int cp_offset() { return cp_offset_; }
  DeferredAction* actions() { return actions_; }
1274 1275 1276 1277 1278 1279 1280 1281 1282 1283
  // A trivial trace is one that has no deferred actions or other state that
  // affects the assumptions used when generating code.  There is no recorded
  // backtrack location in a trivial trace, so with a trivial trace we will
  // generate code that, on a failure to match, gets the backtrack location
  // from the backtrack stack rather than using a direct jump instruction.  We
  // always start code generation with a trivial trace and non-trivial traces
  // are created as we emit code for nodes or add to the list of deferred
  // actions in the trace.  The location of the code generated for a node using
  // a trivial trace is recorded in a label in the node so that gotos can be
  // generated to that code.
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1284
  bool is_trivial() {
1285 1286 1287 1288
    return backtrack_ == NULL &&
           actions_ == NULL &&
           cp_offset_ == 0 &&
           characters_preloaded_ == 0 &&
1289
           bound_checked_up_to_ == 0 &&
1290 1291
           quick_check_performed_.characters() == 0 &&
           at_start_ == UNKNOWN;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1292
  }
1293
  TriBool at_start() { return at_start_; }
1294
  void set_at_start(TriBool at_start) { at_start_ = at_start; }
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1295 1296 1297
  Label* backtrack() { return backtrack_; }
  Label* loop_label() { return loop_label_; }
  RegExpNode* stop_node() { return stop_node_; }
1298
  int characters_preloaded() { return characters_preloaded_; }
1299
  int bound_checked_up_to() { return bound_checked_up_to_; }
1300
  int flush_budget() { return flush_budget_; }
1301 1302
  QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
  bool mentions_reg(int reg);
1303 1304 1305 1306
  // Returns true if a deferred position store exists to the specified
  // register and stores the offset in the out-parameter.  Otherwise
  // returns false.
  bool GetStoredPosition(int reg, int* cp_offset);
1307 1308
  // These set methods and AdvanceCurrentPositionInTrace should be used only on
  // new traces - the intention is that traces are immutable after creation.
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1309
  void add_action(DeferredAction* new_action) {
1310
    DCHECK(new_action->next_ == NULL);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1311 1312 1313 1314 1315 1316
    new_action->next_ = actions_;
    actions_ = new_action;
  }
  void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
  void set_stop_node(RegExpNode* node) { stop_node_ = node; }
  void set_loop_label(Label* label) { loop_label_ = label; }
1317
  void set_characters_preloaded(int count) { characters_preloaded_ = count; }
1318
  void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
1319
  void set_flush_budget(int to) { flush_budget_ = to; }
1320 1321 1322
  void set_quick_check_performed(QuickCheckDetails* d) {
    quick_check_performed_ = *d;
  }
1323
  void InvalidateCurrentCharacter();
1324
  void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
1325

erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1326
 private:
1327
  int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1328
  void PerformDeferredActions(RegExpMacroAssembler* macro,
1329
                              int max_register,
1330
                              const OutSet& affected_registers,
1331 1332 1333
                              OutSet* registers_to_pop,
                              OutSet* registers_to_clear,
                              Zone* zone);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1334 1335
  void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
                                int max_register,
1336 1337
                                const OutSet& registers_to_pop,
                                const OutSet& registers_to_clear);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1338 1339 1340 1341 1342
  int cp_offset_;
  DeferredAction* actions_;
  Label* backtrack_;
  RegExpNode* stop_node_;
  Label* loop_label_;
1343
  int characters_preloaded_;
1344
  int bound_checked_up_to_;
1345
  QuickCheckDetails quick_check_performed_;
1346
  int flush_budget_;
1347
  TriBool at_start_;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1348
};
1349 1350


1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375
class GreedyLoopState {
 public:
  explicit GreedyLoopState(bool not_at_start);

  Label* label() { return &label_; }
  Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }

 private:
  Label label_;
  Trace counter_backtrack_trace_;
};


struct PreloadState {
  static const int kEatsAtLeastNotYetInitialized = -1;
  bool preload_is_current_;
  bool preload_has_checked_bounds_;
  int preload_characters_;
  int eats_at_least_;
  void init() {
    eats_at_least_ = kEatsAtLeastNotYetInitialized;
  }
};


1376 1377 1378 1379 1380 1381 1382
class NodeVisitor {
 public:
  virtual ~NodeVisitor() { }
#define DECLARE_VISIT(Type)                                          \
  virtual void Visit##Type(Type##Node* that) = 0;
FOR_EACH_NODE_TYPE(DECLARE_VISIT)
#undef DECLARE_VISIT
1383
  virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
1384 1385 1386 1387 1388 1389 1390
};


// Node visitor used to add the start set of the alternatives to the
// dispatch table of a choice node.
class DispatchTableConstructor: public NodeVisitor {
 public:
1391 1392
  DispatchTableConstructor(DispatchTable* table, bool ignore_case,
                           Zone* zone)
1393
      : table_(table),
1394
        choice_index_(-1),
1395 1396
        ignore_case_(ignore_case),
        zone_(zone) { }
1397 1398 1399 1400

  void BuildTable(ChoiceNode* node);

  void AddRange(CharacterRange range) {
1401
    table()->AddRange(range, choice_index_, zone_);
1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414
  }

  void AddInverse(ZoneList<CharacterRange>* ranges);

#define DECLARE_VISIT(Type)                                          \
  virtual void Visit##Type(Type##Node* that);
FOR_EACH_NODE_TYPE(DECLARE_VISIT)
#undef DECLARE_VISIT

  DispatchTable* table() { return table_; }
  void set_choice_index(int value) { choice_index_ = value; }

 protected:
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1415
  DispatchTable* table_;
1416
  int choice_index_;
1417
  bool ignore_case_;
1418
  Zone* zone_;
1419 1420 1421
};


1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433
// Assertion propagation moves information about assertions such as
// \b to the affected nodes.  For instance, in /.\b./ information must
// be propagated to the first '.' that whatever follows needs to know
// if it matched a word or a non-word, and to the second '.' that it
// has to check if it succeeds a word or non-word.  In this case the
// result will be something like:
//
//   +-------+        +------------+
//   |   .   |        |      .     |
//   +-------+  --->  +------------+
//   | word? |        | check word |
//   +-------+        +------------+
1434
class Analysis: public NodeVisitor {
1435
 public:
1436
  Analysis(Isolate* isolate, JSRegExp::Flags flags, bool is_one_byte)
1437
      : isolate_(isolate),
1438
        flags_(flags),
1439 1440
        is_one_byte_(is_one_byte),
        error_message_(NULL) {}
1441 1442 1443 1444 1445 1446
  void EnsureAnalyzed(RegExpNode* node);

#define DECLARE_VISIT(Type)                                          \
  virtual void Visit##Type(Type##Node* that);
FOR_EACH_NODE_TYPE(DECLARE_VISIT)
#undef DECLARE_VISIT
1447
  virtual void VisitLoopChoice(LoopChoiceNode* that);
1448

1449 1450
  bool has_failed() { return error_message_ != NULL; }
  const char* error_message() {
1451
    DCHECK(error_message_ != NULL);
1452 1453 1454 1455 1456
    return error_message_;
  }
  void fail(const char* error_message) {
    error_message_ = error_message;
  }
1457

1458 1459
  Isolate* isolate() const { return isolate_; }

1460 1461 1462
  bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
  bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }

1463
 private:
1464
  Isolate* isolate_;
1465
  JSRegExp::Flags flags_;
1466
  bool is_one_byte_;
1467
  const char* error_message_;
1468

1469
  DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
1470 1471 1472
};


1473 1474 1475 1476
struct RegExpCompileData {
  RegExpCompileData()
    : tree(NULL),
      node(NULL),
1477
      simple(true),
1478
      contains_anchor(false),
1479
      capture_count(0) { }
1480
  RegExpTree* tree;
1481
  RegExpNode* node;
1482
  bool simple;
1483
  bool contains_anchor;
1484
  Handle<FixedArray> capture_name_map;
1485 1486 1487 1488 1489 1490 1491
  Handle<String> error;
  int capture_count;
};


class RegExpEngine: public AllStatic {
 public:
1492
  struct CompilationResult {
1493
    CompilationResult(Isolate* isolate, const char* error_message)
1494
        : error_message(error_message),
1495
          code(isolate->heap()->the_hole_value()),
1496 1497
          num_registers(0) {}
    CompilationResult(Object* code, int registers)
1498
        : error_message(NULL), code(code), num_registers(registers) {}
1499 1500 1501 1502 1503
    const char* error_message;
    Object* code;
    int num_registers;
  };

1504
  static CompilationResult Compile(Isolate* isolate, Zone* zone,
1505 1506
                                   RegExpCompileData* input,
                                   JSRegExp::Flags flags,
1507
                                   Handle<String> pattern,
1508
                                   Handle<String> sample_subject,
1509
                                   bool is_one_byte);
1510

1511 1512
  static bool TooMuchRegExpCode(Handle<String> pattern);

1513
  static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
1514 1515 1516
};


1517 1518 1519 1520 1521 1522 1523
class RegExpResultsCache : public AllStatic {
 public:
  enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };

  // Attempt to retrieve a cached result.  On failure, 0 is returned as a Smi.
  // On success, the returned result is guaranteed to be a COW-array.
  static Object* Lookup(Heap* heap, String* key_string, Object* key_pattern,
1524
                        FixedArray** last_match_out, ResultsCacheType type);
1525 1526 1527 1528
  // Attempt to add value_array to the cache specified by type.  On success,
  // value_array is turned into a COW-array.
  static void Enter(Isolate* isolate, Handle<String> key_string,
                    Handle<Object> key_pattern, Handle<FixedArray> value_array,
1529
                    Handle<FixedArray> last_match_cache, ResultsCacheType type);
1530 1531 1532 1533 1534 1535 1536 1537
  static void Clear(FixedArray* cache);
  static const int kRegExpResultsCacheSize = 0x100;

 private:
  static const int kArrayEntriesPerCacheEntry = 4;
  static const int kStringOffset = 0;
  static const int kPatternOffset = 1;
  static const int kArrayOffset = 2;
1538
  static const int kLastMatchOffset = 3;
1539 1540 1541 1542
};

}  // namespace internal
}  // namespace v8
1543

1544
#endif  // V8_REGEXP_JSREGEXP_H_