regexp-parser.cc 77.6 KB
Newer Older
1 2 3 4 5 6
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "src/regexp/regexp-parser.h"

7
#include "src/execution/isolate.h"
8
#include "src/regexp/property-sequences.h"
9
#include "src/regexp/regexp-ast.h"
10
#include "src/regexp/regexp-macro-assembler.h"
11
#include "src/regexp/regexp.h"
12
#include "src/strings/char-predicates-inl.h"
13 14
#include "src/utils/ostreams.h"
#include "src/utils/utils.h"
15
#include "src/zone/zone-list-inl.h"
16

17
#ifdef V8_INTL_SUPPORT
18
#include "unicode/uniset.h"
19
#endif  // V8_INTL_SUPPORT
20

21 22 23
namespace v8 {
namespace internal {

24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
namespace {

// A BufferedZoneList is an automatically growing list, just like (and backed
// by) a ZoneList, that is optimized for the case of adding and removing
// a single element. The last element added is stored outside the backing list,
// and if no more than one element is ever added, the ZoneList isn't even
// allocated.
// Elements must not be nullptr pointers.
template <typename T, int initial_size>
class BufferedZoneList {
 public:
  BufferedZoneList() : list_(nullptr), last_(nullptr) {}

  // Adds element at end of list. This element is buffered and can
  // be read using last() or removed using RemoveLast until a new Add or until
  // RemoveLast or GetList has been called.
  void Add(T* value, Zone* zone) {
    if (last_ != nullptr) {
      if (list_ == nullptr) {
        list_ = zone->New<ZoneList<T*>>(initial_size, zone);
      }
      list_->Add(last_, zone);
    }
    last_ = value;
  }

  T* last() {
    DCHECK(last_ != nullptr);
    return last_;
  }

  T* RemoveLast() {
    DCHECK(last_ != nullptr);
    T* result = last_;
    if ((list_ != nullptr) && (list_->length() > 0))
      last_ = list_->RemoveLast();
    else
      last_ = nullptr;
    return result;
  }

  T* Get(int i) {
    DCHECK((0 <= i) && (i < length()));
    if (list_ == nullptr) {
      DCHECK_EQ(0, i);
      return last_;
    } else {
      if (i == list_->length()) {
        DCHECK(last_ != nullptr);
        return last_;
      } else {
        return list_->at(i);
      }
    }
  }

  void Clear() {
    list_ = nullptr;
    last_ = nullptr;
  }

  int length() {
    int length = (list_ == nullptr) ? 0 : list_->length();
    return length + ((last_ == nullptr) ? 0 : 1);
  }

  ZoneList<T*>* GetList(Zone* zone) {
    if (list_ == nullptr) {
      list_ = zone->New<ZoneList<T*>>(initial_size, zone);
    }
    if (last_ != nullptr) {
      list_->Add(last_, zone);
      last_ = nullptr;
    }
    return list_;
  }

 private:
  ZoneList<T*>* list_;
  T* last_;
};

// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
class RegExpBuilder : public ZoneObject {
 public:
109
  RegExpBuilder(Zone* zone, RegExpFlags flags);
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
  void AddCharacter(base::uc16 character);
  void AddUnicodeCharacter(base::uc32 character);
  void AddEscapedUnicodeCharacter(base::uc32 character);
  // "Adds" an empty expression. Does nothing except consume a
  // following quantifier
  void AddEmpty();
  void AddCharacterClass(RegExpCharacterClass* cc);
  void AddCharacterClassForDesugaring(base::uc32 c);
  void AddAtom(RegExpTree* tree);
  void AddTerm(RegExpTree* tree);
  void AddAssertion(RegExpTree* tree);
  void NewAlternative();  // '|'
  bool AddQuantifierToAtom(int min, int max,
                           RegExpQuantifier::QuantifierType type);
  void FlushText();
  RegExpTree* ToRegExp();
126
  RegExpFlags flags() const { return flags_; }
127

128 129 130
  bool ignore_case() const { return IsIgnoreCase(flags_); }
  bool multiline() const { return IsMultiline(flags_); }
  bool dotall() const { return IsDotAll(flags_); }
131 132 133 134 135 136 137 138 139 140 141

 private:
  static const base::uc16 kNoPendingSurrogate = 0;
  void AddLeadSurrogate(base::uc16 lead_surrogate);
  void AddTrailSurrogate(base::uc16 trail_surrogate);
  void FlushPendingSurrogate();
  void FlushCharacters();
  void FlushTerms();
  bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
  bool NeedsDesugaringForIgnoreCase(base::uc32 c);
  Zone* zone() const { return zone_; }
142
  bool unicode() const { return IsUnicode(flags_); }
143

144
  Zone* const zone_;
145
  bool pending_empty_;
146
  const RegExpFlags flags_;
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
  ZoneList<base::uc16>* characters_;
  base::uc16 pending_surrogate_;
  BufferedZoneList<RegExpTree, 2> terms_;
  BufferedZoneList<RegExpTree, 2> text_;
  BufferedZoneList<RegExpTree, 2> alternatives_;
#ifdef DEBUG
  enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_;
#define LAST(x) last_added_ = x;
#else
#define LAST(x)
#endif
};

enum SubexpressionType {
  INITIAL,
  CAPTURE,  // All positive values represent captures.
  POSITIVE_LOOKAROUND,
  NEGATIVE_LOOKAROUND,
  GROUPING
};

class RegExpParserState : public ZoneObject {
 public:
  // Push a state on the stack.
  RegExpParserState(RegExpParserState* previous_state,
                    SubexpressionType group_type,
                    RegExpLookaround::Type lookaround_type,
                    int disjunction_capture_index,
                    const ZoneVector<base::uc16>* capture_name,
176
                    RegExpFlags flags, Zone* zone)
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
      : previous_state_(previous_state),
        builder_(zone->New<RegExpBuilder>(zone, flags)),
        group_type_(group_type),
        lookaround_type_(lookaround_type),
        disjunction_capture_index_(disjunction_capture_index),
        capture_name_(capture_name) {}
  // Parser state of containing expression, if any.
  RegExpParserState* previous_state() const { return previous_state_; }
  bool IsSubexpression() { return previous_state_ != nullptr; }
  // RegExpBuilder building this regexp's AST.
  RegExpBuilder* builder() const { return builder_; }
  // Type of regexp being parsed (parenthesized group or entire regexp).
  SubexpressionType group_type() const { return group_type_; }
  // Lookahead or Lookbehind.
  RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
  // Index in captures array of first capture in this sub-expression, if any.
  // Also the capture index of this sub-expression itself, if group_type
  // is CAPTURE.
  int capture_index() const { return disjunction_capture_index_; }
  // The name of the current sub-expression, if group_type is CAPTURE. Only
  // used for named captures.
  const ZoneVector<base::uc16>* capture_name() const { return capture_name_; }

  bool IsNamedCapture() const { return capture_name_ != nullptr; }

  // Check whether the parser is inside a capture group with the given index.
  bool IsInsideCaptureGroup(int index) const {
    for (const RegExpParserState* s = this; s != nullptr;
         s = s->previous_state()) {
      if (s->group_type() != CAPTURE) continue;
      // Return true if we found the matching capture index.
      if (index == s->capture_index()) return true;
      // Abort if index is larger than what has been parsed up till this state.
      if (index > s->capture_index()) return false;
    }
    return false;
  }

  // Check whether the parser is inside a capture group with the given name.
  bool IsInsideCaptureGroup(const ZoneVector<base::uc16>* name) const {
    DCHECK_NOT_NULL(name);
    for (const RegExpParserState* s = this; s != nullptr;
         s = s->previous_state()) {
      if (s->capture_name() == nullptr) continue;
      if (*s->capture_name() == *name) return true;
    }
    return false;
  }

 private:
  // Linked list implementation of stack of states.
  RegExpParserState* const previous_state_;
  // Builder for the stored disjunction.
  RegExpBuilder* const builder_;
  // Stored disjunction type (capture, look-ahead or grouping), if any.
  const SubexpressionType group_type_;
  // Stored read direction.
  const RegExpLookaround::Type lookaround_type_;
  // Stored disjunction's capture index (if any).
  const int disjunction_capture_index_;
  // Stored capture name (if any).
  const ZoneVector<base::uc16>* const capture_name_;
};

template <class CharT>
class RegExpParserImpl final {
 private:
244
  RegExpParserImpl(const CharT* input, int input_length, RegExpFlags flags,
245
                   uintptr_t stack_limit, Zone* zone,
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
                   const DisallowGarbageCollection& no_gc);

  bool Parse(RegExpCompileData* result);

  RegExpTree* ParsePattern();
  RegExpTree* ParseDisjunction();
  RegExpTree* ParseGroup();

  // Parses a {...,...} quantifier and stores the range in the given
  // out parameters.
  bool ParseIntervalQuantifier(int* min_out, int* max_out);

  // Parses and returns a single escaped character.  The character
  // must not be 'b' or 'B' since they are usually handle specially.
  base::uc32 ParseClassCharacterEscape();

  // Checks whether the following is a length-digit hexadecimal number,
  // and sets the value if it is.
  bool ParseHexEscape(int length, base::uc32* value);
  bool ParseUnicodeEscape(base::uc32* value);
  bool ParseUnlimitedLengthHexNumber(int max_value, base::uc32* value);

  bool ParsePropertyClassName(ZoneVector<char>* name_1,
                              ZoneVector<char>* name_2);
  bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
                             const ZoneVector<char>& name_1,
                             const ZoneVector<char>& name_2);

  RegExpTree* GetPropertySequence(const ZoneVector<char>& name_1);
  RegExpTree* ParseCharacterClass(const RegExpBuilder* state);

  base::uc32 ParseOctalLiteral();

  // Tries to parse the input as a back reference.  If successful it
  // stores the result in the output parameter and returns true.  If
  // it fails it will push back the characters read so the same characters
  // can be reparsed.
  bool ParseBackReferenceIndex(int* index_out);

  // Parse inside a class. Either add escaped class to the range, or return
  // false and pass parsed single character through |char_out|.
  void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
                        bool add_unicode_case_equivalents, base::uc32* char_out,
                        bool* is_class_escape);

  char ParseClassEscape();

  RegExpTree* ReportError(RegExpError error);
  void Advance();
  void Advance(int dist);
  void Reset(int pos);

  // Reports whether the pattern might be used as a literal search string.
  // Only use if the result of the parse is a single atom node.
  bool simple();
  bool contains_anchor() { return contains_anchor_; }
  void set_contains_anchor() { contains_anchor_ = true; }
  int captures_started() { return captures_started_; }
  int position() { return next_pos_ - 1; }
  bool failed() { return failed_; }
306
  bool unicode() const { return IsUnicode(top_level_flags_); }
307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372

  static bool IsSyntaxCharacterOrSlash(base::uc32 c);

  static const base::uc32 kEndMarker = (1 << 21);

 private:
  // Return the 1-indexed RegExpCapture object, allocate if necessary.
  RegExpCapture* GetCapture(int index);

  // Creates a new named capture at the specified index. Must be called exactly
  // once for each named capture. Fails if a capture with the same name is
  // encountered.
  bool CreateNamedCaptureAtIndex(const ZoneVector<base::uc16>* name, int index);

  // Parses the name of a capture group (?<name>pattern). The name must adhere
  // to IdentifierName in the ECMAScript standard.
  const ZoneVector<base::uc16>* ParseCaptureGroupName();

  bool ParseNamedBackReference(RegExpBuilder* builder,
                               RegExpParserState* state);
  RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);

  // After the initial parsing pass, patch corresponding RegExpCapture objects
  // into all RegExpBackReferences. This is done after initial parsing in order
  // to avoid complicating cases in which references comes before the capture.
  void PatchNamedBackReferences();

  ZoneVector<RegExpCapture*>* GetNamedCaptures() const;

  // Returns true iff the pattern contains named captures. May call
  // ScanForCaptures to look ahead at the remaining pattern.
  bool HasNamedCaptures();

  Zone* zone() const { return zone_; }

  base::uc32 current() { return current_; }
  bool has_more() { return has_more_; }
  bool has_next() { return next_pos_ < input_length(); }
  base::uc32 Next();
  template <bool update_position>
  base::uc32 ReadNext();
  CharT InputAt(int index) const {
    DCHECK(0 <= index && index < input_length());
    return input_[index];
  }
  int input_length() const { return input_length_; }
  void ScanForCaptures();

  struct RegExpCaptureNameLess {
    bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
      DCHECK_NOT_NULL(lhs);
      DCHECK_NOT_NULL(rhs);
      return *lhs->name() < *rhs->name();
    }
  };

  const DisallowGarbageCollection no_gc_;
  Zone* const zone_;
  RegExpError error_ = RegExpError::kNone;
  int error_pos_ = 0;
  ZoneList<RegExpCapture*>* captures_;
  ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
  ZoneList<RegExpBackReference*>* named_back_references_;
  const CharT* const input_;
  const int input_length_;
  base::uc32 current_;
373
  const RegExpFlags top_level_flags_;
374 375 376 377 378 379 380 381 382
  int next_pos_;
  int captures_started_;
  int capture_count_;  // Only valid after we have scanned for captures.
  bool has_more_;
  bool simple_;
  bool contains_anchor_;
  bool is_scanned_for_captures_;
  bool has_named_captures_;  // Only valid after we have scanned for captures.
  bool failed_;
383
  const uintptr_t stack_limit_;
384 385 386

  friend bool RegExpParser::ParseRegExpFromHeapString(Isolate*, Zone*,
                                                      Handle<String>,
387
                                                      RegExpFlags,
388
                                                      RegExpCompileData*);
389 390 391
  friend bool RegExpParser::VerifyRegExpSyntax<CharT>(
      Zone*, uintptr_t, const CharT*, int, RegExpFlags, RegExpCompileData*,
      const DisallowGarbageCollection&);
392 393 394 395
};

template <class CharT>
RegExpParserImpl<CharT>::RegExpParserImpl(
396 397 398
    const CharT* input, int input_length, RegExpFlags flags,
    uintptr_t stack_limit, Zone* zone, const DisallowGarbageCollection& no_gc)
    : zone_(zone),
399 400 401
      captures_(nullptr),
      named_captures_(nullptr),
      named_back_references_(nullptr),
402 403
      input_(input),
      input_length_(input_length),
404
      current_(kEndMarker),
405
      top_level_flags_(flags),
406 407 408 409 410 411 412
      next_pos_(0),
      captures_started_(0),
      capture_count_(0),
      has_more_(true),
      simple_(false),
      contains_anchor_(false),
      is_scanned_for_captures_(false),
413
      has_named_captures_(false),
414 415
      failed_(false),
      stack_limit_(stack_limit) {
416 417 418
  Advance();
}

419 420 421 422 423 424 425 426 427 428 429 430
template <>
template <bool update_position>
inline base::uc32 RegExpParserImpl<uint8_t>::ReadNext() {
  int position = next_pos_;
  base::uc16 c0 = InputAt(position);
  position++;
  DCHECK(!unibrow::Utf16::IsLeadSurrogate(c0));
  if (update_position) next_pos_ = position;
  return c0;
}

template <>
431
template <bool update_position>
432
inline base::uc32 RegExpParserImpl<base::uc16>::ReadNext() {
433
  int position = next_pos_;
434 435
  base::uc16 c0 = InputAt(position);
  base::uc32 result = c0;
436
  position++;
437
  // Read the whole surrogate pair in case of unicode flag, if possible.
438 439 440
  if (unicode() && position < input_length() &&
      unibrow::Utf16::IsLeadSurrogate(c0)) {
    base::uc16 c1 = InputAt(position);
441
    if (unibrow::Utf16::IsTrailSurrogate(c1)) {
442
      result = unibrow::Utf16::CombineSurrogatePair(c0, c1);
443 444 445 446
      position++;
    }
  }
  if (update_position) next_pos_ = position;
447
  return result;
448 449
}

450 451
template <class CharT>
base::uc32 RegExpParserImpl<CharT>::Next() {
452
  if (has_next()) {
453
    return ReadNext<false>();
454 455 456 457 458
  } else {
    return kEndMarker;
  }
}

459 460
template <class CharT>
void RegExpParserImpl<CharT>::Advance() {
461
  if (has_next()) {
462
    if (GetCurrentStackPosition() < stack_limit_) {
463
      if (FLAG_correctness_fuzzer_suppressions) {
464 465
        FATAL("Aborting on stack overflow");
      }
466
      ReportError(RegExpError::kStackOverflow);
467
    } else if (zone()->excess_allocation()) {
468 469 470
      if (FLAG_correctness_fuzzer_suppressions) {
        FATAL("Aborting on excess zone allocation");
      }
471
      ReportError(RegExpError::kTooLarge);
472
    } else {
473
      current_ = ReadNext<true>();
474 475 476 477 478
    }
  } else {
    current_ = kEndMarker;
    // Advance so that position() points to 1-after-the-last-character. This is
    // important so that Reset() to this position works correctly.
479
    next_pos_ = input_length() + 1;
480 481 482 483
    has_more_ = false;
  }
}

484 485
template <class CharT>
void RegExpParserImpl<CharT>::Reset(int pos) {
486
  next_pos_ = pos;
487
  has_more_ = (pos < input_length());
488 489 490
  Advance();
}

491 492
template <class CharT>
void RegExpParserImpl<CharT>::Advance(int dist) {
493
  next_pos_ += dist - 1;
494
  Advance();
495 496
}

497 498 499 500
template <class CharT>
bool RegExpParserImpl<CharT>::simple() {
  return simple_;
}
501

502 503
template <class CharT>
bool RegExpParserImpl<CharT>::IsSyntaxCharacterOrSlash(base::uc32 c) {
504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524
  switch (c) {
    case '^':
    case '$':
    case '\\':
    case '.':
    case '*':
    case '+':
    case '?':
    case '(':
    case ')':
    case '[':
    case ']':
    case '{':
    case '}':
    case '|':
    case '/':
      return true;
    default:
      break;
  }
  return false;
525 526
}

527 528
template <class CharT>
RegExpTree* RegExpParserImpl<CharT>::ReportError(RegExpError error) {
529
  if (failed_) return nullptr;  // Do not overwrite any existing error.
530
  failed_ = true;
531 532 533
  error_ = error;
  error_pos_ = position();
  // Zip to the end to make sure no more input is read.
534
  current_ = kEndMarker;
535
  next_pos_ = input_length();
536
  return nullptr;
537 538
}

539 540
#define CHECK_FAILED /**/);    \
  if (failed_) return nullptr; \
541 542 543 544
  ((void)0

// Pattern ::
//   Disjunction
545 546
template <class CharT>
RegExpTree* RegExpParserImpl<CharT>::ParsePattern() {
547
  RegExpTree* result = ParseDisjunction(CHECK_FAILED);
548
  PatchNamedBackReferences(CHECK_FAILED);
549 550 551
  DCHECK(!has_more());
  // If the result of parsing is a literal string atom, and it has the
  // same length as the input, then the atom is identical to the input.
552
  if (result->IsAtom() && result->AsAtom()->length() == input_length()) {
553 554 555 556 557 558 559 560 561 562 563 564 565 566 567
    simple_ = true;
  }
  return result;
}

// Disjunction ::
//   Alternative
//   Alternative | Disjunction
// Alternative ::
//   [empty]
//   Term Alternative
// Term ::
//   Assertion
//   Atom
//   Atom Quantifier
568 569
template <class CharT>
RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
570
  // Used to store current state while parsing subexpressions.
571
  RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
572
                                  0, nullptr, top_level_flags_, zone());
573 574 575 576 577 578 579 580
  RegExpParserState* state = &initial_state;
  // Cache the builder in a local variable for quick access.
  RegExpBuilder* builder = initial_state.builder();
  while (true) {
    switch (current()) {
      case kEndMarker:
        if (state->IsSubexpression()) {
          // Inside a parenthesized group when hitting end of input.
581
          return ReportError(RegExpError::kUnterminatedGroup);
582 583 584 585 586 587
        }
        DCHECK_EQ(INITIAL, state->group_type());
        // Parsing completed successfully.
        return builder->ToRegExp();
      case ')': {
        if (!state->IsSubexpression()) {
588
          return ReportError(RegExpError::kUnmatchedParen);
589 590 591 592 593 594 595 596 597 598 599 600 601 602 603
        }
        DCHECK_NE(INITIAL, state->group_type());

        Advance();
        // End disjunction parsing and convert builder content to new single
        // regexp atom.
        RegExpTree* body = builder->ToRegExp();

        int end_capture_index = captures_started();

        int capture_index = state->capture_index();
        SubexpressionType group_type = state->group_type();

        // Build result of subexpression.
        if (group_type == CAPTURE) {
604 605 606 607
          if (state->IsNamedCapture()) {
            CreateNamedCaptureAtIndex(state->capture_name(),
                                      capture_index CHECK_FAILED);
          }
608 609 610
          RegExpCapture* capture = GetCapture(capture_index);
          capture->set_body(body);
          body = capture;
611
        } else if (group_type == GROUPING) {
612
          body = zone()->template New<RegExpGroup>(body);
613
        } else {
614 615 616
          DCHECK(group_type == POSITIVE_LOOKAROUND ||
                 group_type == NEGATIVE_LOOKAROUND);
          bool is_positive = (group_type == POSITIVE_LOOKAROUND);
617
          body = zone()->template New<RegExpLookaround>(
618 619 620 621 622 623 624 625 626
              body, is_positive, end_capture_index - capture_index,
              capture_index, state->lookaround_type());
        }

        // Restore previous state.
        state = state->previous_state();
        builder = state->builder();

        builder->AddAtom(body);
627
        // For compatibility with JSC and ES3, we allow quantifiers after
628 629 630 631 632 633 634 635 636 637 638
        // lookaheads, and break in all cases.
        break;
      }
      case '|': {
        Advance();
        builder->NewAlternative();
        continue;
      }
      case '*':
      case '+':
      case '?':
639
        return ReportError(RegExpError::kNothingToRepeat);
640 641
      case '^': {
        Advance();
642
        builder->AddAssertion(zone()->template New<RegExpAssertion>(
643 644 645
            builder->multiline() ? RegExpAssertion::START_OF_LINE
                                 : RegExpAssertion::START_OF_INPUT));
        set_contains_anchor();
646 647 648 649 650
        continue;
      }
      case '$': {
        Advance();
        RegExpAssertion::AssertionType assertion_type =
651 652
            builder->multiline() ? RegExpAssertion::END_OF_LINE
                                 : RegExpAssertion::END_OF_INPUT;
653 654
        builder->AddAssertion(
            zone()->template New<RegExpAssertion>(assertion_type));
655 656 657 658 659
        continue;
      }
      case '.': {
        Advance();
        ZoneList<CharacterRange>* ranges =
660
            zone()->template New<ZoneList<CharacterRange>>(2, zone());
661

662
        if (builder->dotall()) {
663 664 665
          // Everything.
          CharacterRange::AddClassEscape('*', ranges, false, zone());
        } else {
666
          // Everything except \x0A, \x0D, \u2028 and \u2029
667 668 669
          CharacterRange::AddClassEscape('.', ranges, false, zone());
        }

670
        RegExpCharacterClass* cc =
671
            zone()->template New<RegExpCharacterClass>(zone(), ranges);
672
        builder->AddCharacterClass(cc);
673 674 675
        break;
      }
      case '(': {
676
        state = ParseOpenParenthesis(state CHECK_FAILED);
677 678 679 680
        builder = state->builder();
        continue;
      }
      case '[': {
681
        RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
682
        builder->AddCharacterClass(cc->AsCharacterClass());
683 684 685 686 687 688 689
        break;
      }
      // Atom ::
      //   \ AtomEscape
      case '\\':
        switch (Next()) {
          case kEndMarker:
690
            return ReportError(RegExpError::kEscapeAtEndOfPattern);
691 692
          case 'b':
            Advance(2);
693 694
            builder->AddAssertion(zone()->template New<RegExpAssertion>(
                RegExpAssertion::BOUNDARY));
695 696 697
            continue;
          case 'B':
            Advance(2);
698 699
            builder->AddAssertion(zone()->template New<RegExpAssertion>(
                RegExpAssertion::NON_BOUNDARY));
700 701 702 703 704 705 706 707 708 709 710 711
            continue;
          // AtomEscape ::
          //   CharacterClassEscape
          //
          // CharacterClassEscape :: one of
          //   d D s S w W
          case 'd':
          case 'D':
          case 's':
          case 'S':
          case 'w':
          case 'W': {
712
            base::uc32 c = Next();
713 714
            Advance(2);
            ZoneList<CharacterRange>* ranges =
715
                zone()->template New<ZoneList<CharacterRange>>(2, zone());
716 717
            CharacterRange::AddClassEscape(
                c, ranges, unicode() && builder->ignore_case(), zone());
718
            RegExpCharacterClass* cc =
719
                zone()->template New<RegExpCharacterClass>(zone(), ranges);
720
            builder->AddCharacterClass(cc);
721 722
            break;
          }
723 724
          case 'p':
          case 'P': {
725
            base::uc32 p = Next();
726 727
            Advance(2);
            if (unicode()) {
728
              ZoneList<CharacterRange>* ranges =
729
                  zone()->template New<ZoneList<CharacterRange>>(2, zone());
730 731
              ZoneVector<char> name_1(zone());
              ZoneVector<char> name_2(zone());
732 733
              if (ParsePropertyClassName(&name_1, &name_2)) {
                if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
734
                  RegExpCharacterClass* cc =
735 736
                      zone()->template New<RegExpCharacterClass>(zone(),
                                                                 ranges);
737 738 739 740 741 742 743 744 745 746
                  builder->AddCharacterClass(cc);
                  break;
                }
                if (p == 'p' && name_2.empty()) {
                  RegExpTree* sequence = GetPropertySequence(name_1);
                  if (sequence != nullptr) {
                    builder->AddAtom(sequence);
                    break;
                  }
                }
747
              }
748
              return ReportError(RegExpError::kInvalidPropertyName);
749 750 751 752 753
            } else {
              builder->AddCharacter(p);
            }
            break;
          }
754 755 756 757 758 759 760 761 762 763
          case '1':
          case '2':
          case '3':
          case '4':
          case '5':
          case '6':
          case '7':
          case '8':
          case '9': {
            int index = 0;
764 765
            bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
            if (is_backref) {
766 767 768 769 770 771 772 773 774
              if (state->IsInsideCaptureGroup(index)) {
                // The back reference is inside the capture group it refers to.
                // Nothing can possibly have been captured yet, so we use empty
                // instead. This ensures that, when checking a back reference,
                // the capture registers of the referenced capture are either
                // both set or both cleared.
                builder->AddEmpty();
              } else {
                RegExpCapture* capture = GetCapture(index);
775
                RegExpTree* atom = zone()->template New<RegExpBackReference>(
776
                    capture, builder->flags());
777 778 779 780
                builder->AddAtom(atom);
              }
              break;
            }
781 782 783
            // With /u, no identity escapes except for syntax characters
            // are allowed. Otherwise, all identity escapes are allowed.
            if (unicode()) {
784
              return ReportError(RegExpError::kInvalidEscape);
785
            }
786
            base::uc32 first_digit = Next();
787
            if (first_digit == '8' || first_digit == '9') {
788 789
              builder->AddCharacter(first_digit);
              Advance(2);
790 791
              break;
            }
792
            V8_FALLTHROUGH;
793 794 795
          }
          case '0': {
            Advance();
796 797
            if (unicode() && Next() >= '0' && Next() <= '9') {
              // With /u, decimal escape with leading 0 are not parsed as octal.
798
              return ReportError(RegExpError::kInvalidDecimalEscape);
799
            }
800
            base::uc32 octal = ParseOctalLiteral();
801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827
            builder->AddCharacter(octal);
            break;
          }
          // ControlEscape :: one of
          //   f n r t v
          case 'f':
            Advance(2);
            builder->AddCharacter('\f');
            break;
          case 'n':
            Advance(2);
            builder->AddCharacter('\n');
            break;
          case 'r':
            Advance(2);
            builder->AddCharacter('\r');
            break;
          case 't':
            Advance(2);
            builder->AddCharacter('\t');
            break;
          case 'v':
            Advance(2);
            builder->AddCharacter('\v');
            break;
          case 'c': {
            Advance();
828
            base::uc32 controlLetter = Next();
829 830
            // Special case if it is an ASCII letter.
            // Convert lower case letters to uppercase.
831
            base::uc32 letter = controlLetter & ~('a' ^ 'A');
832 833
            if (letter < 'A' || 'Z' < letter) {
              // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
834 835 836
              // Read the backslash as a literal character instead of as
              // starting an escape.
              // ES#prod-annexB-ExtendedPatternCharacter
837 838
              if (unicode()) {
                // With /u, invalid escapes are not treated as identity escapes.
839
                return ReportError(RegExpError::kInvalidUnicodeEscape);
840
              }
841 842 843
              builder->AddCharacter('\\');
            } else {
              Advance(2);
844
              builder->AddCharacter(controlLetter & 0x1F);
845 846 847 848 849
            }
            break;
          }
          case 'x': {
            Advance(2);
850
            base::uc32 value;
851 852
            if (ParseHexEscape(2, &value)) {
              builder->AddCharacter(value);
853
            } else if (!unicode()) {
854 855
              builder->AddCharacter('x');
            } else {
856
              // With /u, invalid escapes are not treated as identity escapes.
857
              return ReportError(RegExpError::kInvalidEscape);
858 859 860 861 862
            }
            break;
          }
          case 'u': {
            Advance(2);
863
            base::uc32 value;
864
            if (ParseUnicodeEscape(&value)) {
865
              builder->AddEscapedUnicodeCharacter(value);
866
            } else if (!unicode()) {
867 868
              builder->AddCharacter('u');
            } else {
869
              // With /u, invalid escapes are not treated as identity escapes.
870
              return ReportError(RegExpError::kInvalidUnicodeEscape);
871 872 873
            }
            break;
          }
874
          case 'k':
875 876
            // Either an identity escape or a named back-reference.  The two
            // interpretations are mutually exclusive: '\k' is interpreted as
877
            // an identity escape for non-Unicode patterns without named
878 879
            // capture groups, and as the beginning of a named back-reference
            // in all other cases.
880
            if (unicode() || HasNamedCaptures()) {
881 882 883 884
              Advance(2);
              ParseNamedBackReference(builder, state CHECK_FAILED);
              break;
            }
885
            V8_FALLTHROUGH;
886 887
          default:
            Advance();
888 889 890
            // With /u, no identity escapes except for syntax characters
            // are allowed. Otherwise, all identity escapes are allowed.
            if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
891 892 893
              builder->AddCharacter(current());
              Advance();
            } else {
894
              return ReportError(RegExpError::kInvalidEscape);
895 896 897 898 899 900
            }
            break;
        }
        break;
      case '{': {
        int dummy;
901
        bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
902
        if (parsed) return ReportError(RegExpError::kNothingToRepeat);
903
        V8_FALLTHROUGH;
904
      }
905 906 907
      case '}':
      case ']':
        if (unicode()) {
908
          return ReportError(RegExpError::kLoneQuantifierBrackets);
909
        }
910
        V8_FALLTHROUGH;
911
      default:
912
        builder->AddUnicodeCharacter(current());
913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942
        Advance();
        break;
    }  // end switch(current())

    int min;
    int max;
    switch (current()) {
      // QuantifierPrefix ::
      //   *
      //   +
      //   ?
      //   {
      case '*':
        min = 0;
        max = RegExpTree::kInfinity;
        Advance();
        break;
      case '+':
        min = 1;
        max = RegExpTree::kInfinity;
        Advance();
        break;
      case '?':
        min = 0;
        max = 1;
        Advance();
        break;
      case '{':
        if (ParseIntervalQuantifier(&min, &max)) {
          if (max < min) {
943
            return ReportError(RegExpError::kRangeOutOfOrder);
944 945
          }
          break;
946 947
        } else if (unicode()) {
          // With /u, incomplete quantifiers are not allowed.
948
          return ReportError(RegExpError::kIncompleteQuantifier);
949
        }
950
        continue;
951 952 953 954 955 956 957 958 959 960 961 962
      default:
        continue;
    }
    RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
    if (current() == '?') {
      quantifier_type = RegExpQuantifier::NON_GREEDY;
      Advance();
    } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
      // FLAG_regexp_possessive_quantifier is a debug-only flag.
      quantifier_type = RegExpQuantifier::POSSESSIVE;
      Advance();
    }
963
    if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
964
      return ReportError(RegExpError::kInvalidQuantifier);
965
    }
966 967 968
  }
}

969 970
template <class CharT>
RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis(
971 972 973
    RegExpParserState* state) {
  RegExpLookaround::Type lookaround_type = state->lookaround_type();
  bool is_named_capture = false;
974
  const ZoneVector<base::uc16>* capture_name = nullptr;
975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994
  SubexpressionType subexpr_type = CAPTURE;
  Advance();
  if (current() == '?') {
    switch (Next()) {
      case ':':
        Advance(2);
        subexpr_type = GROUPING;
        break;
      case '=':
        Advance(2);
        lookaround_type = RegExpLookaround::LOOKAHEAD;
        subexpr_type = POSITIVE_LOOKAROUND;
        break;
      case '!':
        Advance(2);
        lookaround_type = RegExpLookaround::LOOKAHEAD;
        subexpr_type = NEGATIVE_LOOKAROUND;
        break;
      case '<':
        Advance();
995 996 997 998 999 1000 1001 1002 1003 1004
        if (Next() == '=') {
          Advance(2);
          lookaround_type = RegExpLookaround::LOOKBEHIND;
          subexpr_type = POSITIVE_LOOKAROUND;
          break;
        } else if (Next() == '!') {
          Advance(2);
          lookaround_type = RegExpLookaround::LOOKBEHIND;
          subexpr_type = NEGATIVE_LOOKAROUND;
          break;
1005
        }
1006 1007 1008 1009
        is_named_capture = true;
        has_named_captures_ = true;
        Advance();
        break;
1010
      default:
1011
        ReportError(RegExpError::kInvalidGroup);
1012 1013 1014 1015
        return nullptr;
    }
  }
  if (subexpr_type == CAPTURE) {
1016
    if (captures_started_ >= RegExpMacroAssembler::kMaxRegisterCount) {
1017
      ReportError(RegExpError::kTooManyCaptures);
1018 1019 1020 1021 1022 1023 1024 1025 1026
      return nullptr;
    }
    captures_started_++;

    if (is_named_capture) {
      capture_name = ParseCaptureGroupName(CHECK_FAILED);
    }
  }
  // Store current state and begin new disjunction parsing.
1027 1028
  return zone()->template New<RegExpParserState>(
      state, subexpr_type, lookaround_type, captures_started_, capture_name,
1029
      state->builder()->flags(), zone());
1030
}
1031 1032 1033

#ifdef DEBUG
// Currently only used in an DCHECK.
1034
static bool IsSpecialClassEscape(base::uc32 c) {
1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054
  switch (c) {
    case 'd':
    case 'D':
    case 's':
    case 'S':
    case 'w':
    case 'W':
      return true;
    default:
      return false;
  }
}
#endif

// In order to know whether an escape is a backreference or not we have to scan
// the entire regexp and find the number of capturing parentheses.  However we
// don't want to scan the regexp twice unless it is necessary.  This mini-parser
// is called when needed.  It can see the difference between capturing and
// noncapturing parentheses and can skip character classes and backslash-escaped
// characters.
1055 1056
template <class CharT>
void RegExpParserImpl<CharT>::ScanForCaptures() {
1057 1058
  DCHECK(!is_scanned_for_captures_);
  const int saved_position = position();
1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081
  // Start with captures started previous to current position
  int capture_count = captures_started();
  // Add count of captures after this position.
  int n;
  while ((n = current()) != kEndMarker) {
    Advance();
    switch (n) {
      case '\\':
        Advance();
        break;
      case '[': {
        int c;
        while ((c = current()) != kEndMarker) {
          Advance();
          if (c == '\\') {
            Advance();
          } else {
            if (c == ']') break;
          }
        }
        break;
      }
      case '(':
1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092
        if (current() == '?') {
          // At this point we could be in
          // * a non-capturing group '(:',
          // * a lookbehind assertion '(?<=' '(?<!'
          // * or a named capture '(?<'.
          //
          // Of these, only named captures are capturing groups.

          Advance();
          if (current() != '<') break;

1093 1094
          Advance();
          if (current() == '=' || current() == '!') break;
1095 1096 1097 1098 1099

          // Found a possible named capture. It could turn out to be a syntax
          // error (e.g. an unterminated or invalid name), but that distinction
          // does not matter for our purposes.
          has_named_captures_ = true;
1100 1101
        }
        capture_count++;
1102 1103 1104 1105 1106
        break;
    }
  }
  capture_count_ = capture_count;
  is_scanned_for_captures_ = true;
1107
  Reset(saved_position);
1108 1109
}

1110 1111
template <class CharT>
bool RegExpParserImpl<CharT>::ParseBackReferenceIndex(int* index_out) {
1112 1113 1114 1115 1116 1117 1118 1119
  DCHECK_EQ('\\', current());
  DCHECK('1' <= Next() && Next() <= '9');
  // Try to parse a decimal literal that is no greater than the total number
  // of left capturing parentheses in the input.
  int start = position();
  int value = Next() - '0';
  Advance(2);
  while (true) {
1120
    base::uc32 c = current();
1121 1122
    if (IsDecimalDigit(c)) {
      value = 10 * value + (c - '0');
1123
      if (value > RegExpMacroAssembler::kMaxRegisterCount) {
1124 1125 1126 1127 1128 1129 1130 1131 1132
        Reset(start);
        return false;
      }
      Advance();
    } else {
      break;
    }
  }
  if (value > captures_started()) {
1133
    if (!is_scanned_for_captures_) ScanForCaptures();
1134 1135 1136 1137 1138 1139 1140 1141 1142
    if (value > capture_count_) {
      Reset(start);
      return false;
    }
  }
  *index_out = value;
  return true;
}

1143 1144 1145
namespace {

void push_code_unit(ZoneVector<base::uc16>* v, uint32_t code_unit) {
1146 1147 1148 1149 1150 1151 1152 1153
  if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
    v->push_back(code_unit);
  } else {
    v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
    v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
  }
}

1154 1155 1156 1157 1158 1159
}  // namespace

template <class CharT>
const ZoneVector<base::uc16>* RegExpParserImpl<CharT>::ParseCaptureGroupName() {
  ZoneVector<base::uc16>* name =
      zone()->template New<ZoneVector<base::uc16>>(zone());
1160 1161 1162

  bool at_start = true;
  while (true) {
1163
    base::uc32 c = current();
1164
    Advance();
1165 1166 1167

    // Convert unicode escapes.
    if (c == '\\' && current() == 'u') {
1168
      Advance();
1169
      if (!ParseUnicodeEscape(&c)) {
1170
        ReportError(RegExpError::kInvalidUnicodeEscape);
1171 1172 1173 1174
        return nullptr;
      }
    }

1175 1176
    // The backslash char is misclassified as both ID_Start and ID_Continue.
    if (c == '\\') {
1177
      ReportError(RegExpError::kInvalidCaptureGroupName);
1178 1179 1180
      return nullptr;
    }

1181
    if (at_start) {
1182
      if (!IsIdentifierStart(c)) {
1183
        ReportError(RegExpError::kInvalidCaptureGroupName);
1184 1185 1186 1187 1188 1189 1190
        return nullptr;
      }
      push_code_unit(name, c);
      at_start = false;
    } else {
      if (c == '>') {
        break;
1191
      } else if (IsIdentifierPart(c)) {
1192 1193
        push_code_unit(name, c);
      } else {
1194
        ReportError(RegExpError::kInvalidCaptureGroupName);
1195 1196 1197 1198 1199 1200 1201 1202
        return nullptr;
      }
    }
  }

  return name;
}

1203 1204 1205
template <class CharT>
bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex(
    const ZoneVector<base::uc16>* name, int index) {
1206 1207 1208
  DCHECK(0 < index && index <= captures_started_);
  DCHECK_NOT_NULL(name);

1209 1210 1211 1212 1213
  RegExpCapture* capture = GetCapture(index);
  DCHECK_NULL(capture->name());

  capture->set_name(name);

1214
  if (named_captures_ == nullptr) {
1215
    named_captures_ =
1216 1217
        zone_->template New<ZoneSet<RegExpCapture*, RegExpCaptureNameLess>>(
            zone());
1218 1219
  } else {
    // Check for duplicates and bail if we find any.
1220 1221 1222

    const auto& named_capture_it = named_captures_->find(capture);
    if (named_capture_it != named_captures_->end()) {
1223
      ReportError(RegExpError::kDuplicateCaptureGroupName);
1224
      return false;
1225 1226 1227
    }
  }

1228
  named_captures_->emplace(capture);
1229 1230 1231 1232

  return true;
}

1233 1234 1235
template <class CharT>
bool RegExpParserImpl<CharT>::ParseNamedBackReference(
    RegExpBuilder* builder, RegExpParserState* state) {
1236 1237
  // The parser is assumed to be on the '<' in \k<name>.
  if (current() != '<') {
1238
    ReportError(RegExpError::kInvalidNamedReference);
1239 1240 1241
    return false;
  }

1242
  Advance();
1243
  const ZoneVector<base::uc16>* name = ParseCaptureGroupName();
1244 1245 1246 1247 1248 1249 1250
  if (name == nullptr) {
    return false;
  }

  if (state->IsInsideCaptureGroup(name)) {
    builder->AddEmpty();
  } else {
1251 1252
    RegExpBackReference* atom =
        zone()->template New<RegExpBackReference>(builder->flags());
1253 1254 1255 1256 1257 1258
    atom->set_name(name);

    builder->AddAtom(atom);

    if (named_back_references_ == nullptr) {
      named_back_references_ =
1259
          zone()->template New<ZoneList<RegExpBackReference*>>(1, zone());
1260 1261 1262 1263 1264 1265 1266
    }
    named_back_references_->Add(atom, zone());
  }

  return true;
}

1267 1268
template <class CharT>
void RegExpParserImpl<CharT>::PatchNamedBackReferences() {
1269 1270 1271
  if (named_back_references_ == nullptr) return;

  if (named_captures_ == nullptr) {
1272
    ReportError(RegExpError::kInvalidNamedCaptureReference);
1273 1274 1275 1276 1277 1278 1279 1280
    return;
  }

  // Look up and patch the actual capture for each named back reference.

  for (int i = 0; i < named_back_references_->length(); i++) {
    RegExpBackReference* ref = named_back_references_->at(i);

1281 1282 1283
    // Capture used to search the named_captures_ by name, index of the
    // capture is never used.
    static const int kInvalidIndex = 0;
1284 1285
    RegExpCapture* search_capture =
        zone()->template New<RegExpCapture>(kInvalidIndex);
1286 1287
    DCHECK_NULL(search_capture->name());
    search_capture->set_name(ref->name());
1288

1289 1290 1291 1292 1293
    int index = -1;
    const auto& capture_it = named_captures_->find(search_capture);
    if (capture_it != named_captures_->end()) {
      index = (*capture_it)->index();
    } else {
1294
      ReportError(RegExpError::kInvalidNamedCaptureReference);
1295 1296 1297 1298 1299 1300
      return;
    }

    ref->set_capture(GetCapture(index));
  }
}
1301

1302 1303
template <class CharT>
RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) {
1304 1305 1306 1307 1308
  // The index for the capture groups are one-based. Its index in the list is
  // zero-based.
  int know_captures =
      is_scanned_for_captures_ ? capture_count_ : captures_started_;
  DCHECK(index <= know_captures);
1309
  if (captures_ == nullptr) {
1310 1311
    captures_ =
        zone()->template New<ZoneList<RegExpCapture*>>(know_captures, zone());
1312 1313
  }
  while (captures_->length() < know_captures) {
1314 1315
    captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1),
                   zone());
1316 1317 1318 1319
  }
  return captures_->at(index - 1);
}

1320 1321
template <class CharT>
ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() const {
1322
  if (named_captures_ == nullptr || named_captures_->empty()) {
1323
    return nullptr;
1324
  }
1325

1326
  return zone()->template New<ZoneVector<RegExpCapture*>>(
1327
      named_captures_->begin(), named_captures_->end(), zone());
1328
}
1329

1330 1331
template <class CharT>
bool RegExpParserImpl<CharT>::HasNamedCaptures() {
1332 1333 1334 1335 1336 1337 1338 1339 1340
  if (has_named_captures_ || is_scanned_for_captures_) {
    return has_named_captures_;
  }

  ScanForCaptures();
  DCHECK(is_scanned_for_captures_);
  return has_named_captures_;
}

1341 1342 1343 1344 1345 1346 1347
// QuantifierPrefix ::
//   { DecimalDigits }
//   { DecimalDigits , }
//   { DecimalDigits , DecimalDigits }
//
// Returns true if parsing succeeds, and set the min_out and max_out
// values. Values are truncated to RegExpTree::kInfinity if they overflow.
1348 1349 1350
template <class CharT>
bool RegExpParserImpl<CharT>::ParseIntervalQuantifier(int* min_out,
                                                      int* max_out) {
1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408
  DCHECK_EQ(current(), '{');
  int start = position();
  Advance();
  int min = 0;
  if (!IsDecimalDigit(current())) {
    Reset(start);
    return false;
  }
  while (IsDecimalDigit(current())) {
    int next = current() - '0';
    if (min > (RegExpTree::kInfinity - next) / 10) {
      // Overflow. Skip past remaining decimal digits and return -1.
      do {
        Advance();
      } while (IsDecimalDigit(current()));
      min = RegExpTree::kInfinity;
      break;
    }
    min = 10 * min + next;
    Advance();
  }
  int max = 0;
  if (current() == '}') {
    max = min;
    Advance();
  } else if (current() == ',') {
    Advance();
    if (current() == '}') {
      max = RegExpTree::kInfinity;
      Advance();
    } else {
      while (IsDecimalDigit(current())) {
        int next = current() - '0';
        if (max > (RegExpTree::kInfinity - next) / 10) {
          do {
            Advance();
          } while (IsDecimalDigit(current()));
          max = RegExpTree::kInfinity;
          break;
        }
        max = 10 * max + next;
        Advance();
      }
      if (current() != '}') {
        Reset(start);
        return false;
      }
      Advance();
    }
  } else {
    Reset(start);
    return false;
  }
  *min_out = min;
  *max_out = max;
  return true;
}

1409 1410
template <class CharT>
base::uc32 RegExpParserImpl<CharT>::ParseOctalLiteral() {
1411 1412 1413
  DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
  // For compatibility with some other browsers (not all), we parse
  // up to three octal digits with a value below 256.
1414
  // ES#prod-annexB-LegacyOctalEscapeSequence
1415
  base::uc32 value = current() - '0';
1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427
  Advance();
  if ('0' <= current() && current() <= '7') {
    value = value * 8 + current() - '0';
    Advance();
    if (value < 32 && '0' <= current() && current() <= '7') {
      value = value * 8 + current() - '0';
      Advance();
    }
  }
  return value;
}

1428 1429
template <class CharT>
bool RegExpParserImpl<CharT>::ParseHexEscape(int length, base::uc32* value) {
1430
  int start = position();
1431
  base::uc32 val = 0;
1432
  for (int i = 0; i < length; ++i) {
1433 1434
    base::uc32 c = current();
    int d = base::HexValue(c);
1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445
    if (d < 0) {
      Reset(start);
      return false;
    }
    val = val * 16 + d;
    Advance();
  }
  *value = val;
  return true;
}

1446
// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
1447 1448
template <class CharT>
bool RegExpParserImpl<CharT>::ParseUnicodeEscape(base::uc32* value) {
1449 1450 1451
  // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
  // allowed). In the latter case, the number of hex digits between { } is
  // arbitrary. \ and u have already been read.
1452
  if (current() == '{' && unicode()) {
1453 1454
    int start = position();
    Advance();
1455
    if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
1456 1457 1458 1459 1460 1461 1462 1463 1464
      if (current() == '}') {
        Advance();
        return true;
      }
    }
    Reset(start);
    return false;
  }
  // \u but no {, or \u{...} escapes not allowed.
1465 1466 1467 1468 1469 1470 1471
  bool result = ParseHexEscape(4, value);
  if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
      current() == '\\') {
    // Attempt to read trail surrogate.
    int start = position();
    if (Next() == 'u') {
      Advance(2);
1472
      base::uc32 trail;
1473 1474
      if (ParseHexEscape(4, &trail) &&
          unibrow::Utf16::IsTrailSurrogate(trail)) {
1475 1476
        *value = unibrow::Utf16::CombineSurrogatePair(
            static_cast<base::uc16>(*value), static_cast<base::uc16>(trail));
1477 1478 1479 1480 1481 1482
        return true;
      }
    }
    Reset(start);
  }
  return result;
1483 1484
}

1485
#ifdef V8_INTL_SUPPORT
1486 1487 1488

namespace {

1489 1490
bool IsExactPropertyAlias(const char* property_name, UProperty property) {
  const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1491 1492
  if (short_name != nullptr && strcmp(property_name, short_name) == 0)
    return true;
1493 1494 1495
  for (int i = 0;; i++) {
    const char* long_name = u_getPropertyName(
        property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1496
    if (long_name == nullptr) break;
1497 1498 1499 1500 1501 1502 1503
    if (strcmp(property_name, long_name) == 0) return true;
  }
  return false;
}

bool IsExactPropertyValueAlias(const char* property_value_name,
                               UProperty property, int32_t property_value) {
1504 1505
  const char* short_name =
      u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1506
  if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
1507 1508
    return true;
  }
1509 1510 1511 1512
  for (int i = 0;; i++) {
    const char* long_name = u_getPropertyValueName(
        property, property_value,
        static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1513
    if (long_name == nullptr) break;
1514
    if (strcmp(property_value_name, long_name) == 0) return true;
1515 1516 1517 1518
  }
  return false;
}

1519 1520 1521
bool LookupPropertyValueName(UProperty property,
                             const char* property_value_name, bool negate,
                             ZoneList<CharacterRange>* result, Zone* zone) {
1522 1523 1524 1525 1526 1527
  UProperty property_for_lookup = property;
  if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
    // For the property Script_Extensions, we have to do the property value
    // name lookup as if the property is Script.
    property_for_lookup = UCHAR_SCRIPT;
  }
1528
  int32_t property_value =
1529
      u_getPropertyValueEnum(property_for_lookup, property_value_name);
1530 1531
  if (property_value == UCHAR_INVALID_CODE) return false;

1532 1533
  // We require the property name to match exactly to one of the property value
  // aliases. However, u_getPropertyValueEnum uses loose matching.
1534
  if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1535
                                 property_value)) {
1536 1537 1538
    return false;
  }

1539
  UErrorCode ec = U_ZERO_ERROR;
1540 1541 1542
  icu::UnicodeSet set;
  set.applyIntPropertyValue(property, property_value, ec);
  bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1543 1544

  if (success) {
1545 1546 1547 1548 1549 1550
    set.removeAllStrings();
    if (negate) set.complement();
    for (int i = 0; i < set.getRangeCount(); i++) {
      result->Add(
          CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
          zone);
1551 1552 1553 1554 1555
    }
  }
  return success;
}

1556 1557 1558 1559 1560
template <size_t N>
inline bool NameEquals(const char* name, const char (&literal)[N]) {
  return strncmp(name, literal, N + 1) == 0;
}

1561 1562 1563
bool LookupSpecialPropertyValueName(const char* name,
                                    ZoneList<CharacterRange>* result,
                                    bool negate, Zone* zone) {
1564
  if (NameEquals(name, "Any")) {
1565 1566 1567 1568 1569 1570
    if (negate) {
      // Leave the list of character ranges empty, since the negation of 'Any'
      // is the empty set.
    } else {
      result->Add(CharacterRange::Everything(), zone);
    }
1571 1572
  } else if (NameEquals(name, "ASCII")) {
    result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1573
                       : CharacterRange::Range(0x0, 0x7F),
1574 1575
                zone);
  } else if (NameEquals(name, "Assigned")) {
1576 1577
    return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
                                   !negate, result, zone);
1578 1579 1580 1581 1582 1583
  } else {
    return false;
  }
  return true;
}

Dan Elphick's avatar
Dan Elphick committed
1584
// Explicitly allowlist supported binary properties. The spec forbids supporting
1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607
// properties outside of this set to ensure interoperability.
bool IsSupportedBinaryProperty(UProperty property) {
  switch (property) {
    case UCHAR_ALPHABETIC:
    // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
    // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
    case UCHAR_ASCII_HEX_DIGIT:
    // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
    case UCHAR_BIDI_CONTROL:
    case UCHAR_BIDI_MIRRORED:
    case UCHAR_CASE_IGNORABLE:
    case UCHAR_CASED:
    case UCHAR_CHANGES_WHEN_CASEFOLDED:
    case UCHAR_CHANGES_WHEN_CASEMAPPED:
    case UCHAR_CHANGES_WHEN_LOWERCASED:
    case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
    case UCHAR_CHANGES_WHEN_TITLECASED:
    case UCHAR_CHANGES_WHEN_UPPERCASED:
    case UCHAR_DASH:
    case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
    case UCHAR_DEPRECATED:
    case UCHAR_DIACRITIC:
    case UCHAR_EMOJI:
1608
    case UCHAR_EMOJI_COMPONENT:
1609 1610 1611
    case UCHAR_EMOJI_MODIFIER_BASE:
    case UCHAR_EMOJI_MODIFIER:
    case UCHAR_EMOJI_PRESENTATION:
Jungshik Shin's avatar
Jungshik Shin committed
1612
    case UCHAR_EXTENDED_PICTOGRAPHIC:
1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630
    case UCHAR_EXTENDER:
    case UCHAR_GRAPHEME_BASE:
    case UCHAR_GRAPHEME_EXTEND:
    case UCHAR_HEX_DIGIT:
    case UCHAR_ID_CONTINUE:
    case UCHAR_ID_START:
    case UCHAR_IDEOGRAPHIC:
    case UCHAR_IDS_BINARY_OPERATOR:
    case UCHAR_IDS_TRINARY_OPERATOR:
    case UCHAR_JOIN_CONTROL:
    case UCHAR_LOGICAL_ORDER_EXCEPTION:
    case UCHAR_LOWERCASE:
    case UCHAR_MATH:
    case UCHAR_NONCHARACTER_CODE_POINT:
    case UCHAR_PATTERN_SYNTAX:
    case UCHAR_PATTERN_WHITE_SPACE:
    case UCHAR_QUOTATION_MARK:
    case UCHAR_RADICAL:
1631
    case UCHAR_REGIONAL_INDICATOR:
1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647
    case UCHAR_S_TERM:
    case UCHAR_SOFT_DOTTED:
    case UCHAR_TERMINAL_PUNCTUATION:
    case UCHAR_UNIFIED_IDEOGRAPH:
    case UCHAR_UPPERCASE:
    case UCHAR_VARIATION_SELECTOR:
    case UCHAR_WHITE_SPACE:
    case UCHAR_XID_CONTINUE:
    case UCHAR_XID_START:
      return true;
    default:
      break;
  }
  return false;
}

1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660
bool IsUnicodePropertyValueCharacter(char c) {
  // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
  //
  // Note that using this to validate each parsed char is quite conservative.
  // A possible alternative solution would be to only ensure the parsed
  // property name/value candidate string does not contain '\0' characters and
  // let ICU lookups trigger the final failure.
  if ('a' <= c && c <= 'z') return true;
  if ('A' <= c && c <= 'Z') return true;
  if ('0' <= c && c <= '9') return true;
  return (c == '_');
}

1661
}  // namespace
1662

1663 1664 1665
template <class CharT>
bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
                                                     ZoneVector<char>* name_2) {
1666 1667
  DCHECK(name_1->empty());
  DCHECK(name_2->empty());
1668 1669 1670 1671 1672 1673 1674 1675
  // Parse the property class as follows:
  // - In \p{name}, 'name' is interpreted
  //   - either as a general category property value name.
  //   - or as a binary property name.
  // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
  //   and 'value' is interpreted as one of the available property value names.
  // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
  // - Loose matching is not applied.
1676
  if (current() == '{') {
1677 1678
    // Parse \p{[PropertyName=]PropertyNameValue}
    for (Advance(); current() != '}' && current() != '='; Advance()) {
1679
      if (!IsUnicodePropertyValueCharacter(current())) return false;
1680
      if (!has_next()) return false;
1681
      name_1->push_back(static_cast<char>(current()));
1682 1683 1684
    }
    if (current() == '=') {
      for (Advance(); current() != '}'; Advance()) {
1685
        if (!IsUnicodePropertyValueCharacter(current())) return false;
1686
        if (!has_next()) return false;
1687
        name_2->push_back(static_cast<char>(current()));
1688
      }
1689
      name_2->push_back(0);  // null-terminate string.
1690 1691
    }
  } else {
1692
    return false;
1693 1694
  }
  Advance();
1695
  name_1->push_back(0);  // null-terminate string.
1696

1697 1698 1699 1700
  DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
  DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
  return true;
}
1701

1702 1703 1704 1705
template <class CharT>
bool RegExpParserImpl<CharT>::AddPropertyClassRange(
    ZoneList<CharacterRange>* add_to, bool negate,
    const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
1706
  if (name_2.empty()) {
1707
    // First attempt to interpret as general category property value name.
1708
    const char* name = name_1.data();
1709
    if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1710
                                add_to, zone())) {
1711 1712 1713
      return true;
    }
    // Interpret "Any", "ASCII", and "Assigned".
1714
    if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) {
1715 1716 1717 1718
      return true;
    }
    // Then attempt to interpret as binary property name with value name 'Y'.
    UProperty property = u_getPropertyEnum(name);
1719
    if (!IsSupportedBinaryProperty(property)) return false;
1720
    if (!IsExactPropertyAlias(name, property)) return false;
1721
    return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to,
1722
                                   zone());
1723 1724 1725
  } else {
    // Both property name and value name are specified. Attempt to interpret
    // the property name as enumerated property.
1726 1727
    const char* property_name = name_1.data();
    const char* value_name = name_2.data();
1728 1729
    UProperty property = u_getPropertyEnum(property_name);
    if (!IsExactPropertyAlias(property_name, property)) return false;
1730 1731 1732 1733 1734 1735 1736
    if (property == UCHAR_GENERAL_CATEGORY) {
      // We want to allow aggregate value names such as "Letter".
      property = UCHAR_GENERAL_CATEGORY_MASK;
    } else if (property != UCHAR_SCRIPT &&
               property != UCHAR_SCRIPT_EXTENSIONS) {
      return false;
    }
1737
    return LookupPropertyValueName(property, value_name, negate, add_to,
1738
                                   zone());
1739
  }
1740 1741
}

1742 1743 1744
template <class CharT>
RegExpTree* RegExpParserImpl<CharT>::GetPropertySequence(
    const ZoneVector<char>& name_1) {
1745 1746
  if (!FLAG_harmony_regexp_sequence) return nullptr;
  const char* name = name_1.data();
1747
  const base::uc32* sequence_list = nullptr;
1748
  RegExpFlags flags = RegExpFlag::kUnicode;
1749 1750 1751 1752 1753 1754
  if (NameEquals(name, "Emoji_Flag_Sequence")) {
    sequence_list = UnicodePropertySequences::kEmojiFlagSequences;
  } else if (NameEquals(name, "Emoji_Tag_Sequence")) {
    sequence_list = UnicodePropertySequences::kEmojiTagSequences;
  } else if (NameEquals(name, "Emoji_ZWJ_Sequence")) {
    sequence_list = UnicodePropertySequences::kEmojiZWJSequences;
1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770
  }
  if (sequence_list != nullptr) {
    // TODO(yangguo): this creates huge regexp code. Alternative to this is
    // to create a new operator that checks for these sequences at runtime.
    RegExpBuilder builder(zone(), flags);
    while (true) {                   // Iterate through list of sequences.
      while (*sequence_list != 0) {  // Iterate through sequence.
        builder.AddUnicodeCharacter(*sequence_list);
        sequence_list++;
      }
      sequence_list++;
      if (*sequence_list == 0) break;
      builder.NewAlternative();
    }
    return builder.ToRegExp();
  }
1771 1772 1773 1774 1775 1776

  if (NameEquals(name, "Emoji_Keycap_Sequence")) {
    // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence
    // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3}
    RegExpBuilder builder(zone(), flags);
    ZoneList<CharacterRange>* prefix_ranges =
1777
        zone()->template New<ZoneList<CharacterRange>>(2, zone());
1778 1779 1780 1781
    prefix_ranges->Add(CharacterRange::Range('0', '9'), zone());
    prefix_ranges->Add(CharacterRange::Singleton('#'), zone());
    prefix_ranges->Add(CharacterRange::Singleton('*'), zone());
    builder.AddCharacterClass(
1782
        zone()->template New<RegExpCharacterClass>(zone(), prefix_ranges));
1783 1784 1785 1786 1787 1788 1789 1790
    builder.AddCharacter(0xFE0F);
    builder.AddCharacter(0x20E3);
    return builder.ToRegExp();
  } else if (NameEquals(name, "Emoji_Modifier_Sequence")) {
    // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence
    // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
    RegExpBuilder builder(zone(), flags);
    ZoneList<CharacterRange>* modifier_base_ranges =
1791
        zone()->template New<ZoneList<CharacterRange>>(2, zone());
1792 1793
    LookupPropertyValueName(UCHAR_EMOJI_MODIFIER_BASE, "Y", false,
                            modifier_base_ranges, zone());
1794 1795
    builder.AddCharacterClass(zone()->template New<RegExpCharacterClass>(
        zone(), modifier_base_ranges));
1796
    ZoneList<CharacterRange>* modifier_ranges =
1797
        zone()->template New<ZoneList<CharacterRange>>(2, zone());
1798 1799 1800
    LookupPropertyValueName(UCHAR_EMOJI_MODIFIER, "Y", false, modifier_ranges,
                            zone());
    builder.AddCharacterClass(
1801
        zone()->template New<RegExpCharacterClass>(zone(), modifier_ranges));
1802 1803 1804
    return builder.ToRegExp();
  }

1805 1806 1807
  return nullptr;
}

1808
#else  // V8_INTL_SUPPORT
1809

1810 1811 1812
template <class CharT>
bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
                                                     ZoneVector<char>* name_2) {
1813
  return false;
1814
}
1815

1816 1817 1818 1819
template <class CharT>
bool RegExpParserImpl<CharT>::AddPropertyClassRange(
    ZoneList<CharacterRange>* add_to, bool negate,
    const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
1820 1821 1822
  return false;
}

1823 1824 1825
template <class CharT>
RegExpTree* RegExpParserImpl<CharT>::GetPropertySequence(
    const ZoneVector<char>& name) {
1826 1827 1828
  return nullptr;
}

1829
#endif  // V8_INTL_SUPPORT
1830

1831 1832 1833
template <class CharT>
bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value,
                                                            base::uc32* value) {
1834 1835
  base::uc32 x = 0;
  int d = base::HexValue(current());
1836 1837 1838 1839 1840
  if (d < 0) {
    return false;
  }
  while (d >= 0) {
    x = x * 16 + d;
1841
    if (x > static_cast<base::uc32>(max_value)) {
1842 1843 1844
      return false;
    }
    Advance();
1845
    d = base::HexValue(current());
1846 1847 1848 1849 1850
  }
  *value = x;
  return true;
}

1851 1852
template <class CharT>
base::uc32 RegExpParserImpl<CharT>::ParseClassCharacterEscape() {
1853
  DCHECK_EQ('\\', current());
1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877
  DCHECK(has_next() && !IsSpecialClassEscape(Next()));
  Advance();
  switch (current()) {
    case 'b':
      Advance();
      return '\b';
    // ControlEscape :: one of
    //   f n r t v
    case 'f':
      Advance();
      return '\f';
    case 'n':
      Advance();
      return '\n';
    case 'r':
      Advance();
      return '\r';
    case 't':
      Advance();
      return '\t';
    case 'v':
      Advance();
      return '\v';
    case 'c': {
1878 1879
      base::uc32 controlLetter = Next();
      base::uc32 letter = controlLetter & ~('A' ^ 'a');
1880 1881 1882
      // Inside a character class, we also accept digits and underscore as
      // control characters, unless with /u. See Annex B:
      // ES#prod-annexB-ClassControlLetter
1883
      if (letter >= 'A' && letter <= 'Z') {
1884 1885
        Advance(2);
        // Control letters mapped to ASCII control characters in the range
1886 1887
        // 0x00-0x1F.
        return controlLetter & 0x1F;
1888
      }
1889 1890
      if (unicode()) {
        // With /u, invalid escapes are not treated as identity escapes.
1891
        ReportError(RegExpError::kInvalidClassEscape);
1892 1893 1894 1895 1896
        return 0;
      }
      if ((controlLetter >= '0' && controlLetter <= '9') ||
          controlLetter == '_') {
        Advance(2);
1897
        return controlLetter & 0x1F;
1898
      }
1899 1900
      // We match JSC in reading the backslash as a literal
      // character instead of as starting an escape.
1901
      // TODO(v8:6201): Not yet covered by the spec.
1902 1903 1904
      return '\\';
    }
    case '0':
1905 1906 1907 1908 1909
      // With /u, \0 is interpreted as NUL if not followed by another digit.
      if (unicode() && !(Next() >= '0' && Next() <= '9')) {
        Advance();
        return 0;
      }
1910
      V8_FALLTHROUGH;
1911 1912 1913 1914 1915 1916 1917 1918 1919 1920
    case '1':
    case '2':
    case '3':
    case '4':
    case '5':
    case '6':
    case '7':
      // For compatibility, we interpret a decimal escape that isn't
      // a back reference (and therefore either \0 or not valid according
      // to the specification) as a 1..3 digit octal character code.
1921
      // ES#prod-annexB-LegacyOctalEscapeSequence
1922 1923
      if (unicode()) {
        // With /u, decimal escape is not interpreted as octal character code.
1924
        ReportError(RegExpError::kInvalidClassEscape);
1925 1926
        return 0;
      }
1927 1928 1929
      return ParseOctalLiteral();
    case 'x': {
      Advance();
1930
      base::uc32 value;
1931 1932 1933
      if (ParseHexEscape(2, &value)) return value;
      if (unicode()) {
        // With /u, invalid escapes are not treated as identity escapes.
1934
        ReportError(RegExpError::kInvalidEscape);
1935
        return 0;
1936
      }
1937 1938 1939
      // If \x is not followed by a two-digit hexadecimal, treat it
      // as an identity escape.
      return 'x';
1940 1941 1942
    }
    case 'u': {
      Advance();
1943
      base::uc32 value;
1944 1945 1946
      if (ParseUnicodeEscape(&value)) return value;
      if (unicode()) {
        // With /u, invalid escapes are not treated as identity escapes.
1947
        ReportError(RegExpError::kInvalidUnicodeEscape);
1948
        return 0;
1949
      }
1950 1951 1952
      // If \u is not followed by a two-digit hexadecimal, treat it
      // as an identity escape.
      return 'u';
1953 1954
    }
    default: {
1955
      base::uc32 result = current();
1956
      // With /u, no identity escapes except for syntax characters and '-' are
1957
      // allowed. Otherwise, all identity escapes are allowed.
1958
      if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1959 1960 1961
        Advance();
        return result;
      }
1962
      ReportError(RegExpError::kInvalidEscape);
1963 1964 1965
      return 0;
    }
  }
1966
  UNREACHABLE();
1967 1968
}

1969 1970 1971 1972 1973
template <class CharT>
void RegExpParserImpl<CharT>::ParseClassEscape(
    ZoneList<CharacterRange>* ranges, Zone* zone,
    bool add_unicode_case_equivalents, base::uc32* char_out,
    bool* is_class_escape) {
1974
  base::uc32 current_char = current();
1975
  if (current_char == '\\') {
1976 1977 1978 1979 1980 1981 1982
    switch (Next()) {
      case 'w':
      case 'W':
      case 'd':
      case 'D':
      case 's':
      case 'S': {
1983 1984
        CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges,
                                       add_unicode_case_equivalents, zone);
1985
        Advance(2);
1986 1987
        *is_class_escape = true;
        return;
1988 1989
      }
      case kEndMarker:
1990
        ReportError(RegExpError::kEscapeAtEndOfPattern);
1991 1992 1993
        return;
      case 'p':
      case 'P':
1994
        if (unicode()) {
1995 1996
          bool negate = Next() == 'P';
          Advance(2);
1997 1998
          ZoneVector<char> name_1(zone);
          ZoneVector<char> name_2(zone);
1999 2000
          if (!ParsePropertyClassName(&name_1, &name_2) ||
              !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
2001
            ReportError(RegExpError::kInvalidClassPropertyName);
2002 2003 2004 2005 2006
          }
          *is_class_escape = true;
          return;
        }
        break;
2007
      default:
2008
        break;
2009
    }
2010 2011
    *char_out = ParseClassCharacterEscape();
    *is_class_escape = false;
2012 2013
  } else {
    Advance();
2014 2015
    *char_out = current_char;
    *is_class_escape = false;
2016
  }
2017
}
2018

2019 2020 2021
template <class CharT>
RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass(
    const RegExpBuilder* builder) {
2022 2023 2024 2025 2026 2027 2028 2029
  DCHECK_EQ(current(), '[');
  Advance();
  bool is_negated = false;
  if (current() == '^') {
    is_negated = true;
    Advance();
  }
  ZoneList<CharacterRange>* ranges =
2030
      zone()->template New<ZoneList<CharacterRange>>(2, zone());
2031
  bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
2032
  while (has_more() && current() != ']') {
2033
    base::uc32 char_1, char_2;
2034 2035 2036
    bool is_class_1, is_class_2;
    ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
                     &is_class_1 CHECK_FAILED);
2037 2038 2039 2040 2041 2042 2043
    if (current() == '-') {
      Advance();
      if (current() == kEndMarker) {
        // If we reach the end we break out of the loop and let the
        // following code report an error.
        break;
      } else if (current() == ']') {
2044
        if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
2045 2046 2047
        ranges->Add(CharacterRange::Singleton('-'), zone());
        break;
      }
2048 2049 2050
      ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
                       &is_class_2 CHECK_FAILED);
      if (is_class_1 || is_class_2) {
2051
        // Either end is an escaped character class. Treat the '-' verbatim.
2052 2053
        if (unicode()) {
          // ES2015 21.2.2.15.1 step 1.
2054
          return ReportError(RegExpError::kInvalidCharacterClass);
2055
        }
2056
        if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
2057
        ranges->Add(CharacterRange::Singleton('-'), zone());
2058
        if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
2059 2060
        continue;
      }
2061
      // ES2015 21.2.2.15.1 step 6.
2062
      if (char_1 > char_2) {
2063
        return ReportError(RegExpError::kOutOfOrderCharacterClass);
2064
      }
2065
      ranges->Add(CharacterRange::Range(char_1, char_2), zone());
2066
    } else {
2067
      if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
2068 2069 2070
    }
  }
  if (!has_more()) {
2071
    return ReportError(RegExpError::kUnterminatedCharacterClass);
2072 2073
  }
  Advance();
2074 2075
  RegExpCharacterClass::CharacterClassFlags character_class_flags;
  if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
2076 2077
  return zone()->template New<RegExpCharacterClass>(zone(), ranges,
                                                    character_class_flags);
2078 2079 2080 2081
}

#undef CHECK_FAILED

2082 2083
template <class CharT>
bool RegExpParserImpl<CharT>::Parse(RegExpCompileData* result) {
2084
  DCHECK(result != nullptr);
2085 2086
  RegExpTree* tree = ParsePattern();
  if (failed()) {
2087
    DCHECK(tree == nullptr);
2088 2089 2090
    DCHECK(error_ != RegExpError::kNone);
    result->error = error_;
    result->error_pos = error_pos_;
2091
  } else {
2092
    DCHECK(tree != nullptr);
2093
    DCHECK(error_ == RegExpError::kNone);
2094
    if (FLAG_trace_regexp_parser) {
2095
      StdoutStream os;
2096
      tree->Print(os, zone());
2097 2098
      os << "\n";
    }
2099
    result->tree = tree;
2100 2101 2102
    int capture_count = captures_started();
    result->simple = tree->IsAtom() && simple() && capture_count == 0;
    result->contains_anchor = contains_anchor();
2103
    result->capture_count = capture_count;
2104
    result->named_captures = GetNamedCaptures();
2105
  }
2106 2107 2108
  return !failed();
}

2109
RegExpBuilder::RegExpBuilder(Zone* zone, RegExpFlags flags)
2110 2111
    : zone_(zone),
      pending_empty_(false),
2112
      flags_(flags),
2113
      characters_(nullptr),
2114
      pending_surrogate_(kNoPendingSurrogate),
2115 2116 2117 2118 2119 2120 2121 2122 2123
      terms_(),
      alternatives_()
#ifdef DEBUG
      ,
      last_added_(ADD_NONE)
#endif
{
}

2124
void RegExpBuilder::AddLeadSurrogate(base::uc16 lead_surrogate) {
2125 2126 2127 2128 2129 2130
  DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
  FlushPendingSurrogate();
  // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
  pending_surrogate_ = lead_surrogate;
}

2131
void RegExpBuilder::AddTrailSurrogate(base::uc16 trail_surrogate) {
2132 2133
  DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
  if (pending_surrogate_ != kNoPendingSurrogate) {
2134
    base::uc16 lead_surrogate = pending_surrogate_;
2135
    pending_surrogate_ = kNoPendingSurrogate;
2136
    DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
2137
    base::uc32 combined =
2138 2139
        unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
    if (NeedsDesugaringForIgnoreCase(combined)) {
2140
      AddCharacterClassForDesugaring(combined);
2141
    } else {
2142
      ZoneList<base::uc16> surrogate_pair(2, zone());
2143 2144 2145
      surrogate_pair.Add(lead_surrogate, zone());
      surrogate_pair.Add(trail_surrogate, zone());
      RegExpAtom* atom =
2146
          zone()->New<RegExpAtom>(surrogate_pair.ToConstVector());
2147 2148
      AddAtom(atom);
    }
2149 2150 2151 2152 2153 2154 2155 2156 2157
  } else {
    pending_surrogate_ = trail_surrogate;
    FlushPendingSurrogate();
  }
}

void RegExpBuilder::FlushPendingSurrogate() {
  if (pending_surrogate_ != kNoPendingSurrogate) {
    DCHECK(unicode());
2158
    base::uc32 c = pending_surrogate_;
2159
    pending_surrogate_ = kNoPendingSurrogate;
2160
    AddCharacterClassForDesugaring(c);
2161 2162 2163 2164
  }
}


2165
void RegExpBuilder::FlushCharacters() {
2166
  FlushPendingSurrogate();
2167
  pending_empty_ = false;
2168
  if (characters_ != nullptr) {
2169
    RegExpTree* atom = zone()->New<RegExpAtom>(characters_->ToConstVector());
2170
    characters_ = nullptr;
2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184
    text_.Add(atom, zone());
    LAST(ADD_ATOM);
  }
}


void RegExpBuilder::FlushText() {
  FlushCharacters();
  int num_text = text_.length();
  if (num_text == 0) {
    return;
  } else if (num_text == 1) {
    terms_.Add(text_.last(), zone());
  } else {
2185
    RegExpText* text = zone()->New<RegExpText>(zone());
2186 2187 2188 2189 2190 2191
    for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
    terms_.Add(text, zone());
  }
  text_.Clear();
}

2192
void RegExpBuilder::AddCharacter(base::uc16 c) {
2193
  FlushPendingSurrogate();
2194
  pending_empty_ = false;
2195
  if (NeedsDesugaringForIgnoreCase(c)) {
2196
    AddCharacterClassForDesugaring(c);
2197
  } else {
2198
    if (characters_ == nullptr) {
2199
      characters_ = zone()->New<ZoneList<base::uc16>>(4, zone());
2200 2201 2202
    }
    characters_->Add(c, zone());
    LAST(ADD_CHAR);
2203 2204 2205
  }
}

2206 2207
void RegExpBuilder::AddUnicodeCharacter(base::uc32 c) {
  if (c > static_cast<base::uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
2208 2209 2210 2211 2212 2213 2214
    DCHECK(unicode());
    AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
    AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
  } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
    AddLeadSurrogate(c);
  } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
    AddTrailSurrogate(c);
2215
  } else {
2216
    AddCharacter(static_cast<base::uc16>(c));
2217 2218 2219
  }
}

2220
void RegExpBuilder::AddEscapedUnicodeCharacter(base::uc32 character) {
2221 2222 2223 2224 2225 2226
  // A lead or trail surrogate parsed via escape sequence will not
  // pair up with any preceding lead or following trail surrogate.
  FlushPendingSurrogate();
  AddUnicodeCharacter(character);
  FlushPendingSurrogate();
}
2227

2228 2229 2230
void RegExpBuilder::AddEmpty() { pending_empty_ = true; }


2231
void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
2232
  if (NeedsDesugaringForUnicode(cc)) {
2233
    // With /u, character class needs to be desugared, so it
2234 2235 2236 2237 2238 2239 2240
    // must be a standalone term instead of being part of a RegExpText.
    AddTerm(cc);
  } else {
    AddAtom(cc);
  }
}

2241
void RegExpBuilder::AddCharacterClassForDesugaring(base::uc32 c) {
2242
  AddTerm(zone()->New<RegExpCharacterClass>(
2243
      zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c))));
2244 2245
}

2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261
void RegExpBuilder::AddAtom(RegExpTree* term) {
  if (term->IsEmpty()) {
    AddEmpty();
    return;
  }
  if (term->IsTextElement()) {
    FlushCharacters();
    text_.Add(term, zone());
  } else {
    FlushText();
    terms_.Add(term, zone());
  }
  LAST(ADD_ATOM);
}


2262 2263 2264 2265 2266 2267 2268
void RegExpBuilder::AddTerm(RegExpTree* term) {
  FlushText();
  terms_.Add(term, zone());
  LAST(ADD_ATOM);
}


2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283
void RegExpBuilder::AddAssertion(RegExpTree* assert) {
  FlushText();
  terms_.Add(assert, zone());
  LAST(ADD_ASSERT);
}


void RegExpBuilder::NewAlternative() { FlushTerms(); }


void RegExpBuilder::FlushTerms() {
  FlushText();
  int num_terms = terms_.length();
  RegExpTree* alternative;
  if (num_terms == 0) {
2284
    alternative = zone()->New<RegExpEmpty>();
2285 2286 2287
  } else if (num_terms == 1) {
    alternative = terms_.last();
  } else {
2288
    alternative = zone()->New<RegExpAlternative>(terms_.GetList(zone()));
2289 2290 2291 2292 2293 2294 2295
  }
  alternatives_.Add(alternative, zone());
  terms_.Clear();
  LAST(ADD_NONE);
}


2296 2297
bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
  if (!unicode()) return false;
2298 2299 2300 2301
  // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
  // necessarily mean that we need to desugar. It's probably nicer to have a
  // separate pass to figure out unicode desugarings.
  if (ignore_case()) return true;
2302 2303 2304
  ZoneList<CharacterRange>* ranges = cc->ranges(zone());
  CharacterRange::Canonicalize(ranges);
  for (int i = ranges->length() - 1; i >= 0; i--) {
2305 2306
    base::uc32 from = ranges->at(i).from();
    base::uc32 to = ranges->at(i).to();
2307 2308 2309 2310 2311 2312 2313 2314
    // Check for non-BMP characters.
    if (to >= kNonBmpStart) return true;
    // Check for lone surrogates.
    if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
  }
  return false;
}

2315
bool RegExpBuilder::NeedsDesugaringForIgnoreCase(base::uc32 c) {
2316
#ifdef V8_INTL_SUPPORT
2317
  if (unicode() && ignore_case()) {
2318 2319 2320 2321
    icu::UnicodeSet set(c, c);
    set.closeOver(USET_CASE_INSENSITIVE);
    set.removeAllStrings();
    return set.size() > 1;
2322 2323 2324
  }
  // In the case where ICU is not included, we act as if the unicode flag is
  // not set, and do not desugar.
2325
#endif  // V8_INTL_SUPPORT
2326 2327 2328
  return false;
}

2329 2330 2331
RegExpTree* RegExpBuilder::ToRegExp() {
  FlushTerms();
  int num_alternatives = alternatives_.length();
2332
  if (num_alternatives == 0) return zone()->New<RegExpEmpty>();
2333
  if (num_alternatives == 1) return alternatives_.last();
2334
  return zone()->New<RegExpDisjunction>(alternatives_.GetList(zone()));
2335 2336
}

2337
bool RegExpBuilder::AddQuantifierToAtom(
2338
    int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
2339
  FlushPendingSurrogate();
2340 2341
  if (pending_empty_) {
    pending_empty_ = false;
2342
    return true;
2343 2344
  }
  RegExpTree* atom;
2345
  if (characters_ != nullptr) {
2346 2347
    DCHECK(last_added_ == ADD_CHAR);
    // Last atom was character.
2348
    base::Vector<const base::uc16> char_vector = characters_->ToConstVector();
2349 2350
    int num_chars = char_vector.length();
    if (num_chars > 1) {
2351 2352
      base::Vector<const base::uc16> prefix =
          char_vector.SubVector(0, num_chars - 1);
2353
      text_.Add(zone()->New<RegExpAtom>(prefix), zone());
2354 2355
      char_vector = char_vector.SubVector(num_chars - 1, num_chars);
    }
2356
    characters_ = nullptr;
2357
    atom = zone()->New<RegExpAtom>(char_vector);
2358 2359 2360 2361 2362 2363 2364 2365
    FlushText();
  } else if (text_.length() > 0) {
    DCHECK(last_added_ == ADD_ATOM);
    atom = text_.RemoveLast();
    FlushText();
  } else if (terms_.length() > 0) {
    DCHECK(last_added_ == ADD_ATOM);
    atom = terms_.RemoveLast();
2366 2367 2368 2369 2370 2371 2372 2373
    if (atom->IsLookaround()) {
      // With /u, lookarounds are not quantifiable.
      if (unicode()) return false;
      // Lookbehinds are not quantifiable.
      if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
        return false;
      }
    }
2374 2375 2376 2377
    if (atom->max_match() == 0) {
      // Guaranteed to only match an empty string.
      LAST(ADD_TERM);
      if (min == 0) {
2378
        return true;
2379 2380
      }
      terms_.Add(atom, zone());
2381
      return true;
2382 2383 2384 2385 2386
    }
  } else {
    // Only call immediately after adding an atom or character!
    UNREACHABLE();
  }
2387
  terms_.Add(zone()->New<RegExpQuantifier>(min, max, quantifier_type, atom),
2388 2389
             zone());
  LAST(ADD_TERM);
2390
  return true;
2391 2392
}

2393 2394 2395 2396 2397 2398 2399 2400
template class RegExpParserImpl<uint8_t>;
template class RegExpParserImpl<base::uc16>;

}  // namespace

// static
bool RegExpParser::ParseRegExpFromHeapString(Isolate* isolate, Zone* zone,
                                             Handle<String> input,
2401
                                             RegExpFlags flags,
2402 2403
                                             RegExpCompileData* result) {
  DisallowGarbageCollection no_gc;
2404
  uintptr_t stack_limit = isolate->stack_guard()->real_climit();
2405 2406 2407
  String::FlatContent content = input->GetFlatContent(no_gc);
  if (content.IsOneByte()) {
    base::Vector<const uint8_t> v = content.ToOneByteVector();
2408 2409
    return RegExpParserImpl<uint8_t>{v.begin(),   v.length(), flags,
                                     stack_limit, zone,       no_gc}
2410 2411 2412
        .Parse(result);
  } else {
    base::Vector<const base::uc16> v = content.ToUC16Vector();
2413 2414
    return RegExpParserImpl<base::uc16>{v.begin(),   v.length(), flags,
                                        stack_limit, zone,       no_gc}
2415 2416 2417 2418
        .Parse(result);
  }
}

2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437
// static
template <class CharT>
bool RegExpParser::VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit,
                                      const CharT* input, int input_length,
                                      RegExpFlags flags,
                                      RegExpCompileData* result,
                                      const DisallowGarbageCollection& no_gc) {
  return RegExpParserImpl<CharT>{input,       input_length, flags,
                                 stack_limit, zone,         no_gc}
      .Parse(result);
}

template bool RegExpParser::VerifyRegExpSyntax<uint8_t>(
    Zone*, uintptr_t, const uint8_t*, int, RegExpFlags, RegExpCompileData*,
    const DisallowGarbageCollection&);
template bool RegExpParser::VerifyRegExpSyntax<base::uc16>(
    Zone*, uintptr_t, const base::uc16*, int, RegExpFlags, RegExpCompileData*,
    const DisallowGarbageCollection&);

2438 2439
// static
bool RegExpParser::VerifyRegExpSyntax(Isolate* isolate, Zone* zone,
2440
                                      Handle<String> input, RegExpFlags flags,
2441 2442 2443 2444 2445
                                      RegExpCompileData* result,
                                      const DisallowGarbageCollection&) {
  return ParseRegExpFromHeapString(isolate, zone, input, flags, result);
}

2446 2447
}  // namespace internal
}  // namespace v8