regexp.cc 49.4 KB
Newer Older
1
// Copyright 2012 the V8 project authors. All rights reserved.
2 3
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
4

5
#include "src/regexp/regexp.h"
6

7
#include "src/codegen/compilation-cache.h"
8
#include "src/diagnostics/code-tracer.h"
9
#include "src/execution/interrupts-scope.h"
10
#include "src/heap/heap-inl.h"
11
#include "src/objects/js-regexp-inl.h"
12
#include "src/regexp/experimental/experimental.h"
13
#include "src/regexp/regexp-bytecode-generator.h"
14
#include "src/regexp/regexp-bytecodes.h"
15
#include "src/regexp/regexp-compiler.h"
16
#include "src/regexp/regexp-dotprinter.h"
17
#include "src/regexp/regexp-interpreter.h"
18
#include "src/regexp/regexp-macro-assembler-arch.h"
19
#include "src/regexp/regexp-macro-assembler-tracer.h"
20
#include "src/regexp/regexp-parser.h"
21
#include "src/regexp/regexp-utils.h"
22
#include "src/strings/string-search.h"
23
#include "src/utils/ostreams.h"
24

25 26
namespace v8 {
namespace internal {
27

28 29
using namespace regexp_compiler_constants;  // NOLINT(build/namespaces)

30 31 32 33 34 35 36 37 38 39
class RegExpImpl final : public AllStatic {
 public:
  // Returns a string representation of a regular expression.
  // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
  // This function calls the garbage collector if necessary.
  static Handle<String> ToString(Handle<Object> value);

  // Prepares a JSRegExp object with Irregexp-specific data.
  static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
                                 Handle<String> pattern, JSRegExp::Flags flags,
40
                                 int capture_count, uint32_t backtrack_limit);
41

42 43 44 45 46 47 48 49 50 51
  // Prepare a RegExp for being executed one or more times (using
  // IrregexpExecOnce) on the subject.
  // This ensures that the regexp is compiled for the subject, and that
  // the subject is flat.
  // Returns the number of integer spaces required by IrregexpExecOnce
  // as its "registers" argument.  If the regexp cannot be compiled,
  // an exception is set as pending, and this function returns negative.
  static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
                             Handle<String> subject);

52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
  static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
                          Handle<String> pattern, JSRegExp::Flags flags,
                          Handle<String> match_pattern);

  static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
                         Handle<String> subject, int index, int32_t* output,
                         int output_size);

  static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp,
                                 Handle<String> subject, int index,
                                 Handle<RegExpMatchInfo> last_match_info);

  // Execute a regular expression on the subject, starting from index.
  // If matching succeeds, return the number of matches.  This can be larger
  // than one in the case of global regular expressions.
  // The captures and subcaptures are stored into the registers vector.
  // If matching fails, returns RE_FAILURE.
  // If execution fails, sets a pending exception and returns RE_EXCEPTION.
  static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
                             Handle<String> subject, int index, int32_t* output,
                             int output_size);

  // Execute an Irregexp bytecode pattern.
  // On a successful match, the result is a JSArray containing
  // captured positions.  On a failure, the result is the null value.
  // Returns an empty handle in case of an exception.
  V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec(
      Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
80 81
      int index, Handle<RegExpMatchInfo> last_match_info,
      RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone);
82 83 84 85 86 87 88 89 90 91 92

  static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
                              Handle<String> sample_subject, bool is_one_byte);
  static inline bool EnsureCompiledIrregexp(Isolate* isolate,
                                            Handle<JSRegExp> re,
                                            Handle<String> sample_subject,
                                            bool is_one_byte);

  // Returns true on success, false on failure.
  static bool Compile(Isolate* isolate, Zone* zone, RegExpCompileData* input,
                      JSRegExp::Flags flags, Handle<String> pattern,
93
                      Handle<String> sample_subject, bool is_one_byte,
94
                      uint32_t& backtrack_limit);
95 96 97 98 99 100 101 102 103

  // For acting on the JSRegExp data FixedArray.
  static int IrregexpMaxRegisterCount(FixedArray re);
  static void SetIrregexpMaxRegisterCount(FixedArray re, int value);
  static int IrregexpNumberOfCaptures(FixedArray re);
  static ByteArray IrregexpByteCode(FixedArray re, bool is_one_byte);
  static Code IrregexpNativeCode(FixedArray re, bool is_one_byte);
};

104 105 106 107
MaybeHandle<Object> RegExp::ThrowRegExpException(Isolate* isolate,
                                                 Handle<JSRegExp> re,
                                                 Handle<String> pattern,
                                                 RegExpError error) {
108 109 110 111 112
  Vector<const char> error_data = CStrVector(RegExpErrorString(error));
  Handle<String> error_text =
      isolate->factory()
          ->NewStringFromOneByte(Vector<const uint8_t>::cast(error_data))
          .ToHandleChecked();
113 114 115 116
  THROW_NEW_ERROR(
      isolate,
      NewSyntaxError(MessageTemplate::kMalformedRegExp, pattern, error_text),
      Object);
117 118
}

119 120
void RegExp::ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
                                  RegExpError error_text) {
121 122
  USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate),
                           error_text));
123
}
124

125 126 127 128
bool RegExp::IsUnmodifiedRegExp(Isolate* isolate, Handle<JSRegExp> regexp) {
  return RegExpUtils::IsUnmodifiedRegExp(isolate, regexp);
}

129 130 131
// Identifies the sort of regexps where the regexp engine is faster
// than the code used for atom matches.
static bool HasFewDifferentCharacters(Handle<String> pattern) {
132
  int length = std::min(kMaxLookaheadForBoyerMoore, pattern->length());
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
  if (length <= kPatternTooShortForBoyerMoore) return false;
  const int kMod = 128;
  bool character_found[kMod];
  int different = 0;
  memset(&character_found[0], 0, sizeof(character_found));
  for (int i = 0; i < length; i++) {
    int ch = (pattern->Get(i) & (kMod - 1));
    if (!character_found[ch]) {
      character_found[ch] = true;
      different++;
      // We declare a regexp low-alphabet if it has at least 3 times as many
      // characters as it has different characters.
      if (different * 3 > length) return false;
    }
  }
  return true;
}

// Generic RegExp methods. Dispatches to implementation specific methods.

153 154 155
// static
MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re,
                                    Handle<String> pattern,
156 157
                                    JSRegExp::Flags flags,
                                    uint32_t backtrack_limit) {
158 159
  DCHECK(pattern->IsFlat());

160 161 162 163 164 165
  // Caching is based only on the pattern and flags, but code also differs when
  // a backtrack limit is set. A present backtrack limit is very much *not* the
  // common case, so just skip the cache for these.
  const bool is_compilation_cache_enabled =
      (backtrack_limit == JSRegExp::kNoBacktrackLimit);

166
  Zone zone(isolate->allocator(), ZONE_NAME);
167 168 169 170 171 172 173 174 175 176
  CompilationCache* compilation_cache = nullptr;
  if (is_compilation_cache_enabled) {
    compilation_cache = isolate->compilation_cache();
    MaybeHandle<FixedArray> maybe_cached =
        compilation_cache->LookupRegExp(pattern, flags);
    Handle<FixedArray> cached;
    if (maybe_cached.ToHandle(&cached)) {
      re->set_data(*cached);
      return re;
    }
177
  }
178

179
  PostponeInterruptsScope postpone(isolate);
180
  RegExpCompileData parse_result;
181
  FlatStringReader reader(isolate, pattern);
182 183
  DCHECK(!isolate->has_pending_exception());
  if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
184
                                 &parse_result)) {
185
    // Throw an exception if we fail to parse the pattern.
186 187
    return RegExp::ThrowRegExpException(isolate, re, pattern,
                                        parse_result.error);
188
  }
189

190 191
  bool has_been_compiled = false;

192
  if (FLAG_default_to_experimental_regexp_engine &&
193
      ExperimentalRegExp::CanBeHandled(parse_result.tree, flags,
194
                                       parse_result.capture_count)) {
195 196 197 198 199 200 201 202 203 204 205 206 207
    DCHECK(FLAG_enable_experimental_regexp_engine);
    ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
                                   parse_result.capture_count);
    has_been_compiled = true;
  } else if (flags & JSRegExp::kLinear) {
    DCHECK(FLAG_enable_experimental_regexp_engine);
    if (!ExperimentalRegExp::CanBeHandled(parse_result.tree, flags,
                                          parse_result.capture_count)) {
      // TODO(mbid): The error could provide a reason for why the regexp can't
      // be executed in linear time (e.g. due to back references).
      return RegExp::ThrowRegExpException(isolate, re, pattern,
                                          RegExpError::kNotLinear);
    }
208 209
    ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
                                   parse_result.capture_count);
210 211 212
    has_been_compiled = true;
  } else if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) &&
             !HasFewDifferentCharacters(pattern)) {
213
    // Parse-tree is a single atom that is equal to the pattern.
214
    RegExpImpl::AtomCompile(isolate, re, pattern, flags, pattern);
215
    has_been_compiled = true;
216 217
  } else if (parse_result.tree->IsAtom() && !IsSticky(flags) &&
             parse_result.capture_count == 0) {
218
    RegExpAtom* atom = parse_result.tree->AsAtom();
219 220
    // The pattern source might (?) contain escape sequences, but they're
    // resolved in atom_string.
221
    Vector<const uc16> atom_pattern = atom->data();
222 223 224 225 226
    Handle<String> atom_string;
    ASSIGN_RETURN_ON_EXCEPTION(
        isolate, atom_string,
        isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
    if (!IgnoreCase(atom->flags()) && !HasFewDifferentCharacters(atom_string)) {
227
      RegExpImpl::AtomCompile(isolate, re, pattern, flags, atom_string);
228 229 230 231
      has_been_compiled = true;
    }
  }
  if (!has_been_compiled) {
232
    RegExpImpl::IrregexpInitialize(isolate, re, pattern, flags,
233
                                   parse_result.capture_count, backtrack_limit);
234
  }
235
  DCHECK(re->data().IsFixedArray());
236 237
  // Compilation succeeded so the data is set on the regexp
  // and we can store it in the cache.
238
  Handle<FixedArray> data(FixedArray::cast(re->data()), isolate);
239 240 241
  if (is_compilation_cache_enabled) {
    compilation_cache->PutRegExp(pattern, flags, data);
  }
242

243
  return re;
244 245
}

246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
// static
bool RegExp::EnsureFullyCompiled(Isolate* isolate, Handle<JSRegExp> re,
                                 Handle<String> subject) {
  switch (re->TypeTag()) {
    case JSRegExp::NOT_COMPILED:
      UNREACHABLE();
    case JSRegExp::ATOM:
      return true;
    case JSRegExp::IRREGEXP:
      if (RegExpImpl::IrregexpPrepare(isolate, re, subject) == -1) {
        DCHECK(isolate->has_pending_exception());
        return false;
      }
      return true;
    case JSRegExp::EXPERIMENTAL:
      if (!ExperimentalRegExp::IsCompiled(re, isolate) &&
          !ExperimentalRegExp::Compile(isolate, re)) {
        DCHECK(isolate->has_pending_exception());
        return false;
      }
      return true;
  }
}

270 271 272
// static
MaybeHandle<Object> RegExp::ExperimentalOneshotExec(
    Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
273 274
    int index, Handle<RegExpMatchInfo> last_match_info,
    RegExp::ExecQuirks exec_quirks) {
275
  return ExperimentalRegExp::OneshotExec(isolate, regexp, subject, index,
276
                                         last_match_info, exec_quirks);
277 278
}

279 280 281
// static
MaybeHandle<Object> RegExp::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
                                 Handle<String> subject, int index,
282 283
                                 Handle<RegExpMatchInfo> last_match_info,
                                 ExecQuirks exec_quirks) {
284
  switch (regexp->TypeTag()) {
285 286
    case JSRegExp::NOT_COMPILED:
      UNREACHABLE();
287
    case JSRegExp::ATOM:
288 289
      return RegExpImpl::AtomExec(isolate, regexp, subject, index,
                                  last_match_info);
290
    case JSRegExp::IRREGEXP:
291
      return RegExpImpl::IrregexpExec(isolate, regexp, subject, index,
292
                                      last_match_info, exec_quirks);
293 294
    case JSRegExp::EXPERIMENTAL:
      return ExperimentalRegExp::Exec(isolate, regexp, subject, index,
295
                                      last_match_info, exec_quirks);
296 297 298
  }
}

299 300
// RegExp Atom implementation: Simple string search using indexOf.

301 302
void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
                             Handle<String> pattern, JSRegExp::Flags flags,
303
                             Handle<String> match_pattern) {
304
  isolate->factory()->SetRegExpAtomData(re, pattern, flags, match_pattern);
305 306
}

307 308
static void SetAtomLastCapture(Isolate* isolate,
                               Handle<RegExpMatchInfo> last_match_info,
309
                               String subject, int from, int to) {
310
  SealHandleScope shs(isolate);
311 312 313 314 315
  last_match_info->SetNumberOfCaptureRegisters(2);
  last_match_info->SetLastSubject(subject);
  last_match_info->SetLastInput(subject);
  last_match_info->SetCapture(0, from);
  last_match_info->SetCapture(1, to);
316 317
}

318 319
int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
                            Handle<String> subject, int index, int32_t* output,
320
                            int output_size) {
321 322
  DCHECK_LE(0, index);
  DCHECK_LE(index, subject->length());
323

324
  subject = String::Flatten(isolate, subject);
325
  DisallowGarbageCollection no_gc;  // ensure vectors stay valid
326

327
  String needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
328 329
  int needle_len = needle.length();
  DCHECK(needle.IsFlat());
330
  DCHECK_LT(0, needle_len);
331

332
  if (index + needle_len > subject->length()) {
333
    return RegExp::RE_FAILURE;
334
  }
335

336
  for (int i = 0; i < output_size; i += 2) {
337
    String::FlatContent needle_content = needle.GetFlatContent(no_gc);
338
    String::FlatContent subject_content = subject->GetFlatContent(no_gc);
339 340
    DCHECK(needle_content.IsFlat());
    DCHECK(subject_content.IsFlat());
341
    // dispatch on type of strings
342 343 344 345 346 347 348 349 350 351 352 353
    index =
        (needle_content.IsOneByte()
             ? (subject_content.IsOneByte()
                    ? SearchString(isolate, subject_content.ToOneByteVector(),
                                   needle_content.ToOneByteVector(), index)
                    : SearchString(isolate, subject_content.ToUC16Vector(),
                                   needle_content.ToOneByteVector(), index))
             : (subject_content.IsOneByte()
                    ? SearchString(isolate, subject_content.ToOneByteVector(),
                                   needle_content.ToUC16Vector(), index)
                    : SearchString(isolate, subject_content.ToUC16Vector(),
                                   needle_content.ToUC16Vector(), index)));
354 355 356 357
    if (index == -1) {
      return i / 2;  // Return number of matches.
    } else {
      output[i] = index;
358
      output[i + 1] = index + needle_len;
359 360
      index += needle_len;
    }
361
  }
362 363
  return output_size / 2;
}
364

365 366
Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re,
                                    Handle<String> subject, int index,
367
                                    Handle<RegExpMatchInfo> last_match_info) {
368 369 370 371
  static const int kNumRegisters = 2;
  STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
  int32_t* output_registers = isolate->jsregexp_static_offsets_vector();

372 373
  int res =
      AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters);
374

375
  if (res == RegExp::RE_FAILURE) return isolate->factory()->null_value();
376

377
  DCHECK_EQ(res, RegExp::RE_SUCCESS);
378
  SealHandleScope shs(isolate);
379
  SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0],
380
                     output_registers[1]);
381
  return last_match_info;
382 383
}

384 385
// Irregexp implementation.

386
// Ensures that the regexp object contains a compiled version of the
387
// source for either one-byte or two-byte subject strings.
388
// If the compiled version doesn't already exist, it is compiled
389
// from the source pattern.
390 391
// If compilation fails, an exception is thrown and this function
// returns false.
392
bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re,
393 394
                                        Handle<String> sample_subject,
                                        bool is_one_byte) {
Ana Peško's avatar
Ana Peško committed
395
  Object compiled_code = re->Code(is_one_byte);
396
  Object bytecode = re->Bytecode(is_one_byte);
Ana Peško's avatar
Ana Peško committed
397 398 399 400 401 402
  bool needs_initial_compilation =
      compiled_code == Smi::FromInt(JSRegExp::kUninitializedValue);
  // Recompile is needed when we're dealing with the first execution of the
  // regexp after the decision to tier up has been made. If the tiering up
  // strategy is not in use, this value is always false.
  bool needs_tier_up_compilation =
403
      re->MarkedForTierUp() && bytecode.IsByteArray();
Ana Peško's avatar
Ana Peško committed
404 405 406 407 408 409 410

  if (FLAG_trace_regexp_tier_up && needs_tier_up_compilation) {
    PrintF("JSRegExp object %p needs tier-up compilation\n",
           reinterpret_cast<void*>(re->ptr()));
  }

  if (!needs_initial_compilation && !needs_tier_up_compilation) {
411 412
    DCHECK(compiled_code.IsCode());
    DCHECK_IMPLIES(FLAG_regexp_interpret_all, bytecode.IsByteArray());
413 414
    return true;
  }
Ana Peško's avatar
Ana Peško committed
415

416
  DCHECK_IMPLIES(needs_tier_up_compilation, bytecode.IsByteArray());
Ana Peško's avatar
Ana Peško committed
417

418
  return CompileIrregexp(isolate, re, sample_subject, is_one_byte);
419 420
}

Ana Peško's avatar
Ana Peško committed
421 422 423 424 425
#ifdef DEBUG
namespace {

bool RegExpCodeIsValidForPreCompilation(Handle<JSRegExp> re, bool is_one_byte) {
  Object entry = re->Code(is_one_byte);
426
  Object bytecode = re->Bytecode(is_one_byte);
Ana Peško's avatar
Ana Peško committed
427 428 429
  // If we're not using the tier-up strategy, entry can only be a smi
  // representing an uncompiled regexp here. If we're using the tier-up
  // strategy, entry can still be a smi representing an uncompiled regexp, when
430 431 432
  // compiling the regexp before the tier-up, or it can contain a trampoline to
  // the regexp interpreter, in which case the bytecode field contains compiled
  // bytecode, when recompiling the regexp after the tier-up. If the
Ana Peško's avatar
Ana Peško committed
433 434 435 436 437
  // tier-up was forced, which happens for global replaces, entry is a smi
  // representing an uncompiled regexp, even though we're "recompiling" after
  // the tier-up.
  if (re->ShouldProduceBytecode()) {
    DCHECK(entry.IsSmi());
438
    DCHECK(bytecode.IsSmi());
Ana Peško's avatar
Ana Peško committed
439
    int entry_value = Smi::ToInt(entry);
440
    int bytecode_value = Smi::ToInt(bytecode);
Ana Peško's avatar
Ana Peško committed
441
    DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value);
442
    DCHECK_EQ(JSRegExp::kUninitializedValue, bytecode_value);
Ana Peško's avatar
Ana Peško committed
443
  } else {
444
    DCHECK(entry.IsSmi() || (entry.IsCode() && bytecode.IsByteArray()));
Ana Peško's avatar
Ana Peško committed
445 446 447 448 449 450 451 452
  }

  return true;
}

}  // namespace
#endif

453
bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
454
                                 Handle<String> sample_subject,
455
                                 bool is_one_byte) {
456
  // Compile the RegExp.
457
  Zone zone(isolate->allocator(), ZONE_NAME);
458
  PostponeInterruptsScope postpone(isolate);
Ana Peško's avatar
Ana Peško committed
459 460

  DCHECK(RegExpCodeIsValidForPreCompilation(re, is_one_byte));
461 462 463

  JSRegExp::Flags flags = re->GetFlags();

464
  Handle<String> pattern(re->Pattern(), isolate);
465
  pattern = String::Flatten(isolate, pattern);
466
  RegExpCompileData compile_data;
467
  FlatStringReader reader(isolate, pattern);
468 469
  if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
                                 &compile_data)) {
470
    // Throw an exception if we fail to parse the pattern.
471
    // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
472
    USE(RegExp::ThrowRegExpException(isolate, re, pattern, compile_data.error));
473
    return false;
474
  }
Ana Peško's avatar
Ana Peško committed
475 476 477 478 479 480 481 482
  // The compilation target is a kBytecode if we're interpreting all regexp
  // objects, or if we're using the tier-up strategy but the tier-up hasn't
  // happened yet. The compilation target is a kNative if we're using the
  // tier-up strategy and we need to recompile to tier-up, or if we're producing
  // native code for all regexp objects.
  compile_data.compilation_target = re->ShouldProduceBytecode()
                                        ? RegExpCompilationTarget::kBytecode
                                        : RegExpCompilationTarget::kNative;
483
  uint32_t backtrack_limit = re->BacktrackLimit();
484 485
  const bool compilation_succeeded =
      Compile(isolate, &zone, &compile_data, flags, pattern, sample_subject,
486
              is_one_byte, backtrack_limit);
487
  if (!compilation_succeeded) {
488
    DCHECK(compile_data.error != RegExpError::kNone);
489
    RegExp::ThrowRegExpException(isolate, re, compile_data.error);
490 491 492
    return false;
  }

493 494
  Handle<FixedArray> data =
      Handle<FixedArray>(FixedArray::cast(re->data()), isolate);
495
  if (compile_data.compilation_target == RegExpCompilationTarget::kNative) {
496
    data->set(JSRegExp::code_index(is_one_byte), *compile_data.code);
497 498 499 500 501 502 503 504 505
    // Reset bytecode to uninitialized. In case we use tier-up we know that
    // tier-up has happened this way.
    data->set(JSRegExp::bytecode_index(is_one_byte),
              Smi::FromInt(JSRegExp::kUninitializedValue));
  } else {
    DCHECK_EQ(compile_data.compilation_target,
              RegExpCompilationTarget::kBytecode);
    // Store code generated by compiler in bytecode and trampoline to
    // interpreter in code.
506
    data->set(JSRegExp::bytecode_index(is_one_byte), *compile_data.code);
507 508 509 510
    Handle<Code> trampoline =
        BUILTIN_CODE(isolate, RegExpInterpreterTrampoline);
    data->set(JSRegExp::code_index(is_one_byte), *trampoline);
  }
511
  re->SetCaptureNameMap(compile_data.capture_name_map);
512
  int register_max = IrregexpMaxRegisterCount(*data);
513 514
  if (compile_data.register_count > register_max) {
    SetIrregexpMaxRegisterCount(*data, compile_data.register_count);
515
  }
516
  data->set(JSRegExp::kIrregexpBacktrackLimit, Smi::FromInt(backtrack_limit));
517

Ana Peško's avatar
Ana Peško committed
518 519 520 521 522 523 524 525 526
  if (FLAG_trace_regexp_tier_up) {
    PrintF("JSRegExp object %p %s size: %d\n",
           reinterpret_cast<void*>(re->ptr()),
           re->ShouldProduceBytecode() ? "bytecode" : "native code",
           re->ShouldProduceBytecode()
               ? IrregexpByteCode(*data, is_one_byte).Size()
               : IrregexpNativeCode(*data, is_one_byte).Size());
  }

527
  return true;
528 529
}

530
int RegExpImpl::IrregexpMaxRegisterCount(FixedArray re) {
531
  return Smi::ToInt(re.get(JSRegExp::kIrregexpMaxRegisterCountIndex));
532 533
}

534
void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray re, int value) {
535
  re.set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
536 537
}

538
int RegExpImpl::IrregexpNumberOfCaptures(FixedArray re) {
539
  return Smi::ToInt(re.get(JSRegExp::kIrregexpCaptureCountIndex));
540 541
}

542
ByteArray RegExpImpl::IrregexpByteCode(FixedArray re, bool is_one_byte) {
543
  return ByteArray::cast(re.get(JSRegExp::bytecode_index(is_one_byte)));
544 545
}

546
Code RegExpImpl::IrregexpNativeCode(FixedArray re, bool is_one_byte) {
547
  return Code::cast(re.get(JSRegExp::code_index(is_one_byte)));
548 549
}

550
void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
551
                                    Handle<String> pattern,
552 553
                                    JSRegExp::Flags flags, int capture_count,
                                    uint32_t backtrack_limit) {
554
  // Initialize compiled code entries to null.
555 556
  isolate->factory()->SetRegExpIrregexpData(re, pattern, flags, capture_count,
                                            backtrack_limit);
557 558
}

559
// static
560 561
int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
                                Handle<String> subject) {
562
  DCHECK(subject->IsFlat());
563

564
  // Check representation of the underlying storage.
565
  bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
566 567 568 569
  if (!RegExpImpl::EnsureCompiledIrregexp(isolate, regexp, subject,
                                          is_one_byte)) {
    return -1;
  }
570

571 572 573
  // Only reserve room for output captures. Internal registers are allocated by
  // the engine.
  return JSRegExp::RegistersForCaptureCount(regexp->CaptureCount());
574
}
575

576 577 578
int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
                                Handle<String> subject, int index,
                                int32_t* output, int output_size) {
579 580
  DCHECK_LE(0, index);
  DCHECK_LE(index, subject->length());
581
  DCHECK(subject->IsFlat());
582 583
  DCHECK_GE(output_size,
            JSRegExp::RegistersForCaptureCount(regexp->CaptureCount()));
584

585
  bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
586

Ana Peško's avatar
Ana Peško committed
587
  if (!regexp->ShouldProduceBytecode()) {
588 589 590 591 592 593
    do {
      EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
      // The stack is used to allocate registers for the compiled regexp code.
      // This means that in case of failure, the output registers array is left
      // untouched and contains the capture results from the previous successful
      // match.  We can use that to set the last match info lazily.
594
      int res = NativeRegExpMacroAssembler::Match(regexp, subject, output,
595
                                                  output_size, index, isolate);
596 597 598 599
      if (res != NativeRegExpMacroAssembler::RETRY) {
        DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
               isolate->has_pending_exception());
        STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) ==
600
                      RegExp::RE_SUCCESS);
601
        STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::FAILURE) ==
602
                      RegExp::RE_FAILURE);
603
        STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) ==
604
                      RegExp::RE_EXCEPTION);
605
        return res;
606 607 608 609 610 611 612 613 614 615 616
      }
      // If result is RETRY, the string has changed representation, and we
      // must restart from scratch.
      // In this case, it means we must make sure we are prepared to handle
      // the, potentially, different subject (the string can switch between
      // being internal and external, and even between being Latin1 and UC16,
      // but the characters are always the same).
      is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
    } while (true);
    UNREACHABLE();
  } else {
Ana Peško's avatar
Ana Peško committed
617
    DCHECK(regexp->ShouldProduceBytecode());
618 619

    do {
620 621
      IrregexpInterpreter::Result result =
          IrregexpInterpreter::MatchForCallFromRuntime(
622
              isolate, regexp, subject, output, output_size, index);
623 624 625 626 627 628 629
      DCHECK_IMPLIES(result == IrregexpInterpreter::EXCEPTION,
                     isolate->has_pending_exception());

      switch (result) {
        case IrregexpInterpreter::SUCCESS:
        case IrregexpInterpreter::EXCEPTION:
        case IrregexpInterpreter::FAILURE:
630
        case IrregexpInterpreter::FALLBACK_TO_EXPERIMENTAL:
631 632 633 634
          return result;
        case IrregexpInterpreter::RETRY:
          // The string has changed representation, and we must restart the
          // match.
Ana Peško's avatar
Ana Peško committed
635
          // We need to reset the tier up to start over with compilation.
636
          if (FLAG_regexp_tier_up) regexp->ResetLastTierUpTick();
637 638 639 640 641 642
          is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
          EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
          break;
      }
    } while (true);
    UNREACHABLE();
643
  }
644
}
645

646
MaybeHandle<Object> RegExpImpl::IrregexpExec(
647
    Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
648 649
    int previous_index, Handle<RegExpMatchInfo> last_match_info,
    RegExp::ExecQuirks exec_quirks) {
650
  DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
651

652
  subject = String::Flatten(isolate, subject);
653

654
#ifdef DEBUG
655
  if (FLAG_trace_regexp_bytecodes && regexp->ShouldProduceBytecode()) {
656
    String pattern = regexp->Pattern();
657
    PrintF("\n\nRegexp match:   /%s/\n\n", pattern.ToCString().get());
658
    PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
659 660
  }
#endif
661

662 663 664 665 666 667 668 669 670 671 672 673 674 675
  // For very long subject strings, the regexp interpreter is currently much
  // slower than the jitted code execution. If the tier-up strategy is turned
  // on, we want to avoid this performance penalty so we eagerly tier-up if the
  // subject string length is equal or greater than the given heuristic value.
  if (FLAG_regexp_tier_up &&
      subject->length() >= JSRegExp::kTierUpForSubjectLengthValue) {
    regexp->MarkTierUpForNextExec();
    if (FLAG_trace_regexp_tier_up) {
      PrintF(
          "Forcing tier-up for very long strings in "
          "RegExpImpl::IrregexpExec\n");
    }
  }

676
  // Prepare space for the return values.
677 678
  int required_registers =
      RegExpImpl::IrregexpPrepare(isolate, regexp, subject);
679 680
  if (required_registers < 0) {
    // Compiling failed with an exception.
681
    DCHECK(isolate->has_pending_exception());
682
    return MaybeHandle<Object>();
683
  }
684

685
  int32_t* output_registers = nullptr;
686 687 688
  if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
    output_registers = NewArray<int32_t>(required_registers);
  }
689
  std::unique_ptr<int32_t[]> auto_release(output_registers);
690
  if (output_registers == nullptr) {
691 692
    output_registers = isolate->jsregexp_static_offsets_vector();
  }
693

694 695 696
  int res =
      RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index,
                                  output_registers, required_registers);
Ana Peško's avatar
Ana Peško committed
697

698
  if (res == RegExp::RE_SUCCESS) {
699 700 701 702 703
    if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) {
      if (output_registers[0] >= subject->length()) {
        return isolate->factory()->null_value();
      }
    }
704
    int capture_count = regexp->CaptureCount();
705 706
    return RegExp::SetLastMatchInfo(isolate, last_match_info, subject,
                                    capture_count, output_registers);
707 708 709 710
  } else if (res == RegExp::RE_FALLBACK_TO_EXPERIMENTAL) {
    return ExperimentalRegExp::OneshotExec(isolate, regexp, subject,
                                           previous_index, last_match_info);
  } else if (res == RegExp::RE_EXCEPTION) {
711
    DCHECK(isolate->has_pending_exception());
712
    return MaybeHandle<Object>();
713 714 715
  } else {
    DCHECK(res == RegExp::RE_FAILURE);
    return isolate->factory()->null_value();
716
  }
717 718
}

719 720
// static
Handle<RegExpMatchInfo> RegExp::SetLastMatchInfo(
721 722
    Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
    Handle<String> subject, int capture_count, int32_t* match) {
723 724 725 726
  // This is the only place where match infos can grow. If, after executing the
  // regexp, RegExpExecStub finds that the match info is too small, it restarts
  // execution in RegExpImpl::Exec, which finally grows the match info right
  // here.
727 728
  Handle<RegExpMatchInfo> result =
      RegExpMatchInfo::ReserveCaptures(isolate, last_match_info, capture_count);
729
  if (*result != *last_match_info) {
730 731 732 733 734 735 736
    if (*last_match_info == *isolate->regexp_last_match_info()) {
      // This inner condition is only needed for special situations like the
      // regexp fuzzer, where we pass our own custom RegExpMatchInfo to
      // RegExpImpl::Exec; there actually want to bypass the Isolate's match
      // info and execute the regexp without side effects.
      isolate->native_context()->set_regexp_last_match_info(*result);
    }
737 738
  }

739 740
  int capture_register_count =
      JSRegExp::RegistersForCaptureCount(capture_count);
741
  DisallowGarbageCollection no_gc;
742
  if (match != nullptr) {
743
    for (int i = 0; i < capture_register_count; i += 2) {
744 745
      result->SetCapture(i, match[i]);
      result->SetCapture(i + 1, match[i + 1]);
746 747
    }
  }
748 749 750
  result->SetLastSubject(*subject);
  result->SetLastInput(*subject);
  return result;
751 752
}

753
// static
754 755
void RegExp::DotPrintForTesting(const char* label, RegExpNode* node) {
  DotPrinter::DotPrint(label, node);
756
}
757

758
namespace {
759

760 761 762 763 764 765 766 767 768 769
// Returns true if we've either generated too much irregex code within this
// isolate, or the pattern string is too long.
bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) {
  // Limit the space regexps take up on the heap.  In order to limit this we
  // would like to keep track of the amount of regexp code on the heap.  This
  // is not tracked, however.  As a conservative approximation we track the
  // total regexp code compiled including code that has subsequently been freed
  // and the total executable memory at any point.
  static constexpr size_t kRegExpExecutableMemoryLimit = 16 * MB;
  static constexpr size_t kRegExpCompiledLimit = 1 * MB;
770

771
  Heap* heap = isolate->heap();
772
  if (pattern->length() > RegExp::kRegExpTooLargeToOptimize) return true;
773 774
  return (isolate->total_regexp_code_generated() > kRegExpCompiledLimit &&
          heap->CommittedMemoryExecutable() > kRegExpExecutableMemoryLimit);
775 776
}

777
}  // namespace
778

779 780 781 782 783 784
// static
bool RegExp::CompileForTesting(Isolate* isolate, Zone* zone,
                               RegExpCompileData* data, JSRegExp::Flags flags,
                               Handle<String> pattern,
                               Handle<String> sample_subject,
                               bool is_one_byte) {
785
  uint32_t backtrack_limit = JSRegExp::kNoBacktrackLimit;
786
  return RegExpImpl::Compile(isolate, zone, data, flags, pattern,
787
                             sample_subject, is_one_byte, backtrack_limit);
788 789
}

790 791
bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
                         JSRegExp::Flags flags, Handle<String> pattern,
792
                         Handle<String> sample_subject, bool is_one_byte,
793
                         uint32_t& backtrack_limit) {
794 795
  if (JSRegExp::RegistersForCaptureCount(data->capture_count) >
      RegExpMacroAssembler::kMaxRegisterCount) {
796
    data->error = RegExpError::kTooLarge;
797
    return false;
798
  }
799

800
  RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte);
801

802
  if (compiler.optimize()) {
803
    compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
804
  }
805

806 807
  // Sample some characters from the middle of the string.
  static const int kSampleSize = 128;
808

809 810 811
  sample_subject = String::Flatten(isolate, sample_subject);
  int chars_sampled = 0;
  int half_way = (sample_subject->length() - kSampleSize) / 2;
812
  for (int i = std::max(0, half_way);
813 814 815
       i < sample_subject->length() && chars_sampled < kSampleSize;
       i++, chars_sampled++) {
    compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
816
  }
817

818 819
  data->node = compiler.PreprocessRegExp(data, flags, is_one_byte);
  data->error = AnalyzeRegExp(isolate, is_one_byte, data->node);
820
  if (data->error != RegExpError::kNone) {
821
    return false;
822
  }
823

824 825
  if (FLAG_trace_regexp_graph) DotPrinter::DotPrint("Start", data->node);

826 827
  // Create the correct assembler for the architecture.
  std::unique_ptr<RegExpMacroAssembler> macro_assembler;
Ana Peško's avatar
Ana Peško committed
828
  if (data->compilation_target == RegExpCompilationTarget::kNative) {
829 830
    // Native regexp implementation.
    DCHECK(!FLAG_jitless);
831

832 833 834
    NativeRegExpMacroAssembler::Mode mode =
        is_one_byte ? NativeRegExpMacroAssembler::LATIN1
                    : NativeRegExpMacroAssembler::UC16;
835

836 837
    const int output_register_count =
        JSRegExp::RegistersForCaptureCount(data->capture_count);
838
#if V8_TARGET_ARCH_IA32
839 840
    macro_assembler.reset(new RegExpMacroAssemblerIA32(isolate, zone, mode,
                                                       output_register_count));
841
#elif V8_TARGET_ARCH_X64
842 843
    macro_assembler.reset(new RegExpMacroAssemblerX64(isolate, zone, mode,
                                                      output_register_count));
844
#elif V8_TARGET_ARCH_ARM
845 846
    macro_assembler.reset(new RegExpMacroAssemblerARM(isolate, zone, mode,
                                                      output_register_count));
847
#elif V8_TARGET_ARCH_ARM64
848 849
    macro_assembler.reset(new RegExpMacroAssemblerARM64(isolate, zone, mode,
                                                        output_register_count));
850
#elif V8_TARGET_ARCH_S390
851 852
    macro_assembler.reset(new RegExpMacroAssemblerS390(isolate, zone, mode,
                                                       output_register_count));
853
#elif V8_TARGET_ARCH_PPC || V8_TARGET_ARCH_PPC64
854 855
    macro_assembler.reset(new RegExpMacroAssemblerPPC(isolate, zone, mode,
                                                      output_register_count));
856
#elif V8_TARGET_ARCH_MIPS
857 858
    macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode,
                                                       output_register_count));
859
#elif V8_TARGET_ARCH_MIPS64
860 861
    macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode,
                                                       output_register_count));
Brice Dobry's avatar
Brice Dobry committed
862 863 864
#elif V8_TARGET_ARCH_RISCV64
    macro_assembler.reset(new RegExpMacroAssemblerRISCV(isolate, zone, mode,
                                                        output_register_count));
865 866 867 868
#else
#error "Unsupported architecture"
#endif
  } else {
Ana Peško's avatar
Ana Peško committed
869
    DCHECK_EQ(data->compilation_target, RegExpCompilationTarget::kBytecode);
870
    // Interpreted regexp implementation.
871
    macro_assembler.reset(new RegExpBytecodeGenerator(isolate, zone));
872
  }
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
873

874
  macro_assembler->set_slow_safe(TooMuchRegExpCode(isolate, pattern));
875 876 877 878 879 880 881 882 883 884 885 886 887 888 889
  if (FLAG_enable_experimental_regexp_engine_on_excessive_backtracks &&
      ExperimentalRegExp::CanBeHandled(data->tree, flags,
                                       data->capture_count)) {
    if (backtrack_limit == JSRegExp::kNoBacktrackLimit) {
      backtrack_limit = FLAG_regexp_backtracks_before_fallback;
    } else {
      backtrack_limit =
          std::min(backtrack_limit, FLAG_regexp_backtracks_before_fallback);
    }
    macro_assembler->set_backtrack_limit(backtrack_limit);
    macro_assembler->set_can_fallback(true);
  } else {
    macro_assembler->set_backtrack_limit(backtrack_limit);
    macro_assembler->set_can_fallback(false);
  }
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
890

891 892
  // Inserted here, instead of in Assembler, because it depends on information
  // in the AST that isn't replicated in the Node structure.
893 894 895
  bool is_end_anchored = data->tree->IsAnchoredAtEnd();
  bool is_start_anchored = data->tree->IsAnchoredAtStart();
  int max_length = data->tree->max_match();
896
  static const int kMaxBacksearchLimit = 1024;
897
  if (is_end_anchored && !is_start_anchored && !IsSticky(flags) &&
898 899
      max_length < kMaxBacksearchLimit) {
    macro_assembler->SetCurrentPositionFromEnd(max_length);
900 901
  }

902
  if (IsGlobal(flags)) {
903 904 905
    RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
    if (data->tree->min_match() > 0) {
      mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
906
    } else if (IsUnicode(flags)) {
907
      mode = RegExpMacroAssembler::GLOBAL_UNICODE;
908
    }
909
    macro_assembler->set_global_mode(mode);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
910 911
  }

912 913 914 915 916 917 918 919 920 921
  RegExpMacroAssembler* macro_assembler_ptr = macro_assembler.get();
#ifdef DEBUG
  std::unique_ptr<RegExpMacroAssembler> tracer_macro_assembler;
  if (FLAG_trace_regexp_assembler) {
    tracer_macro_assembler.reset(
        new RegExpMacroAssemblerTracer(isolate, macro_assembler_ptr));
    macro_assembler_ptr = tracer_macro_assembler.get();
  }
#endif

922
  RegExpCompiler::CompilationResult result = compiler.Assemble(
923
      isolate, macro_assembler_ptr, data->node, data->capture_count, pattern);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
924

925 926 927 928 929 930 931
  // Code / bytecode printing.
  {
#ifdef ENABLE_DISASSEMBLER
    if (FLAG_print_regexp_code &&
        data->compilation_target == RegExpCompilationTarget::kNative) {
      CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
      OFStream os(trace_scope.file());
932
      Handle<Code> c = Handle<Code>::cast(result.code);
933
      auto pattern_cstring = pattern->ToCString();
934
      c->Disassemble(pattern_cstring.get(), os, isolate);
935 936 937 938
    }
#endif
    if (FLAG_print_regexp_bytecode &&
        data->compilation_target == RegExpCompilationTarget::kBytecode) {
939
      Handle<ByteArray> bytecode = Handle<ByteArray>::cast(result.code);
940
      auto pattern_cstring = pattern->ToCString();
941 942
      RegExpBytecodeDisassemble(bytecode->GetDataStartAddress(),
                                bytecode->length(), pattern_cstring.get());
943 944 945
    }
  }

946
  if (result.error != RegExpError::kNone) {
947
    if (FLAG_correctness_fuzzer_suppressions &&
948
        result.error == RegExpError::kStackOverflow) {
949 950
      FATAL("Aborting on stack overflow");
    }
951
    data->error = result.error;
952
  }
953

954 955 956 957
  data->code = result.code;
  data->register_count = result.num_registers;

  return result.Succeeded();
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
958 959
}

960 961 962 963 964 965 966
RegExpGlobalCache::RegExpGlobalCache(Handle<JSRegExp> regexp,
                                     Handle<String> subject, Isolate* isolate)
    : register_array_(nullptr),
      register_array_size_(0),
      regexp_(regexp),
      subject_(subject),
      isolate_(isolate) {
967
  DCHECK(IsGlobal(regexp->GetFlags()));
968

969 970 971 972
  switch (regexp_->TypeTag()) {
    case JSRegExp::NOT_COMPILED:
      UNREACHABLE();
    case JSRegExp::ATOM: {
973 974
      // ATOM regexps do not have a global loop, so we search for one match at
      // a time.
975 976
      static const int kAtomRegistersPerMatch = 2;
      registers_per_match_ = kAtomRegistersPerMatch;
977
      register_array_size_ = registers_per_match_;
978
      break;
979
    }
980
    case JSRegExp::IRREGEXP: {
981
      registers_per_match_ =
982
          RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
983 984 985 986
      if (registers_per_match_ < 0) {
        num_matches_ = -1;  // Signal exception.
        return;
      }
987 988 989 990 991 992
      if (regexp->ShouldProduceBytecode()) {
        // Global loop in interpreted regexp is not implemented.  We choose the
        // size of the offsets vector so that it can only store one match.
        register_array_size_ = registers_per_match_;
        max_matches_ = 1;
      } else {
993 994
        register_array_size_ = std::max(
            {registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize});
995
      }
996
      break;
997 998 999 1000 1001 1002 1003 1004 1005 1006
    }
    case JSRegExp::EXPERIMENTAL: {
      if (!ExperimentalRegExp::IsCompiled(regexp, isolate_) &&
          !ExperimentalRegExp::Compile(isolate_, regexp)) {
        DCHECK(isolate->has_pending_exception());
        num_matches_ = -1;  // Signal exception.
        return;
      }
      registers_per_match_ =
          JSRegExp::RegistersForCaptureCount(regexp->CaptureCount());
1007 1008
      register_array_size_ = std::max(
          {registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize});
1009 1010
      break;
    }
1011 1012
  }

1013
  max_matches_ = register_array_size_ / registers_per_match_;
1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052

  if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
    register_array_ = NewArray<int32_t>(register_array_size_);
  } else {
    register_array_ = isolate->jsregexp_static_offsets_vector();
  }

  // Set state so that fetching the results the first time triggers a call
  // to the compiled regexp.
  current_match_index_ = max_matches_ - 1;
  num_matches_ = max_matches_;
  DCHECK_LE(2, registers_per_match_);  // Each match has at least one capture.
  DCHECK_GE(register_array_size_, registers_per_match_);
  int32_t* last_match =
      &register_array_[current_match_index_ * registers_per_match_];
  last_match[0] = -1;
  last_match[1] = 0;
}

RegExpGlobalCache::~RegExpGlobalCache() {
  // Deallocate the register array if we allocated it in the constructor
  // (as opposed to using the existing jsregexp_static_offsets_vector).
  if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
    DeleteArray(register_array_);
  }
}

int RegExpGlobalCache::AdvanceZeroLength(int last_index) {
  if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() &&
      unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
      unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
    // Advance over the surrogate pair.
    return last_index + 2;
  }
  return last_index + 1;
}

int32_t* RegExpGlobalCache::FetchNext() {
  current_match_index_++;
Ana Peško's avatar
Ana Peško committed
1053

1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065
  if (current_match_index_ >= num_matches_) {
    // Current batch of results exhausted.
    // Fail if last batch was not even fully filled.
    if (num_matches_ < max_matches_) {
      num_matches_ = 0;  // Signal failed match.
      return nullptr;
    }

    int32_t* last_match =
        &register_array_[(current_match_index_ - 1) * registers_per_match_];
    int last_end_index = last_match[1];

1066 1067 1068 1069 1070 1071 1072 1073 1074
    switch (regexp_->TypeTag()) {
      case JSRegExp::NOT_COMPILED:
        UNREACHABLE();
      case JSRegExp::ATOM:
        num_matches_ =
            RegExpImpl::AtomExecRaw(isolate_, regexp_, subject_, last_end_index,
                                    register_array_, register_array_size_);
        break;
      case JSRegExp::EXPERIMENTAL: {
1075
        DCHECK(ExperimentalRegExp::IsCompiled(regexp_, isolate_));
1076
        DisallowGarbageCollection no_gc;
1077
        num_matches_ = ExperimentalRegExp::ExecRaw(
1078 1079
            isolate_, RegExp::kFromRuntime, *regexp_, *subject_,
            register_array_, register_array_size_, last_end_index);
1080
        break;
1081
      }
1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095
      case JSRegExp::IRREGEXP: {
        int last_start_index = last_match[0];
        if (last_start_index == last_end_index) {
          // Zero-length match. Advance by one code point.
          last_end_index = AdvanceZeroLength(last_end_index);
        }
        if (last_end_index > subject_->length()) {
          num_matches_ = 0;  // Signal failed match.
          return nullptr;
        }
        num_matches_ = RegExpImpl::IrregexpExecRaw(
            isolate_, regexp_, subject_, last_end_index, register_array_,
            register_array_size_);
        break;
1096 1097 1098
      }
    }

1099 1100 1101
    // Fall back to experimental engine if needed and possible.
    if (num_matches_ == RegExp::kInternalRegExpFallbackToExperimental) {
      num_matches_ = ExperimentalRegExp::OneshotExecRaw(
1102
          isolate_, regexp_, subject_, register_array_, register_array_size_,
1103 1104 1105 1106 1107 1108
          last_end_index);
    }

    if (num_matches_ <= 0) {
      return nullptr;
    }
1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122
    current_match_index_ = 0;
    return register_array_;
  } else {
    return &register_array_[current_match_index_ * registers_per_match_];
  }
}

int32_t* RegExpGlobalCache::LastSuccessfulMatch() {
  int index = current_match_index_ * registers_per_match_;
  if (num_matches_ == 0) {
    // After a failed match we shift back by one result.
    index -= registers_per_match_;
  }
  return &register_array_[index];
1123 1124
}

1125 1126 1127 1128 1129
Object RegExpResultsCache::Lookup(Heap* heap, String key_string,
                                  Object key_pattern,
                                  FixedArray* last_match_cache,
                                  ResultsCacheType type) {
  FixedArray cache;
1130
  if (!key_string.IsInternalizedString()) return Smi::zero();
1131 1132
  if (type == STRING_SPLIT_SUBSTRINGS) {
    DCHECK(key_pattern.IsString());
1133
    if (!key_pattern.IsInternalizedString()) return Smi::zero();
1134
    cache = heap->string_split_cache();
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1135
  } else {
1136 1137 1138
    DCHECK(type == REGEXP_MULTIPLE_INDICES);
    DCHECK(key_pattern.IsFixedArray());
    cache = heap->regexp_multiple_cache();
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1139
  }
1140

1141
  uint32_t hash = key_string.hash();
1142 1143 1144 1145 1146 1147 1148 1149
  uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
                    ~(kArrayEntriesPerCacheEntry - 1));
  if (cache.get(index + kStringOffset) != key_string ||
      cache.get(index + kPatternOffset) != key_pattern) {
    index =
        ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
    if (cache.get(index + kStringOffset) != key_string ||
        cache.get(index + kPatternOffset) != key_pattern) {
1150
      return Smi::zero();
1151
    }
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1152
  }
1153

1154 1155
  *last_match_cache = FixedArray::cast(cache.get(index + kLastMatchOffset));
  return cache.get(index + kArrayOffset);
1156 1157
}

1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173
void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
                               Handle<Object> key_pattern,
                               Handle<FixedArray> value_array,
                               Handle<FixedArray> last_match_cache,
                               ResultsCacheType type) {
  Factory* factory = isolate->factory();
  Handle<FixedArray> cache;
  if (!key_string->IsInternalizedString()) return;
  if (type == STRING_SPLIT_SUBSTRINGS) {
    DCHECK(key_pattern->IsString());
    if (!key_pattern->IsInternalizedString()) return;
    cache = factory->string_split_cache();
  } else {
    DCHECK(type == REGEXP_MULTIPLE_INDICES);
    DCHECK(key_pattern->IsFixedArray());
    cache = factory->regexp_multiple_cache();
1174 1175
  }

1176
  uint32_t hash = key_string->hash();
1177 1178
  uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
                    ~(kArrayEntriesPerCacheEntry - 1));
1179
  if (cache->get(index + kStringOffset) == Smi::zero()) {
1180 1181 1182
    cache->set(index + kStringOffset, *key_string);
    cache->set(index + kPatternOffset, *key_pattern);
    cache->set(index + kArrayOffset, *value_array);
1183
    cache->set(index + kLastMatchOffset, *last_match_cache);
1184 1185 1186
  } else {
    uint32_t index2 =
        ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
1187
    if (cache->get(index2 + kStringOffset) == Smi::zero()) {
1188 1189 1190
      cache->set(index2 + kStringOffset, *key_string);
      cache->set(index2 + kPatternOffset, *key_pattern);
      cache->set(index2 + kArrayOffset, *value_array);
1191
      cache->set(index2 + kLastMatchOffset, *last_match_cache);
1192
    } else {
1193 1194 1195 1196
      cache->set(index2 + kStringOffset, Smi::zero());
      cache->set(index2 + kPatternOffset, Smi::zero());
      cache->set(index2 + kArrayOffset, Smi::zero());
      cache->set(index2 + kLastMatchOffset, Smi::zero());
1197 1198 1199
      cache->set(index + kStringOffset, *key_string);
      cache->set(index + kPatternOffset, *key_pattern);
      cache->set(index + kArrayOffset, *value_array);
1200
      cache->set(index + kLastMatchOffset, *last_match_cache);
1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212
    }
  }
  // If the array is a reasonably short list of substrings, convert it into a
  // list of internalized strings.
  if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
    for (int i = 0; i < value_array->length(); i++) {
      Handle<String> str(String::cast(value_array->get(i)), isolate);
      Handle<String> internalized_str = factory->InternalizeString(str);
      value_array->set(i, *internalized_str);
    }
  }
  // Convert backing store to a copy-on-write array.
1213 1214
  value_array->set_map_no_write_barrier(
      ReadOnlyRoots(isolate).fixed_cow_array_map());
1215 1216
}

1217
void RegExpResultsCache::Clear(FixedArray cache) {
1218
  for (int i = 0; i < kRegExpResultsCacheSize; i++) {
1219
    cache.set(i, Smi::zero());
1220 1221 1222
  }
}

1223 1224
}  // namespace internal
}  // namespace v8