// Copyright 2012 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "src/regexp/regexp.h" #include "src/base/strings.h" #include "src/codegen/compilation-cache.h" #include "src/diagnostics/code-tracer.h" #include "src/execution/interrupts-scope.h" #include "src/heap/heap-inl.h" #include "src/objects/js-regexp-inl.h" #include "src/regexp/experimental/experimental.h" #include "src/regexp/regexp-bytecode-generator.h" #include "src/regexp/regexp-bytecodes.h" #include "src/regexp/regexp-compiler.h" #include "src/regexp/regexp-dotprinter.h" #include "src/regexp/regexp-interpreter.h" #include "src/regexp/regexp-macro-assembler-arch.h" #include "src/regexp/regexp-macro-assembler-tracer.h" #include "src/regexp/regexp-parser.h" #include "src/regexp/regexp-utils.h" #include "src/strings/string-search.h" #include "src/utils/ostreams.h" namespace v8 { namespace internal { using namespace regexp_compiler_constants; // NOLINT(build/namespaces) class RegExpImpl final : public AllStatic { public: // Returns a string representation of a regular expression. // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4. // This function calls the garbage collector if necessary. static Handle<String> ToString(Handle<Object> value); // Prepares a JSRegExp object with Irregexp-specific data. static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern, RegExpFlags flags, int capture_count, uint32_t backtrack_limit); // Prepare a RegExp for being executed one or more times (using // IrregexpExecOnce) on the subject. // This ensures that the regexp is compiled for the subject, and that // the subject is flat. // Returns the number of integer spaces required by IrregexpExecOnce // as its "registers" argument. If the regexp cannot be compiled, // an exception is set as pending, and this function returns negative. static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject); static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern, RegExpFlags flags, Handle<String> match_pattern); static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, int index, int32_t* output, int output_size); static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, int index, Handle<RegExpMatchInfo> last_match_info); // Execute a regular expression on the subject, starting from index. // If matching succeeds, return the number of matches. This can be larger // than one in the case of global regular expressions. // The captures and subcaptures are stored into the registers vector. // If matching fails, returns RE_FAILURE. // If execution fails, sets a pending exception and returns RE_EXCEPTION. static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, int index, int32_t* output, int output_size); // Execute an Irregexp bytecode pattern. // On a successful match, the result is a JSArray containing // captured positions. On a failure, the result is the null value. // Returns an empty handle in case of an exception. V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec( Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, int index, Handle<RegExpMatchInfo> last_match_info, RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone); static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re, Handle<String> sample_subject, bool is_one_byte); static inline bool EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re, Handle<String> sample_subject, bool is_one_byte); // Returns true on success, false on failure. static bool Compile(Isolate* isolate, Zone* zone, RegExpCompileData* input, RegExpFlags flags, Handle<String> pattern, Handle<String> sample_subject, bool is_one_byte, uint32_t& backtrack_limit); // For acting on the JSRegExp data FixedArray. static int IrregexpMaxRegisterCount(FixedArray re); static void SetIrregexpMaxRegisterCount(FixedArray re, int value); static int IrregexpNumberOfCaptures(FixedArray re); static ByteArray IrregexpByteCode(FixedArray re, bool is_one_byte); static Code IrregexpNativeCode(FixedArray re, bool is_one_byte); }; // static bool RegExp::CanGenerateBytecode() { return FLAG_regexp_interpret_all || FLAG_regexp_tier_up; } // static template <class CharT> bool RegExp::VerifySyntax(Zone* zone, uintptr_t stack_limit, const CharT* input, int input_length, RegExpFlags flags, RegExpError* regexp_error_out, const DisallowGarbageCollection& no_gc) { RegExpCompileData data; bool pattern_is_valid = RegExpParser::VerifyRegExpSyntax( zone, stack_limit, input, input_length, flags, &data, no_gc); *regexp_error_out = data.error; return pattern_is_valid; } template bool RegExp::VerifySyntax<uint8_t>(Zone*, uintptr_t, const uint8_t*, int, RegExpFlags, RegExpError* regexp_error_out, const DisallowGarbageCollection&); template bool RegExp::VerifySyntax<base::uc16>( Zone*, uintptr_t, const base::uc16*, int, RegExpFlags, RegExpError* regexp_error_out, const DisallowGarbageCollection&); MaybeHandle<Object> RegExp::ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern, RegExpError error) { base::Vector<const char> error_data = base::CStrVector(RegExpErrorString(error)); Handle<String> error_text = isolate->factory() ->NewStringFromOneByte(base::Vector<const uint8_t>::cast(error_data)) .ToHandleChecked(); THROW_NEW_ERROR( isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp, pattern, error_text), Object); } void RegExp::ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re, RegExpError error_text) { USE(ThrowRegExpException(isolate, re, Handle<String>(re->source(), isolate), error_text)); } bool RegExp::IsUnmodifiedRegExp(Isolate* isolate, Handle<JSRegExp> regexp) { return RegExpUtils::IsUnmodifiedRegExp(isolate, regexp); } namespace { // Identifies the sort of regexps where the regexp engine is faster // than the code used for atom matches. bool HasFewDifferentCharacters(Handle<String> pattern) { int length = std::min(kMaxLookaheadForBoyerMoore, pattern->length()); if (length <= kPatternTooShortForBoyerMoore) return false; const int kMod = 128; bool character_found[kMod]; int different = 0; memset(&character_found[0], 0, sizeof(character_found)); for (int i = 0; i < length; i++) { int ch = (pattern->Get(i) & (kMod - 1)); if (!character_found[ch]) { character_found[ch] = true; different++; // We declare a regexp low-alphabet if it has at least 3 times as many // characters as it has different characters. if (different * 3 > length) return false; } } return true; } } // namespace // Generic RegExp methods. Dispatches to implementation specific methods. // static MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern, RegExpFlags flags, uint32_t backtrack_limit) { DCHECK(pattern->IsFlat()); // Caching is based only on the pattern and flags, but code also differs when // a backtrack limit is set. A present backtrack limit is very much *not* the // common case, so just skip the cache for these. const bool is_compilation_cache_enabled = (backtrack_limit == JSRegExp::kNoBacktrackLimit); Zone zone(isolate->allocator(), ZONE_NAME); CompilationCache* compilation_cache = nullptr; if (is_compilation_cache_enabled) { compilation_cache = isolate->compilation_cache(); MaybeHandle<FixedArray> maybe_cached = compilation_cache->LookupRegExp( pattern, JSRegExp::AsJSRegExpFlags(flags)); Handle<FixedArray> cached; if (maybe_cached.ToHandle(&cached)) { re->set_data(*cached); return re; } } PostponeInterruptsScope postpone(isolate); RegExpCompileData parse_result; DCHECK(!isolate->has_pending_exception()); if (!RegExpParser::ParseRegExpFromHeapString(isolate, &zone, pattern, flags, &parse_result)) { // Throw an exception if we fail to parse the pattern. return RegExp::ThrowRegExpException(isolate, re, pattern, parse_result.error); } bool has_been_compiled = false; if (FLAG_default_to_experimental_regexp_engine && ExperimentalRegExp::CanBeHandled(parse_result.tree, flags, parse_result.capture_count)) { DCHECK(FLAG_enable_experimental_regexp_engine); ExperimentalRegExp::Initialize(isolate, re, pattern, flags, parse_result.capture_count); has_been_compiled = true; } else if (flags & JSRegExp::kLinear) { DCHECK(FLAG_enable_experimental_regexp_engine); if (!ExperimentalRegExp::CanBeHandled(parse_result.tree, flags, parse_result.capture_count)) { // TODO(mbid): The error could provide a reason for why the regexp can't // be executed in linear time (e.g. due to back references). return RegExp::ThrowRegExpException(isolate, re, pattern, RegExpError::kNotLinear); } ExperimentalRegExp::Initialize(isolate, re, pattern, flags, parse_result.capture_count); has_been_compiled = true; } else if (parse_result.simple && !IsIgnoreCase(flags) && !IsSticky(flags) && !HasFewDifferentCharacters(pattern)) { // Parse-tree is a single atom that is equal to the pattern. RegExpImpl::AtomCompile(isolate, re, pattern, flags, pattern); has_been_compiled = true; } else if (parse_result.tree->IsAtom() && !IsSticky(flags) && parse_result.capture_count == 0) { RegExpAtom* atom = parse_result.tree->AsAtom(); // The pattern source might (?) contain escape sequences, but they're // resolved in atom_string. base::Vector<const base::uc16> atom_pattern = atom->data(); Handle<String> atom_string; ASSIGN_RETURN_ON_EXCEPTION( isolate, atom_string, isolate->factory()->NewStringFromTwoByte(atom_pattern), Object); if (!IsIgnoreCase(flags) && !HasFewDifferentCharacters(atom_string)) { RegExpImpl::AtomCompile(isolate, re, pattern, flags, atom_string); has_been_compiled = true; } } if (!has_been_compiled) { RegExpImpl::IrregexpInitialize(isolate, re, pattern, flags, parse_result.capture_count, backtrack_limit); } DCHECK(re->data().IsFixedArray()); // Compilation succeeded so the data is set on the regexp // and we can store it in the cache. Handle<FixedArray> data(FixedArray::cast(re->data()), isolate); if (is_compilation_cache_enabled) { compilation_cache->PutRegExp(pattern, JSRegExp::AsJSRegExpFlags(flags), data); } return re; } // static bool RegExp::EnsureFullyCompiled(Isolate* isolate, Handle<JSRegExp> re, Handle<String> subject) { switch (re->type_tag()) { case JSRegExp::NOT_COMPILED: UNREACHABLE(); case JSRegExp::ATOM: return true; case JSRegExp::IRREGEXP: if (RegExpImpl::IrregexpPrepare(isolate, re, subject) == -1) { DCHECK(isolate->has_pending_exception()); return false; } return true; case JSRegExp::EXPERIMENTAL: if (!ExperimentalRegExp::IsCompiled(re, isolate) && !ExperimentalRegExp::Compile(isolate, re)) { DCHECK(isolate->has_pending_exception()); return false; } return true; } } // static MaybeHandle<Object> RegExp::ExperimentalOneshotExec( Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, int index, Handle<RegExpMatchInfo> last_match_info, RegExp::ExecQuirks exec_quirks) { return ExperimentalRegExp::OneshotExec(isolate, regexp, subject, index, last_match_info, exec_quirks); } // static MaybeHandle<Object> RegExp::Exec(Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, int index, Handle<RegExpMatchInfo> last_match_info, ExecQuirks exec_quirks) { switch (regexp->type_tag()) { case JSRegExp::NOT_COMPILED: UNREACHABLE(); case JSRegExp::ATOM: return RegExpImpl::AtomExec(isolate, regexp, subject, index, last_match_info); case JSRegExp::IRREGEXP: return RegExpImpl::IrregexpExec(isolate, regexp, subject, index, last_match_info, exec_quirks); case JSRegExp::EXPERIMENTAL: return ExperimentalRegExp::Exec(isolate, regexp, subject, index, last_match_info, exec_quirks); } } // RegExp Atom implementation: Simple string search using indexOf. void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern, RegExpFlags flags, Handle<String> match_pattern) { isolate->factory()->SetRegExpAtomData( re, pattern, JSRegExp::AsJSRegExpFlags(flags), match_pattern); } namespace { void SetAtomLastCapture(Isolate* isolate, Handle<RegExpMatchInfo> last_match_info, String subject, int from, int to) { SealHandleScope shs(isolate); last_match_info->SetNumberOfCaptureRegisters(2); last_match_info->SetLastSubject(subject); last_match_info->SetLastInput(subject); last_match_info->SetCapture(0, from); last_match_info->SetCapture(1, to); } } // namespace int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, int index, int32_t* output, int output_size) { DCHECK_LE(0, index); DCHECK_LE(index, subject->length()); subject = String::Flatten(isolate, subject); DisallowGarbageCollection no_gc; // ensure vectors stay valid String needle = regexp->atom_pattern(); int needle_len = needle.length(); DCHECK(needle.IsFlat()); DCHECK_LT(0, needle_len); if (index + needle_len > subject->length()) { return RegExp::RE_FAILURE; } for (int i = 0; i < output_size; i += 2) { String::FlatContent needle_content = needle.GetFlatContent(no_gc); String::FlatContent subject_content = subject->GetFlatContent(no_gc); DCHECK(needle_content.IsFlat()); DCHECK(subject_content.IsFlat()); // dispatch on type of strings index = (needle_content.IsOneByte() ? (subject_content.IsOneByte() ? SearchString(isolate, subject_content.ToOneByteVector(), needle_content.ToOneByteVector(), index) : SearchString(isolate, subject_content.ToUC16Vector(), needle_content.ToOneByteVector(), index)) : (subject_content.IsOneByte() ? SearchString(isolate, subject_content.ToOneByteVector(), needle_content.ToUC16Vector(), index) : SearchString(isolate, subject_content.ToUC16Vector(), needle_content.ToUC16Vector(), index))); if (index == -1) { return i / 2; // Return number of matches. } else { output[i] = index; output[i + 1] = index + needle_len; index += needle_len; } } return output_size / 2; } Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re, Handle<String> subject, int index, Handle<RegExpMatchInfo> last_match_info) { static const int kNumRegisters = 2; STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize); int32_t* output_registers = isolate->jsregexp_static_offsets_vector(); int res = AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters); if (res == RegExp::RE_FAILURE) return isolate->factory()->null_value(); DCHECK_EQ(res, RegExp::RE_SUCCESS); SealHandleScope shs(isolate); SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0], output_registers[1]); return last_match_info; } // Irregexp implementation. // Ensures that the regexp object contains a compiled version of the // source for either one-byte or two-byte subject strings. // If the compiled version doesn't already exist, it is compiled // from the source pattern. // If compilation fails, an exception is thrown and this function // returns false. bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re, Handle<String> sample_subject, bool is_one_byte) { Object compiled_code = re->code(is_one_byte); Object bytecode = re->bytecode(is_one_byte); bool needs_initial_compilation = compiled_code == Smi::FromInt(JSRegExp::kUninitializedValue); // Recompile is needed when we're dealing with the first execution of the // regexp after the decision to tier up has been made. If the tiering up // strategy is not in use, this value is always false. bool needs_tier_up_compilation = re->MarkedForTierUp() && bytecode.IsByteArray(); if (FLAG_trace_regexp_tier_up && needs_tier_up_compilation) { PrintF("JSRegExp object %p needs tier-up compilation\n", reinterpret_cast<void*>(re->ptr())); } if (!needs_initial_compilation && !needs_tier_up_compilation) { DCHECK(compiled_code.IsCodeT()); DCHECK_IMPLIES(FLAG_regexp_interpret_all, bytecode.IsByteArray()); return true; } DCHECK_IMPLIES(needs_tier_up_compilation, bytecode.IsByteArray()); return CompileIrregexp(isolate, re, sample_subject, is_one_byte); } namespace { #ifdef DEBUG bool RegExpCodeIsValidForPreCompilation(Handle<JSRegExp> re, bool is_one_byte) { Object entry = re->code(is_one_byte); Object bytecode = re->bytecode(is_one_byte); // If we're not using the tier-up strategy, entry can only be a smi // representing an uncompiled regexp here. If we're using the tier-up // strategy, entry can still be a smi representing an uncompiled regexp, when // compiling the regexp before the tier-up, or it can contain a trampoline to // the regexp interpreter, in which case the bytecode field contains compiled // bytecode, when recompiling the regexp after the tier-up. If the // tier-up was forced, which happens for global replaces, entry is a smi // representing an uncompiled regexp, even though we're "recompiling" after // the tier-up. if (re->ShouldProduceBytecode()) { DCHECK(entry.IsSmi()); DCHECK(bytecode.IsSmi()); int entry_value = Smi::ToInt(entry); int bytecode_value = Smi::ToInt(bytecode); DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value); DCHECK_EQ(JSRegExp::kUninitializedValue, bytecode_value); } else { DCHECK(entry.IsSmi() || (entry.IsCodeT() && bytecode.IsByteArray())); } return true; } #endif struct RegExpCaptureIndexLess { bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const { DCHECK_NOT_NULL(lhs); DCHECK_NOT_NULL(rhs); return lhs->index() < rhs->index(); } }; } // namespace // static Handle<FixedArray> RegExp::CreateCaptureNameMap( Isolate* isolate, ZoneVector<RegExpCapture*>* named_captures) { if (named_captures == nullptr) return Handle<FixedArray>(); DCHECK(!named_captures->empty()); // Named captures are sorted by name (because the set is used to ensure // name uniqueness). But the capture name map must to be sorted by index. std::sort(named_captures->begin(), named_captures->end(), RegExpCaptureIndexLess{}); int len = static_cast<int>(named_captures->size()) * 2; Handle<FixedArray> array = isolate->factory()->NewFixedArray(len); int i = 0; for (const RegExpCapture* capture : *named_captures) { base::Vector<const base::uc16> capture_name(capture->name()->data(), capture->name()->size()); // CSA code in ConstructNewResultFromMatchInfo requires these strings to be // internalized so they can be used as property names in the 'exec' results. Handle<String> name = isolate->factory()->InternalizeString(capture_name); array->set(i * 2, *name); array->set(i * 2 + 1, Smi::FromInt(capture->index())); i++; } DCHECK_EQ(i * 2, len); return array; } bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re, Handle<String> sample_subject, bool is_one_byte) { // Compile the RegExp. Zone zone(isolate->allocator(), ZONE_NAME); PostponeInterruptsScope postpone(isolate); DCHECK(RegExpCodeIsValidForPreCompilation(re, is_one_byte)); RegExpFlags flags = JSRegExp::AsRegExpFlags(re->flags()); Handle<String> pattern(re->source(), isolate); pattern = String::Flatten(isolate, pattern); RegExpCompileData compile_data; if (!RegExpParser::ParseRegExpFromHeapString(isolate, &zone, pattern, flags, &compile_data)) { // Throw an exception if we fail to parse the pattern. // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. USE(RegExp::ThrowRegExpException(isolate, re, pattern, compile_data.error)); return false; } // The compilation target is a kBytecode if we're interpreting all regexp // objects, or if we're using the tier-up strategy but the tier-up hasn't // happened yet. The compilation target is a kNative if we're using the // tier-up strategy and we need to recompile to tier-up, or if we're producing // native code for all regexp objects. compile_data.compilation_target = re->ShouldProduceBytecode() ? RegExpCompilationTarget::kBytecode : RegExpCompilationTarget::kNative; uint32_t backtrack_limit = re->backtrack_limit(); const bool compilation_succeeded = Compile(isolate, &zone, &compile_data, flags, pattern, sample_subject, is_one_byte, backtrack_limit); if (!compilation_succeeded) { DCHECK(compile_data.error != RegExpError::kNone); RegExp::ThrowRegExpException(isolate, re, compile_data.error); return false; } Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()), isolate); if (compile_data.compilation_target == RegExpCompilationTarget::kNative) { Code code = Code::cast(*compile_data.code); data->set(JSRegExp::code_index(is_one_byte), ToCodeT(code)); // Reset bytecode to uninitialized. In case we use tier-up we know that // tier-up has happened this way. data->set(JSRegExp::bytecode_index(is_one_byte), Smi::FromInt(JSRegExp::kUninitializedValue)); } else { DCHECK_EQ(compile_data.compilation_target, RegExpCompilationTarget::kBytecode); // Store code generated by compiler in bytecode and trampoline to // interpreter in code. data->set(JSRegExp::bytecode_index(is_one_byte), *compile_data.code); Handle<CodeT> trampoline = BUILTIN_CODE(isolate, RegExpInterpreterTrampoline); data->set(JSRegExp::code_index(is_one_byte), *trampoline); } Handle<FixedArray> capture_name_map = RegExp::CreateCaptureNameMap(isolate, compile_data.named_captures); re->set_capture_name_map(capture_name_map); int register_max = IrregexpMaxRegisterCount(*data); if (compile_data.register_count > register_max) { SetIrregexpMaxRegisterCount(*data, compile_data.register_count); } data->set(JSRegExp::kIrregexpBacktrackLimit, Smi::FromInt(backtrack_limit)); if (FLAG_trace_regexp_tier_up) { PrintF("JSRegExp object %p %s size: %d\n", reinterpret_cast<void*>(re->ptr()), re->ShouldProduceBytecode() ? "bytecode" : "native code", re->ShouldProduceBytecode() ? IrregexpByteCode(*data, is_one_byte).Size() : IrregexpNativeCode(*data, is_one_byte).Size()); } return true; } int RegExpImpl::IrregexpMaxRegisterCount(FixedArray re) { return Smi::ToInt(re.get(JSRegExp::kIrregexpMaxRegisterCountIndex)); } void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray re, int value) { re.set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value)); } int RegExpImpl::IrregexpNumberOfCaptures(FixedArray re) { return Smi::ToInt(re.get(JSRegExp::kIrregexpCaptureCountIndex)); } ByteArray RegExpImpl::IrregexpByteCode(FixedArray re, bool is_one_byte) { return ByteArray::cast(re.get(JSRegExp::bytecode_index(is_one_byte))); } Code RegExpImpl::IrregexpNativeCode(FixedArray re, bool is_one_byte) { return Code::cast(re.get(JSRegExp::code_index(is_one_byte))); } void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern, RegExpFlags flags, int capture_count, uint32_t backtrack_limit) { // Initialize compiled code entries to null. isolate->factory()->SetRegExpIrregexpData(re, pattern, JSRegExp::AsJSRegExpFlags(flags), capture_count, backtrack_limit); } // static int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject) { DCHECK(subject->IsFlat()); // Check representation of the underlying storage. bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject); if (!RegExpImpl::EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte)) { return -1; } // Only reserve room for output captures. Internal registers are allocated by // the engine. return JSRegExp::RegistersForCaptureCount(regexp->capture_count()); } int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, int index, int32_t* output, int output_size) { DCHECK_LE(0, index); DCHECK_LE(index, subject->length()); DCHECK(subject->IsFlat()); DCHECK_GE(output_size, JSRegExp::RegistersForCaptureCount(regexp->capture_count())); bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject); if (!regexp->ShouldProduceBytecode()) { do { EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte); // The stack is used to allocate registers for the compiled regexp code. // This means that in case of failure, the output registers array is left // untouched and contains the capture results from the previous successful // match. We can use that to set the last match info lazily. int res = NativeRegExpMacroAssembler::Match(regexp, subject, output, output_size, index, isolate); if (res != NativeRegExpMacroAssembler::RETRY) { DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION || isolate->has_pending_exception()); STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RegExp::RE_SUCCESS); STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RegExp::RE_FAILURE); STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) == RegExp::RE_EXCEPTION); return res; } // If result is RETRY, the string has changed representation, and we // must restart from scratch. // In this case, it means we must make sure we are prepared to handle // the, potentially, different subject (the string can switch between // being internal and external, and even between being Latin1 and // UC16, but the characters are always the same). is_one_byte = String::IsOneByteRepresentationUnderneath(*subject); } while (true); UNREACHABLE(); } else { DCHECK(regexp->ShouldProduceBytecode()); do { IrregexpInterpreter::Result result = IrregexpInterpreter::MatchForCallFromRuntime( isolate, regexp, subject, output, output_size, index); DCHECK_IMPLIES(result == IrregexpInterpreter::EXCEPTION, isolate->has_pending_exception()); switch (result) { case IrregexpInterpreter::SUCCESS: case IrregexpInterpreter::EXCEPTION: case IrregexpInterpreter::FAILURE: case IrregexpInterpreter::FALLBACK_TO_EXPERIMENTAL: return result; case IrregexpInterpreter::RETRY: // The string has changed representation, and we must restart the // match. // We need to reset the tier up to start over with compilation. if (FLAG_regexp_tier_up) regexp->ResetLastTierUpTick(); is_one_byte = String::IsOneByteRepresentationUnderneath(*subject); EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte); break; } } while (true); UNREACHABLE(); } } MaybeHandle<Object> RegExpImpl::IrregexpExec( Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, int previous_index, Handle<RegExpMatchInfo> last_match_info, RegExp::ExecQuirks exec_quirks) { DCHECK_EQ(regexp->type_tag(), JSRegExp::IRREGEXP); subject = String::Flatten(isolate, subject); #ifdef DEBUG if (FLAG_trace_regexp_bytecodes && regexp->ShouldProduceBytecode()) { PrintF("\n\nRegexp match: /%s/\n\n", regexp->source().ToCString().get()); PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get()); } #endif // For very long subject strings, the regexp interpreter is currently much // slower than the jitted code execution. If the tier-up strategy is turned // on, we want to avoid this performance penalty so we eagerly tier-up if the // subject string length is equal or greater than the given heuristic value. if (FLAG_regexp_tier_up && subject->length() >= JSRegExp::kTierUpForSubjectLengthValue) { regexp->MarkTierUpForNextExec(); if (FLAG_trace_regexp_tier_up) { PrintF( "Forcing tier-up for very long strings in " "RegExpImpl::IrregexpExec\n"); } } // Prepare space for the return values. int required_registers = RegExpImpl::IrregexpPrepare(isolate, regexp, subject); if (required_registers < 0) { // Compiling failed with an exception. DCHECK(isolate->has_pending_exception()); return MaybeHandle<Object>(); } int32_t* output_registers = nullptr; if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) { output_registers = NewArray<int32_t>(required_registers); } std::unique_ptr<int32_t[]> auto_release(output_registers); if (output_registers == nullptr) { output_registers = isolate->jsregexp_static_offsets_vector(); } int res = RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index, output_registers, required_registers); if (res == RegExp::RE_SUCCESS) { if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) { if (output_registers[0] >= subject->length()) { return isolate->factory()->null_value(); } } int capture_count = regexp->capture_count(); return RegExp::SetLastMatchInfo(isolate, last_match_info, subject, capture_count, output_registers); } else if (res == RegExp::RE_FALLBACK_TO_EXPERIMENTAL) { return ExperimentalRegExp::OneshotExec(isolate, regexp, subject, previous_index, last_match_info); } else if (res == RegExp::RE_EXCEPTION) { DCHECK(isolate->has_pending_exception()); return MaybeHandle<Object>(); } else { DCHECK(res == RegExp::RE_FAILURE); return isolate->factory()->null_value(); } } // static Handle<RegExpMatchInfo> RegExp::SetLastMatchInfo( Isolate* isolate, Handle<RegExpMatchInfo> last_match_info, Handle<String> subject, int capture_count, int32_t* match) { // This is the only place where match infos can grow. If, after executing the // regexp, RegExpExecStub finds that the match info is too small, it restarts // execution in RegExpImpl::Exec, which finally grows the match info right // here. Handle<RegExpMatchInfo> result = RegExpMatchInfo::ReserveCaptures(isolate, last_match_info, capture_count); if (*result != *last_match_info) { if (*last_match_info == *isolate->regexp_last_match_info()) { // This inner condition is only needed for special situations like the // regexp fuzzer, where we pass our own custom RegExpMatchInfo to // RegExpImpl::Exec; there actually want to bypass the Isolate's match // info and execute the regexp without side effects. isolate->native_context()->set_regexp_last_match_info(*result); } } int capture_register_count = JSRegExp::RegistersForCaptureCount(capture_count); DisallowGarbageCollection no_gc; if (match != nullptr) { for (int i = 0; i < capture_register_count; i += 2) { result->SetCapture(i, match[i]); result->SetCapture(i + 1, match[i + 1]); } } result->SetLastSubject(*subject); result->SetLastInput(*subject); return result; } // static void RegExp::DotPrintForTesting(const char* label, RegExpNode* node) { DotPrinter::DotPrint(label, node); } namespace { // Returns true if we've either generated too much irregex code within this // isolate, or the pattern string is too long. bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) { // Limit the space regexps take up on the heap. In order to limit this we // would like to keep track of the amount of regexp code on the heap. This // is not tracked, however. As a conservative approximation we track the // total regexp code compiled including code that has subsequently been freed // and the total executable memory at any point. static constexpr size_t kRegExpExecutableMemoryLimit = 16 * MB; static constexpr size_t kRegExpCompiledLimit = 1 * MB; Heap* heap = isolate->heap(); if (pattern->length() > RegExp::kRegExpTooLargeToOptimize) return true; return (isolate->total_regexp_code_generated() > kRegExpCompiledLimit && heap->CommittedMemoryExecutable() > kRegExpExecutableMemoryLimit); } } // namespace // static bool RegExp::CompileForTesting(Isolate* isolate, Zone* zone, RegExpCompileData* data, RegExpFlags flags, Handle<String> pattern, Handle<String> sample_subject, bool is_one_byte) { uint32_t backtrack_limit = JSRegExp::kNoBacktrackLimit; return RegExpImpl::Compile(isolate, zone, data, flags, pattern, sample_subject, is_one_byte, backtrack_limit); } bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data, RegExpFlags flags, Handle<String> pattern, Handle<String> sample_subject, bool is_one_byte, uint32_t& backtrack_limit) { if (JSRegExp::RegistersForCaptureCount(data->capture_count) > RegExpMacroAssembler::kMaxRegisterCount) { data->error = RegExpError::kTooLarge; return false; } RegExpCompiler compiler(isolate, zone, data->capture_count, flags, is_one_byte); if (compiler.optimize()) { compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern)); } // Sample some characters from the middle of the string. static const int kSampleSize = 128; sample_subject = String::Flatten(isolate, sample_subject); int chars_sampled = 0; int half_way = (sample_subject->length() - kSampleSize) / 2; for (int i = std::max(0, half_way); i < sample_subject->length() && chars_sampled < kSampleSize; i++, chars_sampled++) { compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); } data->node = compiler.PreprocessRegExp(data, flags, is_one_byte); data->error = AnalyzeRegExp(isolate, is_one_byte, flags, data->node); if (data->error != RegExpError::kNone) { return false; } if (FLAG_trace_regexp_graph) DotPrinter::DotPrint("Start", data->node); // Create the correct assembler for the architecture. std::unique_ptr<RegExpMacroAssembler> macro_assembler; if (data->compilation_target == RegExpCompilationTarget::kNative) { // Native regexp implementation. DCHECK(!FLAG_jitless); NativeRegExpMacroAssembler::Mode mode = is_one_byte ? NativeRegExpMacroAssembler::LATIN1 : NativeRegExpMacroAssembler::UC16; const int output_register_count = JSRegExp::RegistersForCaptureCount(data->capture_count); #if V8_TARGET_ARCH_IA32 macro_assembler.reset(new RegExpMacroAssemblerIA32(isolate, zone, mode, output_register_count)); #elif V8_TARGET_ARCH_X64 macro_assembler.reset(new RegExpMacroAssemblerX64(isolate, zone, mode, output_register_count)); #elif V8_TARGET_ARCH_ARM macro_assembler.reset(new RegExpMacroAssemblerARM(isolate, zone, mode, output_register_count)); #elif V8_TARGET_ARCH_ARM64 macro_assembler.reset(new RegExpMacroAssemblerARM64(isolate, zone, mode, output_register_count)); #elif V8_TARGET_ARCH_S390 macro_assembler.reset(new RegExpMacroAssemblerS390(isolate, zone, mode, output_register_count)); #elif V8_TARGET_ARCH_PPC || V8_TARGET_ARCH_PPC64 macro_assembler.reset(new RegExpMacroAssemblerPPC(isolate, zone, mode, output_register_count)); #elif V8_TARGET_ARCH_MIPS macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode, output_register_count)); #elif V8_TARGET_ARCH_MIPS64 macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode, output_register_count)); #elif V8_TARGET_ARCH_RISCV64 macro_assembler.reset(new RegExpMacroAssemblerRISCV(isolate, zone, mode, output_register_count)); #elif V8_TARGET_ARCH_LOONG64 macro_assembler.reset(new RegExpMacroAssemblerLOONG64( isolate, zone, mode, output_register_count)); #else #error "Unsupported architecture" #endif } else { DCHECK_EQ(data->compilation_target, RegExpCompilationTarget::kBytecode); // Interpreted regexp implementation. macro_assembler.reset(new RegExpBytecodeGenerator(isolate, zone)); } macro_assembler->set_slow_safe(TooMuchRegExpCode(isolate, pattern)); if (FLAG_enable_experimental_regexp_engine_on_excessive_backtracks && ExperimentalRegExp::CanBeHandled(data->tree, flags, data->capture_count)) { if (backtrack_limit == JSRegExp::kNoBacktrackLimit) { backtrack_limit = FLAG_regexp_backtracks_before_fallback; } else { backtrack_limit = std::min(backtrack_limit, FLAG_regexp_backtracks_before_fallback); } macro_assembler->set_backtrack_limit(backtrack_limit); macro_assembler->set_can_fallback(true); } else { macro_assembler->set_backtrack_limit(backtrack_limit); macro_assembler->set_can_fallback(false); } // Inserted here, instead of in Assembler, because it depends on information // in the AST that isn't replicated in the Node structure. bool is_end_anchored = data->tree->IsAnchoredAtEnd(); bool is_start_anchored = data->tree->IsAnchoredAtStart(); int max_length = data->tree->max_match(); static const int kMaxBacksearchLimit = 1024; if (is_end_anchored && !is_start_anchored && !IsSticky(flags) && max_length < kMaxBacksearchLimit) { macro_assembler->SetCurrentPositionFromEnd(max_length); } if (IsGlobal(flags)) { RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL; if (data->tree->min_match() > 0) { mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK; } else if (IsUnicode(flags)) { mode = RegExpMacroAssembler::GLOBAL_UNICODE; } macro_assembler->set_global_mode(mode); } RegExpMacroAssembler* macro_assembler_ptr = macro_assembler.get(); #ifdef DEBUG std::unique_ptr<RegExpMacroAssembler> tracer_macro_assembler; if (FLAG_trace_regexp_assembler) { tracer_macro_assembler.reset( new RegExpMacroAssemblerTracer(isolate, macro_assembler_ptr)); macro_assembler_ptr = tracer_macro_assembler.get(); } #endif RegExpCompiler::CompilationResult result = compiler.Assemble( isolate, macro_assembler_ptr, data->node, data->capture_count, pattern); // Code / bytecode printing. { #ifdef ENABLE_DISASSEMBLER if (FLAG_print_regexp_code && data->compilation_target == RegExpCompilationTarget::kNative) { CodeTracer::Scope trace_scope(isolate->GetCodeTracer()); OFStream os(trace_scope.file()); Handle<Code> c = Handle<Code>::cast(result.code); auto pattern_cstring = pattern->ToCString(); c->Disassemble(pattern_cstring.get(), os, isolate); } #endif if (FLAG_print_regexp_bytecode && data->compilation_target == RegExpCompilationTarget::kBytecode) { Handle<ByteArray> bytecode = Handle<ByteArray>::cast(result.code); auto pattern_cstring = pattern->ToCString(); RegExpBytecodeDisassemble(bytecode->GetDataStartAddress(), bytecode->length(), pattern_cstring.get()); } } if (result.error != RegExpError::kNone) { if (FLAG_correctness_fuzzer_suppressions && result.error == RegExpError::kStackOverflow) { FATAL("Aborting on stack overflow"); } data->error = result.error; } data->code = result.code; data->register_count = result.num_registers; return result.Succeeded(); } RegExpGlobalCache::RegExpGlobalCache(Handle<JSRegExp> regexp, Handle<String> subject, Isolate* isolate) : register_array_(nullptr), register_array_size_(0), regexp_(regexp), subject_(subject), isolate_(isolate) { DCHECK(IsGlobal(JSRegExp::AsRegExpFlags(regexp->flags()))); switch (regexp_->type_tag()) { case JSRegExp::NOT_COMPILED: UNREACHABLE(); case JSRegExp::ATOM: { // ATOM regexps do not have a global loop, so we search for one match at // a time. static const int kAtomRegistersPerMatch = 2; registers_per_match_ = kAtomRegistersPerMatch; register_array_size_ = registers_per_match_; break; } case JSRegExp::IRREGEXP: { registers_per_match_ = RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_); if (registers_per_match_ < 0) { num_matches_ = -1; // Signal exception. return; } if (regexp->ShouldProduceBytecode()) { // Global loop in interpreted regexp is not implemented. We choose the // size of the offsets vector so that it can only store one match. register_array_size_ = registers_per_match_; max_matches_ = 1; } else { register_array_size_ = std::max( {registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize}); } break; } case JSRegExp::EXPERIMENTAL: { if (!ExperimentalRegExp::IsCompiled(regexp, isolate_) && !ExperimentalRegExp::Compile(isolate_, regexp)) { DCHECK(isolate->has_pending_exception()); num_matches_ = -1; // Signal exception. return; } registers_per_match_ = JSRegExp::RegistersForCaptureCount(regexp->capture_count()); register_array_size_ = std::max( {registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize}); break; } } max_matches_ = register_array_size_ / registers_per_match_; if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { register_array_ = NewArray<int32_t>(register_array_size_); } else { register_array_ = isolate->jsregexp_static_offsets_vector(); } // Set state so that fetching the results the first time triggers a call // to the compiled regexp. current_match_index_ = max_matches_ - 1; num_matches_ = max_matches_; DCHECK_LE(2, registers_per_match_); // Each match has at least one capture. DCHECK_GE(register_array_size_, registers_per_match_); int32_t* last_match = ®ister_array_[current_match_index_ * registers_per_match_]; last_match[0] = -1; last_match[1] = 0; } RegExpGlobalCache::~RegExpGlobalCache() { // Deallocate the register array if we allocated it in the constructor // (as opposed to using the existing jsregexp_static_offsets_vector). if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { DeleteArray(register_array_); } } int RegExpGlobalCache::AdvanceZeroLength(int last_index) { if (IsUnicode(JSRegExp::AsRegExpFlags(regexp_->flags())) && last_index + 1 < subject_->length() && unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) && unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) { // Advance over the surrogate pair. return last_index + 2; } return last_index + 1; } int32_t* RegExpGlobalCache::FetchNext() { current_match_index_++; if (current_match_index_ >= num_matches_) { // Current batch of results exhausted. // Fail if last batch was not even fully filled. if (num_matches_ < max_matches_) { num_matches_ = 0; // Signal failed match. return nullptr; } int32_t* last_match = ®ister_array_[(current_match_index_ - 1) * registers_per_match_]; int last_end_index = last_match[1]; switch (regexp_->type_tag()) { case JSRegExp::NOT_COMPILED: UNREACHABLE(); case JSRegExp::ATOM: num_matches_ = RegExpImpl::AtomExecRaw(isolate_, regexp_, subject_, last_end_index, register_array_, register_array_size_); break; case JSRegExp::EXPERIMENTAL: { DCHECK(ExperimentalRegExp::IsCompiled(regexp_, isolate_)); DisallowGarbageCollection no_gc; num_matches_ = ExperimentalRegExp::ExecRaw( isolate_, RegExp::kFromRuntime, *regexp_, *subject_, register_array_, register_array_size_, last_end_index); break; } case JSRegExp::IRREGEXP: { int last_start_index = last_match[0]; if (last_start_index == last_end_index) { // Zero-length match. Advance by one code point. last_end_index = AdvanceZeroLength(last_end_index); } if (last_end_index > subject_->length()) { num_matches_ = 0; // Signal failed match. return nullptr; } num_matches_ = RegExpImpl::IrregexpExecRaw( isolate_, regexp_, subject_, last_end_index, register_array_, register_array_size_); break; } } // Fall back to experimental engine if needed and possible. if (num_matches_ == RegExp::kInternalRegExpFallbackToExperimental) { num_matches_ = ExperimentalRegExp::OneshotExecRaw( isolate_, regexp_, subject_, register_array_, register_array_size_, last_end_index); } if (num_matches_ <= 0) { return nullptr; } current_match_index_ = 0; return register_array_; } else { return ®ister_array_[current_match_index_ * registers_per_match_]; } } int32_t* RegExpGlobalCache::LastSuccessfulMatch() { int index = current_match_index_ * registers_per_match_; if (num_matches_ == 0) { // After a failed match we shift back by one result. index -= registers_per_match_; } return ®ister_array_[index]; } Object RegExpResultsCache::Lookup(Heap* heap, String key_string, Object key_pattern, FixedArray* last_match_cache, ResultsCacheType type) { FixedArray cache; if (!key_string.IsInternalizedString()) return Smi::zero(); if (type == STRING_SPLIT_SUBSTRINGS) { DCHECK(key_pattern.IsString()); if (!key_pattern.IsInternalizedString()) return Smi::zero(); cache = heap->string_split_cache(); } else { DCHECK(type == REGEXP_MULTIPLE_INDICES); DCHECK(key_pattern.IsFixedArray()); cache = heap->regexp_multiple_cache(); } uint32_t hash = key_string.hash(); uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) & ~(kArrayEntriesPerCacheEntry - 1)); if (cache.get(index + kStringOffset) != key_string || cache.get(index + kPatternOffset) != key_pattern) { index = ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1)); if (cache.get(index + kStringOffset) != key_string || cache.get(index + kPatternOffset) != key_pattern) { return Smi::zero(); } } *last_match_cache = FixedArray::cast(cache.get(index + kLastMatchOffset)); return cache.get(index + kArrayOffset); } void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string, Handle<Object> key_pattern, Handle<FixedArray> value_array, Handle<FixedArray> last_match_cache, ResultsCacheType type) { Factory* factory = isolate->factory(); Handle<FixedArray> cache; if (!key_string->IsInternalizedString()) return; if (type == STRING_SPLIT_SUBSTRINGS) { DCHECK(key_pattern->IsString()); if (!key_pattern->IsInternalizedString()) return; cache = factory->string_split_cache(); } else { DCHECK(type == REGEXP_MULTIPLE_INDICES); DCHECK(key_pattern->IsFixedArray()); cache = factory->regexp_multiple_cache(); } uint32_t hash = key_string->hash(); uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) & ~(kArrayEntriesPerCacheEntry - 1)); if (cache->get(index + kStringOffset) == Smi::zero()) { cache->set(index + kStringOffset, *key_string); cache->set(index + kPatternOffset, *key_pattern); cache->set(index + kArrayOffset, *value_array); cache->set(index + kLastMatchOffset, *last_match_cache); } else { uint32_t index2 = ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1)); if (cache->get(index2 + kStringOffset) == Smi::zero()) { cache->set(index2 + kStringOffset, *key_string); cache->set(index2 + kPatternOffset, *key_pattern); cache->set(index2 + kArrayOffset, *value_array); cache->set(index2 + kLastMatchOffset, *last_match_cache); } else { cache->set(index2 + kStringOffset, Smi::zero()); cache->set(index2 + kPatternOffset, Smi::zero()); cache->set(index2 + kArrayOffset, Smi::zero()); cache->set(index2 + kLastMatchOffset, Smi::zero()); cache->set(index + kStringOffset, *key_string); cache->set(index + kPatternOffset, *key_pattern); cache->set(index + kArrayOffset, *value_array); cache->set(index + kLastMatchOffset, *last_match_cache); } } // If the array is a reasonably short list of substrings, convert it into a // list of internalized strings. if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) { for (int i = 0; i < value_array->length(); i++) { Handle<String> str(String::cast(value_array->get(i)), isolate); Handle<String> internalized_str = factory->InternalizeString(str); value_array->set(i, *internalized_str); } } // Convert backing store to a copy-on-write array. value_array->set_map_no_write_barrier( ReadOnlyRoots(isolate).fixed_cow_array_map()); } void RegExpResultsCache::Clear(FixedArray cache) { for (int i = 0; i < kRegExpResultsCacheSize; i++) { cache.set(i, Smi::zero()); } } } // namespace internal } // namespace v8