// Copyright 2012 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef V8_REGEXP_REGEXP_H_ #define V8_REGEXP_REGEXP_H_ #include "src/common/assert-scope.h" #include "src/handles/handles.h" #include "src/regexp/regexp-error.h" #include "src/regexp/regexp-flags.h" #include "src/zone/zone-containers.h" namespace v8 { namespace internal { class JSRegExp; class RegExpCapture; class RegExpMatchInfo; class RegExpNode; class RegExpTree; enum class RegExpCompilationTarget : int { kBytecode, kNative }; // TODO(jgruber): Do not expose in regexp.h. // TODO(jgruber): Consider splitting between ParseData and CompileData. struct RegExpCompileData { // The parsed AST as produced by the RegExpParser. RegExpTree* tree = nullptr; // The compiled Node graph as produced by RegExpTree::ToNode methods. RegExpNode* node = nullptr; // Either the generated code as produced by the compiler or a trampoline // to the interpreter. Handle<Object> code; // True, iff the pattern is a 'simple' atom with zero captures. In other // words, the pattern consists of a string with no metacharacters and special // regexp features, and can be implemented as a standard string search. bool simple = true; // True, iff the pattern is anchored at the start of the string with '^'. bool contains_anchor = false; // Only set if the pattern contains named captures. // Note: the lifetime equals that of the parse/compile zone. ZoneVector<RegExpCapture*>* named_captures = nullptr; // The error message. Only used if an error occurred during parsing or // compilation. RegExpError error = RegExpError::kNone; // The position at which the error was detected. Only used if an // error occurred. int error_pos = 0; // The number of capture groups, without the global capture \0. int capture_count = 0; // The number of registers used by the generated code. int register_count = 0; // The compilation target (bytecode or native code). RegExpCompilationTarget compilation_target; }; class RegExp final : public AllStatic { public: // Whether the irregexp engine generates interpreter bytecode. static bool CanGenerateBytecode(); // Verify the given pattern, i.e. check that parsing succeeds. If // verification fails, `regexp_error_out` is set. template <class CharT> static bool VerifySyntax(Zone* zone, uintptr_t stack_limit, const CharT* input, int input_length, RegExpFlags flags, RegExpError* regexp_error_out, const DisallowGarbageCollection& no_gc); // Parses the RegExp pattern and prepares the JSRegExp object with // generic data and choice of implementation - as well as what // the implementation wants to store in the data field. // Returns false if compilation fails. V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile( Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern, RegExpFlags flags, uint32_t backtrack_limit); // Ensures that a regexp is fully compiled and ready to be executed on a // subject string. Returns true on success. Return false on failure, and // then an exception will be pending. V8_WARN_UNUSED_RESULT static bool EnsureFullyCompiled(Isolate* isolate, Handle<JSRegExp> re, Handle<String> subject); enum CallOrigin : int { kFromRuntime = 0, kFromJs = 1, }; enum class ExecQuirks { kNone, // Used to work around an issue in the RegExpPrototypeSplit fast path, // which diverges from the spec by not creating a sticky copy of the RegExp // instance and calling `exec` in a loop. If called in this context, we // must not update the last_match_info on a successful match at the subject // string end. See crbug.com/1075514 for more information. kTreatMatchAtEndAsFailure, }; // See ECMA-262 section 15.10.6.2. // This function calls the garbage collector if necessary. V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Exec( Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, int index, Handle<RegExpMatchInfo> last_match_info, ExecQuirks exec_quirks = ExecQuirks::kNone); V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> ExperimentalOneshotExec(Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, int index, Handle<RegExpMatchInfo> last_match_info, ExecQuirks exec_quirks = ExecQuirks::kNone); // Integral return values used throughout regexp code layers. static constexpr int kInternalRegExpFailure = 0; static constexpr int kInternalRegExpSuccess = 1; static constexpr int kInternalRegExpException = -1; static constexpr int kInternalRegExpRetry = -2; static constexpr int kInternalRegExpFallbackToExperimental = -3; static constexpr int kInternalRegExpSmallestResult = -3; enum IrregexpResult : int32_t { RE_FAILURE = kInternalRegExpFailure, RE_SUCCESS = kInternalRegExpSuccess, RE_EXCEPTION = kInternalRegExpException, RE_RETRY = kInternalRegExpRetry, RE_FALLBACK_TO_EXPERIMENTAL = kInternalRegExpFallbackToExperimental, }; // Set last match info. If match is nullptr, then setting captures is // omitted. static Handle<RegExpMatchInfo> SetLastMatchInfo( Isolate* isolate, Handle<RegExpMatchInfo> last_match_info, Handle<String> subject, int capture_count, int32_t* match); V8_EXPORT_PRIVATE static bool CompileForTesting( Isolate* isolate, Zone* zone, RegExpCompileData* input, RegExpFlags flags, Handle<String> pattern, Handle<String> sample_subject, bool is_one_byte); V8_EXPORT_PRIVATE static void DotPrintForTesting(const char* label, RegExpNode* node); static const int kRegExpTooLargeToOptimize = 20 * KB; V8_WARN_UNUSED_RESULT static MaybeHandle<Object> ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern, RegExpError error); static void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re, RegExpError error_text); static bool IsUnmodifiedRegExp(Isolate* isolate, Handle<JSRegExp> regexp); static Handle<FixedArray> CreateCaptureNameMap( Isolate* isolate, ZoneVector<RegExpCapture*>* named_captures); }; // Uses a special global mode of irregexp-generated code to perform a global // search and return multiple results at once. As such, this is essentially an // iterator over multiple results (retrieved batch-wise in advance). class RegExpGlobalCache final { public: RegExpGlobalCache(Handle<JSRegExp> regexp, Handle<String> subject, Isolate* isolate); ~RegExpGlobalCache(); // Fetch the next entry in the cache for global regexp match results. // This does not set the last match info. Upon failure, nullptr is // returned. The cause can be checked with Result(). The previous result is // still in available in memory when a failure happens. int32_t* FetchNext(); int32_t* LastSuccessfulMatch(); bool HasException() { return num_matches_ < 0; } private: int AdvanceZeroLength(int last_index); int num_matches_; int max_matches_; int current_match_index_; int registers_per_match_; // Pointer to the last set of captures. int32_t* register_array_; int register_array_size_; Handle<JSRegExp> regexp_; Handle<String> subject_; Isolate* isolate_; }; // Caches results for specific regexp queries on the isolate. At the time of // writing, this is used during global calls to RegExp.prototype.exec and // @@split. class RegExpResultsCache final : public AllStatic { public: enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS }; // Attempt to retrieve a cached result. On failure, 0 is returned as a Smi. // On success, the returned result is guaranteed to be a COW-array. static Object Lookup(Heap* heap, String key_string, Object key_pattern, FixedArray* last_match_out, ResultsCacheType type); // Attempt to add value_array to the cache specified by type. On success, // value_array is turned into a COW-array. static void Enter(Isolate* isolate, Handle<String> key_string, Handle<Object> key_pattern, Handle<FixedArray> value_array, Handle<FixedArray> last_match_cache, ResultsCacheType type); static void Clear(FixedArray cache); static constexpr int kRegExpResultsCacheSize = 0x100; private: static constexpr int kStringOffset = 0; static constexpr int kPatternOffset = 1; static constexpr int kArrayOffset = 2; static constexpr int kLastMatchOffset = 3; static constexpr int kArrayEntriesPerCacheEntry = 4; }; } // namespace internal } // namespace v8 #endif // V8_REGEXP_REGEXP_H_