Commit c7d57dd3 authored by Jakob Gruber's avatar Jakob Gruber Committed by Commit Bot

[regexp] Reduce public API surface

This further reduces the number of things declared in the public
regexp API file, currently still named jsregexp.h.

* Move JSRegExp::Flags convenience functions to regexp-compiler.h.
* Set RegExpImpl methods private if possible (these will later be
  moved to a new hidden impl class).
* Merge RegExpEngine::CompilationResult into RegExpCompileData.
* Move remaining RegExpEngine methods to RegExpImpl and delete
  RegExpEngine.
* Extract RegExpGlobalCache.
* Document a few data structures.

Upcoming CLs will rename RegExpImpl to RegExp and jsregexp.h to
regexp.h. This should then be the only header included from other
directories.

Bug: v8:9359
Change-Id: I78c8f4cca495a2b95735a48b6181583bc3310bdf
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1662294Reviewed-by: 's avatarPeter Marshall <petermarshall@chromium.org>
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#62218}
parent 8e53e4b2
...@@ -2677,7 +2677,6 @@ v8_source_set("v8_base_without_compiler") { ...@@ -2677,7 +2677,6 @@ v8_source_set("v8_base_without_compiler") {
"src/profiler/tick-sample.h", "src/profiler/tick-sample.h",
"src/profiler/tracing-cpu-profiler.cc", "src/profiler/tracing-cpu-profiler.cc",
"src/profiler/tracing-cpu-profiler.h", "src/profiler/tracing-cpu-profiler.h",
"src/regexp/jsregexp-inl.h",
"src/regexp/jsregexp.cc", "src/regexp/jsregexp.cc",
"src/regexp/jsregexp.h", "src/regexp/jsregexp.h",
"src/regexp/property-sequences.cc", "src/regexp/property-sequences.cc",
......
// Copyright 2013 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_JSREGEXP_INL_H_
#define V8_REGEXP_JSREGEXP_INL_H_
#include "src/objects/js-regexp-inl.h"
#include "src/objects/objects.h"
#include "src/regexp/jsregexp.h"
#include "src/utils/allocation.h"
namespace v8 {
namespace internal {
RegExpImpl::GlobalCache::~GlobalCache() {
// Deallocate the register array if we allocated it in the constructor
// (as opposed to using the existing jsregexp_static_offsets_vector).
if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
DeleteArray(register_array_);
}
}
int32_t* RegExpImpl::GlobalCache::FetchNext() {
current_match_index_++;
if (current_match_index_ >= num_matches_) {
// Current batch of results exhausted.
// Fail if last batch was not even fully filled.
if (num_matches_ < max_matches_) {
num_matches_ = 0; // Signal failed match.
return nullptr;
}
int32_t* last_match =
&register_array_[(current_match_index_ - 1) * registers_per_match_];
int last_end_index = last_match[1];
if (regexp_->TypeTag() == JSRegExp::ATOM) {
num_matches_ =
RegExpImpl::AtomExecRaw(isolate_, regexp_, subject_, last_end_index,
register_array_, register_array_size_);
} else {
int last_start_index = last_match[0];
if (last_start_index == last_end_index) {
// Zero-length match. Advance by one code point.
last_end_index = AdvanceZeroLength(last_end_index);
}
if (last_end_index > subject_->length()) {
num_matches_ = 0; // Signal failed match.
return nullptr;
}
num_matches_ = RegExpImpl::IrregexpExecRaw(
isolate_, regexp_, subject_, last_end_index, register_array_,
register_array_size_);
}
if (num_matches_ <= 0) return nullptr;
current_match_index_ = 0;
return register_array_;
} else {
return &register_array_[current_match_index_ * registers_per_match_];
}
}
int32_t* RegExpImpl::GlobalCache::LastSuccessfulMatch() {
int index = current_match_index_ * registers_per_match_;
if (num_matches_ == 0) {
// After a failed match we shift back by one result.
index -= registers_per_match_;
}
return &register_array_[index];
}
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_JSREGEXP_INL_H_
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include "src/codegen/compilation-cache.h" #include "src/codegen/compilation-cache.h"
#include "src/heap/heap-inl.h" #include "src/heap/heap-inl.h"
#include "src/regexp/jsregexp-inl.h" #include "src/objects/js-regexp-inl.h"
#include "src/regexp/regexp-compiler.h" #include "src/regexp/regexp-compiler.h"
#include "src/regexp/regexp-dotprinter.h" #include "src/regexp/regexp-dotprinter.h"
#include "src/regexp/regexp-interpreter.h" #include "src/regexp/regexp-interpreter.h"
...@@ -290,28 +290,22 @@ bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re, ...@@ -290,28 +290,22 @@ bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
USE(ThrowRegExpException(isolate, re, pattern, compile_data.error)); USE(ThrowRegExpException(isolate, re, pattern, compile_data.error));
return false; return false;
} }
RegExpEngine::CompilationResult result = const bool compilation_succeeded =
RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern, Compile(isolate, &zone, &compile_data, flags, pattern, sample_subject,
sample_subject, is_one_byte); is_one_byte);
if (result.error_message != nullptr) { if (!compilation_succeeded) {
// Unable to compile regexp. DCHECK(!compile_data.error.is_null());
if (FLAG_correctness_fuzzer_suppressions && ThrowRegExpException(isolate, re, compile_data.error);
strncmp(result.error_message, "Stack overflow", 15) == 0) {
FATAL("Aborting on stack overflow");
}
Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
CStrVector(result.error_message)).ToHandleChecked();
ThrowRegExpException(isolate, re, error_message);
return false; return false;
} }
Handle<FixedArray> data = Handle<FixedArray> data =
Handle<FixedArray>(FixedArray::cast(re->data()), isolate); Handle<FixedArray>(FixedArray::cast(re->data()), isolate);
data->set(JSRegExp::code_index(is_one_byte), result.code); data->set(JSRegExp::code_index(is_one_byte), compile_data.code);
SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map); SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
int register_max = IrregexpMaxRegisterCount(*data); int register_max = IrregexpMaxRegisterCount(*data);
if (result.num_registers > register_max) { if (compile_data.register_count > register_max) {
SetIrregexpMaxRegisterCount(*data, result.num_registers); SetIrregexpMaxRegisterCount(*data, compile_data.register_count);
} }
return true; return true;
...@@ -552,88 +546,59 @@ Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo( ...@@ -552,88 +546,59 @@ Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo(
return result; return result;
} }
RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp, void RegExpImpl::DotPrintForTesting(const char* label, RegExpNode* node,
Handle<String> subject, Isolate* isolate) bool ignore_case) {
: register_array_(nullptr), DotPrinter::DotPrint(label, node, ignore_case);
register_array_size_(0), }
regexp_(regexp),
subject_(subject),
isolate_(isolate) {
bool interpreted = FLAG_regexp_interpret_all;
if (regexp_->TypeTag() == JSRegExp::ATOM) { namespace {
static const int kAtomRegistersPerMatch = 2;
registers_per_match_ = kAtomRegistersPerMatch;
// There is no distinction between interpreted and native for atom regexps.
interpreted = false;
} else {
registers_per_match_ =
RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
if (registers_per_match_ < 0) {
num_matches_ = -1; // Signal exception.
return;
}
}
DCHECK(IsGlobal(regexp->GetFlags())); // Returns true if we've either generated too much irregex code within this
if (!interpreted) { // isolate, or the pattern string is too long.
register_array_size_ = bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) {
Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize); // Limit the space regexps take up on the heap. In order to limit this we
max_matches_ = register_array_size_ / registers_per_match_; // would like to keep track of the amount of regexp code on the heap. This
} else { // is not tracked, however. As a conservative approximation we track the
// Global loop in interpreted regexp is not implemented. We choose // total regexp code compiled including code that has subsequently been freed
// the size of the offsets vector so that it can only store one match. // and the total executable memory at any point.
register_array_size_ = registers_per_match_; static constexpr size_t kRegExpExecutableMemoryLimit = 16 * MB;
max_matches_ = 1; static constexpr size_t kRegExpCompiledLimit = 1 * MB;
}
if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { Heap* heap = isolate->heap();
register_array_ = NewArray<int32_t>(register_array_size_); if (pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize) return true;
} else { return (isolate->total_regexp_code_generated() > kRegExpCompiledLimit &&
register_array_ = isolate->jsregexp_static_offsets_vector(); heap->CommittedMemoryExecutable() > kRegExpExecutableMemoryLimit);
}
// Set state so that fetching the results the first time triggers a call
// to the compiled regexp.
current_match_index_ = max_matches_ - 1;
num_matches_ = max_matches_;
DCHECK_LE(2, registers_per_match_); // Each match has at least one capture.
DCHECK_GE(register_array_size_, registers_per_match_);
int32_t* last_match =
&register_array_[current_match_index_ * registers_per_match_];
last_match[0] = -1;
last_match[1] = 0;
} }
int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) { } // namespace
if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() &&
unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
// Advance over the surrogate pair.
return last_index + 2;
}
return last_index + 1;
}
void RegExpEngine::DotPrint(const char* label, RegExpNode* node, bool RegExpImpl::CompileForTesting(Isolate* isolate, Zone* zone,
bool ignore_case) { RegExpCompileData* data,
DotPrinter::DotPrint(label, node, ignore_case); JSRegExp::Flags flags,
Handle<String> pattern,
Handle<String> sample_subject,
bool is_one_byte) {
return Compile(isolate, zone, data, flags, pattern, sample_subject,
is_one_byte);
} }
RegExpEngine::CompilationResult RegExpEngine::Compile( bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
Isolate* isolate, Zone* zone, RegExpCompileData* data, JSRegExp::Flags flags, Handle<String> pattern,
JSRegExp::Flags flags, Handle<String> pattern, Handle<String> sample_subject, bool is_one_byte) {
Handle<String> sample_subject, bool is_one_byte) {
if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
return CompilationResult::RegExpTooBig(); data->error =
isolate->factory()->NewStringFromAsciiChecked("RegExp too big");
return false;
} }
bool is_sticky = IsSticky(flags); bool is_sticky = IsSticky(flags);
bool is_global = IsGlobal(flags); bool is_global = IsGlobal(flags);
bool is_unicode = IsUnicode(flags); bool is_unicode = IsUnicode(flags);
RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte); RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte);
if (compiler.optimize()) if (compiler.optimize()) {
compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern)); compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
}
// Sample some characters from the middle of the string. // Sample some characters from the middle of the string.
static const int kSampleSize = 128; static const int kSampleSize = 128;
...@@ -693,7 +658,9 @@ RegExpEngine::CompilationResult RegExpEngine::Compile( ...@@ -693,7 +658,9 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
Analysis analysis(isolate, is_one_byte); Analysis analysis(isolate, is_one_byte);
analysis.EnsureAnalyzed(node); analysis.EnsureAnalyzed(node);
if (analysis.has_failed()) { if (analysis.has_failed()) {
return CompilationResult(analysis.error_message()); data->error =
isolate->factory()->NewStringFromAsciiChecked(analysis.error_message());
return false;
} }
// Create the correct assembler for the architecture. // Create the correct assembler for the architecture.
...@@ -763,20 +730,140 @@ RegExpEngine::CompilationResult RegExpEngine::Compile( ...@@ -763,20 +730,140 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
RegExpCompiler::CompilationResult result = compiler.Assemble( RegExpCompiler::CompilationResult result = compiler.Assemble(
isolate, macro_assembler.get(), node, data->capture_count, pattern); isolate, macro_assembler.get(), node, data->capture_count, pattern);
return RegExpEngine::CompilationResult(result.error_message, result.code, if (FLAG_correctness_fuzzer_suppressions &&
result.num_registers); strncmp(result.error_message, "Stack overflow", 15) == 0) {
FATAL("Aborting on stack overflow");
}
if (result.error_message != nullptr) {
data->error =
isolate->factory()->NewStringFromAsciiChecked(result.error_message);
}
data->code = result.code;
data->register_count = result.num_registers;
return result.Succeeded();
} }
bool RegExpEngine::TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) { RegExpGlobalCache::RegExpGlobalCache(Handle<JSRegExp> regexp,
Heap* heap = isolate->heap(); Handle<String> subject, Isolate* isolate)
bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; : register_array_(nullptr),
if (isolate->total_regexp_code_generated() > register_array_size_(0),
RegExpImpl::kRegExpCompiledLimit && regexp_(regexp),
heap->CommittedMemoryExecutable() > subject_(subject),
RegExpImpl::kRegExpExecutableMemoryLimit) { isolate_(isolate) {
too_much = true; bool interpreted = FLAG_regexp_interpret_all;
}
return too_much; if (regexp_->TypeTag() == JSRegExp::ATOM) {
static const int kAtomRegistersPerMatch = 2;
registers_per_match_ = kAtomRegistersPerMatch;
// There is no distinction between interpreted and native for atom regexps.
interpreted = false;
} else {
registers_per_match_ =
RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
if (registers_per_match_ < 0) {
num_matches_ = -1; // Signal exception.
return;
}
}
DCHECK(IsGlobal(regexp->GetFlags()));
if (!interpreted) {
register_array_size_ =
Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
max_matches_ = register_array_size_ / registers_per_match_;
} else {
// Global loop in interpreted regexp is not implemented. We choose
// the size of the offsets vector so that it can only store one match.
register_array_size_ = registers_per_match_;
max_matches_ = 1;
}
if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
register_array_ = NewArray<int32_t>(register_array_size_);
} else {
register_array_ = isolate->jsregexp_static_offsets_vector();
}
// Set state so that fetching the results the first time triggers a call
// to the compiled regexp.
current_match_index_ = max_matches_ - 1;
num_matches_ = max_matches_;
DCHECK_LE(2, registers_per_match_); // Each match has at least one capture.
DCHECK_GE(register_array_size_, registers_per_match_);
int32_t* last_match =
&register_array_[current_match_index_ * registers_per_match_];
last_match[0] = -1;
last_match[1] = 0;
}
RegExpGlobalCache::~RegExpGlobalCache() {
// Deallocate the register array if we allocated it in the constructor
// (as opposed to using the existing jsregexp_static_offsets_vector).
if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
DeleteArray(register_array_);
}
}
int RegExpGlobalCache::AdvanceZeroLength(int last_index) {
if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() &&
unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
// Advance over the surrogate pair.
return last_index + 2;
}
return last_index + 1;
}
int32_t* RegExpGlobalCache::FetchNext() {
current_match_index_++;
if (current_match_index_ >= num_matches_) {
// Current batch of results exhausted.
// Fail if last batch was not even fully filled.
if (num_matches_ < max_matches_) {
num_matches_ = 0; // Signal failed match.
return nullptr;
}
int32_t* last_match =
&register_array_[(current_match_index_ - 1) * registers_per_match_];
int last_end_index = last_match[1];
if (regexp_->TypeTag() == JSRegExp::ATOM) {
num_matches_ =
RegExpImpl::AtomExecRaw(isolate_, regexp_, subject_, last_end_index,
register_array_, register_array_size_);
} else {
int last_start_index = last_match[0];
if (last_start_index == last_end_index) {
// Zero-length match. Advance by one code point.
last_end_index = AdvanceZeroLength(last_end_index);
}
if (last_end_index > subject_->length()) {
num_matches_ = 0; // Signal failed match.
return nullptr;
}
num_matches_ = RegExpImpl::IrregexpExecRaw(
isolate_, regexp_, subject_, last_end_index, register_array_,
register_array_size_);
}
if (num_matches_ <= 0) return nullptr;
current_match_index_ = 0;
return register_array_;
} else {
return &register_array_[current_match_index_ * registers_per_match_];
}
}
int32_t* RegExpGlobalCache::LastSuccessfulMatch() {
int index = current_match_index_ * registers_per_match_;
if (num_matches_ == 0) {
// After a failed match we shift back by one result.
index -= registers_per_match_;
}
return &register_array_[index];
} }
Object RegExpResultsCache::Lookup(Heap* heap, String key_string, Object RegExpResultsCache::Lookup(Heap* heap, String key_string,
......
...@@ -13,46 +13,46 @@ namespace internal { ...@@ -13,46 +13,46 @@ namespace internal {
class RegExpNode; class RegExpNode;
class RegExpTree; class RegExpTree;
inline bool IgnoreCase(JSRegExp::Flags flags) { // TODO(jgruber): Consider splitting between ParseData and CompileData.
return (flags & JSRegExp::kIgnoreCase) != 0; struct RegExpCompileData {
} // The parsed AST as produced by the RegExpParser.
RegExpTree* tree = nullptr;
// The compiled Node graph as produced by RegExpTree::ToNode methods.
RegExpNode* node = nullptr;
// The generated code as produced by the compiler. Either a Code object (for
// irregexp native code) or a ByteArray (for irregexp bytecode).
Object code;
inline bool IsUnicode(JSRegExp::Flags flags) { // True, iff the pattern is a 'simple' atom with zero captures. In other
return (flags & JSRegExp::kUnicode) != 0; // words, the pattern consists of a string with no metacharacters and special
} // regexp features, and can be implemented as a standard string search.
bool simple = true;
inline bool IsSticky(JSRegExp::Flags flags) { // True, iff the pattern is anchored at the start of the string with '^'.
return (flags & JSRegExp::kSticky) != 0; bool contains_anchor = false;
}
inline bool IsGlobal(JSRegExp::Flags flags) { // Only use if the pattern contains named captures. If so, this contains a
return (flags & JSRegExp::kGlobal) != 0; // mapping of capture names to capture indices.
} Handle<FixedArray> capture_name_map;
inline bool DotAll(JSRegExp::Flags flags) { // The error message. Only used if an error occurred during parsing or
return (flags & JSRegExp::kDotAll) != 0; // compilation.
} Handle<String> error;
inline bool Multiline(JSRegExp::Flags flags) { // The number of capture groups, without the global capture \0.
return (flags & JSRegExp::kMultiline) != 0; int capture_count = 0;
}
inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) { // The number of registers used by the generated code.
// Both unicode and ignore_case flags are set. We need to use ICU to find int register_count = 0;
// the closure over case equivalents. };
return IsUnicode(flags) && IgnoreCase(flags);
}
class RegExpImpl final { class RegExpImpl final : public AllStatic {
public: public:
// Whether the irregexp engine generates native code or interpreter bytecode. // Whether the irregexp engine generates native code or interpreter bytecode.
static bool UsesNativeRegExp() { return !FLAG_regexp_interpret_all; } static bool UsesNativeRegExp() { return !FLAG_regexp_interpret_all; }
// Returns a string representation of a regular expression.
// Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
// This function calls the garbage collector if necessary.
static Handle<String> ToString(Handle<Object> value);
// Parses the RegExp pattern and prepares the JSRegExp object with // Parses the RegExp pattern and prepares the JSRegExp object with
// generic data and choice of implementation - as well as what // generic data and choice of implementation - as well as what
// the implementation wants to store in the data field. // the implementation wants to store in the data field.
...@@ -67,6 +67,43 @@ class RegExpImpl final { ...@@ -67,6 +67,43 @@ class RegExpImpl final {
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int index, Handle<RegExpMatchInfo> last_match_info); int index, Handle<RegExpMatchInfo> last_match_info);
enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
// Prepare a RegExp for being executed one or more times (using
// IrregexpExecOnce) on the subject.
// This ensures that the regexp is compiled for the subject, and that
// the subject is flat.
// Returns the number of integer spaces required by IrregexpExecOnce
// as its "registers" argument. If the regexp cannot be compiled,
// an exception is set as pending, and this function returns negative.
static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject);
// Set last match info. If match is nullptr, then setting captures is
// omitted.
static Handle<RegExpMatchInfo> SetLastMatchInfo(
Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
Handle<String> subject, int capture_count, int32_t* match);
V8_EXPORT_PRIVATE static bool CompileForTesting(Isolate* isolate, Zone* zone,
RegExpCompileData* input,
JSRegExp::Flags flags,
Handle<String> pattern,
Handle<String> sample_subject,
bool is_one_byte);
V8_EXPORT_PRIVATE static void DotPrintForTesting(const char* label,
RegExpNode* node,
bool ignore_case);
static const int kRegExpTooLargeToOptimize = 20 * KB;
private:
// Returns a string representation of a regular expression.
// Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
// This function calls the garbage collector if necessary.
static Handle<String> ToString(Handle<Object> value);
// Prepares a JSRegExp object with Irregexp-specific data. // Prepares a JSRegExp object with Irregexp-specific data.
static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re, static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> pattern, JSRegExp::Flags flags, Handle<String> pattern, JSRegExp::Flags flags,
...@@ -84,18 +121,6 @@ class RegExpImpl final { ...@@ -84,18 +121,6 @@ class RegExpImpl final {
Handle<String> subject, int index, Handle<String> subject, int index,
Handle<RegExpMatchInfo> last_match_info); Handle<RegExpMatchInfo> last_match_info);
enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
// Prepare a RegExp for being executed one or more times (using
// IrregexpExecOnce) on the subject.
// This ensures that the regexp is compiled for the subject, and that
// the subject is flat.
// Returns the number of integer spaces required by IrregexpExecOnce
// as its "registers" argument. If the regexp cannot be compiled,
// an exception is set as pending, and this function returns negative.
static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject);
// Execute a regular expression on the subject, starting from index. // Execute a regular expression on the subject, starting from index.
// If matching succeeds, return the number of matches. This can be larger // If matching succeeds, return the number of matches. This can be larger
// than one in the case of global regular expressions. // than one in the case of global regular expressions.
...@@ -114,44 +139,17 @@ class RegExpImpl final { ...@@ -114,44 +139,17 @@ class RegExpImpl final {
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int index, Handle<RegExpMatchInfo> last_match_info); int index, Handle<RegExpMatchInfo> last_match_info);
// Set last match info. If match is nullptr, then setting captures is static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
// omitted. Handle<String> sample_subject, bool is_one_byte);
static Handle<RegExpMatchInfo> SetLastMatchInfo( static inline bool EnsureCompiledIrregexp(Isolate* isolate,
Isolate* isolate, Handle<RegExpMatchInfo> last_match_info, Handle<JSRegExp> re,
Handle<String> subject, int capture_count, int32_t* match); Handle<String> sample_subject,
bool is_one_byte);
class GlobalCache {
public:
GlobalCache(Handle<JSRegExp> regexp,
Handle<String> subject,
Isolate* isolate);
V8_INLINE ~GlobalCache();
// Fetch the next entry in the cache for global regexp match results.
// This does not set the last match info. Upon failure, nullptr is
// returned. The cause can be checked with Result(). The previous result is
// still in available in memory when a failure happens.
V8_INLINE int32_t* FetchNext();
V8_INLINE int32_t* LastSuccessfulMatch();
V8_INLINE bool HasException() { return num_matches_ < 0; }
private:
int AdvanceZeroLength(int last_index);
int num_matches_; // Returns true on success, false on failure.
int max_matches_; static bool Compile(Isolate* isolate, Zone* zone, RegExpCompileData* input,
int current_match_index_; JSRegExp::Flags flags, Handle<String> pattern,
int registers_per_match_; Handle<String> sample_subject, bool is_one_byte);
// Pointer to the last set of captures.
int32_t* register_array_;
int register_array_size_;
Handle<JSRegExp> regexp_;
Handle<String> subject_;
Isolate* isolate_;
};
// For acting on the JSRegExp data FixedArray. // For acting on the JSRegExp data FixedArray.
static int IrregexpMaxRegisterCount(FixedArray re); static int IrregexpMaxRegisterCount(FixedArray re);
...@@ -163,68 +161,47 @@ class RegExpImpl final { ...@@ -163,68 +161,47 @@ class RegExpImpl final {
static ByteArray IrregexpByteCode(FixedArray re, bool is_one_byte); static ByteArray IrregexpByteCode(FixedArray re, bool is_one_byte);
static Code IrregexpNativeCode(FixedArray re, bool is_one_byte); static Code IrregexpNativeCode(FixedArray re, bool is_one_byte);
// Limit the space regexps take up on the heap. In order to limit this we friend class RegExpGlobalCache;
// would like to keep track of the amount of regexp code on the heap. This
// is not tracked, however. As a conservative approximation we track the
// total regexp code compiled including code that has subsequently been freed
// and the total executable memory at any point.
static const size_t kRegExpExecutableMemoryLimit = 16 * MB;
static const size_t kRegExpCompiledLimit = 1 * MB;
static const int kRegExpTooLargeToOptimize = 20 * KB;
private:
static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> sample_subject, bool is_one_byte);
static inline bool EnsureCompiledIrregexp(Isolate* isolate,
Handle<JSRegExp> re,
Handle<String> sample_subject,
bool is_one_byte);
};
struct RegExpCompileData {
RegExpCompileData()
: tree(nullptr),
node(nullptr),
simple(true),
contains_anchor(false),
capture_count(0) {}
RegExpTree* tree;
RegExpNode* node;
bool simple;
bool contains_anchor;
Handle<FixedArray> capture_name_map;
Handle<String> error;
int capture_count;
}; };
class RegExpEngine final : public AllStatic { // Uses a special global mode of irregexp-generated code to perform a global
// search and return multiple results at once. As such, this is essentially an
// iterator over multiple results (retrieved batch-wise in advance).
class RegExpGlobalCache final {
public: public:
struct CompilationResult { RegExpGlobalCache(Handle<JSRegExp> regexp, Handle<String> subject,
explicit CompilationResult(const char* error_message) Isolate* isolate);
: error_message(error_message) {}
CompilationResult(const char* error_message, Object code, int registers) ~RegExpGlobalCache();
: error_message(error_message), code(code), num_registers(registers) {}
// Fetch the next entry in the cache for global regexp match results.
static CompilationResult RegExpTooBig() { // This does not set the last match info. Upon failure, nullptr is
return CompilationResult("RegExp too big"); // returned. The cause can be checked with Result(). The previous result is
} // still in available in memory when a failure happens.
int32_t* FetchNext();
const char* const error_message = nullptr;
Object const code; int32_t* LastSuccessfulMatch();
int const num_registers = 0;
}; bool HasException() { return num_matches_ < 0; }
V8_EXPORT_PRIVATE static CompilationResult Compile( private:
Isolate* isolate, Zone* zone, RegExpCompileData* input, int AdvanceZeroLength(int last_index);
JSRegExp::Flags flags, Handle<String> pattern,
Handle<String> sample_subject, bool is_one_byte); int num_matches_;
int max_matches_;
static bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern); int current_match_index_;
int registers_per_match_;
V8_EXPORT_PRIVATE static void DotPrint(const char* label, RegExpNode* node, // Pointer to the last set of captures.
bool ignore_case); int32_t* register_array_;
int register_array_size_;
Handle<JSRegExp> regexp_;
Handle<String> subject_;
Isolate* isolate_;
}; };
// Caches results for specific regexp queries on the isolate. At the time of
// writing, this is used during global calls to RegExp.prototype.exec and
// @@split.
class RegExpResultsCache final : public AllStatic { class RegExpResultsCache final : public AllStatic {
public: public:
enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS }; enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
...@@ -239,14 +216,15 @@ class RegExpResultsCache final : public AllStatic { ...@@ -239,14 +216,15 @@ class RegExpResultsCache final : public AllStatic {
Handle<Object> key_pattern, Handle<FixedArray> value_array, Handle<Object> key_pattern, Handle<FixedArray> value_array,
Handle<FixedArray> last_match_cache, ResultsCacheType type); Handle<FixedArray> last_match_cache, ResultsCacheType type);
static void Clear(FixedArray cache); static void Clear(FixedArray cache);
static const int kRegExpResultsCacheSize = 0x100;
static constexpr int kRegExpResultsCacheSize = 0x100;
private: private:
static const int kArrayEntriesPerCacheEntry = 4; static constexpr int kStringOffset = 0;
static const int kStringOffset = 0; static constexpr int kPatternOffset = 1;
static const int kPatternOffset = 1; static constexpr int kArrayOffset = 2;
static const int kArrayOffset = 2; static constexpr int kLastMatchOffset = 3;
static const int kLastMatchOffset = 3; static constexpr int kArrayEntriesPerCacheEntry = 4;
}; };
} // namespace internal } // namespace internal
......
...@@ -7,7 +7,6 @@ ...@@ -7,7 +7,6 @@
#include "src/diagnostics/code-tracer.h" #include "src/diagnostics/code-tracer.h"
#include "src/execution/isolate.h" #include "src/execution/isolate.h"
#include "src/objects/objects-inl.h" #include "src/objects/objects-inl.h"
#include "src/regexp/jsregexp.h" // TODO(jgruber): Only needed for IgnoreCase.
#include "src/regexp/regexp-macro-assembler-arch.h" #include "src/regexp/regexp-macro-assembler-arch.h"
#include "src/regexp/regexp-macro-assembler-tracer.h" #include "src/regexp/regexp-macro-assembler-tracer.h"
#include "src/strings/unicode-inl.h" #include "src/strings/unicode-inl.h"
......
...@@ -45,6 +45,36 @@ constexpr int kPatternTooShortForBoyerMoore = 2; ...@@ -45,6 +45,36 @@ constexpr int kPatternTooShortForBoyerMoore = 2;
} // namespace regexp_compiler_constants } // namespace regexp_compiler_constants
inline bool IgnoreCase(JSRegExp::Flags flags) {
return (flags & JSRegExp::kIgnoreCase) != 0;
}
inline bool IsUnicode(JSRegExp::Flags flags) {
return (flags & JSRegExp::kUnicode) != 0;
}
inline bool IsSticky(JSRegExp::Flags flags) {
return (flags & JSRegExp::kSticky) != 0;
}
inline bool IsGlobal(JSRegExp::Flags flags) {
return (flags & JSRegExp::kGlobal) != 0;
}
inline bool DotAll(JSRegExp::Flags flags) {
return (flags & JSRegExp::kDotAll) != 0;
}
inline bool Multiline(JSRegExp::Flags flags) {
return (flags & JSRegExp::kMultiline) != 0;
}
inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
// Both unicode and ignore_case flags are set. We need to use ICU to find
// the closure over case equivalents.
return IsUnicode(flags) && IgnoreCase(flags);
}
// A set of unsigned integers that behaves especially well on small // A set of unsigned integers that behaves especially well on small
// integers (< 32). May do zone-allocation. // integers (< 32). May do zone-allocation.
class OutSet : public ZoneObject { class OutSet : public ZoneObject {
...@@ -645,6 +675,8 @@ class RegExpCompiler { ...@@ -645,6 +675,8 @@ class RegExpCompiler {
return CompilationResult("RegExp too big"); return CompilationResult("RegExp too big");
} }
bool Succeeded() const { return error_message == nullptr; }
const char* const error_message = nullptr; const char* const error_message = nullptr;
Object code; Object code;
int num_registers = 0; int num_registers = 0;
......
...@@ -11,7 +11,8 @@ ...@@ -11,7 +11,8 @@
#include "src/logging/counters.h" #include "src/logging/counters.h"
#include "src/numbers/conversions-inl.h" #include "src/numbers/conversions-inl.h"
#include "src/objects/js-array-inl.h" #include "src/objects/js-array-inl.h"
#include "src/regexp/jsregexp-inl.h" #include "src/objects/js-regexp-inl.h"
#include "src/regexp/jsregexp.h"
#include "src/regexp/regexp-utils.h" #include "src/regexp/regexp-utils.h"
#include "src/runtime/runtime-utils.h" #include "src/runtime/runtime-utils.h"
#include "src/strings/string-builder-inl.h" #include "src/strings/string-builder-inl.h"
...@@ -638,7 +639,7 @@ V8_WARN_UNUSED_RESULT static Object StringReplaceGlobalRegExpWithString( ...@@ -638,7 +639,7 @@ V8_WARN_UNUSED_RESULT static Object StringReplaceGlobalRegExpWithString(
} }
} }
RegExpImpl::GlobalCache global_cache(regexp, subject, isolate); RegExpGlobalCache global_cache(regexp, subject, isolate);
if (global_cache.HasException()) return ReadOnlyRoots(isolate).exception(); if (global_cache.HasException()) return ReadOnlyRoots(isolate).exception();
int32_t* current_match = global_cache.FetchNext(); int32_t* current_match = global_cache.FetchNext();
...@@ -703,7 +704,7 @@ V8_WARN_UNUSED_RESULT static Object StringReplaceGlobalRegExpWithEmptyString( ...@@ -703,7 +704,7 @@ V8_WARN_UNUSED_RESULT static Object StringReplaceGlobalRegExpWithEmptyString(
} }
} }
RegExpImpl::GlobalCache global_cache(regexp, subject, isolate); RegExpGlobalCache global_cache(regexp, subject, isolate);
if (global_cache.HasException()) return ReadOnlyRoots(isolate).exception(); if (global_cache.HasException()) return ReadOnlyRoots(isolate).exception();
int32_t* current_match = global_cache.FetchNext(); int32_t* current_match = global_cache.FetchNext();
...@@ -1115,7 +1116,7 @@ static Object SearchRegExpMultiple(Isolate* isolate, Handle<String> subject, ...@@ -1115,7 +1116,7 @@ static Object SearchRegExpMultiple(Isolate* isolate, Handle<String> subject,
} }
} }
RegExpImpl::GlobalCache global_cache(regexp, subject, isolate); RegExpGlobalCache global_cache(regexp, subject, isolate);
if (global_cache.HasException()) return ReadOnlyRoots(isolate).exception(); if (global_cache.HasException()) return ReadOnlyRoots(isolate).exception();
// Ensured in Runtime_RegExpExecMultiple. // Ensured in Runtime_RegExpExecMultiple.
......
...@@ -10,7 +10,6 @@ ...@@ -10,7 +10,6 @@
#include "src/objects/objects-inl.h" #include "src/objects/objects-inl.h"
#include "src/objects/slots.h" #include "src/objects/slots.h"
#include "src/objects/smi.h" #include "src/objects/smi.h"
#include "src/regexp/jsregexp-inl.h"
#include "src/regexp/regexp-utils.h" #include "src/regexp/regexp-utils.h"
#include "src/runtime/runtime-utils.h" #include "src/runtime/runtime-utils.h"
#include "src/strings/string-builder-inl.h" #include "src/strings/string-builder-inl.h"
......
...@@ -547,8 +547,8 @@ static RegExpNode* Compile(const char* input, bool multiline, bool unicode, ...@@ -547,8 +547,8 @@ static RegExpNode* Compile(const char* input, bool multiline, bool unicode,
.ToHandleChecked(); .ToHandleChecked();
Handle<String> sample_subject = Handle<String> sample_subject =
isolate->factory()->NewStringFromUtf8(CStrVector("")).ToHandleChecked(); isolate->factory()->NewStringFromUtf8(CStrVector("")).ToHandleChecked();
RegExpEngine::Compile(isolate, zone, &compile_data, flags, pattern, RegExpImpl::CompileForTesting(isolate, zone, &compile_data, flags, pattern,
sample_subject, is_one_byte); sample_subject, is_one_byte);
return compile_data.node; return compile_data.node;
} }
...@@ -561,7 +561,7 @@ static void Execute(const char* input, bool multiline, bool unicode, ...@@ -561,7 +561,7 @@ static void Execute(const char* input, bool multiline, bool unicode,
USE(node); USE(node);
#ifdef DEBUG #ifdef DEBUG
if (dot_output) { if (dot_output) {
RegExpEngine::DotPrint(input, node, false); RegExpImpl::DotPrintForTesting(input, node, false);
} }
#endif // DEBUG #endif // DEBUG
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment