Commit f46dec2e authored by Jakob Gruber's avatar Jakob Gruber Committed by V8 LUCI CQ

[regexp] Avoid heap allocations in the regexp parser

In follow-up work, the parser will be refactored to take the input as
raw char arrays instead of a FlatStringReader s.t. it can be reused by
the V8 parser (which has AstRawStrings instead of Strings).

Bug: v8:896
Change-Id: I0e0bda4b34bc23b8bc427ddf3f9516081c42bb8a
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3099947Reviewed-by: 's avatarPatrick Thier <pthier@chromium.org>
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76359}
parent 5cca428a
......@@ -92,7 +92,8 @@ base::Optional<CompilationResult> CompileImpl(Isolate* isolate,
CompilationResult result;
result.bytecode = VectorToByteArray(isolate, bytecode.ToVector());
result.capture_name_map = parse_result.capture_name_map;
result.capture_name_map =
RegExp::CreateCaptureNameMap(isolate, parse_result.named_captures);
return result;
}
......
......@@ -25,7 +25,8 @@ namespace v8 {
namespace internal {
RegExpParser::RegExpParser(FlatStringReader* in, JSRegExp::Flags flags,
Isolate* isolate, Zone* zone)
Isolate* isolate, Zone* zone,
const DisallowGarbageCollection& no_gc)
: isolate_(isolate),
zone_(zone),
captures_(nullptr),
......@@ -914,52 +915,13 @@ RegExpCapture* RegExpParser::GetCapture(int index) {
return captures_->at(index - 1);
}
namespace {
struct RegExpCaptureIndexLess {
bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
DCHECK_NOT_NULL(lhs);
DCHECK_NOT_NULL(rhs);
return lhs->index() < rhs->index();
}
};
} // namespace
Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
ZoneVector<RegExpCapture*>* RegExpParser::GetNamedCaptures() const {
if (named_captures_ == nullptr || named_captures_->empty()) {
return Handle<FixedArray>();
return nullptr;
}
// Named captures are sorted by name (because the set is used to ensure
// name uniqueness). But the capture name map must to be sorted by index.
ZoneVector<RegExpCapture*> sorted_named_captures(
return zone()->template New<ZoneVector<RegExpCapture*>>(
named_captures_->begin(), named_captures_->end(), zone());
std::sort(sorted_named_captures.begin(), sorted_named_captures.end(),
RegExpCaptureIndexLess{});
DCHECK_EQ(sorted_named_captures.size(), named_captures_->size());
Factory* factory = isolate()->factory();
int len = static_cast<int>(sorted_named_captures.size()) * 2;
Handle<FixedArray> array = factory->NewFixedArray(len);
int i = 0;
for (const auto& capture : sorted_named_captures) {
base::Vector<const base::uc16> capture_name(capture->name()->data(),
capture->name()->size());
// CSA code in ConstructNewResultFromMatchInfo requires these strings to be
// internalized so they can be used as property names in the 'exec' results.
Handle<String> name = factory->InternalizeString(capture_name);
array->set(i * 2, *name);
array->set(i * 2 + 1, Smi::FromInt(capture->index()));
i++;
}
DCHECK_EQ(i * 2, len);
return array;
}
bool RegExpParser::HasNamedCaptures() {
......@@ -1720,8 +1682,7 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
#undef CHECK_FAILED
bool RegExpParser::Parse(RegExpCompileData* result,
const DisallowGarbageCollection&) {
bool RegExpParser::Parse(RegExpCompileData* result) {
DCHECK(result != nullptr);
RegExpTree* tree = ParsePattern();
if (failed()) {
......@@ -1742,6 +1703,7 @@ bool RegExpParser::Parse(RegExpCompileData* result,
result->simple = tree->IsAtom() && simple() && capture_count == 0;
result->contains_anchor = contains_anchor();
result->capture_count = capture_count;
result->named_captures = GetNamedCaptures();
}
return !failed();
}
......@@ -1749,16 +1711,8 @@ bool RegExpParser::Parse(RegExpCompileData* result,
bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
FlatStringReader* input, JSRegExp::Flags flags,
RegExpCompileData* result) {
RegExpParser parser(input, flags, isolate, zone);
bool success;
{
DisallowGarbageCollection no_gc;
success = parser.Parse(result, no_gc);
}
if (success) {
result->capture_name_map = parser.CreateCaptureNameMap();
}
return success;
DisallowGarbageCollection no_gc;
return RegExpParser{input, flags, isolate, zone, no_gc}.Parse(result);
}
bool RegExpParser::VerifyRegExpSyntax(Isolate* isolate, Zone* zone,
......@@ -1766,8 +1720,7 @@ bool RegExpParser::VerifyRegExpSyntax(Isolate* isolate, Zone* zone,
JSRegExp::Flags flags,
RegExpCompileData* result,
const DisallowGarbageCollection& no_gc) {
RegExpParser parser(input, flags, isolate, zone);
return parser.Parse(result, no_gc);
return RegExpParser{input, flags, isolate, zone, no_gc}.Parse(result);
}
RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
......
......@@ -155,9 +155,6 @@ class RegExpBuilder : public ZoneObject {
class V8_EXPORT_PRIVATE RegExpParser {
public:
RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate,
Zone* zone);
static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
JSRegExp::Flags flags, RegExpCompileData* result);
......@@ -165,10 +162,13 @@ class V8_EXPORT_PRIVATE RegExpParser {
static bool VerifyRegExpSyntax(Isolate* isolate, Zone* zone,
FlatStringReader* input, JSRegExp::Flags flags,
RegExpCompileData* result,
const DisallowGarbageCollection& nogc);
const DisallowGarbageCollection& no_gc);
private:
bool Parse(RegExpCompileData* result, const DisallowGarbageCollection&);
RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate,
Zone* zone, const DisallowGarbageCollection& no_gc);
bool Parse(RegExpCompileData* result);
RegExpTree* ParsePattern();
RegExpTree* ParseDisjunction();
......@@ -318,7 +318,7 @@ class V8_EXPORT_PRIVATE RegExpParser {
// to avoid complicating cases in which references comes before the capture.
void PatchNamedBackReferences();
Handle<FixedArray> CreateCaptureNameMap();
ZoneVector<RegExpCapture*>* GetNamedCaptures() const;
// Returns true iff the pattern contains named captures. May call
// ScanForCaptures to look ahead at the remaining pattern.
......@@ -344,8 +344,9 @@ class V8_EXPORT_PRIVATE RegExpParser {
}
};
Isolate* isolate_;
Zone* zone_;
const DisallowGarbageCollection no_gc_;
Isolate* const isolate_;
Zone* const zone_;
RegExpError error_ = RegExpError::kNone;
int error_pos_ = 0;
ZoneList<RegExpCapture*>* captures_;
......@@ -356,7 +357,7 @@ class V8_EXPORT_PRIVATE RegExpParser {
// These are the flags specified outside the regexp syntax ie after the
// terminating '/' or in the second argument to the constructor. The current
// flags are stored on the RegExpBuilder.
JSRegExp::Flags top_level_flags_;
const JSRegExp::Flags top_level_flags_;
int next_pos_;
int captures_started_;
int capture_count_; // Only valid after we have scanned for captures.
......
......@@ -420,9 +420,9 @@ bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re,
return CompileIrregexp(isolate, re, sample_subject, is_one_byte);
}
#ifdef DEBUG
namespace {
#ifdef DEBUG
bool RegExpCodeIsValidForPreCompilation(Handle<JSRegExp> re, bool is_one_byte) {
Object entry = re->Code(is_one_byte);
Object bytecode = re->Bytecode(is_one_byte);
......@@ -448,9 +448,50 @@ bool RegExpCodeIsValidForPreCompilation(Handle<JSRegExp> re, bool is_one_byte) {
return true;
}
#endif
struct RegExpCaptureIndexLess {
bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
DCHECK_NOT_NULL(lhs);
DCHECK_NOT_NULL(rhs);
return lhs->index() < rhs->index();
}
};
} // namespace
#endif
// static
Handle<FixedArray> RegExp::CreateCaptureNameMap(
Isolate* isolate, ZoneVector<RegExpCapture*>* named_captures) {
if (named_captures == nullptr) return Handle<FixedArray>();
DCHECK(!named_captures->empty());
// Named captures are sorted by name (because the set is used to ensure
// name uniqueness). But the capture name map must to be sorted by index.
std::sort(named_captures->begin(), named_captures->end(),
RegExpCaptureIndexLess{});
int len = static_cast<int>(named_captures->size()) * 2;
Handle<FixedArray> array = isolate->factory()->NewFixedArray(len);
int i = 0;
for (const RegExpCapture* capture : *named_captures) {
base::Vector<const base::uc16> capture_name(capture->name()->data(),
capture->name()->size());
// CSA code in ConstructNewResultFromMatchInfo requires these strings to be
// internalized so they can be used as property names in the 'exec' results.
Handle<String> name = isolate->factory()->InternalizeString(capture_name);
array->set(i * 2, *name);
array->set(i * 2 + 1, Smi::FromInt(capture->index()));
i++;
}
DCHECK_EQ(i * 2, len);
return array;
}
bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> sample_subject,
......@@ -513,7 +554,9 @@ bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
BUILTIN_CODE(isolate, RegExpInterpreterTrampoline);
data->set(JSRegExp::code_index(is_one_byte), ToCodeT(*trampoline));
}
re->SetCaptureNameMap(compile_data.capture_name_map);
Handle<FixedArray> capture_name_map =
RegExp::CreateCaptureNameMap(isolate, compile_data.named_captures);
re->SetCaptureNameMap(capture_name_map);
int register_max = IrregexpMaxRegisterCount(*data);
if (compile_data.register_count > register_max) {
SetIrregexpMaxRegisterCount(*data, compile_data.register_count);
......
......@@ -7,10 +7,12 @@
#include "src/objects/js-regexp.h"
#include "src/regexp/regexp-error.h"
#include "src/zone/zone-containers.h"
namespace v8 {
namespace internal {
class RegExpCapture;
class RegExpNode;
class RegExpTree;
......@@ -37,9 +39,9 @@ struct RegExpCompileData {
// True, iff the pattern is anchored at the start of the string with '^'.
bool contains_anchor = false;
// Only use if the pattern contains named captures. If so, this contains a
// mapping of capture names to capture indices.
Handle<FixedArray> capture_name_map;
// Only set if the pattern contains named captures.
// Note: the lifetime equals that of the parse/compile zone.
ZoneVector<RegExpCapture*>* named_captures = nullptr;
// The error message. Only used if an error occurred during parsing or
// compilation.
......@@ -152,6 +154,9 @@ class RegExp final : public AllStatic {
RegExpError error_text);
static bool IsUnmodifiedRegExp(Isolate* isolate, Handle<JSRegExp> regexp);
static Handle<FixedArray> CreateCaptureNameMap(
Isolate* isolate, ZoneVector<RegExpCapture*>* named_captures);
};
// Uses a special global mode of irregexp-generated code to perform a global
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment