Commit 58ac66b7 authored by Iain Ireland's avatar Iain Ireland Committed by Commit Bot

[regexp] Factor out PreprocessRegExp

RegExpImpl::Compile does a number of transformations that require
directly manipulating the internal representation of the regexp. For
example, when matching a (non-sticky, non-anchored) regular
expression, the pattern must be wrapped in .* so that it can match
anywhere in the input.

In the interest of moving towards a cleaner division between irregexp
and the outside world, it makes sense to move this code into
RegExpCompiler.

R=jgruber@chromium.org

Bug: v8:10406
Change-Id: I6da251c91c0016914a51480f80bb46c337fd0b23
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2140246Reviewed-by: 's avatarJakob Gruber <jgruber@chromium.org>
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#67262}
parent 6d392516
......@@ -3804,26 +3804,24 @@ void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
if (initial_offset == 0) set_bm_info(not_at_start, bm);
}
// static
RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate(
RegExpCompiler* compiler, RegExpNode* on_success, JSRegExp::Flags flags) {
DCHECK(!compiler->read_backward());
Zone* zone = compiler->zone();
RegExpNode* on_success, JSRegExp::Flags flags) {
DCHECK(!read_backward());
ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
zone(), CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
zone(), CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
ChoiceNode* optional_step_back = new (zone()) ChoiceNode(2, zone());
int stack_register = compiler->UnicodeLookaroundStackRegister();
int position_register = compiler->UnicodeLookaroundPositionRegister();
int stack_register = UnicodeLookaroundStackRegister();
int position_register = UnicodeLookaroundPositionRegister();
RegExpNode* step_back = TextNode::CreateForCharacterRanges(
zone, lead_surrogates, true, on_success, flags);
zone(), lead_surrogates, true, on_success, flags);
RegExpLookaround::Builder builder(true, step_back, stack_register,
position_register);
RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
zone, trail_surrogates, false, builder.on_match_success(), flags);
zone(), trail_surrogates, false, builder.on_match_success(), flags);
optional_step_back->AddAlternative(
GuardedAlternative(builder.ForMatch(match_trail)));
......@@ -3832,5 +3830,49 @@ RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate(
return optional_step_back;
}
RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
JSRegExp::Flags flags,
bool is_one_byte) {
// Wrap the body of the regexp in capture #0.
RegExpNode* captured_body =
RegExpCapture::ToNode(data->tree, 0, this, accept());
RegExpNode* node = captured_body;
if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags)) {
// Add a .*? at the beginning, outside the body capture, unless
// this expression is anchored at the beginning or sticky.
JSRegExp::Flags default_flags = JSRegExp::Flags();
RegExpNode* loop_node = RegExpQuantifier::ToNode(
0, RegExpTree::kInfinity, false,
new (zone()) RegExpCharacterClass('*', default_flags), this,
captured_body, data->contains_anchor);
if (data->contains_anchor) {
// Unroll loop once, to take care of the case that might start
// at the start of input.
ChoiceNode* first_step_node = new (zone()) ChoiceNode(2, zone());
first_step_node->AddAlternative(GuardedAlternative(captured_body));
first_step_node->AddAlternative(GuardedAlternative(new (zone()) TextNode(
new (zone()) RegExpCharacterClass('*', default_flags), false,
loop_node)));
node = first_step_node;
} else {
node = loop_node;
}
}
if (is_one_byte) {
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
// Do it again to propagate the new nodes to places where they were not
// put because they had not been calculated yet.
if (node != nullptr) {
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
}
} else if (IsUnicode(flags) && (IsGlobal(flags) || IsSticky(flags))) {
node = OptionallyStepBackToLeadSurrogate(node, flags);
}
if (node == nullptr) node = new (zone()) EndNode(EndNode::BACKTRACK, zone());
return node;
}
} // namespace internal
} // namespace v8
......@@ -519,10 +519,18 @@ class RegExpCompiler {
RegExpNode* start, int capture_count,
Handle<String> pattern);
// Preprocessing is the final step of node creation before analysis
// and assembly. It includes:
// - Wrapping the body of the regexp in capture 0.
// - Inserting the implicit .* before/after the regexp if necessary.
// - If the input is a one-byte string, filtering out nodes that can't match.
// - Fixing up regexp matches that start within a surrogate pair.
RegExpNode* PreprocessRegExp(RegExpCompileData* data, JSRegExp::Flags flags,
bool is_one_byte);
// If the regexp matching starts within a surrogate pair, step back to the
// lead surrogate and start matching from there.
static RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
RegExpNode* on_success,
RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpNode* on_success,
JSRegExp::Flags flags);
inline void AddWork(RegExpNode* node) {
......
......@@ -751,9 +751,6 @@ bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
return false;
}
bool is_sticky = IsSticky(flags);
bool is_global = IsGlobal(flags);
bool is_unicode = IsUnicode(flags);
RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte);
if (compiler.optimize()) {
......@@ -772,50 +769,8 @@ bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
}
// Wrap the body of the regexp in capture #0.
RegExpNode* captured_body =
RegExpCapture::ToNode(data->tree, 0, &compiler, compiler.accept());
RegExpNode* node = captured_body;
bool is_end_anchored = data->tree->IsAnchoredAtEnd();
bool is_start_anchored = data->tree->IsAnchoredAtStart();
int max_length = data->tree->max_match();
if (!is_start_anchored && !is_sticky) {
// Add a .*? at the beginning, outside the body capture, unless
// this expression is anchored at the beginning or sticky.
JSRegExp::Flags default_flags = JSRegExp::Flags();
RegExpNode* loop_node = RegExpQuantifier::ToNode(
0, RegExpTree::kInfinity, false,
new (zone) RegExpCharacterClass('*', default_flags), &compiler,
captured_body, data->contains_anchor);
if (data->contains_anchor) {
// Unroll loop once, to take care of the case that might start
// at the start of input.
ChoiceNode* first_step_node = new (zone) ChoiceNode(2, zone);
first_step_node->AddAlternative(GuardedAlternative(captured_body));
first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
new (zone) RegExpCharacterClass('*', default_flags), false,
loop_node)));
node = first_step_node;
} else {
node = loop_node;
}
}
if (is_one_byte) {
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
// Do it again to propagate the new nodes to places where they were not
// put because they had not been calculated yet.
if (node != nullptr) {
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
}
} else if (is_unicode && (is_global || is_sticky)) {
node = RegExpCompiler::OptionallyStepBackToLeadSurrogate(&compiler, node,
flags);
}
if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
data->node = node;
data->error = AnalyzeRegExp(isolate, is_one_byte, node);
data->node = compiler.PreprocessRegExp(data, flags, is_one_byte);
data->error = AnalyzeRegExp(isolate, is_one_byte, data->node);
if (data->error != RegExpError::kNone) {
return false;
}
......@@ -868,17 +823,20 @@ bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
// Inserted here, instead of in Assembler, because it depends on information
// in the AST that isn't replicated in the Node structure.
bool is_end_anchored = data->tree->IsAnchoredAtEnd();
bool is_start_anchored = data->tree->IsAnchoredAtStart();
int max_length = data->tree->max_match();
static const int kMaxBacksearchLimit = 1024;
if (is_end_anchored && !is_start_anchored && !is_sticky &&
if (is_end_anchored && !is_start_anchored && !IsSticky(flags) &&
max_length < kMaxBacksearchLimit) {
macro_assembler->SetCurrentPositionFromEnd(max_length);
}
if (is_global) {
if (IsGlobal(flags)) {
RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
if (data->tree->min_match() > 0) {
mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
} else if (is_unicode) {
} else if (IsUnicode(flags)) {
mode = RegExpMacroAssembler::GLOBAL_UNICODE;
}
macro_assembler->set_global_mode(mode);
......@@ -895,7 +853,7 @@ bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
#endif
RegExpCompiler::CompilationResult result = compiler.Assemble(
isolate, macro_assembler_ptr, node, data->capture_count, pattern);
isolate, macro_assembler_ptr, data->node, data->capture_count, pattern);
// Code / bytecode printing.
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment