Commit 10817205 authored by jgruber's avatar jgruber Committed by Commit Bot

[regexp] Limit ATOM regexps to patterns length <= 2

This is a modified reland of 062bb7d4

There's an inherent trade-off when deciding between ATOM and IRREGEXP
regexps: IRREGEXP is faster at runtime for all but trivial short
patterns, while ATOM regexps have a lower memory overhead.

This CL is intended to help investigate impact on benchmarks and real-world
code - if something tanks, it's easy to revert, otherwise it can be a first
step towards a possible removal of ATOM regexps.

Bug: v8:6633
Change-Id: I8d946a7cbb398d4987b47ecba24c9faa88788d0d
Reviewed-on: https://chromium-review.googlesource.com/599910Reviewed-by: 's avatarYang Guo <yangguo@chromium.org>
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/master@{#47164}
parent ea0e1e21
......@@ -96,40 +96,12 @@ ContainedInLattice AddRange(ContainedInLattice containment,
return containment;
}
// Generic RegExp methods. Dispatches to implementation specific methods.
// More makes code generation slower, less makes V8 benchmark score lower.
const int kMaxLookaheadForBoyerMoore = 8;
// In a 3-character pattern you can maximally step forwards 3 characters
// at a time, which is not always enough to pay for the extra logic.
const int kPatternTooShortForBoyerMoore = 2;
// Identifies the sort of regexps where the regexp engine is faster
// than the code used for atom matches.
static bool HasFewDifferentCharacters(Handle<String> pattern) {
int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
if (length <= kPatternTooShortForBoyerMoore) return false;
const int kMod = 128;
bool character_found[kMod];
int different = 0;
memset(&character_found[0], 0, sizeof(character_found));
for (int i = 0; i < length; i++) {
int ch = (pattern->Get(i) & (kMod - 1));
if (!character_found[ch]) {
character_found[ch] = true;
different++;
// We declare a regexp low-alphabet if it has at least 3 times as many
// characters as it has different characters.
if (different * 3 > length) return false;
}
}
return true;
}
// Generic RegExp methods. Dispatches to implementation specific methods.
MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
Handle<String> pattern,
JSRegExp::Flags flags) {
......@@ -158,7 +130,8 @@ MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
bool has_been_compiled = false;
if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
!(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
!(flags & JSRegExp::kSticky) &&
pattern->length() <= kPatternTooShortForBoyerMoore) {
// Parse-tree is a single atom that is equal to the pattern.
AtomCompile(re, pattern, flags, pattern);
has_been_compiled = true;
......@@ -166,12 +139,11 @@ MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
!(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
RegExpAtom* atom = parse_result.tree->AsAtom();
Vector<const uc16> atom_pattern = atom->data();
Handle<String> atom_string;
ASSIGN_RETURN_ON_EXCEPTION(
isolate, atom_string,
isolate->factory()->NewStringFromTwoByte(atom_pattern),
Object);
if (!HasFewDifferentCharacters(atom_string)) {
if (atom_pattern.length() <= kPatternTooShortForBoyerMoore) {
Handle<String> atom_string;
ASSIGN_RETURN_ON_EXCEPTION(
isolate, atom_string,
isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
AtomCompile(re, pattern, flags, atom_string);
has_been_compiled = true;
}
......@@ -3029,6 +3001,8 @@ static void EmitHat(RegExpCompiler* compiler,
on_success->Emit(compiler, &new_trace);
}
// More makes code generation slower, less makes V8 benchmark score lower.
const int kMaxLookaheadForBoyerMoore = 8;
// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment