Commit f14b93a5 authored by erikcorry's avatar erikcorry

Regexp: Improve the speed that we scan for an initial point where a non-anchored

regexp can match by using a Boyer-Moore-like table.  This is done by identifying
non-greedy non-capturing loops in the nodes that eat any character one at a time.
For example in the middle of the regexp /foo[\s\S]*?bar/ we find such a loop.
There is also such a loop implicitly inserted at the start of any non-anchored
regexp.

When we have found such a loop we look ahead in the nodes to find the set of
characters that can come at given distances.  For example for the regexp
/.?foo/ we know that there are at least 3 characters ahead of us, and the sets
of characters that can occur are [any, [f, o], [o]].  We find a range in the
lookahead info where the set of characters is reasonably constrained.  In our
example this is from index 1 to 2 (0 is not constrained).  We can now look 3
characters ahead and if we don't find one of [f, o] (the union of [f, o] and
[o]) then we can skip forwards by the range size (in this case 2).

For Unicode input strings we do the same, but modulo 128.

We also look at the first string fed to the regexp and use that to get a hint
of the character frequencies in the inputs.  This affects the assessment of
whether the set of characters is 'reasonably constrained'.

We still have the old lookahead mechanism, which uses a wide load of multiple
characters followed by a mask and compare to determine whether a match is
possible at this point.
Review URL: http://codereview.chromium.org/9965010

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@11204 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent f00631b7
......@@ -452,8 +452,12 @@ void RegExpMacroAssemblerARM::CheckNotCharacter(unsigned c,
void RegExpMacroAssemblerARM::CheckCharacterAfterAnd(uint32_t c,
uint32_t mask,
Label* on_equal) {
__ and_(r0, current_character(), Operand(mask));
__ cmp(r0, Operand(c));
if (c == 0) {
__ tst(current_character(), Operand(mask));
} else {
__ and_(r0, current_character(), Operand(mask));
__ cmp(r0, Operand(c));
}
BranchOrBacktrack(eq, on_equal);
}
......@@ -461,8 +465,12 @@ void RegExpMacroAssemblerARM::CheckCharacterAfterAnd(uint32_t c,
void RegExpMacroAssemblerARM::CheckNotCharacterAfterAnd(unsigned c,
unsigned mask,
Label* on_not_equal) {
__ and_(r0, current_character(), Operand(mask));
__ cmp(r0, Operand(c));
if (c == 0) {
__ tst(current_character(), Operand(mask));
} else {
__ and_(r0, current_character(), Operand(mask));
__ cmp(r0, Operand(c));
}
BranchOrBacktrack(ne, on_not_equal);
}
......
......@@ -504,8 +504,8 @@ void RegExpMacroAssemblerIA32::CheckCharacterAfterAnd(uint32_t c,
if (c == 0) {
__ test(current_character(), Immediate(mask));
} else {
__ mov(eax, current_character());
__ and_(eax, mask);
__ mov(eax, mask);
__ and_(eax, current_character());
__ cmp(eax, c);
}
BranchOrBacktrack(equal, on_equal);
......@@ -518,8 +518,8 @@ void RegExpMacroAssemblerIA32::CheckNotCharacterAfterAnd(uint32_t c,
if (c == 0) {
__ test(current_character(), Immediate(mask));
} else {
__ mov(eax, current_character());
__ and_(eax, mask);
__ mov(eax, mask);
__ and_(eax, current_character());
__ cmp(eax, c);
}
BranchOrBacktrack(not_equal, on_not_equal);
......@@ -569,8 +569,8 @@ void RegExpMacroAssemblerIA32::CheckBitInTable(
__ mov(eax, Immediate(table));
Register index = current_character();
if (mode_ != ASCII || kTableMask != String::kMaxAsciiCharCode) {
__ mov(ebx, current_character());
__ and_(ebx, kTableSize - 1);
__ mov(ebx, kTableSize - 1);
__ and_(ebx, current_character());
index = ebx;
}
__ cmpb(FieldOperand(eax, index, times_1, ByteArray::kHeaderSize), 0);
......
......@@ -280,7 +280,8 @@ Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
// from the source pattern.
// If compilation fails, an exception is thrown and this function
// returns false.
bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
bool RegExpImpl::EnsureCompiledIrregexp(
Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) {
Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
#ifdef V8_INTERPRETED_REGEXP
if (compiled_code->IsByteArray()) return true;
......@@ -296,7 +297,7 @@ bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
ASSERT(compiled_code->IsSmi());
return true;
}
return CompileIrregexp(re, is_ascii);
return CompileIrregexp(re, sample_subject, is_ascii);
}
......@@ -316,7 +317,9 @@ static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,
}
bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
Handle<String> sample_subject,
bool is_ascii) {
// Compile the RegExp.
Isolate* isolate = re->GetIsolate();
ZoneScope zone_scope(isolate, DELETE_ON_EXIT);
......@@ -365,6 +368,7 @@ bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
flags.is_ignore_case(),
flags.is_multiline(),
pattern,
sample_subject,
is_ascii);
if (result.error_message != NULL) {
// Unable to compile regexp.
......@@ -435,7 +439,7 @@ int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
// Check the asciiness of the underlying storage.
bool is_ascii = subject->IsAsciiRepresentationUnderneath();
if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1;
if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1;
#ifdef V8_INTERPRETED_REGEXP
// Byte-code regexp needs space allocated for all its registers.
......@@ -466,7 +470,7 @@ RegExpImpl::IrregexpResult RegExpImpl::IrregexpExecOnce(
#ifndef V8_INTERPRETED_REGEXP
ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
do {
EnsureCompiledIrregexp(regexp, is_ascii);
EnsureCompiledIrregexp(regexp, subject, is_ascii);
Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
NativeRegExpMacroAssembler::Result res =
NativeRegExpMacroAssembler::Match(code,
......@@ -784,6 +788,53 @@ DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
}
class FrequencyCollator {
public:
FrequencyCollator() : total_samples_(0) {
for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
frequencies_[i] = CharacterFrequency(i);
}
}
void CountCharacter(int character) {
int index = (character & RegExpMacroAssembler::kTableMask);
frequencies_[index].Increment();
total_samples_++;
}
// Does not measure in percent, but rather per-128 (the table size from the
// regexp macro assembler).
int Frequency(int in_character) {
ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);
if (total_samples_ < 1) return 1; // Division by zero.
int freq_in_per128 =
(frequencies_[in_character].counter() * 128) / total_samples_;
return freq_in_per128;
}
private:
class CharacterFrequency {
public:
CharacterFrequency() : counter_(0), character_(-1) { }
explicit CharacterFrequency(int character)
: counter_(0), character_(character) { }
void Increment() { counter_++; }
int counter() { return counter_; }
int character() { return character_; }
private:
int counter_;
int character_;
};
private:
CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
int total_samples_;
};
class RegExpCompiler {
public:
RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
......@@ -819,6 +870,7 @@ class RegExpCompiler {
inline bool ignore_case() { return ignore_case_; }
inline bool ascii() { return ascii_; }
FrequencyCollator* frequency_collator() { return &frequency_collator_; }
int current_expansion_factor() { return current_expansion_factor_; }
void set_current_expansion_factor(int value) {
......@@ -837,6 +889,7 @@ class RegExpCompiler {
bool ascii_;
bool reg_exp_too_big_;
int current_expansion_factor_;
FrequencyCollator frequency_collator_;
};
......@@ -865,7 +918,8 @@ RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
ignore_case_(ignore_case),
ascii_(ascii),
reg_exp_too_big_(false),
current_expansion_factor_(1) {
current_expansion_factor_(1),
frequency_collator_() {
accept_ = new EndNode(EndNode::ACCEPT);
ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
}
......@@ -2056,6 +2110,17 @@ int ActionNode::EatsAtLeast(int still_to_find,
}
void ActionNode::FillInBMInfo(int offset,
BoyerMooreLookahead* bm,
bool not_at_start) {
if (type_ == BEGIN_SUBMATCH) {
bm->SetRest(offset);
} else if (type_ != POSITIVE_SUBMATCH_SUCCESS) {
on_success()->FillInBMInfo(offset, bm, not_at_start);
}
}
int AssertionNode::EatsAtLeast(int still_to_find,
int recursion_depth,
bool not_at_start) {
......@@ -2072,6 +2137,14 @@ int AssertionNode::EatsAtLeast(int still_to_find,
}
void AssertionNode::FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start) {
// Match the behaviour of EatsAtLeast on this node.
if (type() == AT_START && not_at_start) return;
on_success()->FillInBMInfo(offset, bm, not_at_start);
}
int BackReferenceNode::EatsAtLeast(int still_to_find,
int recursion_depth,
bool not_at_start) {
......@@ -2504,6 +2577,16 @@ void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
}
void LoopChoiceNode::FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool nas) {
if (body_can_be_zero_length_) {
bm->SetRest(offset);
return;
}
ChoiceNode::FillInBMInfo(offset, bm, nas);
}
void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in,
......@@ -3000,6 +3083,30 @@ int TextNode::GreedyLoopTextLength() {
}
RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
RegExpCompiler* compiler) {
if (elms_->length() != 1) return NULL;
TextElement elm = elms_->at(0);
if (elm.type != TextElement::CHAR_CLASS) return NULL;
RegExpCharacterClass* node = elm.data.u_char_class;
ZoneList<CharacterRange>* ranges = node->ranges();
if (!CharacterRange::IsCanonical(ranges)) {
CharacterRange::Canonicalize(ranges);
}
if (node->is_negated()) {
return ranges->length() == 0 ? on_success() : NULL;
}
if (ranges->length() != 1) return NULL;
uint32_t max_char;
if (compiler->ascii()) {
max_char = String::kMaxAsciiCharCode;
} else {
max_char = String::kMaxUtf16CodeUnit;
}
return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
}
// Finds the fixed match length of a sequence of nodes that goes from
// this alternative and back to this choice node. If there are variable
// length nodes or other complications in the way then return a sentinel
......@@ -3064,8 +3171,8 @@ void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
bool not_at_start) {
int preload_characters = EatsAtLeast(4, 0, not_at_start);
int eats_at_least) {
int preload_characters = Min(4, eats_at_least);
if (compiler->macro_assembler()->CanReadUnaligned()) {
bool ascii = compiler->ascii();
if (ascii) {
......@@ -3131,6 +3238,197 @@ class AlternativeGenerationList {
};
BoyerMooreLookahead::BoyerMooreLookahead(
int length, int map_length, RegExpCompiler* compiler)
: length_(length),
map_length_(map_length),
compiler_(compiler) {
ASSERT(IsPowerOf2(map_length));
if (compiler->ascii()) {
max_char_ = String::kMaxAsciiCharCode;
} else {
max_char_ = String::kMaxUtf16CodeUnit;
}
bitmaps_ = new ZoneList<ZoneList<bool>*>(length);
for (int i = 0; i < length; i++) {
bitmaps_->Add(new ZoneList<bool>(map_length));
ZoneList<bool>* map = bitmaps_->at(i);
for (int i = 0; i < map_length; i++) {
map->Add(false);
}
}
}
// Find the longest range of lookahead that has the fewest number of different
// characters that can occur at a given position. Since we are optimizing two
// different parameters at once this is a tradeoff.
bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
int biggest_points = 0;
for (int max_number_of_chars = 4;
max_number_of_chars < kTooManyCharacters;
max_number_of_chars *= 2) {
biggest_points =
FindBestInterval(max_number_of_chars, biggest_points, from, to);
}
if (biggest_points == 0) return false;
return true;
}
// Find the highest-points range between 0 and length_ where the character
// information is not too vague. 'Too vague' means that there are more than
// max_number_of_chars that can occur at this position. Calculates the number
// of points as the product of width-of-the-range and
// probability-of-finding-one-of-the-characters, where the probability is
// calculated using the frequency distribution of the sample subject string.
int BoyerMooreLookahead::FindBestInterval(
int max_number_of_chars, int old_biggest_points, int* from, int* to) {
int biggest_points = old_biggest_points;
static const int kSize = RegExpMacroAssembler::kTableSize;
for (int i = 0; i < length_; ) {
while (i < length_ && Count(i) > max_number_of_chars) i++;
if (i == length_) break;
int remembered_from = i;
bool union_map[kSize];
for (int j = 0; j < kSize; j++) union_map[j] = false;
while (i < length_ && Count(i) <= max_number_of_chars) {
ZoneList<bool>* map = bitmaps_->at(i);
for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
i++;
}
int frequency = 0;
for (int j = 0; j < kSize; j++) {
if (union_map[j]) {
// Add 1 to the frequency to give a small per-character boost for
// the cases where our sampling is not good enough and many
// characters have a frequency of zero. This means the frequency
// can theoretically be up to 2*kSize though we treat it mostly as
// a fraction of kSize.
frequency += compiler_->frequency_collator()->Frequency(j) + 1;
}
}
// We use the probability of skipping times the distance we are skipping to
// judge the effectiveness of this. Actually we have a cut-off: By
// dividing by 2 we switch off the skipping if the probability of skipping
// is less than 50%. This is because the multibyte mask-and-compare
// skipping in quickcheck is more likely to do well on this case.
bool in_quickcheck_range = ((i - remembered_from < 4) ||
(compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2));
// Called 'probability' but it is only a rough estimate and can actually
// be outside the 0-kSize range.
int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
int points = (i - remembered_from) * probability;
if (points > biggest_points) {
*from = remembered_from;
*to = i - 1;
biggest_points = points;
}
}
return biggest_points;
}
// Take all the characters that will not prevent a successful match if they
// occur in the subject string in the range between min_lookahead and
// max_lookahead (inclusive) measured from the current position. If the
// character at max_lookahead offset is not one of these characters, then we
// can safely skip forwards by the number of characters in the range.
int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
int max_lookahead,
Handle<ByteArray> boolean_skip_table) {
const int kSize = RegExpMacroAssembler::kTableSize;
const int kSkipArrayEntry = 0;
const int kDontSkipArrayEntry = 1;
for (int i = 0; i < kSize; i++) {
boolean_skip_table->set(i, kSkipArrayEntry);
}
int skip = max_lookahead + 1 - min_lookahead;
for (int i = max_lookahead; i >= min_lookahead; i--) {
ZoneList<bool>* map = bitmaps_->at(i);
for (int j = 0; j < map_length_; j++) {
if (map->at(j)) {
boolean_skip_table->set(j, kDontSkipArrayEntry);
}
}
}
return skip;
}
// See comment above on the implementation of GetSkipTable.
bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
int min_lookahead = 0;
int max_lookahead = 0;
if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false;
bool found_single_character = false;
bool abandoned_search_for_single_character = false;
int single_character = 0;
for (int i = max_lookahead; i >= min_lookahead; i--) {
ZoneList<bool>* map = bitmaps_->at(i);
for (int j = 0; j < map_length_; j++) {
if (map->at(j)) {
if (found_single_character) {
found_single_character = false; // Found two.
abandoned_search_for_single_character = true;
break;
} else {
found_single_character = true;
single_character = j;
}
}
}
if (abandoned_search_for_single_character) break;
}
int lookahead_width = max_lookahead + 1 - min_lookahead;
if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
// The mask-compare can probably handle this better.
return false;
}
if (found_single_character) {
Label cont, again;
masm->Bind(&again);
masm->LoadCurrentCharacter(max_lookahead, &cont, true);
if (max_char_ > map_length_) {
ASSERT(map_length_ == RegExpMacroAssembler::kTableSize);
masm->CheckCharacterAfterAnd(single_character,
RegExpMacroAssembler::kTableMask,
&cont);
} else {
masm->CheckCharacter(single_character, &cont);
}
masm->AdvanceCurrentPosition(lookahead_width);
masm->GoTo(&again);
masm->Bind(&cont);
return true;
}
Handle<ByteArray> boolean_skip_table =
FACTORY->NewByteArray(map_length_, TENURED);
int skip_distance = GetSkipTable(
min_lookahead, max_lookahead, boolean_skip_table);
ASSERT(skip_distance != 0);
Label cont, again;
masm->Bind(&again);
masm->LoadCurrentCharacter(max_lookahead, &cont, true);
masm->CheckBitInTable(boolean_skip_table, &cont);
masm->AdvanceCurrentPosition(skip_distance);
masm->GoTo(&again);
masm->Bind(&cont);
return true;
}
/* Code generation for choice nodes.
*
* We generate quick checks that do a mask and compare to eliminate a
......@@ -3274,10 +3572,51 @@ void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
int first_normal_choice = greedy_loop ? 1 : 0;
int preload_characters =
CalculatePreloadCharacters(compiler,
current_trace->at_start() == Trace::FALSE);
bool preload_is_current =
bool not_at_start = current_trace->at_start() == Trace::FALSE;
const int kEatsAtLeastNotYetInitialized = -1;
int eats_at_least = kEatsAtLeastNotYetInitialized;
bool skip_was_emitted = false;
// More makes code generation slower, less makes V8 benchmark score lower.
const int kMaxLookaheadForBoyerMoore = 8;
if (!greedy_loop && choice_count == 2) {
GuardedAlternative alt1 = alternatives_->at(1);
if (alt1.guards() == NULL || alt1.guards()->length() == 0) {
RegExpNode* eats_anything_node = alt1.node();
if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==
this) {
// At this point we know that we are at a non-greedy loop that will eat
// any character one at a time. Any non-anchored regexp has such a
// loop prepended to it in order to find where it starts. We look for
// a pattern of the form ...abc... where we can look 6 characters ahead
// and step forwards 3 if the character is not one of abc. Abc need
// not be atoms, they can be any reasonably limited character class or
// small alternation.
ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.
eats_at_least =
Min(kMaxLookaheadForBoyerMoore,
EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start));
if (eats_at_least >= 1) {
BoyerMooreLookahead bm(eats_at_least,
RegExpMacroAssembler::kTableSize,
compiler);
GuardedAlternative alt0 = alternatives_->at(0);
alt0.node()->FillInBMInfo(0, &bm, not_at_start);
skip_was_emitted = bm.EmitSkipInstructions(macro_assembler);
}
}
}
}
if (eats_at_least == kEatsAtLeastNotYetInitialized) {
// Save some time by looking at most one machine word ahead.
eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start);
}
int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least);
bool preload_is_current = !skip_was_emitted &&
(current_trace->characters_preloaded() == preload_characters);
bool preload_has_checked_bounds = preload_is_current;
......@@ -5432,6 +5771,76 @@ int BackReferenceNode::ComputeFirstCharacterSet(int budget) {
}
void ChoiceNode::FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start) {
ZoneList<GuardedAlternative>* alts = alternatives();
for (int i = 0; i < alts->length(); i++) {
GuardedAlternative& alt = alts->at(i);
if (alt.guards() != NULL && alt.guards()->length() != 0) {
bm->SetRest(offset); // Give up trying to fill in info.
return;
}
alt.node()->FillInBMInfo(offset, bm, not_at_start);
}
}
void TextNode::FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start) {
if (offset >= bm->length()) return;
int max_char = bm->max_char();
for (int i = 0; i < elements()->length(); i++) {
if (offset >= bm->length()) return;
TextElement text = elements()->at(i);
if (text.type == TextElement::ATOM) {
RegExpAtom* atom = text.data.u_atom;
for (int j = 0; j < atom->length(); j++, offset++) {
if (offset >= bm->length()) return;
uc16 character = atom->data()[j];
if (bm->compiler()->ignore_case()) {
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
int length = GetCaseIndependentLetters(
ISOLATE,
character,
bm->max_char() == String::kMaxAsciiCharCode,
chars);
for (int j = 0; j < length; j++) {
bm->Set(offset, chars[j]);
}
} else {
if (character <= max_char) bm->Set(offset, character);
}
}
} else {
ASSERT(text.type == TextElement::CHAR_CLASS);
RegExpCharacterClass* char_class = text.data.u_char_class;
ZoneList<CharacterRange>* ranges = char_class->ranges();
if (char_class->is_negated()) {
bm->SetAll(offset);
} else {
for (int k = 0; k < ranges->length(); k++) {
CharacterRange& range = ranges->at(k);
if (range.from() > max_char) continue;
int to = Min(max_char, static_cast<int>(range.to()));
if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) {
bm->SetAll(offset);
break;
}
for (int m = range.from(); m <= to; m++) {
bm->Set(offset, m);
}
}
}
offset++;
}
}
if (offset >= bm->length()) return;
on_success()->FillInBMInfo(offset,
bm,
true); // Not at start after a text node.
}
int TextNode::ComputeFirstCharacterSet(int budget) {
budget--;
if (budget >= 0) {
......@@ -5589,15 +5998,30 @@ void DispatchTableConstructor::VisitAction(ActionNode* that) {
}
RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
bool ignore_case,
bool is_multiline,
Handle<String> pattern,
bool is_ascii) {
RegExpEngine::CompilationResult RegExpEngine::Compile(
RegExpCompileData* data,
bool ignore_case,
bool is_multiline,
Handle<String> pattern,
Handle<String> sample_subject,
bool is_ascii) {
if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
return IrregexpRegExpTooBig();
}
RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
// Sample some characters from the middle of the string.
static const int kSampleSize = 128;
FlattenString(sample_subject);
int chars_sampled = 0;
int half_way = (sample_subject->length() - kSampleSize) / 2;
for (int i = Max(0, half_way);
i < sample_subject->length() && chars_sampled < kSampleSize;
i++, chars_sampled++) {
compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
}
// Wrap the body of the regexp in capture #0.
RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
0,
......
......@@ -190,8 +190,10 @@ class RegExpImpl {
static String* last_ascii_string_;
static String* two_byte_cached_string_;
static bool CompileIrregexp(Handle<JSRegExp> re, bool is_ascii);
static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii);
static bool CompileIrregexp(
Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii);
static inline bool EnsureCompiledIrregexp(
Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii);
// Set the subject cache. The previous string buffer is not deleted, so the
......@@ -419,6 +421,90 @@ class DispatchTable : public ZoneObject {
};
// Improve the speed that we scan for an initial point where a non-anchored
// regexp can match by using a Boyer-Moore-like table. This is done by
// identifying non-greedy non-capturing loops in the nodes that eat any
// character one at a time. For example in the middle of the regexp
// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
// inserted at the start of any non-anchored regexp.
//
// When we have found such a loop we look ahead in the nodes to find the set of
// characters that can come at given distances. For example for the regexp
// /.?foo/ we know that there are at least 3 characters ahead of us, and the
// sets of characters that can occur are [any, [f, o], [o]]. We find a range in
// the lookahead info where the set of characters is reasonably constrained. In
// our example this is from index 1 to 2 (0 is not constrained). We can now
// look 3 characters ahead and if we don't find one of [f, o] (the union of
// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
//
// For Unicode input strings we do the same, but modulo 128.
//
// We also look at the first string fed to the regexp and use that to get a hint
// of the character frequencies in the inputs. This affects the assessment of
// whether the set of characters is 'reasonably constrained'.
//
// We also have another lookahead mechanism (called quick check in the code),
// which uses a wide load of multiple characters followed by a mask and compare
// to determine whether a match is possible at this point.
class BoyerMooreLookahead {
public:
BoyerMooreLookahead(int length, int map_length, RegExpCompiler* compiler);
int length() { return length_; }
int max_char() { return max_char_; }
RegExpCompiler* compiler() { return compiler_; }
static const int kTooManyCharacters = 32;
int Count(int map_number) {
ZoneList<bool>* map = bitmaps_->at(map_number);
if (map == NULL) return map_length_;
int count = 0;
for (int i = 0; i < map_length_; i++) {
if (map->at(i)) count++;
}
return count;
}
void Set(int map_number, int character) {
if (character > max_char_) return;
ZoneList<bool>* map = bitmaps_->at(map_number);
if (map == NULL) return;
map->at(character & (map_length_ - 1)) = true;
}
void SetAll(int map_number) {
bitmaps_->at(map_number) = NULL;
}
void SetRest(int from_map) {
for (int i = from_map; i < length_; i++) SetAll(i);
}
bool EmitSkipInstructions(RegExpMacroAssembler* masm);
private:
// This is the value obtained by EatsAtLeast. If we do not have at least this
// many characters left in the sample string then the match is bound to fail.
// Therefore it is OK to read a character this far ahead of the current match
// point.
int length_;
// We conservatively consider all character values modulo this length. For
// ASCII there is no loss of precision, since this has a value of 128.
int map_length_;
RegExpCompiler* compiler_;
// 0x7f for ASCII, 0xffff for UTF-16.
int max_char_;
ZoneList<ZoneList<bool>*>* bitmaps_;
int GetSkipTable(int min_lookahead,
int max_lookahead,
Handle<ByteArray> boolean_skip_table);
bool FindWorthwhileInterval(int* from, int* to);
int FindBestInterval(
int max_number_of_chars, int old_biggest_points, int* from, int* to);
};
#define FOR_EACH_NODE_TYPE(VISIT) \
VISIT(End) \
VISIT(Action) \
......@@ -635,6 +721,22 @@ class RegExpNode: public ZoneObject {
bool not_at_start) = 0;
static const int kNodeIsTooComplexForGreedyLoops = -1;
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
// Only returns the successor for a text node of length 1 that matches any
// character and that has no guards on it.
virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
RegExpCompiler* compiler) {
return NULL;
}
// Collects information on the possible code units (mod 128) that can match if
// we look forward. This is used for a Boyer-Moore-like string searching
// implementation. TODO(erikcorry): This should share more code with
// EatsAtLeast, GetQuickCheckDetails and ComputeFirstCharacterSet.
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start) {
UNREACHABLE();
}
Label* label() { return &label_; }
// If non-generic code is generated for a node (i.e. the node is not at the
// start of the trace) then it cannot be reused. This variable sets a limit
......@@ -747,6 +849,10 @@ class SeqRegExpNode: public RegExpNode {
: on_success_(on_success) { }
RegExpNode* on_success() { return on_success_; }
void set_on_success(RegExpNode* node) { on_success_ = node; }
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start) {
on_success_->FillInBMInfo(offset, bm, not_at_start);
}
private:
RegExpNode* on_success_;
};
......@@ -793,6 +899,8 @@ class ActionNode: public SeqRegExpNode {
return on_success()->GetQuickCheckDetails(
details, compiler, filled_in, not_at_start);
}
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start);
Type type() { return type_; }
// TODO(erikcorry): We should allow some action nodes in greedy loops.
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
......@@ -860,6 +968,10 @@ class TextNode: public SeqRegExpNode {
ZoneList<TextElement>* elements() { return elms_; }
void MakeCaseIndependent(bool is_ascii);
virtual int GreedyLoopTextLength();
virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
RegExpCompiler* compiler);
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start);
virtual TextNode* Clone() {
TextNode* result = new TextNode(*this);
result->CalculateOffsets();
......@@ -928,6 +1040,8 @@ class AssertionNode: public SeqRegExpNode {
RegExpCompiler* compiler,
int filled_in,
bool not_at_start);
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start);
virtual int ComputeFirstCharacterSet(int budget);
virtual AssertionNode* Clone() { return new AssertionNode(*this); }
AssertionNodeType type() { return type_; }
......@@ -961,6 +1075,12 @@ class BackReferenceNode: public SeqRegExpNode {
bool not_at_start) {
return;
}
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start) {
// Working out the set of characters that a backreference can match is too
// hard, so we just say that any character can match.
bm->SetRest(offset);
}
virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }
virtual int ComputeFirstCharacterSet(int budget);
......@@ -986,6 +1106,11 @@ class EndNode: public RegExpNode {
// Returning 0 from EatsAtLeast should ensure we never get here.
UNREACHABLE();
}
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start) {
// Returning 0 from EatsAtLeast should ensure we never get here.
UNREACHABLE();
}
virtual EndNode* Clone() { return new EndNode(*this); }
private:
Action action_;
......@@ -1071,6 +1196,8 @@ class ChoiceNode: public RegExpNode {
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start);
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start);
virtual ChoiceNode* Clone() { return new ChoiceNode(*this); }
bool being_calculated() { return being_calculated_; }
......@@ -1089,7 +1216,7 @@ class ChoiceNode: public RegExpNode {
void GenerateGuard(RegExpMacroAssembler* macro_assembler,
Guard* guard,
Trace* trace);
int CalculatePreloadCharacters(RegExpCompiler* compiler, bool not_at_start);
int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
void EmitOutOfLineContinuation(RegExpCompiler* compiler,
Trace* trace,
GuardedAlternative alternative,
......@@ -1119,6 +1246,10 @@ class NegativeLookaheadChoiceNode: public ChoiceNode {
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start);
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start) {
alternatives_->at(1).node()->FillInBMInfo(offset, bm, not_at_start);
}
// For a negative lookahead we don't emit the quick check for the
// alternative that is expected to fail. This is because quick check code
// starts by loading enough characters for the alternative that takes fewest
......@@ -1146,6 +1277,8 @@ class LoopChoiceNode: public ChoiceNode {
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start);
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start);
virtual int ComputeFirstCharacterSet(int budget);
virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }
RegExpNode* loop_node() { return loop_node_; }
......@@ -1458,6 +1591,7 @@ class RegExpEngine: public AllStatic {
bool ignore_case,
bool multiline,
Handle<String> pattern,
Handle<String> sample_subject,
bool is_ascii);
static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
......
......@@ -542,9 +542,13 @@ void RegExpMacroAssemblerX64::CheckNotCharacter(uint32_t c,
void RegExpMacroAssemblerX64::CheckCharacterAfterAnd(uint32_t c,
uint32_t mask,
Label* on_equal) {
__ movl(rax, current_character());
__ and_(rax, Immediate(mask));
__ cmpl(rax, Immediate(c));
if (c == 0) {
__ testl(current_character(), Immediate(mask));
} else {
__ movl(rax, Immediate(mask));
__ and_(rax, current_character());
__ cmpl(rax, Immediate(c));
}
BranchOrBacktrack(equal, on_equal);
}
......@@ -552,9 +556,13 @@ void RegExpMacroAssemblerX64::CheckCharacterAfterAnd(uint32_t c,
void RegExpMacroAssemblerX64::CheckNotCharacterAfterAnd(uint32_t c,
uint32_t mask,
Label* on_not_equal) {
__ movl(rax, current_character());
__ and_(rax, Immediate(mask));
__ cmpl(rax, Immediate(c));
if (c == 0) {
__ testl(current_character(), Immediate(mask));
} else {
__ movl(rax, Immediate(mask));
__ and_(rax, current_character());
__ cmpl(rax, Immediate(c));
}
BranchOrBacktrack(not_equal, on_not_equal);
}
......
......@@ -504,7 +504,10 @@ static RegExpNode* Compile(const char* input, bool multiline, bool is_ascii) {
return NULL;
Handle<String> pattern = isolate->factory()->
NewStringFromUtf8(CStrVector(input));
RegExpEngine::Compile(&compile_data, false, multiline, pattern, is_ascii);
Handle<String> sample_subject =
isolate->factory()->NewStringFromUtf8(CStrVector(""));
RegExpEngine::Compile(
&compile_data, false, multiline, pattern, sample_subject, is_ascii);
return compile_data.node;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment