Commit f14b93a5 authored by erikcorry's avatar erikcorry

Regexp: Improve the speed that we scan for an initial point where a non-anchored

regexp can match by using a Boyer-Moore-like table.  This is done by identifying
non-greedy non-capturing loops in the nodes that eat any character one at a time.
For example in the middle of the regexp /foo[\s\S]*?bar/ we find such a loop.
There is also such a loop implicitly inserted at the start of any non-anchored
regexp.

When we have found such a loop we look ahead in the nodes to find the set of
characters that can come at given distances.  For example for the regexp
/.?foo/ we know that there are at least 3 characters ahead of us, and the sets
of characters that can occur are [any, [f, o], [o]].  We find a range in the
lookahead info where the set of characters is reasonably constrained.  In our
example this is from index 1 to 2 (0 is not constrained).  We can now look 3
characters ahead and if we don't find one of [f, o] (the union of [f, o] and
[o]) then we can skip forwards by the range size (in this case 2).

For Unicode input strings we do the same, but modulo 128.

We also look at the first string fed to the regexp and use that to get a hint
of the character frequencies in the inputs.  This affects the assessment of
whether the set of characters is 'reasonably constrained'.

We still have the old lookahead mechanism, which uses a wide load of multiple
characters followed by a mask and compare to determine whether a match is
possible at this point.
Review URL: http://codereview.chromium.org/9965010

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@11204 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent f00631b7
......@@ -452,8 +452,12 @@ void RegExpMacroAssemblerARM::CheckNotCharacter(unsigned c,
void RegExpMacroAssemblerARM::CheckCharacterAfterAnd(uint32_t c,
uint32_t mask,
Label* on_equal) {
__ and_(r0, current_character(), Operand(mask));
__ cmp(r0, Operand(c));
if (c == 0) {
__ tst(current_character(), Operand(mask));
} else {
__ and_(r0, current_character(), Operand(mask));
__ cmp(r0, Operand(c));
}
BranchOrBacktrack(eq, on_equal);
}
......@@ -461,8 +465,12 @@ void RegExpMacroAssemblerARM::CheckCharacterAfterAnd(uint32_t c,
void RegExpMacroAssemblerARM::CheckNotCharacterAfterAnd(unsigned c,
unsigned mask,
Label* on_not_equal) {
__ and_(r0, current_character(), Operand(mask));
__ cmp(r0, Operand(c));
if (c == 0) {
__ tst(current_character(), Operand(mask));
} else {
__ and_(r0, current_character(), Operand(mask));
__ cmp(r0, Operand(c));
}
BranchOrBacktrack(ne, on_not_equal);
}
......
......@@ -504,8 +504,8 @@ void RegExpMacroAssemblerIA32::CheckCharacterAfterAnd(uint32_t c,
if (c == 0) {
__ test(current_character(), Immediate(mask));
} else {
__ mov(eax, current_character());
__ and_(eax, mask);
__ mov(eax, mask);
__ and_(eax, current_character());
__ cmp(eax, c);
}
BranchOrBacktrack(equal, on_equal);
......@@ -518,8 +518,8 @@ void RegExpMacroAssemblerIA32::CheckNotCharacterAfterAnd(uint32_t c,
if (c == 0) {
__ test(current_character(), Immediate(mask));
} else {
__ mov(eax, current_character());
__ and_(eax, mask);
__ mov(eax, mask);
__ and_(eax, current_character());
__ cmp(eax, c);
}
BranchOrBacktrack(not_equal, on_not_equal);
......@@ -569,8 +569,8 @@ void RegExpMacroAssemblerIA32::CheckBitInTable(
__ mov(eax, Immediate(table));
Register index = current_character();
if (mode_ != ASCII || kTableMask != String::kMaxAsciiCharCode) {
__ mov(ebx, current_character());
__ and_(ebx, kTableSize - 1);
__ mov(ebx, kTableSize - 1);
__ and_(ebx, current_character());
index = ebx;
}
__ cmpb(FieldOperand(eax, index, times_1, ByteArray::kHeaderSize), 0);
......
This diff is collapsed.
This diff is collapsed.
......@@ -542,9 +542,13 @@ void RegExpMacroAssemblerX64::CheckNotCharacter(uint32_t c,
void RegExpMacroAssemblerX64::CheckCharacterAfterAnd(uint32_t c,
uint32_t mask,
Label* on_equal) {
__ movl(rax, current_character());
__ and_(rax, Immediate(mask));
__ cmpl(rax, Immediate(c));
if (c == 0) {
__ testl(current_character(), Immediate(mask));
} else {
__ movl(rax, Immediate(mask));
__ and_(rax, current_character());
__ cmpl(rax, Immediate(c));
}
BranchOrBacktrack(equal, on_equal);
}
......@@ -552,9 +556,13 @@ void RegExpMacroAssemblerX64::CheckCharacterAfterAnd(uint32_t c,
void RegExpMacroAssemblerX64::CheckNotCharacterAfterAnd(uint32_t c,
uint32_t mask,
Label* on_not_equal) {
__ movl(rax, current_character());
__ and_(rax, Immediate(mask));
__ cmpl(rax, Immediate(c));
if (c == 0) {
__ testl(current_character(), Immediate(mask));
} else {
__ movl(rax, Immediate(mask));
__ and_(rax, current_character());
__ cmpl(rax, Immediate(c));
}
BranchOrBacktrack(not_equal, on_not_equal);
}
......
......@@ -504,7 +504,10 @@ static RegExpNode* Compile(const char* input, bool multiline, bool is_ascii) {
return NULL;
Handle<String> pattern = isolate->factory()->
NewStringFromUtf8(CStrVector(input));
RegExpEngine::Compile(&compile_data, false, multiline, pattern, is_ascii);
Handle<String> sample_subject =
isolate->factory()->NewStringFromUtf8(CStrVector(""));
RegExpEngine::Compile(
&compile_data, false, multiline, pattern, sample_subject, is_ascii);
return compile_data.node;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment