Commit 46504c15 authored by lrn@chromium.org's avatar lrn@chromium.org

Attempt to make \b\w+ faster. Slight performance increase on, e.g., string unpacking.

Review URL: http://codereview.chromium.org/507051


git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@3563 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 40d6cbca
......@@ -465,8 +465,6 @@ void RegExpMacroAssemblerARM::CheckNotCharacterAfterMinusAnd(
bool RegExpMacroAssemblerARM::CheckSpecialCharacterClass(uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match) {
// Range checks (c in min..max) are generally implemented by an unsigned
// (c - min) <= (max - min) check
......@@ -475,11 +473,6 @@ bool RegExpMacroAssemblerARM::CheckSpecialCharacterClass(uc16 type,
// Match space-characters
if (mode_ == ASCII) {
// ASCII space characters are '\t'..'\r' and ' '.
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
Label success;
__ cmp(current_character(), Operand(' '));
__ b(eq, &success);
......@@ -493,11 +486,6 @@ bool RegExpMacroAssemblerARM::CheckSpecialCharacterClass(uc16 type,
return false;
case 'S':
// Match non-space characters.
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
if (mode_ == ASCII) {
// ASCII space characters are '\t'..'\r' and ' '.
__ cmp(current_character(), Operand(' '));
......@@ -510,33 +498,18 @@ bool RegExpMacroAssemblerARM::CheckSpecialCharacterClass(uc16 type,
return false;
case 'd':
// Match ASCII digits ('0'..'9')
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ sub(r0, current_character(), Operand('0'));
__ cmp(current_character(), Operand('9' - '0'));
BranchOrBacktrack(hi, on_no_match);
return true;
case 'D':
// Match non ASCII-digits
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ sub(r0, current_character(), Operand('0'));
__ cmp(r0, Operand('9' - '0'));
BranchOrBacktrack(ls, on_no_match);
return true;
case '.': {
// Match non-newlines (not 0x0a('\n'), 0x0d('\r'), 0x2028 and 0x2029)
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ eor(r0, current_character(), Operand(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c
__ sub(r0, r0, Operand(0x0b));
......@@ -552,13 +525,71 @@ bool RegExpMacroAssemblerARM::CheckSpecialCharacterClass(uc16 type,
}
return true;
}
case 'n': {
// Match newlines (0x0a('\n'), 0x0d('\r'), 0x2028 and 0x2029)
__ eor(r0, current_character(), Operand(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c
__ sub(r0, r0, Operand(0x0b));
__ cmp(r0, Operand(0x0c - 0x0b));
if (mode_ == ASCII) {
BranchOrBacktrack(hi, on_no_match);
} else {
Label done;
__ b(ls, &done);
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0b). I.e., check for
// 0x201d (0x2028 - 0x0b) or 0x201e.
__ sub(r0, r0, Operand(0x2028 - 0x0b));
__ cmp(r0, Operand(1));
BranchOrBacktrack(hi, on_no_match);
__ bind(&done);
}
return true;
}
case 'w': {
// Match word character (0-9, A-Z, a-z and _).
Label digits, done;
__ cmp(current_character(), Operand('9'));
__ b(ls, &digits);
__ cmp(current_character(), Operand('_'));
__ b(eq, &done);
__ orr(r0, current_character(), Operand(0x20));
__ sub(r0, r0, Operand('a'));
__ cmp(r0, Operand('z' - 'a'));
BranchOrBacktrack(hi, on_no_match);
__ jmp(&done);
__ bind(&digits);
__ cmp(current_character(), Operand('0'));
BranchOrBacktrack(lo, on_no_match);
__ bind(&done);
return true;
}
case 'W': {
// Match non-word character (not 0-9, A-Z, a-z and _).
Label digits, done;
__ cmp(current_character(), Operand('9'));
__ b(ls, &digits);
__ cmp(current_character(), Operand('_'));
BranchOrBacktrack(eq, on_no_match);
__ orr(r0, current_character(), Operand(0x20));
__ sub(r0, r0, Operand('a'));
__ cmp(r0, Operand('z' - 'a'));
BranchOrBacktrack(ls, on_no_match);
__ jmp(&done);
__ bind(&digits);
__ cmp(current_character(), Operand('0'));
BranchOrBacktrack(hs, on_no_match);
__ bind(&done);
return true;
}
case '*':
// Match any character.
if (check_offset) {
CheckPosition(cp_offset, on_no_match);
}
return true;
// No custom implementation (yet): w, W, s(UC16), S(UC16).
// No custom implementation (yet): s(UC16), S(UC16).
default:
return false;
}
......
......@@ -80,8 +80,6 @@ class RegExpMacroAssemblerARM: public NativeRegExpMacroAssembler {
// the end of the string.
virtual void CheckPosition(int cp_offset, Label* on_outside_input);
virtual bool CheckSpecialCharacterClass(uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match);
virtual void Fail();
virtual Handle<Object> GetCode(Handle<String> source);
......
......@@ -433,7 +433,7 @@ void* RegExpUnparser::VisitQuantifier(RegExpQuantifier* that, void* data) {
} else {
stream()->Add("%i ", that->max());
}
stream()->Add(that->is_greedy() ? "g " : "n ");
stream()->Add(that->is_greedy() ? "g " : that->is_possessive() ? "p " : "n ");
that->body()->Accept(this, data);
stream()->Add(")");
return NULL;
......
......@@ -1526,6 +1526,7 @@ class CharacterSet BASE_EMBEDDED {
standard_set_type_ = special_set_type;
}
bool is_standard() { return standard_set_type_ != 0; }
void Canonicalize();
private:
ZoneList<CharacterRange>* ranges_;
// If non-zero, the value represents a standard set (e.g., all whitespace
......@@ -1619,12 +1620,13 @@ class RegExpText: public RegExpTree {
class RegExpQuantifier: public RegExpTree {
public:
RegExpQuantifier(int min, int max, bool is_greedy, RegExpTree* body)
: min_(min),
enum Type { GREEDY, NON_GREEDY, POSSESSIVE };
RegExpQuantifier(int min, int max, Type type, RegExpTree* body)
: body_(body),
min_(min),
max_(max),
is_greedy_(is_greedy),
body_(body),
min_match_(min * body->min_match()) {
min_match_(min * body->min_match()),
type_(type) {
if (max > 0 && body->max_match() > kInfinity / max) {
max_match_ = kInfinity;
} else {
......@@ -1648,15 +1650,17 @@ class RegExpQuantifier: public RegExpTree {
virtual int max_match() { return max_match_; }
int min() { return min_; }
int max() { return max_; }
bool is_greedy() { return is_greedy_; }
bool is_possessive() { return type_ == POSSESSIVE; }
bool is_non_greedy() { return type_ == NON_GREEDY; }
bool is_greedy() { return type_ == GREEDY; }
RegExpTree* body() { return body_; }
private:
RegExpTree* body_;
int min_;
int max_;
bool is_greedy_;
RegExpTree* body_;
int min_match_;
int max_match_;
Type type_;
};
......
......@@ -329,6 +329,9 @@ DEFINE_bool(collect_heap_spill_statistics, false,
"(requires heap_stats)")
// Regexp
DEFINE_bool(regexp_possessive_quantifier,
false,
"enable possessive quantifier syntax for testing")
DEFINE_bool(trace_regexp_bytecodes, false, "trace regexp bytecode execution")
DEFINE_bool(trace_regexp_assembler,
false,
......
......@@ -477,8 +477,6 @@ void RegExpMacroAssemblerIA32::CheckNotCharacterAfterMinusAnd(
bool RegExpMacroAssemblerIA32::CheckSpecialCharacterClass(uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match) {
// Range checks (c in min..max) are generally implemented by an unsigned
// (c - min) <= (max - min) check
......@@ -487,17 +485,12 @@ bool RegExpMacroAssemblerIA32::CheckSpecialCharacterClass(uc16 type,
// Match space-characters
if (mode_ == ASCII) {
// ASCII space characters are '\t'..'\r' and ' '.
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
Label success;
__ cmp(current_character(), ' ');
__ j(equal, &success);
// Check range 0x09..0x0d
__ sub(Operand(current_character()), Immediate('\t'));
__ cmp(current_character(), '\r' - '\t');
__ lea(eax, Operand(current_character(), -'\t'));
__ cmp(eax, '\r' - '\t');
BranchOrBacktrack(above, on_no_match);
__ bind(&success);
return true;
......@@ -505,72 +498,118 @@ bool RegExpMacroAssemblerIA32::CheckSpecialCharacterClass(uc16 type,
return false;
case 'S':
// Match non-space characters.
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
if (mode_ == ASCII) {
// ASCII space characters are '\t'..'\r' and ' '.
__ cmp(current_character(), ' ');
BranchOrBacktrack(equal, on_no_match);
__ sub(Operand(current_character()), Immediate('\t'));
__ cmp(current_character(), '\r' - '\t');
__ lea(eax, Operand(current_character(), -'\t'));
__ cmp(eax, '\r' - '\t');
BranchOrBacktrack(below_equal, on_no_match);
return true;
}
return false;
case 'd':
// Match ASCII digits ('0'..'9')
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ sub(Operand(current_character()), Immediate('0'));
__ cmp(current_character(), '9' - '0');
__ lea(eax, Operand(current_character(), -'0'));
__ cmp(eax, '9' - '0');
BranchOrBacktrack(above, on_no_match);
return true;
case 'D':
// Match non ASCII-digits
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ sub(Operand(current_character()), Immediate('0'));
__ cmp(current_character(), '9' - '0');
__ lea(eax, Operand(current_character(), -'0'));
__ cmp(eax, '9' - '0');
BranchOrBacktrack(below_equal, on_no_match);
return true;
case '.': {
// Match non-newlines (not 0x0a('\n'), 0x0d('\r'), 0x2028 and 0x2029)
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ xor_(Operand(current_character()), Immediate(0x01));
__ mov(Operand(eax), current_character());
__ xor_(Operand(eax), Immediate(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c
__ sub(Operand(current_character()), Immediate(0x0b));
__ cmp(current_character(), 0x0c - 0x0b);
__ sub(Operand(eax), Immediate(0x0b));
__ cmp(eax, 0x0c - 0x0b);
BranchOrBacktrack(below_equal, on_no_match);
if (mode_ == UC16) {
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0b). I.e., check for
// 0x201d (0x2028 - 0x0b) or 0x201e.
__ sub(Operand(current_character()), Immediate(0x2028 - 0x0b));
__ cmp(current_character(), 1);
__ sub(Operand(eax), Immediate(0x2028 - 0x0b));
__ cmp(eax, 0x2029 - 0x2028);
BranchOrBacktrack(below_equal, on_no_match);
}
return true;
}
case 'w': {
Label done, check_digits;
__ cmp(Operand(current_character()), Immediate('9'));
__ j(less_equal, &check_digits);
__ cmp(Operand(current_character()), Immediate('_'));
__ j(equal, &done);
// Convert to lower case if letter.
__ mov(Operand(eax), current_character());
__ or_(eax, 0x20);
// check current character in range ['a'..'z'], nondestructively.
__ sub(Operand(eax), Immediate('a'));
__ cmp(Operand(eax), Immediate('z' - 'a'));
BranchOrBacktrack(above, on_no_match);
__ jmp(&done);
__ bind(&check_digits);
// Check current character in range ['0'..'9'].
__ cmp(Operand(current_character()), Immediate('0'));
BranchOrBacktrack(below, on_no_match);
__ bind(&done);
return true;
}
case 'W': {
Label done, check_digits;
__ cmp(Operand(current_character()), Immediate('9'));
__ j(less_equal, &check_digits);
__ cmp(Operand(current_character()), Immediate('_'));
BranchOrBacktrack(equal, on_no_match);
// Convert to lower case if letter.
__ mov(Operand(eax), current_character());
__ or_(eax, 0x20);
// check current character in range ['a'..'z'], nondestructively.
__ sub(Operand(eax), Immediate('a'));
__ cmp(Operand(eax), Immediate('z' - 'a'));
BranchOrBacktrack(below_equal, on_no_match);
__ jmp(&done);
__ bind(&check_digits);
// Check current character in range ['0'..'9'].
__ cmp(Operand(current_character()), Immediate('0'));
BranchOrBacktrack(above_equal, on_no_match);
__ bind(&done);
return true;
}
// Non-standard classes (with no syntactic shorthand) used internally.
case '*':
// Match any character.
if (check_offset) {
CheckPosition(cp_offset, on_no_match);
return true;
case 'n': {
// Match newlines (0x0a('\n'), 0x0d('\r'), 0x2028 or 0x2029).
// The opposite of '.'.
__ mov(Operand(eax), current_character());
__ xor_(Operand(eax), Immediate(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c
__ sub(Operand(eax), Immediate(0x0b));
__ cmp(eax, 0x0c - 0x0b);
if (mode_ == ASCII) {
BranchOrBacktrack(above, on_no_match);
} else {
Label done;
BranchOrBacktrack(below_equal, &done);
ASSERT_EQ(UC16, mode_);
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0b). I.e., check for
// 0x201d (0x2028 - 0x0b) or 0x201e.
__ sub(Operand(eax), Immediate(0x2028 - 0x0b));
__ cmp(eax, 1);
BranchOrBacktrack(above, on_no_match);
__ bind(&done);
}
return true;
// No custom implementation (yet): w, W, s(UC16), S(UC16).
}
// No custom implementation (yet): s(UC16), S(UC16).
default:
return false;
}
......
......@@ -78,10 +78,7 @@ class RegExpMacroAssemblerIA32: public NativeRegExpMacroAssembler {
// Checks whether the given offset from the current position is before
// the end of the string.
virtual void CheckPosition(int cp_offset, Label* on_outside_input);
virtual bool CheckSpecialCharacterClass(uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match);
virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match);
virtual void Fail();
virtual Handle<Object> GetCode(Handle<String> source);
virtual void GoTo(Label* label);
......
......@@ -1408,14 +1408,6 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
int cp_offset,
bool check_offset,
bool preloaded) {
if (cc->is_standard() &&
macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
cp_offset,
check_offset,
on_failure)) {
return;
}
ZoneList<CharacterRange>* ranges = cc->ranges();
int max_char;
if (ascii) {
......@@ -1466,6 +1458,12 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
}
if (cc->is_standard() &&
macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
on_failure)) {
return;
}
for (int i = 0; i < last_valid_range; i++) {
CharacterRange& range = ranges->at(i);
Label next_range;
......@@ -1603,8 +1601,8 @@ int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) {
}
int NegativeLookaheadChoiceNode:: EatsAtLeast(int still_to_find,
int recursion_depth) {
int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
int recursion_depth) {
if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
// Alternative 0 is the negative lookahead, alternative 1 is what comes
// afterwards.
......@@ -2026,6 +2024,12 @@ static void EmitWordCheck(RegExpMacroAssembler* assembler,
Label* word,
Label* non_word,
bool fall_through_on_word) {
if (assembler->CheckSpecialCharacterClass(
fall_through_on_word ? 'w' : 'W',
fall_through_on_word ? non_word : word)) {
// Optimized implementation available.
return;
}
assembler->CheckCharacterGT('z', non_word);
assembler->CheckCharacterLT('0', non_word);
assembler->CheckCharacterGT('a' - 1, word);
......@@ -2062,17 +2066,60 @@ static void EmitHat(RegExpCompiler* compiler,
assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
new_trace.backtrack(),
false);
// Newline means \n, \r, 0x2028 or 0x2029.
if (!compiler->ascii()) {
assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
if (!assembler->CheckSpecialCharacterClass('n',
new_trace.backtrack())) {
// Newline means \n, \r, 0x2028 or 0x2029.
if (!compiler->ascii()) {
assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
}
assembler->CheckCharacter('\n', &ok);
assembler->CheckNotCharacter('\r', new_trace.backtrack());
}
assembler->CheckCharacter('\n', &ok);
assembler->CheckNotCharacter('\r', new_trace.backtrack());
assembler->Bind(&ok);
on_success->Emit(compiler, &new_trace);
}
// Emit the code to handle \b and \B (word-boundary or non-word-boundary)
// when we know whether the next character must be a word character or not.
static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type,
RegExpCompiler* compiler,
RegExpNode* on_success,
Trace* trace) {
RegExpMacroAssembler* assembler = compiler->macro_assembler();
Label done;
Trace new_trace(*trace);
bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER);
Label* on_word = expect_word_character ? &done : new_trace.backtrack();
Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done;
// Check whether previous character was a word character.
switch (trace->at_start()) {
case Trace::TRUE:
if (expect_word_character) {
assembler->GoTo(on_non_word);
}
break;
case Trace::UNKNOWN:
ASSERT_EQ(0, trace->cp_offset());
assembler->CheckAtStart(on_non_word);
// Fall through.
case Trace::FALSE:
int prev_char_offset = trace->cp_offset() - 1;
assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1);
EmitWordCheck(assembler, on_word, on_non_word, expect_word_character);
// We may or may not have loaded the previous character.
new_trace.InvalidateCurrentCharacter();
}
assembler->Bind(&done);
on_success->Emit(compiler, &new_trace);
}
// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
RegExpCompiler* compiler,
......@@ -2182,10 +2229,15 @@ void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
case AFTER_NEWLINE:
EmitHat(compiler, on_success(), trace);
return;
case AT_NON_BOUNDARY:
case AT_BOUNDARY:
case AT_NON_BOUNDARY: {
EmitBoundaryCheck(type_, compiler, on_success(), trace);
return;
}
case AFTER_WORD_CHARACTER:
case AFTER_NONWORD_CHARACTER: {
EmitHalfBoundaryCheck(type_, compiler, on_success(), trace);
}
}
on_success()->Emit(compiler, trace);
}
......@@ -2768,7 +2820,7 @@ void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
// to generate probably can't use it.
if (i != first_normal_choice) {
alt_gen->expects_preload = false;
new_trace.set_characters_preloaded(0);
new_trace.InvalidateCurrentCharacter();
}
if (i < choice_count - 1) {
new_trace.set_backtrack(&alt_gen->after);
......@@ -3259,6 +3311,12 @@ void DotPrinter::VisitAssertion(AssertionNode* that) {
case AssertionNode::AFTER_NEWLINE:
stream()->Add("label=\"(?<=\\n)\", shape=septagon");
break;
case AssertionNode::AFTER_WORD_CHARACTER:
stream()->Add("label=\"(?<=\\w)\", shape=septagon");
break;
case AssertionNode::AFTER_NONWORD_CHARACTER:
stream()->Add("label=\"(?<=\\W)\", shape=septagon");
break;
}
stream()->Add("];\n");
PrintAttributes(that);
......@@ -3461,6 +3519,20 @@ bool RegExpCharacterClass::is_standard() {
set_.set_standard_set_type('.');
return true;
}
if (CompareRanges(set_.ranges(),
kLineTerminatorRanges,
kLineTerminatorRangeCount)) {
set_.set_standard_set_type('n');
return true;
}
if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
set_.set_standard_set_type('w');
return true;
}
if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
set_.set_standard_set_type('W');
return true;
}
return false;
}
......@@ -3987,6 +4059,101 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
}
bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
ASSERT_NOT_NULL(ranges);
int n = ranges->length();
if (n <= 1) return true;
int max = ranges->at(0).to();
for (int i = 1; i < n; i++) {
CharacterRange next_range = ranges->at(i);
if (next_range.from() <= max + 1) return false;
max = next_range.to();
}
return true;
}
SetRelation CharacterRange::WordCharacterRelation(
ZoneList<CharacterRange>* range) {
ASSERT(IsCanonical(range));
int i = 0; // Word character range index.
int j = 0; // Argument range index.
ASSERT_NE(0, kWordRangeCount);
SetRelation result;
if (range->length() == 0) {
result.SetElementsInSecondSet();
return result;
}
CharacterRange argument_range = range->at(0);
CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]);
while (i < kWordRangeCount && j < range->length()) {
// Check the two ranges for the five cases:
// - no overlap.
// - partial overlap (there are elements in both ranges that isn't
// in the other, and there are also elements that are in both).
// - argument range entirely inside word range.
// - word range entirely inside argument range.
// - ranges are completely equal.
// First check for no overlap. The earlier range is not in the other set.
if (argument_range.from() > word_range.to()) {
// Ranges are disjoint. The earlier word range contains elements that
// cannot be in the argument set.
result.SetElementsInSecondSet();
} else if (word_range.from() > argument_range.to()) {
// Ranges are disjoint. The earlier argument range contains elements that
// cannot be in the word set.
result.SetElementsInFirstSet();
} else if (word_range.from() <= argument_range.from() &&
word_range.to() >= argument_range.from()) {
result.SetElementsInBothSets();
// argument range completely inside word range.
if (word_range.from() < argument_range.from() ||
word_range.to() > argument_range.from()) {
result.SetElementsInSecondSet();
}
} else if (word_range.from() >= argument_range.from() &&
word_range.to() <= argument_range.from()) {
result.SetElementsInBothSets();
result.SetElementsInFirstSet();
} else {
// There is overlap, and neither is a subrange of the other
result.SetElementsInFirstSet();
result.SetElementsInSecondSet();
result.SetElementsInBothSets();
}
if (result.NonTrivialIntersection()) {
// The result is as (im)precise as we can possibly make it.
return result;
}
// Progress the range(s) with minimal to-character.
uc16 word_to = word_range.to();
uc16 argument_to = argument_range.to();
if (argument_to <= word_to) {
j++;
if (j < range->length()) {
argument_range = range->at(j);
}
}
if (word_to <= argument_to) {
i += 2;
if (i < kWordRangeCount) {
word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]);
}
}
}
// Check if anything wasn't compared in the loop.
if (i < kWordRangeCount) {
// word range contains something not in argument range.
result.SetElementsInSecondSet();
} else if (j < range->length()) {
// Argument range contains something not in word range.
result.SetElementsInFirstSet();
}
return result;
}
static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
int bottom,
int top) {
......@@ -4096,6 +4263,287 @@ ZoneList<CharacterRange>* CharacterSet::ranges() {
}
// Move a number of elements in a zonelist to another position
// in the same list. Handles overlapping source and target areas.
static void MoveRanges(ZoneList<CharacterRange>* list,
int from,
int to,
int count) {
// Ranges are potentially overlapping.
if (from < to) {
for (int i = count - 1; i >= 0; i--) {
list->at(to + i) = list->at(from + i);
}
} else {
for (int i = 0; i < count; i++) {
list->at(to + i) = list->at(from + i);
}
}
}
static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
int count,
CharacterRange insert) {
// Inserts a range into list[0..count[, which must be sorted
// by from value and non-overlapping and non-adjacent, using at most
// list[0..count] for the result. Returns the number of resulting
// canonicalized ranges. Inserting a range may collapse existing ranges into
// fewer ranges, so the return value can be anything in the range 1..count+1.
uc16 from = insert.from();
uc16 to = insert.to();
int start_pos = 0;
int end_pos = count;
for (int i = count - 1; i >= 0; i--) {
CharacterRange current = list->at(i);
if (current.from() > to + 1) {
end_pos = i;
} else if (current.to() + 1 < from) {
start_pos = i + 1;
break;
}
}
// Inserted range overlaps, or is adjacent to, ranges at positions
// [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
// not affected by the insertion.
// If start_pos == end_pos, the range must be inserted before start_pos.
// if start_pos < end_pos, the entire range from start_pos to end_pos
// must be merged with the insert range.
if (start_pos == end_pos) {
// Insert between existing ranges at position start_pos.
if (start_pos < count) {
MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
}
list->at(start_pos) = insert;
return count + 1;
}
if (start_pos + 1 == end_pos) {
// Replace single existing range at position start_pos.
CharacterRange to_replace = list->at(start_pos);
int new_from = Min(to_replace.from(), from);
int new_to = Max(to_replace.to(), to);
list->at(start_pos) = CharacterRange(new_from, new_to);
return count;
}
// Replace a number of existing ranges from start_pos to end_pos - 1.
// Move the remaining ranges down.
int new_from = Min(list->at(start_pos).from(), from);
int new_to = Max(list->at(end_pos - 1).to(), to);
if (end_pos < count) {
MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
}
list->at(start_pos) = CharacterRange(new_from, new_to);
return count - (end_pos - start_pos) + 1;
}
void CharacterSet::Canonicalize() {
// Special/default classes are always considered canonical. The result
// of calling ranges() will be sorted.
if (ranges_ == NULL) return;
CharacterRange::Canonicalize(ranges_);
}
void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
if (character_ranges->length() <= 1) return;
// Check whether ranges are already canonical (increasing, non-overlapping,
// non-adjacent).
int n = character_ranges->length();
int max = character_ranges->at(0).to();
int i = 1;
while (i < n) {
CharacterRange current = character_ranges->at(i);
if (current.from() <= max + 1) {
break;
}
max = current.to();
i++;
}
// Canonical until the i'th range. If that's all of them, we are done.
if (i == n) return;
// The ranges at index i and forward are not canonicalized. Make them so by
// doing the equivalent of insertion sort (inserting each into the previous
// list, in order).
// Notice that inserting a range can reduce the number of ranges in the
// result due to combining of adjacent and overlapping ranges.
int read = i; // Range to insert.
int num_canonical = i; // Length of canonicalized part of list.
do {
num_canonical = InsertRangeInCanonicalList(character_ranges,
num_canonical,
character_ranges->at(read));
read++;
} while (read < n);
character_ranges->Rewind(num_canonical);
ASSERT(CharacterRange::IsCanonical(character_ranges));
}
// Utility function for CharacterRange::Merge. Adds a range at the end of
// a canonicalized range list, if necessary merging the range with the last
// range of the list.
static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) {
if (set == NULL) return;
ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from());
int n = set->length();
if (n > 0) {
CharacterRange lastRange = set->at(n - 1);
if (lastRange.to() == range.from() - 1) {
set->at(n - 1) = CharacterRange(lastRange.from(), range.to());
return;
}
}
set->Add(range);
}
static void AddRangeToSelectedSet(int selector,
ZoneList<CharacterRange>* first_set,
ZoneList<CharacterRange>* second_set,
ZoneList<CharacterRange>* intersection_set,
CharacterRange range) {
switch (selector) {
case kInsideFirst:
AddRangeToSet(first_set, range);
break;
case kInsideSecond:
AddRangeToSet(second_set, range);
break;
case kInsideBoth:
AddRangeToSet(intersection_set, range);
break;
}
}
void CharacterRange::Merge(ZoneList<CharacterRange>* first_set,
ZoneList<CharacterRange>* second_set,
ZoneList<CharacterRange>* first_set_only_out,
ZoneList<CharacterRange>* second_set_only_out,
ZoneList<CharacterRange>* both_sets_out) {
// Inputs are canonicalized.
ASSERT(CharacterRange::IsCanonical(first_set));
ASSERT(CharacterRange::IsCanonical(second_set));
// Outputs are empty, if applicable.
ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0);
ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0);
ASSERT(both_sets_out == NULL || both_sets_out->length() == 0);
// Merge sets by iterating through the lists in order of lowest "from" value,
// and putting intervals into one of three sets.
if (first_set->length() == 0) {
second_set_only_out->AddAll(*second_set);
return;
}
if (second_set->length() == 0) {
first_set_only_out->AddAll(*first_set);
return;
}
// Indices into input lists.
int i1 = 0;
int i2 = 0;
// Cache length of input lists.
int n1 = first_set->length();
int n2 = second_set->length();
// Current range. May be invalid if state is kInsideNone.
int from = 0;
int to = -1;
// Where current range comes from.
int state = kInsideNone;
while (i1 < n1 || i2 < n2) {
CharacterRange next_range;
int range_source;
if (i2 == n2 || first_set->at(i1).from() < second_set->at(i2).from()) {
next_range = first_set->at(i1++);
range_source = kInsideFirst;
} else {
next_range = second_set->at(i2++);
range_source = kInsideSecond;
}
if (to < next_range.from()) {
// Ranges disjoint: |current| |next|
AddRangeToSelectedSet(state,
first_set_only_out,
second_set_only_out,
both_sets_out,
CharacterRange(from, to));
from = next_range.from();
to = next_range.to();
state = range_source;
} else {
if (from < next_range.from()) {
AddRangeToSelectedSet(state,
first_set_only_out,
second_set_only_out,
both_sets_out,
CharacterRange(from, next_range.from()-1));
}
if (to < next_range.to()) {
// Ranges overlap: |current|
// |next|
AddRangeToSelectedSet(state | range_source,
first_set_only_out,
second_set_only_out,
both_sets_out,
CharacterRange(next_range.from(), to));
from = to + 1;
to = next_range.to();
state = range_source;
} else {
// Range included: |current| , possibly ending at same character.
// |next|
AddRangeToSelectedSet(
state | range_source,
first_set_only_out,
second_set_only_out,
both_sets_out,
CharacterRange(next_range.from(), next_range.to()));
from = next_range.to() + 1;
// If ranges end at same character, both ranges are consumed completely.
if (next_range.to() == to) state = kInsideNone;
}
}
}
AddRangeToSelectedSet(state,
first_set_only_out,
second_set_only_out,
both_sets_out,
CharacterRange(from, to));
}
void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
ZoneList<CharacterRange>* negated_ranges) {
ASSERT(CharacterRange::IsCanonical(ranges));
ASSERT_EQ(0, negated_ranges->length());
int range_count = ranges->length();
uc16 from = 0;
int i = 0;
if (range_count > 0 && ranges->at(0).from() == 0) {
from = ranges->at(0).to();
i = 1;
}
while (i < range_count) {
CharacterRange range = ranges->at(i);
negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
from = range.to();
i++;
}
if (from < String::kMaxUC16CharCode) {
negated_ranges->Add(CharacterRange(from + 1, String::kMaxUC16CharCode));
}
}
// -------------------------------------------------------------------
// Interest propagation
......@@ -4387,9 +4835,203 @@ void Analysis::VisitBackReference(BackReferenceNode* that) {
void Analysis::VisitAssertion(AssertionNode* that) {
EnsureAnalyzed(that->on_success());
AssertionNode::AssertionNodeType type = that->type();
if (type == AssertionNode::AT_BOUNDARY ||
type == AssertionNode::AT_NON_BOUNDARY) {
// Check if the following character is known to be a word character
// or known to not be a word character.
ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet();
CharacterRange::Canonicalize(following_chars);
SetRelation word_relation =
CharacterRange::WordCharacterRelation(following_chars);
if (word_relation.ContainedIn()) {
// Following character is definitely a word character.
type = (type == AssertionNode::AT_BOUNDARY) ?
AssertionNode::AFTER_NONWORD_CHARACTER :
AssertionNode::AFTER_WORD_CHARACTER;
that->set_type(type);
} else if (word_relation.Disjoint()) {
// Following character is definitely *not* a word character.
type = (type == AssertionNode::AT_BOUNDARY) ?
AssertionNode::AFTER_WORD_CHARACTER :
AssertionNode::AFTER_NONWORD_CHARACTER;
that->set_type(type);
}
}
}
ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() {
if (first_character_set_ == NULL) {
if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) {
// If we can't find an exact solution within the budget, we
// set the value to the set of every character, i.e., all characters
// are possible.
ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1);
all_set->Add(CharacterRange::Everything());
first_character_set_ = all_set;
}
}
return first_character_set_;
}
int RegExpNode::ComputeFirstCharacterSet(int budget) {
// Default behavior is to not be able to determine the first character.
return kComputeFirstCharacterSetFail;
}
int LoopChoiceNode::ComputeFirstCharacterSet(int budget) {
budget--;
if (budget >= 0) {
// Find loop min-iteration. It's the value of the guarded choice node
// with a GEQ guard, if any.
int min_repetition = 0;
for (int i = 0; i <= 1; i++) {
GuardedAlternative alternative = alternatives()->at(i);
ZoneList<Guard*>* guards = alternative.guards();
if (guards != NULL && guards->length() > 0) {
Guard* guard = guards->at(0);
if (guard->op() == Guard::GEQ) {
min_repetition = guard->value();
break;
}
}
}
budget = loop_node()->ComputeFirstCharacterSet(budget);
if (budget >= 0) {
ZoneList<CharacterRange>* character_set =
loop_node()->first_character_set();
if (body_can_be_zero_length() || min_repetition == 0) {
budget = continue_node()->ComputeFirstCharacterSet(budget);
if (budget < 0) return budget;
ZoneList<CharacterRange>* body_set =
continue_node()->first_character_set();
ZoneList<CharacterRange>* union_set =
new ZoneList<CharacterRange>(Max(character_set->length(),
body_set->length()));
CharacterRange::Merge(character_set,
body_set,
union_set,
union_set,
union_set);
character_set = union_set;
}
set_first_character_set(character_set);
}
}
return budget;
}
int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) {
budget--;
if (budget >= 0) {
GuardedAlternative successor = this->alternatives()->at(1);
RegExpNode* successor_node = successor.node();
budget = successor_node->ComputeFirstCharacterSet(budget);
if (budget >= 0) {
set_first_character_set(successor_node->first_character_set());
}
}
return budget;
}
// The first character set of an EndNode is unknowable. Just use the
// default implementation that fails and returns all characters as possible.
int AssertionNode::ComputeFirstCharacterSet(int budget) {
budget -= 1;
if (budget >= 0) {
switch (type_) {
case AT_END: {
set_first_character_set(new ZoneList<CharacterRange>(0));
break;
}
case AT_START:
case AT_BOUNDARY:
case AT_NON_BOUNDARY:
case AFTER_NEWLINE:
case AFTER_NONWORD_CHARACTER:
case AFTER_WORD_CHARACTER: {
ASSERT_NOT_NULL(on_success());
budget = on_success()->ComputeFirstCharacterSet(budget);
set_first_character_set(on_success()->first_character_set());
break;
}
}
}
return budget;
}
int ActionNode::ComputeFirstCharacterSet(int budget) {
if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail;
budget--;
if (budget >= 0) {
ASSERT_NOT_NULL(on_success());
budget = on_success()->ComputeFirstCharacterSet(budget);
if (budget >= 0) {
set_first_character_set(on_success()->first_character_set());
}
}
return budget;
}
int BackReferenceNode::ComputeFirstCharacterSet(int budget) {
// We don't know anything about the first character of a backreference
// at this point.
return kComputeFirstCharacterSetFail;
}
int TextNode::ComputeFirstCharacterSet(int budget) {
budget--;
if (budget >= 0) {
ASSERT_NE(0, elements()->length());
TextElement text = elements()->at(0);
if (text.type == TextElement::ATOM) {
RegExpAtom* atom = text.data.u_atom;
ASSERT_NE(0, atom->length());
uc16 first_char = atom->data()[0];
ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1);
range->Add(CharacterRange(first_char, first_char));
set_first_character_set(range);
} else {
ASSERT(text.type == TextElement::CHAR_CLASS);
RegExpCharacterClass* char_class = text.data.u_char_class;
if (char_class->is_negated()) {
ZoneList<CharacterRange>* ranges = char_class->ranges();
int length = ranges->length();
int new_length = length + 1;
if (length > 0) {
if (ranges->at(0).from() == 0) new_length--;
if (ranges->at(length - 1).to() == String::kMaxUC16CharCode) {
new_length--;
}
}
ZoneList<CharacterRange>* negated_ranges =
new ZoneList<CharacterRange>(new_length);
CharacterRange::Negate(ranges, negated_ranges);
set_first_character_set(negated_ranges);
} else {
set_first_character_set(char_class->ranges());
}
}
}
return budget;
}
// -------------------------------------------------------------------
// Dispatch table construction
......@@ -4448,7 +5090,6 @@ void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
}
static int CompareRangeByFrom(const CharacterRange* a,
const CharacterRange* b) {
return Compare<uc16>(a->from(), b->from());
......
......@@ -177,6 +177,57 @@ class RegExpImpl {
};
// Represents the location of one element relative to the intersection of
// two sets. Corresponds to the four areas of a Venn diagram.
enum ElementInSetsRelation {
kInsideNone = 0,
kInsideFirst = 1,
kInsideSecond = 2,
kInsideBoth = 3
};
// Represents the relation of two sets.
// Sets can be either disjoint, partially or fully overlapping, or equal.
class SetRelation BASE_EMBEDDED {
public:
// Relation is represented by a bit saying whether there are elements in
// one set that is not in the other, and a bit saying that there are elements
// that are in both sets.
// Location of an element. Corresponds to the internal areas of
// a Venn diagram.
enum {
kInFirst = 1 << kInsideFirst,
kInSecond = 1 << kInsideSecond,
kInBoth = 1 << kInsideBoth
};
SetRelation() : bits_(0) {}
~SetRelation() {}
// Add the existence of objects in a particular
void SetElementsInFirstSet() { bits_ |= kInFirst; }
void SetElementsInSecondSet() { bits_ |= kInSecond; }
void SetElementsInBothSets() { bits_ |= kInBoth; }
// Check the currently known relation of the sets (common functions only,
// for other combinations, use value() to get the bits and check them
// manually).
// Sets are completely disjoint.
bool Disjoint() { return (bits_ & kInBoth) == 0; }
// Sets are equal.
bool Equals() { return (bits_ & (kInFirst | kInSecond)) == 0; }
// First set contains second.
bool Contains() { return (bits_ & kInSecond) == 0; }
// Second set contains first.
bool ContainedIn() { return (bits_ & kInFirst) == 0; }
bool NonTrivialIntersection() {
return (bits_ == (kInFirst | kInSecond | kInBoth));
}
int value() { return bits_; }
private:
int bits_;
};
class CharacterRange {
public:
CharacterRange() : from_(0), to_(0) { }
......@@ -208,7 +259,39 @@ class CharacterRange {
Vector<const uc16> overlay,
ZoneList<CharacterRange>** included,
ZoneList<CharacterRange>** excluded);
// Whether a range list is in canonical form: Ranges ordered by from value,
// and ranges non-overlapping and non-adjacent.
static bool IsCanonical(ZoneList<CharacterRange>* ranges);
// Convert range list to canonical form. The characters covered by the ranges
// will still be the same, but no character is in more than one range, and
// adjacent ranges are merged. The resulting list may be shorter than the
// original, but cannot be longer.
static void Canonicalize(ZoneList<CharacterRange>* ranges);
// Check how the set of characters defined by a CharacterRange list relates
// to the set of word characters. List must be in canonical form.
static SetRelation WordCharacterRelation(ZoneList<CharacterRange>* ranges);
// Takes two character range lists (representing character sets) in canonical
// form and merges them.
// The characters that are only covered by the first set are added to
// first_set_only_out. the characters that are only in the second set are
// added to second_set_only_out, and the characters that are in both are
// added to both_sets_out.
// The pointers to first_set_only_out, second_set_only_out and both_sets_out
// should be to empty lists, but they need not be distinct, and may be NULL.
// If NULL, the characters are dropped, and if two arguments are the same
// pointer, the result is the union of the two sets that would be created
// if the pointers had been distinct.
// This way, the Merge function can compute all the usual set operations:
// union (all three out-sets are equal), intersection (only both_sets_out is
// non-NULL), and set difference (only first_set is non-NULL).
static void Merge(ZoneList<CharacterRange>* first_set,
ZoneList<CharacterRange>* second_set,
ZoneList<CharacterRange>* first_set_only_out,
ZoneList<CharacterRange>* second_set_only_out,
ZoneList<CharacterRange>* both_sets_out);
// Negate the contents of a character range in canonical form.
static void Negate(ZoneList<CharacterRange>* src,
ZoneList<CharacterRange>* dst);
static const int kRangeCanonicalizeMax = 0x346;
static const int kStartMarker = (1 << 24);
static const int kPayloadMask = (1 << 24) - 1;
......@@ -482,7 +565,7 @@ class QuickCheckDetails {
class RegExpNode: public ZoneObject {
public:
RegExpNode() : trace_count_(0) { }
RegExpNode() : first_character_set_(NULL), trace_count_(0) { }
virtual ~RegExpNode();
virtual void Accept(NodeVisitor* visitor) = 0;
// Generates a goto to this node or actually generates the code at this point.
......@@ -533,8 +616,29 @@ class RegExpNode: public ZoneObject {
SiblingList* siblings() { return &siblings_; }
void set_siblings(SiblingList* other) { siblings_ = *other; }
// Return the set of possible next characters recognized by the regexp
// (or a safe subset, potentially the set of all characters).
ZoneList<CharacterRange>* FirstCharacterSet();
// Compute (if possible within the budget of traversed nodes) the
// possible first characters of the input matched by this node and
// its continuation. Returns the remaining budget after the computation.
// If the budget is spent, the result is negative, and the cached
// first_character_set_ value isn't set.
virtual int ComputeFirstCharacterSet(int budget);
// Get and set the cached first character set value.
ZoneList<CharacterRange>* first_character_set() {
return first_character_set_;
}
void set_first_character_set(ZoneList<CharacterRange>* character_set) {
first_character_set_ = character_set;
}
protected:
enum LimitResult { DONE, CONTINUE };
static const int kComputeFirstCharacterSetFail = -1;
LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
// Returns a sibling of this node whose interests and assumptions
......@@ -555,9 +659,11 @@ class RegExpNode: public ZoneObject {
virtual RegExpNode* Clone() = 0;
private:
static const int kFirstCharBudget = 10;
Label label_;
NodeInfo info_;
SiblingList siblings_;
ZoneList<CharacterRange>* first_character_set_;
// This variable keeps track of how many times code has been generated for
// this node (in different traces). We don't keep track of where the
// generated code is located unless the code is generated at the start of
......@@ -648,7 +754,7 @@ class ActionNode: public SeqRegExpNode {
// TODO(erikcorry): We should allow some action nodes in greedy loops.
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
virtual ActionNode* Clone() { return new ActionNode(*this); }
virtual int ComputeFirstCharacterSet(int budget);
private:
union {
struct {
......@@ -714,7 +820,7 @@ class TextNode: public SeqRegExpNode {
return result;
}
void CalculateOffsets();
virtual int ComputeFirstCharacterSet(int budget);
private:
enum TextEmitPassType {
NON_ASCII_MATCH, // Check for characters that can't match.
......@@ -744,7 +850,12 @@ class AssertionNode: public SeqRegExpNode {
AT_START,
AT_BOUNDARY,
AT_NON_BOUNDARY,
AFTER_NEWLINE
AFTER_NEWLINE,
// Types not directly expressible in regexp syntax.
// Used for modifying a boundary node if its following character is
// known to be word and/or non-word.
AFTER_NONWORD_CHARACTER,
AFTER_WORD_CHARACTER
};
static AssertionNode* AtEnd(RegExpNode* on_success) {
return new AssertionNode(AT_END, on_success);
......@@ -768,8 +879,10 @@ class AssertionNode: public SeqRegExpNode {
RegExpCompiler* compiler,
int filled_in,
bool not_at_start);
virtual int ComputeFirstCharacterSet(int budget);
virtual AssertionNode* Clone() { return new AssertionNode(*this); }
AssertionNodeType type() { return type_; }
void set_type(AssertionNodeType type) { type_ = type; }
private:
AssertionNode(AssertionNodeType t, RegExpNode* on_success)
: SeqRegExpNode(on_success), type_(t) { }
......@@ -797,7 +910,7 @@ class BackReferenceNode: public SeqRegExpNode {
return;
}
virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }
virtual int ComputeFirstCharacterSet(int budget);
private:
int start_reg_;
int end_reg_;
......@@ -819,7 +932,6 @@ class EndNode: public RegExpNode {
UNREACHABLE();
}
virtual EndNode* Clone() { return new EndNode(*this); }
private:
Action action_;
};
......@@ -953,6 +1065,7 @@ class NegativeLookaheadChoiceNode: public ChoiceNode {
// characters, but on a negative lookahead the negative branch did not take
// part in that calculation (EatsAtLeast) so the assumptions don't hold.
virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; }
virtual int ComputeFirstCharacterSet(int budget);
};
......@@ -971,6 +1084,7 @@ class LoopChoiceNode: public ChoiceNode {
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start);
virtual int ComputeFirstCharacterSet(int budget);
virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }
RegExpNode* loop_node() { return loop_node_; }
RegExpNode* continue_node() { return continue_node_; }
......@@ -1126,7 +1240,7 @@ class Trace {
void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
void set_stop_node(RegExpNode* node) { stop_node_ = node; }
void set_loop_label(Label* label) { loop_label_ = label; }
void set_characters_preloaded(int cpre) { characters_preloaded_ = cpre; }
void set_characters_preloaded(int count) { characters_preloaded_ = count; }
void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
void set_flush_budget(int to) { flush_budget_ = to; }
void set_quick_check_performed(QuickCheckDetails* d) {
......
......@@ -371,7 +371,7 @@ class RegExpBuilder: public ZoneObject {
void AddAtom(RegExpTree* tree);
void AddAssertion(RegExpTree* tree);
void NewAlternative(); // '|'
void AddQuantifierToAtom(int min, int max, bool is_greedy);
void AddQuantifierToAtom(int min, int max, RegExpQuantifier::Type type);
RegExpTree* ToRegExp();
private:
void FlushCharacters();
......@@ -503,7 +503,9 @@ RegExpTree* RegExpBuilder::ToRegExp() {
}
void RegExpBuilder::AddQuantifierToAtom(int min, int max, bool is_greedy) {
void RegExpBuilder::AddQuantifierToAtom(int min,
int max,
RegExpQuantifier::Type type) {
if (pending_empty_) {
pending_empty_ = false;
return;
......@@ -543,7 +545,7 @@ void RegExpBuilder::AddQuantifierToAtom(int min, int max, bool is_greedy) {
UNREACHABLE();
return;
}
terms_.Add(new RegExpQuantifier(min, max, is_greedy, atom));
terms_.Add(new RegExpQuantifier(min, max, type, atom));
LAST(ADD_TERM);
}
......@@ -4278,12 +4280,16 @@ RegExpTree* RegExpParser::ParseDisjunction() {
default:
continue;
}
bool is_greedy = true;
RegExpQuantifier::Type type = RegExpQuantifier::GREEDY;
if (current() == '?') {
is_greedy = false;
type = RegExpQuantifier::NON_GREEDY;
Advance();
} else if (FLAG_regexp_possessive_quantifier && current() == '+') {
// FLAG_regexp_possessive_quantifier is a debug-only flag.
type = RegExpQuantifier::POSSESSIVE;
Advance();
}
builder->AddQuantifierToAtom(min, max, is_greedy);
builder->AddQuantifierToAtom(min, max, type);
}
}
......
......@@ -307,18 +307,11 @@ void RegExpMacroAssemblerTracer::CheckCharacters(Vector<const uc16> str,
bool RegExpMacroAssemblerTracer::CheckSpecialCharacterClass(
uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match) {
bool supported = assembler_->CheckSpecialCharacterClass(type,
cp_offset,
check_offset,
on_no_match);
PrintF(" CheckSpecialCharacterClass(type='%c', offset=%d, "
"check_offset=%s, label[%08x]): %s;\n",
PrintF(" CheckSpecialCharacterClass(type='%c', label[%08x]): %s;\n",
type,
cp_offset,
check_offset ? "true" : "false",
on_no_match,
supported ? "true" : "false");
return supported;
......
......@@ -69,8 +69,6 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
uc16 and_with,
Label* on_not_equal);
virtual bool CheckSpecialCharacterClass(uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match);
virtual void Fail();
virtual Handle<Object> GetCode(Handle<String> source);
......
......@@ -123,8 +123,6 @@ class RegExpMacroAssembler {
// not have custom support.
// May clobber the current loaded character.
virtual bool CheckSpecialCharacterClass(uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match) {
return false;
}
......
......@@ -60,7 +60,7 @@ namespace internal {
* - r8 : code object pointer. Used to convert between absolute and
* code-object-relative addresses.
*
* The registers rax, rbx, rcx, r9 and r11 are free to use for computations.
* The registers rax, rbx, r9 and r11 are free to use for computations.
* If changed to use r12+, they should be saved as callee-save registers.
*
* Each call to a C++ method should retain these registers.
......@@ -496,27 +496,22 @@ void RegExpMacroAssemblerX64::CheckNotCharacterAfterMinusAnd(
bool RegExpMacroAssemblerX64::CheckSpecialCharacterClass(uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match) {
// Range checks (c in min..max) are generally implemented by an unsigned
// (c - min) <= (max - min) check
// (c - min) <= (max - min) check, using the sequence:
// lea(rax, Operand(current_character(), -min)) or sub(rax, Immediate(min))
// cmp(rax, Immediate(max - min))
switch (type) {
case 's':
// Match space-characters
if (mode_ == ASCII) {
// ASCII space characters are '\t'..'\r' and ' '.
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
Label success;
__ cmpl(current_character(), Immediate(' '));
__ j(equal, &success);
// Check range 0x09..0x0d
__ subl(current_character(), Immediate('\t'));
__ cmpl(current_character(), Immediate('\r' - '\t'));
__ lea(rax, Operand(current_character(), -'\t'));
__ cmpl(rax, Immediate('\r' - '\t'));
BranchOrBacktrack(above, on_no_match);
__ bind(&success);
return true;
......@@ -524,72 +519,116 @@ bool RegExpMacroAssemblerX64::CheckSpecialCharacterClass(uc16 type,
return false;
case 'S':
// Match non-space characters.
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
if (mode_ == ASCII) {
// ASCII space characters are '\t'..'\r' and ' '.
__ cmpl(current_character(), Immediate(' '));
BranchOrBacktrack(equal, on_no_match);
__ subl(current_character(), Immediate('\t'));
__ cmpl(current_character(), Immediate('\r' - '\t'));
__ lea(rax, Operand(current_character(), -'\t'));
__ cmpl(rax, Immediate('\r' - '\t'));
BranchOrBacktrack(below_equal, on_no_match);
return true;
}
return false;
case 'd':
// Match ASCII digits ('0'..'9')
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ subl(current_character(), Immediate('0'));
__ cmpl(current_character(), Immediate('9' - '0'));
__ lea(rax, Operand(current_character(), -'0'));
__ cmpl(rax, Immediate('9' - '0'));
BranchOrBacktrack(above, on_no_match);
return true;
case 'D':
// Match non ASCII-digits
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ subl(current_character(), Immediate('0'));
__ cmpl(current_character(), Immediate('9' - '0'));
__ lea(rax, Operand(current_character(), -'0'));
__ cmpl(rax, Immediate('9' - '0'));
BranchOrBacktrack(below_equal, on_no_match);
return true;
case '.': {
// Match non-newlines (not 0x0a('\n'), 0x0d('\r'), 0x2028 and 0x2029)
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ xor_(current_character(), Immediate(0x01));
__ movl(rax, current_character());
__ xor_(rax, Immediate(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c
__ subl(current_character(), Immediate(0x0b));
__ cmpl(current_character(), Immediate(0x0c - 0x0b));
__ subl(rax, Immediate(0x0b));
__ cmpl(rax, Immediate(0x0c - 0x0b));
BranchOrBacktrack(below_equal, on_no_match);
if (mode_ == UC16) {
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0b). I.e., check for
// 0x201d (0x2028 - 0x0b) or 0x201e.
__ subl(current_character(), Immediate(0x2028 - 0x0b));
__ cmpl(current_character(), Immediate(1));
__ subl(rax, Immediate(0x2028 - 0x0b));
__ cmpl(rax, Immediate(0x2029 - 0x2028));
BranchOrBacktrack(below_equal, on_no_match);
}
return true;
}
case 'n': {
// Match newlines (0x0a('\n'), 0x0d('\r'), 0x2028 and 0x2029)
__ movl(rax, current_character());
__ xor_(rax, Immediate(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c
__ subl(rax, Immediate(0x0b));
__ cmpl(rax, Immediate(0x0c - 0x0b));
if (mode_ == ASCII) {
BranchOrBacktrack(above, on_no_match);
} else {
Label done;
BranchOrBacktrack(below_equal, &done);
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0b). I.e., check for
// 0x201d (0x2028 - 0x0b) or 0x201e.
__ subl(rax, Immediate(0x2028 - 0x0b));
__ cmpl(rax, Immediate(0x2029 - 0x2028));
BranchOrBacktrack(above, on_no_match);
__ bind(&done);
}
return true;
}
case 'w': {
Label done, check_digits;
__ cmpl(current_character(), Immediate('9'));
__ j(less_equal, &check_digits);
__ cmpl(current_character(), Immediate('_'));
__ j(equal, &done);
// Convert to lower case if letter.
__ movl(rax, current_character());
__ orl(rax, Immediate(0x20));
// check rax in range ['a'..'z'].
__ subl(rax, Immediate('a'));
__ cmpl(rax, Immediate('z' - 'a'));
BranchOrBacktrack(above, on_no_match);
__ jmp(&done);
__ bind(&check_digits);
// Check current character in range ['0'..'9'].
__ cmpl(current_character(), Immediate('0'));
BranchOrBacktrack(below, on_no_match);
__ bind(&done);
return true;
}
case 'W': {
Label done, check_digits;
__ cmpl(current_character(), Immediate('9'));
__ j(less_equal, &check_digits);
__ cmpl(current_character(), Immediate('_'));
BranchOrBacktrack(equal, on_no_match);
// Convert to lower case if letter.
__ movl(rax, current_character());
__ orl(rax, Immediate(0x20));
// check current character in range ['a'..'z'], nondestructively.
__ subl(rax, Immediate('a'));
__ cmpl(rax, Immediate('z' - 'a'));
BranchOrBacktrack(below_equal, on_no_match);
__ jmp(&done);
__ bind(&check_digits);
// Check current character in range ['0'..'9'].
__ cmpl(current_character(), Immediate('0'));
BranchOrBacktrack(above_equal, on_no_match);
__ bind(&done);
return true;
}
case '*':
// Match any character.
if (check_offset) {
CheckPosition(cp_offset, on_no_match);
}
return true;
// No custom implementation (yet): w, W, s(UC16), S(UC16).
// No custom implementation (yet): s(UC16), S(UC16).
default:
return false;
}
......
......@@ -73,8 +73,6 @@ class RegExpMacroAssemblerX64: public NativeRegExpMacroAssembler {
// the end of the string.
virtual void CheckPosition(int cp_offset, Label* on_outside_input);
virtual bool CheckSpecialCharacterClass(uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match);
virtual void Fail();
virtual Handle<Object> GetCode(Handle<String> source);
......
......@@ -58,6 +58,16 @@
using namespace v8::internal;
static bool CheckParse(const char* input) {
V8::Initialize(NULL);
v8::HandleScope scope;
ZoneScope zone_scope(DELETE_ON_EXIT);
FlatStringReader reader(CStrVector(input));
RegExpCompileData result;
return v8::internal::ParseRegExp(&reader, false, &result);
}
static SmartPointer<const char> Parse(const char* input) {
V8::Initialize(NULL);
v8::HandleScope scope;
......@@ -106,7 +116,7 @@ static MinMaxPair CheckMinMaxMatch(const char* input) {
}
#define CHECK_PARSE_ERROR(input) CHECK(!CheckParse(input))
#define CHECK_PARSE_EQ(input, expected) CHECK_EQ(expected, *Parse(input))
#define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input));
#define CHECK_MIN_MAX(input, min, max) \
......@@ -600,6 +610,34 @@ TEST(DispatchTableConstruction) {
}
}
// Test of debug-only syntax.
#ifdef DEBUG
TEST(ParsePossessiveRepetition) {
bool old_flag_value = FLAG_regexp_possessive_quantifier;
// Enable possessive quantifier syntax.
FLAG_regexp_possessive_quantifier = true;
CHECK_PARSE_EQ("a*+", "(# 0 - p 'a')");
CHECK_PARSE_EQ("a++", "(# 1 - p 'a')");
CHECK_PARSE_EQ("a?+", "(# 0 1 p 'a')");
CHECK_PARSE_EQ("a{10,20}+", "(# 10 20 p 'a')");
CHECK_PARSE_EQ("za{10,20}+b", "(: 'z' (# 10 20 p 'a') 'b')");
// Disable possessive quantifier syntax.
FLAG_regexp_possessive_quantifier = false;
CHECK_PARSE_ERROR("a*+");
CHECK_PARSE_ERROR("a++");
CHECK_PARSE_ERROR("a?+");
CHECK_PARSE_ERROR("a{10,20}+");
CHECK_PARSE_ERROR("a{10,20}+b");
FLAG_regexp_possessive_quantifier = old_flag_value;
}
#endif
// Tests of interpreter.
......@@ -1550,7 +1588,68 @@ TEST(CharClassDifference) {
}
TEST(CanonicalizeCharacterSets) {
ZoneScope scope(DELETE_ON_EXIT);
ZoneList<CharacterRange>* list = new ZoneList<CharacterRange>(4);
CharacterSet set(list);
list->Add(CharacterRange(10, 20));
list->Add(CharacterRange(30, 40));
list->Add(CharacterRange(50, 60));
set.Canonicalize();
ASSERT_EQ(3, list->length());
ASSERT_EQ(10, list->at(0).from());
ASSERT_EQ(20, list->at(0).to());
ASSERT_EQ(30, list->at(1).from());
ASSERT_EQ(40, list->at(1).to());
ASSERT_EQ(50, list->at(2).from());
ASSERT_EQ(60, list->at(2).to());
list->Rewind(0);
list->Add(CharacterRange(10, 20));
list->Add(CharacterRange(50, 60));
list->Add(CharacterRange(30, 40));
set.Canonicalize();
ASSERT_EQ(3, list->length());
ASSERT_EQ(10, list->at(0).from());
ASSERT_EQ(20, list->at(0).to());
ASSERT_EQ(30, list->at(1).from());
ASSERT_EQ(40, list->at(1).to());
ASSERT_EQ(50, list->at(2).from());
ASSERT_EQ(60, list->at(2).to());
list->Rewind(0);
list->Add(CharacterRange(30, 40));
list->Add(CharacterRange(10, 20));
list->Add(CharacterRange(25, 25));
list->Add(CharacterRange(100, 100));
list->Add(CharacterRange(1, 1));
set.Canonicalize();
ASSERT_EQ(5, list->length());
ASSERT_EQ(1, list->at(0).from());
ASSERT_EQ(1, list->at(0).to());
ASSERT_EQ(10, list->at(1).from());
ASSERT_EQ(20, list->at(1).to());
ASSERT_EQ(25, list->at(2).from());
ASSERT_EQ(25, list->at(2).to());
ASSERT_EQ(30, list->at(3).from());
ASSERT_EQ(40, list->at(3).to());
ASSERT_EQ(100, list->at(4).from());
ASSERT_EQ(100, list->at(4).to());
list->Rewind(0);
list->Add(CharacterRange(10, 19));
list->Add(CharacterRange(21, 30));
list->Add(CharacterRange(20, 20));
set.Canonicalize();
ASSERT_EQ(1, list->length());
ASSERT_EQ(10, list->at(0).from());
ASSERT_EQ(30, list->at(0).to());
}
TEST(Graph) {
V8::Initialize(NULL);
Execute("(?:(?:x(.))?\1)+$", false, true, true);
Execute("\\b\\w+\\b", false, true, true);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment