Commit 46504c15 authored by lrn@chromium.org's avatar lrn@chromium.org

Attempt to make \b\w+ faster. Slight performance increase on, e.g., string unpacking.

Review URL: http://codereview.chromium.org/507051


git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@3563 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 40d6cbca
......@@ -465,8 +465,6 @@ void RegExpMacroAssemblerARM::CheckNotCharacterAfterMinusAnd(
bool RegExpMacroAssemblerARM::CheckSpecialCharacterClass(uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match) {
// Range checks (c in min..max) are generally implemented by an unsigned
// (c - min) <= (max - min) check
......@@ -475,11 +473,6 @@ bool RegExpMacroAssemblerARM::CheckSpecialCharacterClass(uc16 type,
// Match space-characters
if (mode_ == ASCII) {
// ASCII space characters are '\t'..'\r' and ' '.
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
Label success;
__ cmp(current_character(), Operand(' '));
__ b(eq, &success);
......@@ -493,11 +486,6 @@ bool RegExpMacroAssemblerARM::CheckSpecialCharacterClass(uc16 type,
return false;
case 'S':
// Match non-space characters.
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
if (mode_ == ASCII) {
// ASCII space characters are '\t'..'\r' and ' '.
__ cmp(current_character(), Operand(' '));
......@@ -510,33 +498,18 @@ bool RegExpMacroAssemblerARM::CheckSpecialCharacterClass(uc16 type,
return false;
case 'd':
// Match ASCII digits ('0'..'9')
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ sub(r0, current_character(), Operand('0'));
__ cmp(current_character(), Operand('9' - '0'));
BranchOrBacktrack(hi, on_no_match);
return true;
case 'D':
// Match non ASCII-digits
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ sub(r0, current_character(), Operand('0'));
__ cmp(r0, Operand('9' - '0'));
BranchOrBacktrack(ls, on_no_match);
return true;
case '.': {
// Match non-newlines (not 0x0a('\n'), 0x0d('\r'), 0x2028 and 0x2029)
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ eor(r0, current_character(), Operand(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c
__ sub(r0, r0, Operand(0x0b));
......@@ -552,13 +525,71 @@ bool RegExpMacroAssemblerARM::CheckSpecialCharacterClass(uc16 type,
}
return true;
}
case 'n': {
// Match newlines (0x0a('\n'), 0x0d('\r'), 0x2028 and 0x2029)
__ eor(r0, current_character(), Operand(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c
__ sub(r0, r0, Operand(0x0b));
__ cmp(r0, Operand(0x0c - 0x0b));
if (mode_ == ASCII) {
BranchOrBacktrack(hi, on_no_match);
} else {
Label done;
__ b(ls, &done);
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0b). I.e., check for
// 0x201d (0x2028 - 0x0b) or 0x201e.
__ sub(r0, r0, Operand(0x2028 - 0x0b));
__ cmp(r0, Operand(1));
BranchOrBacktrack(hi, on_no_match);
__ bind(&done);
}
return true;
}
case 'w': {
// Match word character (0-9, A-Z, a-z and _).
Label digits, done;
__ cmp(current_character(), Operand('9'));
__ b(ls, &digits);
__ cmp(current_character(), Operand('_'));
__ b(eq, &done);
__ orr(r0, current_character(), Operand(0x20));
__ sub(r0, r0, Operand('a'));
__ cmp(r0, Operand('z' - 'a'));
BranchOrBacktrack(hi, on_no_match);
__ jmp(&done);
__ bind(&digits);
__ cmp(current_character(), Operand('0'));
BranchOrBacktrack(lo, on_no_match);
__ bind(&done);
return true;
}
case 'W': {
// Match non-word character (not 0-9, A-Z, a-z and _).
Label digits, done;
__ cmp(current_character(), Operand('9'));
__ b(ls, &digits);
__ cmp(current_character(), Operand('_'));
BranchOrBacktrack(eq, on_no_match);
__ orr(r0, current_character(), Operand(0x20));
__ sub(r0, r0, Operand('a'));
__ cmp(r0, Operand('z' - 'a'));
BranchOrBacktrack(ls, on_no_match);
__ jmp(&done);
__ bind(&digits);
__ cmp(current_character(), Operand('0'));
BranchOrBacktrack(hs, on_no_match);
__ bind(&done);
return true;
}
case '*':
// Match any character.
if (check_offset) {
CheckPosition(cp_offset, on_no_match);
}
return true;
// No custom implementation (yet): w, W, s(UC16), S(UC16).
// No custom implementation (yet): s(UC16), S(UC16).
default:
return false;
}
......
......@@ -80,8 +80,6 @@ class RegExpMacroAssemblerARM: public NativeRegExpMacroAssembler {
// the end of the string.
virtual void CheckPosition(int cp_offset, Label* on_outside_input);
virtual bool CheckSpecialCharacterClass(uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match);
virtual void Fail();
virtual Handle<Object> GetCode(Handle<String> source);
......
......@@ -433,7 +433,7 @@ void* RegExpUnparser::VisitQuantifier(RegExpQuantifier* that, void* data) {
} else {
stream()->Add("%i ", that->max());
}
stream()->Add(that->is_greedy() ? "g " : "n ");
stream()->Add(that->is_greedy() ? "g " : that->is_possessive() ? "p " : "n ");
that->body()->Accept(this, data);
stream()->Add(")");
return NULL;
......
......@@ -1526,6 +1526,7 @@ class CharacterSet BASE_EMBEDDED {
standard_set_type_ = special_set_type;
}
bool is_standard() { return standard_set_type_ != 0; }
void Canonicalize();
private:
ZoneList<CharacterRange>* ranges_;
// If non-zero, the value represents a standard set (e.g., all whitespace
......@@ -1619,12 +1620,13 @@ class RegExpText: public RegExpTree {
class RegExpQuantifier: public RegExpTree {
public:
RegExpQuantifier(int min, int max, bool is_greedy, RegExpTree* body)
: min_(min),
enum Type { GREEDY, NON_GREEDY, POSSESSIVE };
RegExpQuantifier(int min, int max, Type type, RegExpTree* body)
: body_(body),
min_(min),
max_(max),
is_greedy_(is_greedy),
body_(body),
min_match_(min * body->min_match()) {
min_match_(min * body->min_match()),
type_(type) {
if (max > 0 && body->max_match() > kInfinity / max) {
max_match_ = kInfinity;
} else {
......@@ -1648,15 +1650,17 @@ class RegExpQuantifier: public RegExpTree {
virtual int max_match() { return max_match_; }
int min() { return min_; }
int max() { return max_; }
bool is_greedy() { return is_greedy_; }
bool is_possessive() { return type_ == POSSESSIVE; }
bool is_non_greedy() { return type_ == NON_GREEDY; }
bool is_greedy() { return type_ == GREEDY; }
RegExpTree* body() { return body_; }
private:
RegExpTree* body_;
int min_;
int max_;
bool is_greedy_;
RegExpTree* body_;
int min_match_;
int max_match_;
Type type_;
};
......
......@@ -329,6 +329,9 @@ DEFINE_bool(collect_heap_spill_statistics, false,
"(requires heap_stats)")
// Regexp
DEFINE_bool(regexp_possessive_quantifier,
false,
"enable possessive quantifier syntax for testing")
DEFINE_bool(trace_regexp_bytecodes, false, "trace regexp bytecode execution")
DEFINE_bool(trace_regexp_assembler,
false,
......
......@@ -477,8 +477,6 @@ void RegExpMacroAssemblerIA32::CheckNotCharacterAfterMinusAnd(
bool RegExpMacroAssemblerIA32::CheckSpecialCharacterClass(uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match) {
// Range checks (c in min..max) are generally implemented by an unsigned
// (c - min) <= (max - min) check
......@@ -487,17 +485,12 @@ bool RegExpMacroAssemblerIA32::CheckSpecialCharacterClass(uc16 type,
// Match space-characters
if (mode_ == ASCII) {
// ASCII space characters are '\t'..'\r' and ' '.
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
Label success;
__ cmp(current_character(), ' ');
__ j(equal, &success);
// Check range 0x09..0x0d
__ sub(Operand(current_character()), Immediate('\t'));
__ cmp(current_character(), '\r' - '\t');
__ lea(eax, Operand(current_character(), -'\t'));
__ cmp(eax, '\r' - '\t');
BranchOrBacktrack(above, on_no_match);
__ bind(&success);
return true;
......@@ -505,72 +498,118 @@ bool RegExpMacroAssemblerIA32::CheckSpecialCharacterClass(uc16 type,
return false;
case 'S':
// Match non-space characters.
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
if (mode_ == ASCII) {
// ASCII space characters are '\t'..'\r' and ' '.
__ cmp(current_character(), ' ');
BranchOrBacktrack(equal, on_no_match);
__ sub(Operand(current_character()), Immediate('\t'));
__ cmp(current_character(), '\r' - '\t');
__ lea(eax, Operand(current_character(), -'\t'));
__ cmp(eax, '\r' - '\t');
BranchOrBacktrack(below_equal, on_no_match);
return true;
}
return false;
case 'd':
// Match ASCII digits ('0'..'9')
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ sub(Operand(current_character()), Immediate('0'));
__ cmp(current_character(), '9' - '0');
__ lea(eax, Operand(current_character(), -'0'));
__ cmp(eax, '9' - '0');
BranchOrBacktrack(above, on_no_match);
return true;
case 'D':
// Match non ASCII-digits
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ sub(Operand(current_character()), Immediate('0'));
__ cmp(current_character(), '9' - '0');
__ lea(eax, Operand(current_character(), -'0'));
__ cmp(eax, '9' - '0');
BranchOrBacktrack(below_equal, on_no_match);
return true;
case '.': {
// Match non-newlines (not 0x0a('\n'), 0x0d('\r'), 0x2028 and 0x2029)
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ xor_(Operand(current_character()), Immediate(0x01));
__ mov(Operand(eax), current_character());
__ xor_(Operand(eax), Immediate(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c
__ sub(Operand(current_character()), Immediate(0x0b));
__ cmp(current_character(), 0x0c - 0x0b);
__ sub(Operand(eax), Immediate(0x0b));
__ cmp(eax, 0x0c - 0x0b);
BranchOrBacktrack(below_equal, on_no_match);
if (mode_ == UC16) {
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0b). I.e., check for
// 0x201d (0x2028 - 0x0b) or 0x201e.
__ sub(Operand(current_character()), Immediate(0x2028 - 0x0b));
__ cmp(current_character(), 1);
__ sub(Operand(eax), Immediate(0x2028 - 0x0b));
__ cmp(eax, 0x2029 - 0x2028);
BranchOrBacktrack(below_equal, on_no_match);
}
return true;
}
case 'w': {
Label done, check_digits;
__ cmp(Operand(current_character()), Immediate('9'));
__ j(less_equal, &check_digits);
__ cmp(Operand(current_character()), Immediate('_'));
__ j(equal, &done);
// Convert to lower case if letter.
__ mov(Operand(eax), current_character());
__ or_(eax, 0x20);
// check current character in range ['a'..'z'], nondestructively.
__ sub(Operand(eax), Immediate('a'));
__ cmp(Operand(eax), Immediate('z' - 'a'));
BranchOrBacktrack(above, on_no_match);
__ jmp(&done);
__ bind(&check_digits);
// Check current character in range ['0'..'9'].
__ cmp(Operand(current_character()), Immediate('0'));
BranchOrBacktrack(below, on_no_match);
__ bind(&done);
return true;
}
case 'W': {
Label done, check_digits;
__ cmp(Operand(current_character()), Immediate('9'));
__ j(less_equal, &check_digits);
__ cmp(Operand(current_character()), Immediate('_'));
BranchOrBacktrack(equal, on_no_match);
// Convert to lower case if letter.
__ mov(Operand(eax), current_character());
__ or_(eax, 0x20);
// check current character in range ['a'..'z'], nondestructively.
__ sub(Operand(eax), Immediate('a'));
__ cmp(Operand(eax), Immediate('z' - 'a'));
BranchOrBacktrack(below_equal, on_no_match);
__ jmp(&done);
__ bind(&check_digits);
// Check current character in range ['0'..'9'].
__ cmp(Operand(current_character()), Immediate('0'));
BranchOrBacktrack(above_equal, on_no_match);
__ bind(&done);
return true;
}
// Non-standard classes (with no syntactic shorthand) used internally.
case '*':
// Match any character.
if (check_offset) {
CheckPosition(cp_offset, on_no_match);
return true;
case 'n': {
// Match newlines (0x0a('\n'), 0x0d('\r'), 0x2028 or 0x2029).
// The opposite of '.'.
__ mov(Operand(eax), current_character());
__ xor_(Operand(eax), Immediate(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c
__ sub(Operand(eax), Immediate(0x0b));
__ cmp(eax, 0x0c - 0x0b);
if (mode_ == ASCII) {
BranchOrBacktrack(above, on_no_match);
} else {
Label done;
BranchOrBacktrack(below_equal, &done);
ASSERT_EQ(UC16, mode_);
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0b). I.e., check for
// 0x201d (0x2028 - 0x0b) or 0x201e.
__ sub(Operand(eax), Immediate(0x2028 - 0x0b));
__ cmp(eax, 1);
BranchOrBacktrack(above, on_no_match);
__ bind(&done);
}
return true;
// No custom implementation (yet): w, W, s(UC16), S(UC16).
}
// No custom implementation (yet): s(UC16), S(UC16).
default:
return false;
}
......
......@@ -78,10 +78,7 @@ class RegExpMacroAssemblerIA32: public NativeRegExpMacroAssembler {
// Checks whether the given offset from the current position is before
// the end of the string.
virtual void CheckPosition(int cp_offset, Label* on_outside_input);
virtual bool CheckSpecialCharacterClass(uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match);
virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match);
virtual void Fail();
virtual Handle<Object> GetCode(Handle<String> source);
virtual void GoTo(Label* label);
......
This diff is collapsed.
......@@ -177,6 +177,57 @@ class RegExpImpl {
};
// Represents the location of one element relative to the intersection of
// two sets. Corresponds to the four areas of a Venn diagram.
enum ElementInSetsRelation {
kInsideNone = 0,
kInsideFirst = 1,
kInsideSecond = 2,
kInsideBoth = 3
};
// Represents the relation of two sets.
// Sets can be either disjoint, partially or fully overlapping, or equal.
class SetRelation BASE_EMBEDDED {
public:
// Relation is represented by a bit saying whether there are elements in
// one set that is not in the other, and a bit saying that there are elements
// that are in both sets.
// Location of an element. Corresponds to the internal areas of
// a Venn diagram.
enum {
kInFirst = 1 << kInsideFirst,
kInSecond = 1 << kInsideSecond,
kInBoth = 1 << kInsideBoth
};
SetRelation() : bits_(0) {}
~SetRelation() {}
// Add the existence of objects in a particular
void SetElementsInFirstSet() { bits_ |= kInFirst; }
void SetElementsInSecondSet() { bits_ |= kInSecond; }
void SetElementsInBothSets() { bits_ |= kInBoth; }
// Check the currently known relation of the sets (common functions only,
// for other combinations, use value() to get the bits and check them
// manually).
// Sets are completely disjoint.
bool Disjoint() { return (bits_ & kInBoth) == 0; }
// Sets are equal.
bool Equals() { return (bits_ & (kInFirst | kInSecond)) == 0; }
// First set contains second.
bool Contains() { return (bits_ & kInSecond) == 0; }
// Second set contains first.
bool ContainedIn() { return (bits_ & kInFirst) == 0; }
bool NonTrivialIntersection() {
return (bits_ == (kInFirst | kInSecond | kInBoth));
}
int value() { return bits_; }
private:
int bits_;
};
class CharacterRange {
public:
CharacterRange() : from_(0), to_(0) { }
......@@ -208,7 +259,39 @@ class CharacterRange {
Vector<const uc16> overlay,
ZoneList<CharacterRange>** included,
ZoneList<CharacterRange>** excluded);
// Whether a range list is in canonical form: Ranges ordered by from value,
// and ranges non-overlapping and non-adjacent.
static bool IsCanonical(ZoneList<CharacterRange>* ranges);
// Convert range list to canonical form. The characters covered by the ranges
// will still be the same, but no character is in more than one range, and
// adjacent ranges are merged. The resulting list may be shorter than the
// original, but cannot be longer.
static void Canonicalize(ZoneList<CharacterRange>* ranges);
// Check how the set of characters defined by a CharacterRange list relates
// to the set of word characters. List must be in canonical form.
static SetRelation WordCharacterRelation(ZoneList<CharacterRange>* ranges);
// Takes two character range lists (representing character sets) in canonical
// form and merges them.
// The characters that are only covered by the first set are added to
// first_set_only_out. the characters that are only in the second set are
// added to second_set_only_out, and the characters that are in both are
// added to both_sets_out.
// The pointers to first_set_only_out, second_set_only_out and both_sets_out
// should be to empty lists, but they need not be distinct, and may be NULL.
// If NULL, the characters are dropped, and if two arguments are the same
// pointer, the result is the union of the two sets that would be created
// if the pointers had been distinct.
// This way, the Merge function can compute all the usual set operations:
// union (all three out-sets are equal), intersection (only both_sets_out is
// non-NULL), and set difference (only first_set is non-NULL).
static void Merge(ZoneList<CharacterRange>* first_set,
ZoneList<CharacterRange>* second_set,
ZoneList<CharacterRange>* first_set_only_out,
ZoneList<CharacterRange>* second_set_only_out,
ZoneList<CharacterRange>* both_sets_out);
// Negate the contents of a character range in canonical form.
static void Negate(ZoneList<CharacterRange>* src,
ZoneList<CharacterRange>* dst);
static const int kRangeCanonicalizeMax = 0x346;
static const int kStartMarker = (1 << 24);
static const int kPayloadMask = (1 << 24) - 1;
......@@ -482,7 +565,7 @@ class QuickCheckDetails {
class RegExpNode: public ZoneObject {
public:
RegExpNode() : trace_count_(0) { }
RegExpNode() : first_character_set_(NULL), trace_count_(0) { }
virtual ~RegExpNode();
virtual void Accept(NodeVisitor* visitor) = 0;
// Generates a goto to this node or actually generates the code at this point.
......@@ -533,8 +616,29 @@ class RegExpNode: public ZoneObject {
SiblingList* siblings() { return &siblings_; }
void set_siblings(SiblingList* other) { siblings_ = *other; }
// Return the set of possible next characters recognized by the regexp
// (or a safe subset, potentially the set of all characters).
ZoneList<CharacterRange>* FirstCharacterSet();
// Compute (if possible within the budget of traversed nodes) the
// possible first characters of the input matched by this node and
// its continuation. Returns the remaining budget after the computation.
// If the budget is spent, the result is negative, and the cached
// first_character_set_ value isn't set.
virtual int ComputeFirstCharacterSet(int budget);
// Get and set the cached first character set value.
ZoneList<CharacterRange>* first_character_set() {
return first_character_set_;
}
void set_first_character_set(ZoneList<CharacterRange>* character_set) {
first_character_set_ = character_set;
}
protected:
enum LimitResult { DONE, CONTINUE };
static const int kComputeFirstCharacterSetFail = -1;
LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
// Returns a sibling of this node whose interests and assumptions
......@@ -555,9 +659,11 @@ class RegExpNode: public ZoneObject {
virtual RegExpNode* Clone() = 0;
private:
static const int kFirstCharBudget = 10;
Label label_;
NodeInfo info_;
SiblingList siblings_;
ZoneList<CharacterRange>* first_character_set_;
// This variable keeps track of how many times code has been generated for
// this node (in different traces). We don't keep track of where the
// generated code is located unless the code is generated at the start of
......@@ -648,7 +754,7 @@ class ActionNode: public SeqRegExpNode {
// TODO(erikcorry): We should allow some action nodes in greedy loops.
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
virtual ActionNode* Clone() { return new ActionNode(*this); }
virtual int ComputeFirstCharacterSet(int budget);
private:
union {
struct {
......@@ -714,7 +820,7 @@ class TextNode: public SeqRegExpNode {
return result;
}
void CalculateOffsets();
virtual int ComputeFirstCharacterSet(int budget);
private:
enum TextEmitPassType {
NON_ASCII_MATCH, // Check for characters that can't match.
......@@ -744,7 +850,12 @@ class AssertionNode: public SeqRegExpNode {
AT_START,
AT_BOUNDARY,
AT_NON_BOUNDARY,
AFTER_NEWLINE
AFTER_NEWLINE,
// Types not directly expressible in regexp syntax.
// Used for modifying a boundary node if its following character is
// known to be word and/or non-word.
AFTER_NONWORD_CHARACTER,
AFTER_WORD_CHARACTER
};
static AssertionNode* AtEnd(RegExpNode* on_success) {
return new AssertionNode(AT_END, on_success);
......@@ -768,8 +879,10 @@ class AssertionNode: public SeqRegExpNode {
RegExpCompiler* compiler,
int filled_in,
bool not_at_start);
virtual int ComputeFirstCharacterSet(int budget);
virtual AssertionNode* Clone() { return new AssertionNode(*this); }
AssertionNodeType type() { return type_; }
void set_type(AssertionNodeType type) { type_ = type; }
private:
AssertionNode(AssertionNodeType t, RegExpNode* on_success)
: SeqRegExpNode(on_success), type_(t) { }
......@@ -797,7 +910,7 @@ class BackReferenceNode: public SeqRegExpNode {
return;
}
virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }
virtual int ComputeFirstCharacterSet(int budget);
private:
int start_reg_;
int end_reg_;
......@@ -819,7 +932,6 @@ class EndNode: public RegExpNode {
UNREACHABLE();
}
virtual EndNode* Clone() { return new EndNode(*this); }
private:
Action action_;
};
......@@ -953,6 +1065,7 @@ class NegativeLookaheadChoiceNode: public ChoiceNode {
// characters, but on a negative lookahead the negative branch did not take
// part in that calculation (EatsAtLeast) so the assumptions don't hold.
virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; }
virtual int ComputeFirstCharacterSet(int budget);
};
......@@ -971,6 +1084,7 @@ class LoopChoiceNode: public ChoiceNode {
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start);
virtual int ComputeFirstCharacterSet(int budget);
virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }
RegExpNode* loop_node() { return loop_node_; }
RegExpNode* continue_node() { return continue_node_; }
......@@ -1126,7 +1240,7 @@ class Trace {
void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
void set_stop_node(RegExpNode* node) { stop_node_ = node; }
void set_loop_label(Label* label) { loop_label_ = label; }
void set_characters_preloaded(int cpre) { characters_preloaded_ = cpre; }
void set_characters_preloaded(int count) { characters_preloaded_ = count; }
void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
void set_flush_budget(int to) { flush_budget_ = to; }
void set_quick_check_performed(QuickCheckDetails* d) {
......
......@@ -371,7 +371,7 @@ class RegExpBuilder: public ZoneObject {
void AddAtom(RegExpTree* tree);
void AddAssertion(RegExpTree* tree);
void NewAlternative(); // '|'
void AddQuantifierToAtom(int min, int max, bool is_greedy);
void AddQuantifierToAtom(int min, int max, RegExpQuantifier::Type type);
RegExpTree* ToRegExp();
private:
void FlushCharacters();
......@@ -503,7 +503,9 @@ RegExpTree* RegExpBuilder::ToRegExp() {
}
void RegExpBuilder::AddQuantifierToAtom(int min, int max, bool is_greedy) {
void RegExpBuilder::AddQuantifierToAtom(int min,
int max,
RegExpQuantifier::Type type) {
if (pending_empty_) {
pending_empty_ = false;
return;
......@@ -543,7 +545,7 @@ void RegExpBuilder::AddQuantifierToAtom(int min, int max, bool is_greedy) {
UNREACHABLE();
return;
}
terms_.Add(new RegExpQuantifier(min, max, is_greedy, atom));
terms_.Add(new RegExpQuantifier(min, max, type, atom));
LAST(ADD_TERM);
}
......@@ -4278,12 +4280,16 @@ RegExpTree* RegExpParser::ParseDisjunction() {
default:
continue;
}
bool is_greedy = true;
RegExpQuantifier::Type type = RegExpQuantifier::GREEDY;
if (current() == '?') {
is_greedy = false;
type = RegExpQuantifier::NON_GREEDY;
Advance();
} else if (FLAG_regexp_possessive_quantifier && current() == '+') {
// FLAG_regexp_possessive_quantifier is a debug-only flag.
type = RegExpQuantifier::POSSESSIVE;
Advance();
}
builder->AddQuantifierToAtom(min, max, is_greedy);
builder->AddQuantifierToAtom(min, max, type);
}
}
......
......@@ -307,18 +307,11 @@ void RegExpMacroAssemblerTracer::CheckCharacters(Vector<const uc16> str,
bool RegExpMacroAssemblerTracer::CheckSpecialCharacterClass(
uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match) {
bool supported = assembler_->CheckSpecialCharacterClass(type,
cp_offset,
check_offset,
on_no_match);
PrintF(" CheckSpecialCharacterClass(type='%c', offset=%d, "
"check_offset=%s, label[%08x]): %s;\n",
PrintF(" CheckSpecialCharacterClass(type='%c', label[%08x]): %s;\n",
type,
cp_offset,
check_offset ? "true" : "false",
on_no_match,
supported ? "true" : "false");
return supported;
......
......@@ -69,8 +69,6 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
uc16 and_with,
Label* on_not_equal);
virtual bool CheckSpecialCharacterClass(uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match);
virtual void Fail();
virtual Handle<Object> GetCode(Handle<String> source);
......
......@@ -123,8 +123,6 @@ class RegExpMacroAssembler {
// not have custom support.
// May clobber the current loaded character.
virtual bool CheckSpecialCharacterClass(uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match) {
return false;
}
......
......@@ -60,7 +60,7 @@ namespace internal {
* - r8 : code object pointer. Used to convert between absolute and
* code-object-relative addresses.
*
* The registers rax, rbx, rcx, r9 and r11 are free to use for computations.
* The registers rax, rbx, r9 and r11 are free to use for computations.
* If changed to use r12+, they should be saved as callee-save registers.
*
* Each call to a C++ method should retain these registers.
......@@ -496,27 +496,22 @@ void RegExpMacroAssemblerX64::CheckNotCharacterAfterMinusAnd(
bool RegExpMacroAssemblerX64::CheckSpecialCharacterClass(uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match) {
// Range checks (c in min..max) are generally implemented by an unsigned
// (c - min) <= (max - min) check
// (c - min) <= (max - min) check, using the sequence:
// lea(rax, Operand(current_character(), -min)) or sub(rax, Immediate(min))
// cmp(rax, Immediate(max - min))
switch (type) {
case 's':
// Match space-characters
if (mode_ == ASCII) {
// ASCII space characters are '\t'..'\r' and ' '.
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
Label success;
__ cmpl(current_character(), Immediate(' '));
__ j(equal, &success);
// Check range 0x09..0x0d
__ subl(current_character(), Immediate('\t'));
__ cmpl(current_character(), Immediate('\r' - '\t'));
__ lea(rax, Operand(current_character(), -'\t'));
__ cmpl(rax, Immediate('\r' - '\t'));
BranchOrBacktrack(above, on_no_match);
__ bind(&success);
return true;
......@@ -524,72 +519,116 @@ bool RegExpMacroAssemblerX64::CheckSpecialCharacterClass(uc16 type,
return false;
case 'S':
// Match non-space characters.
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
if (mode_ == ASCII) {
// ASCII space characters are '\t'..'\r' and ' '.
__ cmpl(current_character(), Immediate(' '));
BranchOrBacktrack(equal, on_no_match);
__ subl(current_character(), Immediate('\t'));
__ cmpl(current_character(), Immediate('\r' - '\t'));
__ lea(rax, Operand(current_character(), -'\t'));
__ cmpl(rax, Immediate('\r' - '\t'));
BranchOrBacktrack(below_equal, on_no_match);
return true;
}
return false;
case 'd':
// Match ASCII digits ('0'..'9')
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ subl(current_character(), Immediate('0'));
__ cmpl(current_character(), Immediate('9' - '0'));
__ lea(rax, Operand(current_character(), -'0'));
__ cmpl(rax, Immediate('9' - '0'));
BranchOrBacktrack(above, on_no_match);
return true;
case 'D':
// Match non ASCII-digits
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ subl(current_character(), Immediate('0'));
__ cmpl(current_character(), Immediate('9' - '0'));
__ lea(rax, Operand(current_character(), -'0'));
__ cmpl(rax, Immediate('9' - '0'));
BranchOrBacktrack(below_equal, on_no_match);
return true;
case '.': {
// Match non-newlines (not 0x0a('\n'), 0x0d('\r'), 0x2028 and 0x2029)
if (check_offset) {
LoadCurrentCharacter(cp_offset, on_no_match, 1);
} else {
LoadCurrentCharacterUnchecked(cp_offset, 1);
}
__ xor_(current_character(), Immediate(0x01));
__ movl(rax, current_character());
__ xor_(rax, Immediate(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c
__ subl(current_character(), Immediate(0x0b));
__ cmpl(current_character(), Immediate(0x0c - 0x0b));
__ subl(rax, Immediate(0x0b));
__ cmpl(rax, Immediate(0x0c - 0x0b));
BranchOrBacktrack(below_equal, on_no_match);
if (mode_ == UC16) {
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0b). I.e., check for
// 0x201d (0x2028 - 0x0b) or 0x201e.
__ subl(current_character(), Immediate(0x2028 - 0x0b));
__ cmpl(current_character(), Immediate(1));
__ subl(rax, Immediate(0x2028 - 0x0b));
__ cmpl(rax, Immediate(0x2029 - 0x2028));
BranchOrBacktrack(below_equal, on_no_match);
}
return true;
}
case 'n': {
// Match newlines (0x0a('\n'), 0x0d('\r'), 0x2028 and 0x2029)
__ movl(rax, current_character());
__ xor_(rax, Immediate(0x01));
// See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c
__ subl(rax, Immediate(0x0b));
__ cmpl(rax, Immediate(0x0c - 0x0b));
if (mode_ == ASCII) {
BranchOrBacktrack(above, on_no_match);
} else {
Label done;
BranchOrBacktrack(below_equal, &done);
// Compare original value to 0x2028 and 0x2029, using the already
// computed (current_char ^ 0x01 - 0x0b). I.e., check for
// 0x201d (0x2028 - 0x0b) or 0x201e.
__ subl(rax, Immediate(0x2028 - 0x0b));
__ cmpl(rax, Immediate(0x2029 - 0x2028));
BranchOrBacktrack(above, on_no_match);
__ bind(&done);
}
return true;
}
case 'w': {
Label done, check_digits;
__ cmpl(current_character(), Immediate('9'));
__ j(less_equal, &check_digits);
__ cmpl(current_character(), Immediate('_'));
__ j(equal, &done);
// Convert to lower case if letter.
__ movl(rax, current_character());
__ orl(rax, Immediate(0x20));
// check rax in range ['a'..'z'].
__ subl(rax, Immediate('a'));
__ cmpl(rax, Immediate('z' - 'a'));
BranchOrBacktrack(above, on_no_match);
__ jmp(&done);
__ bind(&check_digits);
// Check current character in range ['0'..'9'].
__ cmpl(current_character(), Immediate('0'));
BranchOrBacktrack(below, on_no_match);
__ bind(&done);
return true;
}
case 'W': {
Label done, check_digits;
__ cmpl(current_character(), Immediate('9'));
__ j(less_equal, &check_digits);
__ cmpl(current_character(), Immediate('_'));
BranchOrBacktrack(equal, on_no_match);
// Convert to lower case if letter.
__ movl(rax, current_character());
__ orl(rax, Immediate(0x20));
// check current character in range ['a'..'z'], nondestructively.
__ subl(rax, Immediate('a'));
__ cmpl(rax, Immediate('z' - 'a'));
BranchOrBacktrack(below_equal, on_no_match);
__ jmp(&done);
__ bind(&check_digits);
// Check current character in range ['0'..'9'].
__ cmpl(current_character(), Immediate('0'));
BranchOrBacktrack(above_equal, on_no_match);
__ bind(&done);
return true;
}
case '*':
// Match any character.
if (check_offset) {
CheckPosition(cp_offset, on_no_match);
}
return true;
// No custom implementation (yet): w, W, s(UC16), S(UC16).
// No custom implementation (yet): s(UC16), S(UC16).
default:
return false;
}
......
......@@ -73,8 +73,6 @@ class RegExpMacroAssemblerX64: public NativeRegExpMacroAssembler {
// the end of the string.
virtual void CheckPosition(int cp_offset, Label* on_outside_input);
virtual bool CheckSpecialCharacterClass(uc16 type,
int cp_offset,
bool check_offset,
Label* on_no_match);
virtual void Fail();
virtual Handle<Object> GetCode(Handle<String> source);
......
......@@ -58,6 +58,16 @@
using namespace v8::internal;
static bool CheckParse(const char* input) {
V8::Initialize(NULL);
v8::HandleScope scope;
ZoneScope zone_scope(DELETE_ON_EXIT);
FlatStringReader reader(CStrVector(input));
RegExpCompileData result;
return v8::internal::ParseRegExp(&reader, false, &result);
}
static SmartPointer<const char> Parse(const char* input) {
V8::Initialize(NULL);
v8::HandleScope scope;
......@@ -106,7 +116,7 @@ static MinMaxPair CheckMinMaxMatch(const char* input) {
}
#define CHECK_PARSE_ERROR(input) CHECK(!CheckParse(input))
#define CHECK_PARSE_EQ(input, expected) CHECK_EQ(expected, *Parse(input))
#define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input));
#define CHECK_MIN_MAX(input, min, max) \
......@@ -600,6 +610,34 @@ TEST(DispatchTableConstruction) {
}
}
// Test of debug-only syntax.
#ifdef DEBUG
TEST(ParsePossessiveRepetition) {
bool old_flag_value = FLAG_regexp_possessive_quantifier;
// Enable possessive quantifier syntax.
FLAG_regexp_possessive_quantifier = true;
CHECK_PARSE_EQ("a*+", "(# 0 - p 'a')");
CHECK_PARSE_EQ("a++", "(# 1 - p 'a')");
CHECK_PARSE_EQ("a?+", "(# 0 1 p 'a')");
CHECK_PARSE_EQ("a{10,20}+", "(# 10 20 p 'a')");
CHECK_PARSE_EQ("za{10,20}+b", "(: 'z' (# 10 20 p 'a') 'b')");
// Disable possessive quantifier syntax.
FLAG_regexp_possessive_quantifier = false;
CHECK_PARSE_ERROR("a*+");
CHECK_PARSE_ERROR("a++");
CHECK_PARSE_ERROR("a?+");
CHECK_PARSE_ERROR("a{10,20}+");
CHECK_PARSE_ERROR("a{10,20}+b");
FLAG_regexp_possessive_quantifier = old_flag_value;
}
#endif
// Tests of interpreter.
......@@ -1550,7 +1588,68 @@ TEST(CharClassDifference) {
}
TEST(CanonicalizeCharacterSets) {
ZoneScope scope(DELETE_ON_EXIT);
ZoneList<CharacterRange>* list = new ZoneList<CharacterRange>(4);
CharacterSet set(list);
list->Add(CharacterRange(10, 20));
list->Add(CharacterRange(30, 40));
list->Add(CharacterRange(50, 60));
set.Canonicalize();
ASSERT_EQ(3, list->length());
ASSERT_EQ(10, list->at(0).from());
ASSERT_EQ(20, list->at(0).to());
ASSERT_EQ(30, list->at(1).from());
ASSERT_EQ(40, list->at(1).to());
ASSERT_EQ(50, list->at(2).from());
ASSERT_EQ(60, list->at(2).to());
list->Rewind(0);
list->Add(CharacterRange(10, 20));
list->Add(CharacterRange(50, 60));
list->Add(CharacterRange(30, 40));
set.Canonicalize();
ASSERT_EQ(3, list->length());
ASSERT_EQ(10, list->at(0).from());
ASSERT_EQ(20, list->at(0).to());
ASSERT_EQ(30, list->at(1).from());
ASSERT_EQ(40, list->at(1).to());
ASSERT_EQ(50, list->at(2).from());
ASSERT_EQ(60, list->at(2).to());
list->Rewind(0);
list->Add(CharacterRange(30, 40));
list->Add(CharacterRange(10, 20));
list->Add(CharacterRange(25, 25));
list->Add(CharacterRange(100, 100));
list->Add(CharacterRange(1, 1));
set.Canonicalize();
ASSERT_EQ(5, list->length());
ASSERT_EQ(1, list->at(0).from());
ASSERT_EQ(1, list->at(0).to());
ASSERT_EQ(10, list->at(1).from());
ASSERT_EQ(20, list->at(1).to());
ASSERT_EQ(25, list->at(2).from());
ASSERT_EQ(25, list->at(2).to());
ASSERT_EQ(30, list->at(3).from());
ASSERT_EQ(40, list->at(3).to());
ASSERT_EQ(100, list->at(4).from());
ASSERT_EQ(100, list->at(4).to());
list->Rewind(0);
list->Add(CharacterRange(10, 19));
list->Add(CharacterRange(21, 30));
list->Add(CharacterRange(20, 20));
set.Canonicalize();
ASSERT_EQ(1, list->length());
ASSERT_EQ(10, list->at(0).from());
ASSERT_EQ(30, list->at(0).to());
}
TEST(Graph) {
V8::Initialize(NULL);
Execute("(?:(?:x(.))?\1)+$", false, true, true);
Execute("\\b\\w+\\b", false, true, true);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment