Commit 00122b76 authored by lrn@chromium.org's avatar lrn@chromium.org

Each RegExtTree node can now report the min and max size of strings it can match.


git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@988 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 4fe63706
......@@ -332,7 +332,7 @@ void* RegExpUnparser::VisitText(RegExpText* that, void* data) {
void* RegExpUnparser::VisitQuantifier(RegExpQuantifier* that, void* data) {
stream()->Add("(# %i ", that->min());
if (that->max() == RegExpQuantifier::kInfinity) {
if (that->max() == RegExpTree::kInfinity) {
stream()->Add("- ");
} else {
stream()->Add("%i ", that->max());
......@@ -381,4 +381,34 @@ SmartPointer<const char> RegExpTree::ToString() {
}
RegExpDisjunction::RegExpDisjunction(ZoneList<RegExpTree*>* alternatives)
: alternatives_(alternatives) {
RegExpTree* first_alternative = alternatives->at(0);
min_match_ = first_alternative->min_match();
max_match_ = first_alternative->max_match();
for (int i = 1; i < alternatives->length(); i++) {
RegExpTree* alternative = alternatives->at(i);
min_match_ = Min(min_match_, alternative->min_match());
max_match_ = Max(max_match_, alternative->max_match());
}
}
RegExpAlternative::RegExpAlternative(ZoneList<RegExpTree*>* nodes)
: nodes_(nodes) {
min_match_ = 0;
max_match_ = 0;
for (int i = 0; i < nodes->length(); i++) {
RegExpTree* node = nodes->at(i);
min_match_ += node->min_match();
int node_max_match = node->max_match();
if (kInfinity - max_match_ < node_max_match) {
max_match_ = kInfinity;
} else {
max_match_ += node->max_match();
}
}
}
} } // namespace v8::internal
......@@ -1213,11 +1213,14 @@ class ThisFunction: public Expression {
class RegExpTree: public ZoneObject {
public:
static const int kInfinity = (1<<31)-1;
virtual ~RegExpTree() { }
virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
virtual RegExpNode* ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) = 0;
virtual bool IsTextElement() { return false; }
virtual int min_match() = 0;
virtual int max_match() = 0;
virtual void AppendToText(RegExpText* text);
SmartPointer<const char> ToString();
#define MAKE_ASTYPE(Name) \
......@@ -1230,47 +1233,37 @@ class RegExpTree: public ZoneObject {
class RegExpDisjunction: public RegExpTree {
public:
explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives)
: alternatives_(alternatives) { }
explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
virtual void* Accept(RegExpVisitor* visitor, void* data);
virtual RegExpNode* ToNode(RegExpCompiler* compiler,
RegExpNode* on_success);
virtual RegExpDisjunction* AsDisjunction();
virtual bool IsDisjunction();
virtual int min_match() { return min_match_; }
virtual int max_match() { return max_match_; }
ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
private:
ZoneList<RegExpTree*>* alternatives_;
int min_match_;
int max_match_;
};
class RegExpAlternative: public RegExpTree {
public:
explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes) : nodes_(nodes) { }
explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
virtual void* Accept(RegExpVisitor* visitor, void* data);
virtual RegExpNode* ToNode(RegExpCompiler* compiler,
RegExpNode* on_success);
virtual RegExpAlternative* AsAlternative();
virtual bool IsAlternative();
virtual int min_match() { return min_match_; }
virtual int max_match() { return max_match_; }
ZoneList<RegExpTree*>* nodes() { return nodes_; }
private:
ZoneList<RegExpTree*>* nodes_;
};
class RegExpText: public RegExpTree {
public:
RegExpText() : elements_(2) { }
virtual void* Accept(RegExpVisitor* visitor, void* data);
virtual RegExpNode* ToNode(RegExpCompiler* compiler,
RegExpNode* on_success);
virtual RegExpText* AsText();
virtual bool IsText();
virtual bool IsTextElement() { return true; }
virtual void AppendToText(RegExpText* text);
void AddElement(TextElement elm) { elements_.Add(elm); }
ZoneList<TextElement>* elements() { return &elements_; }
private:
ZoneList<TextElement> elements_;
int min_match_;
int max_match_;
};
......@@ -1290,6 +1283,8 @@ class RegExpAssertion: public RegExpTree {
RegExpNode* on_success);
virtual RegExpAssertion* AsAssertion();
virtual bool IsAssertion();
virtual int min_match() { return 0; }
virtual int max_match() { return 0; }
Type type() { return type_; }
private:
Type type_;
......@@ -1312,6 +1307,8 @@ class RegExpCharacterClass: public RegExpTree {
virtual RegExpCharacterClass* AsCharacterClass();
virtual bool IsCharacterClass();
virtual bool IsTextElement() { return true; }
virtual int min_match() { return 1; }
virtual int max_match() { return 1; }
virtual void AppendToText(RegExpText* text);
ZoneList<CharacterRange>* ranges() { return ranges_; }
bool is_negated() { return is_negated_; }
......@@ -1330,20 +1327,53 @@ class RegExpAtom: public RegExpTree {
virtual RegExpAtom* AsAtom();
virtual bool IsAtom();
virtual bool IsTextElement() { return true; }
virtual int min_match() { return data_.length(); }
virtual int max_match() { return data_.length(); }
virtual void AppendToText(RegExpText* text);
Vector<const uc16> data() { return data_; }
int length() { return data_.length(); }
private:
Vector<const uc16> data_;
};
class RegExpText: public RegExpTree {
public:
RegExpText() : elements_(2), length_(0) {}
virtual void* Accept(RegExpVisitor* visitor, void* data);
virtual RegExpNode* ToNode(RegExpCompiler* compiler,
RegExpNode* on_success);
virtual RegExpText* AsText();
virtual bool IsText();
virtual bool IsTextElement() { return true; }
virtual int min_match() { return length_; }
virtual int max_match() { return length_; }
virtual void AppendToText(RegExpText* text);
void AddElement(TextElement elm) {
elements_.Add(elm);
length_ += elm.length();
};
ZoneList<TextElement>* elements() { return &elements_; }
private:
ZoneList<TextElement> elements_;
int length_;
};
class RegExpQuantifier: public RegExpTree {
public:
RegExpQuantifier(int min, int max, bool is_greedy, RegExpTree* body)
: min_(min),
max_(max),
is_greedy_(is_greedy),
body_(body) { }
body_(body),
min_match_(min * body->min_match()) {
if (max > 0 && body->max_match() > kInfinity / max) {
max_match_ = kInfinity;
} else {
max_match_ = max * body->max_match();
}
}
virtual void* Accept(RegExpVisitor* visitor, void* data);
virtual RegExpNode* ToNode(RegExpCompiler* compiler,
RegExpNode* on_success);
......@@ -1355,18 +1385,19 @@ class RegExpQuantifier: public RegExpTree {
RegExpNode* on_success);
virtual RegExpQuantifier* AsQuantifier();
virtual bool IsQuantifier();
virtual int min_match() { return min_match_; }
virtual int max_match() { return max_match_; }
int min() { return min_; }
int max() { return max_; }
bool is_greedy() { return is_greedy_; }
RegExpTree* body() { return body_; }
// We just use a very large integer value as infinity because 2^30
// is infinite in practice.
static const int kInfinity = (1 << 30);
private:
int min_;
int max_;
bool is_greedy_;
RegExpTree* body_;
int min_match_;
int max_match_;
};
......@@ -1389,6 +1420,8 @@ class RegExpCapture: public RegExpTree {
RegExpNode* on_success);
virtual RegExpCapture* AsCapture();
virtual bool IsCapture();
virtual int min_match() { return body_->min_match(); }
virtual int max_match() { return body_->max_match(); }
RegExpTree* body() { return body_; }
int index() { return index_; }
inline CaptureAvailability available() { return available_; }
......@@ -1414,6 +1447,8 @@ class RegExpLookahead: public RegExpTree {
RegExpNode* on_success);
virtual RegExpLookahead* AsLookahead();
virtual bool IsLookahead();
virtual int min_match() { return 0; }
virtual int max_match() { return 0; }
RegExpTree* body() { return body_; }
bool is_positive() { return is_positive_; }
private:
......@@ -1431,6 +1466,8 @@ class RegExpBackReference: public RegExpTree {
RegExpNode* on_success);
virtual RegExpBackReference* AsBackReference();
virtual bool IsBackReference();
virtual int min_match() { return capture_->min_match(); }
virtual int max_match() { return capture_->max_match(); }
int index() { return capture_->index(); }
RegExpCapture* capture() { return capture_; }
private:
......@@ -1446,6 +1483,8 @@ class RegExpEmpty: public RegExpTree {
RegExpNode* on_success);
virtual RegExpEmpty* AsEmpty();
virtual bool IsEmpty();
virtual int min_match() { return 0; }
virtual int max_match() { return 0; }
static RegExpEmpty* GetInstance() { return &kInstance; }
private:
static RegExpEmpty kInstance;
......
......@@ -1193,6 +1193,16 @@ TextElement TextElement::CharClass(
}
int TextElement::length() {
if (type == ATOM) {
return data.u_atom->length();
} else {
ASSERT(type == CHAR_CLASS);
return 1;
}
}
DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
if (table_ == NULL) {
table_ = new DispatchTable();
......@@ -2667,7 +2677,7 @@ RegExpNode* RegExpQuantifier::ToNode(int min,
// TODO(someone): clear captures on repetition and handle empty
// matches.
bool has_min = min > 0;
bool has_max = max < RegExpQuantifier::kInfinity;
bool has_max = max < RegExpTree::kInfinity;
bool needs_counter = has_min || has_max;
int reg_ctr = needs_counter ? compiler->AllocateRegister() : -1;
ChoiceNode* center = new LoopChoiceNode(2);
......@@ -3784,7 +3794,7 @@ Handle<FixedArray> RegExpEngine::Compile(RegExpCompileData* data,
// since we don't even handle ^ yet I'm saving that optimization for
// later.
RegExpNode* node = RegExpQuantifier::ToNode(0,
RegExpQuantifier::kInfinity,
RegExpTree::kInfinity,
false,
new RegExpCharacterClass('*'),
&compiler,
......
......@@ -439,6 +439,7 @@ class TextElement {
explicit TextElement(Type t) : type(t), cp_offset(-1) { }
static TextElement Atom(RegExpAtom* atom);
static TextElement CharClass(RegExpCharacterClass* char_class);
int length();
Type type;
union {
RegExpAtom* u_atom;
......
......@@ -470,9 +470,8 @@ void RegExpBuilder::AddQuantifierToAtom(int min, int max, bool is_greedy) {
} else if (terms_.length() > 0) {
ASSERT(last_added_ == ADD_ATOM);
atom = terms_.RemoveLast();
if (atom->IsLookahead() || atom->IsAssertion()) {
// Guaranteed not to match a non-empty string.
// Assertion as an atom can happen as, e.g., (?:\b)
if (atom->max_match() == 0) {
// Guaranteed to only match an empty string.
LAST(ADD_TERM);
if (min == 0) {
return;
......@@ -3797,12 +3796,12 @@ RegExpTree* RegExpParser::ParseDisjunction() {
// {
case '*':
min = 0;
max = RegExpQuantifier::kInfinity;
max = RegExpTree::kInfinity;
Advance();
break;
case '+':
min = 1;
max = RegExpQuantifier::kInfinity;
max = RegExpTree::kInfinity;
Advance();
break;
case '?':
......@@ -3968,7 +3967,7 @@ bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
} else if (current() == ',') {
Advance();
if (current() == '}') {
max = RegExpQuantifier::kInfinity;
max = RegExpTree::kInfinity;
Advance();
} else {
while (IsDecimalDigit(current())) {
......
......@@ -76,9 +76,36 @@ static bool CheckSimple(const char* input) {
return result.simple;
}
struct MinMaxPair {
int min_match;
int max_match;
};
static MinMaxPair CheckMinMaxMatch(const char* input) {
V8::Initialize(NULL);
v8::HandleScope scope;
unibrow::Utf8InputBuffer<> buffer(input, strlen(input));
ZoneScope zone_scope(DELETE_ON_EXIT);
FlatStringReader reader(CStrVector(input));
RegExpCompileData result;
CHECK(v8::internal::ParseRegExp(&reader, false, &result));
CHECK(result.tree != NULL);
CHECK(result.error.is_null());
int min_match = result.tree->min_match();
int max_match = result.tree->max_match();
MinMaxPair pair = { min_match, max_match };
return pair;
}
#define CHECK_PARSE_EQ(input, expected) CHECK_EQ(expected, *Parse(input))
#define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input));
#define CHECK_MIN_MAX(input, min, max) \
{ MinMaxPair min_max = CheckMinMaxMatch(input); \
CHECK_EQ(min, min_max.min_match); \
CHECK_EQ(max, min_max.max_match); \
}
TEST(Parser) {
V8::Initialize(NULL);
......@@ -251,6 +278,55 @@ TEST(Parser) {
CHECK_PARSE_EQ("{12z}", "'{12z}'");
CHECK_PARSE_EQ("{12,", "'{12,'");
CHECK_PARSE_EQ("{12,3b", "'{12,3b'");
CHECK_MIN_MAX("a", 1, 1);
CHECK_MIN_MAX("abc", 3, 3);
CHECK_MIN_MAX("a[bc]d", 3, 3);
CHECK_MIN_MAX("a|bc", 1, 2);
CHECK_MIN_MAX("ab|c", 1, 2);
CHECK_MIN_MAX("a||bc", 0, 2);
CHECK_MIN_MAX("|", 0, 0);
CHECK_MIN_MAX("(?:ab)", 2, 2);
CHECK_MIN_MAX("(?:ab|cde)", 2, 3);
CHECK_MIN_MAX("(?:ab)|cde", 2, 3);
CHECK_MIN_MAX("(ab)", 2, 2);
CHECK_MIN_MAX("(ab|cde)", 2, 3);
CHECK_MIN_MAX("(ab)\\1", 4, 4);
CHECK_MIN_MAX("(ab|cde)\\1", 4, 6);
CHECK_MIN_MAX("(?:ab)?", 0, 2);
CHECK_MIN_MAX("(?:ab)*", 0, RegExpTree::kInfinity);
CHECK_MIN_MAX("(?:ab)+", 2, RegExpTree::kInfinity);
CHECK_MIN_MAX("a?", 0, 1);
CHECK_MIN_MAX("a*", 0, RegExpTree::kInfinity);
CHECK_MIN_MAX("a+", 1, RegExpTree::kInfinity);
CHECK_MIN_MAX("a??", 0, 1);
CHECK_MIN_MAX("a*?", 0, RegExpTree::kInfinity);
CHECK_MIN_MAX("a+?", 1, RegExpTree::kInfinity);
CHECK_MIN_MAX("(?:a?)?", 0, 1);
CHECK_MIN_MAX("(?:a*)?", 0, RegExpTree::kInfinity);
CHECK_MIN_MAX("(?:a+)?", 0, RegExpTree::kInfinity);
CHECK_MIN_MAX("(?:a?)+", 0, RegExpTree::kInfinity);
CHECK_MIN_MAX("(?:a*)+", 0, RegExpTree::kInfinity);
CHECK_MIN_MAX("(?:a+)+", 1, RegExpTree::kInfinity);
CHECK_MIN_MAX("(?:a?)*", 0, RegExpTree::kInfinity);
CHECK_MIN_MAX("(?:a*)*", 0, RegExpTree::kInfinity);
CHECK_MIN_MAX("(?:a+)*", 0, RegExpTree::kInfinity);
CHECK_MIN_MAX("a{0}", 0, 0);
CHECK_MIN_MAX("(?:a+){0}", 0, 0);
CHECK_MIN_MAX("(?:a+){0,0}", 0, 0);
CHECK_MIN_MAX("a*b", 1, RegExpTree::kInfinity);
CHECK_MIN_MAX("a+b", 2, RegExpTree::kInfinity);
CHECK_MIN_MAX("a*b|c", 1, RegExpTree::kInfinity);
CHECK_MIN_MAX("a+b|c", 1, RegExpTree::kInfinity);
CHECK_MIN_MAX("(?:a{5,1000000}){3,1000000}", 15, RegExpTree::kInfinity);
CHECK_MIN_MAX("(?:ab){4,7}", 8, 14);
CHECK_MIN_MAX("a\\bc", 2, 2);
CHECK_MIN_MAX("a\\Bc", 2, 2);
CHECK_MIN_MAX("a\\sc", 3, 3);
CHECK_MIN_MAX("a\\Sc", 3, 3);
CHECK_MIN_MAX("a(?=b)c", 2, 2);
CHECK_MIN_MAX("a(?=bbb|bb)c", 2, 2);
CHECK_MIN_MAX("a(?!bbb|bb)c", 2, 2);
}
TEST(ParserRegression) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment