Commit 09e3c761 authored by lrn@chromium.org's avatar lrn@chromium.org

Quantified look-aheads are sometimes removed entirely, leaving only a

single atom node. A flag was not set in this case, leading the wrapper
code to think the pattern was equal to the atom and use the pattern
in the indexOf operation.


git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@971 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 2a84fa41
...@@ -270,23 +270,24 @@ Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re, ...@@ -270,23 +270,24 @@ Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
"malformed_regexp"); "malformed_regexp");
return Handle<Object>::null(); return Handle<Object>::null();
} }
if (parse_result.simple && !flags.is_ignore_case()) {
// Parse-tree is a single atom that is equal to the pattern.
result = AtomCompile(re, pattern, flags, pattern);
} else if (parse_result.tree->IsAtom() &&
!flags.is_ignore_case() &&
parse_result.capture_count == 0) {
// TODO(lrn) Accept capture_count > 0 on atoms.
RegExpAtom* atom = parse_result.tree->AsAtom(); RegExpAtom* atom = parse_result.tree->AsAtom();
if (atom != NULL && !flags.is_ignore_case()) {
if (parse_result.has_character_escapes) {
Vector<const uc16> atom_pattern = atom->data(); Vector<const uc16> atom_pattern = atom->data();
Handle<String> atom_string = Handle<String> atom_string =
Factory::NewStringFromTwoByte(atom_pattern); Factory::NewStringFromTwoByte(atom_pattern);
result = AtomCompile(re, pattern, flags, atom_string); result = AtomCompile(re, pattern, flags, atom_string);
} else { } else if (FLAG_irregexp) {
result = AtomCompile(re, pattern, flags, pattern);
}
} else {
if (FLAG_irregexp) {
result = IrregexpPrepare(re, pattern, flags); result = IrregexpPrepare(re, pattern, flags);
} else { } else {
result = JscrePrepare(re, pattern, flags); result = JscrePrepare(re, pattern, flags);
} }
}
Object* data = re->data(); Object* data = re->data();
if (data->IsFixedArray()) { if (data->IsFixedArray()) {
// If compilation succeeded then the data is set on the regexp // If compilation succeeded then the data is set on the regexp
......
...@@ -1117,12 +1117,12 @@ struct RegExpCompileData { ...@@ -1117,12 +1117,12 @@ struct RegExpCompileData {
: tree(NULL), : tree(NULL),
node(NULL), node(NULL),
has_lookbehind(false), has_lookbehind(false),
has_character_escapes(false), simple(true),
capture_count(0) { } capture_count(0) { }
RegExpTree* tree; RegExpTree* tree;
RegExpNode* node; RegExpNode* node;
bool has_lookbehind; bool has_lookbehind;
bool has_character_escapes; bool simple;
Handle<String> error; Handle<String> error;
int capture_count; int capture_count;
}; };
......
...@@ -527,7 +527,9 @@ class RegExpParser { ...@@ -527,7 +527,9 @@ class RegExpParser {
void Advance(int dist); void Advance(int dist);
void Reset(int pos); void Reset(int pos);
bool HasCharacterEscapes(); // Reports whether the pattern might be used as a literal search string.
// Only use if the result of the parse is a single atom node.
bool simple();
int captures_started() { return captures_ == NULL ? 0 : captures_->length(); } int captures_started() { return captures_ == NULL ? 0 : captures_->length(); }
int position() { return next_pos_ - 1; } int position() { return next_pos_ - 1; }
...@@ -548,7 +550,7 @@ class RegExpParser { ...@@ -548,7 +550,7 @@ class RegExpParser {
int next_pos_; int next_pos_;
FlatStringReader* in_; FlatStringReader* in_;
Handle<String>* error_; Handle<String>* error_;
bool has_character_escapes_; bool simple_;
ZoneList<RegExpCapture*>* captures_; ZoneList<RegExpCapture*>* captures_;
bool is_scanned_for_captures_; bool is_scanned_for_captures_;
// The capture count is only valid after we have scanned for captures. // The capture count is only valid after we have scanned for captures.
...@@ -3502,7 +3504,7 @@ RegExpParser::RegExpParser(FlatStringReader* in, ...@@ -3502,7 +3504,7 @@ RegExpParser::RegExpParser(FlatStringReader* in,
next_pos_(0), next_pos_(0),
in_(in), in_(in),
error_(error), error_(error),
has_character_escapes_(false), simple_(true),
captures_(NULL), captures_(NULL),
is_scanned_for_captures_(false), is_scanned_for_captures_(false),
capture_count_(0), capture_count_(0),
...@@ -3550,11 +3552,8 @@ void RegExpParser::Advance(int dist) { ...@@ -3550,11 +3552,8 @@ void RegExpParser::Advance(int dist) {
} }
// Reports whether the parsed string atoms contain any characters that were bool RegExpParser::simple() {
// escaped in the original pattern. If not, all atoms are proper substrings return simple_;
// of the original pattern.
bool RegExpParser::HasCharacterEscapes() {
return has_character_escapes_;
} }
RegExpTree* RegExpParser::ReportError(Vector<const char> message) { RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
...@@ -3769,7 +3768,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { ...@@ -3769,7 +3768,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
Advance(2); Advance(2);
break; break;
} }
has_character_escapes_ = true; simple_ = false;
break; break;
case '{': { case '{': {
int dummy; int dummy;
...@@ -3822,6 +3821,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { ...@@ -3822,6 +3821,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
is_greedy = false; is_greedy = false;
Advance(); Advance();
} }
simple_ = false; // Adding quantifier might *remove* look-ahead.
builder.AddQuantifierToAtom(min, max, is_greedy); builder.AddQuantifierToAtom(min, max, is_greedy);
} }
} }
...@@ -4307,15 +4307,17 @@ bool ParseRegExp(FlatStringReader* input, ...@@ -4307,15 +4307,17 @@ bool ParseRegExp(FlatStringReader* input,
// Make sure we have a stack guard. // Make sure we have a stack guard.
StackGuard guard; StackGuard guard;
RegExpParser parser(input, &result->error, multiline); RegExpParser parser(input, &result->error, multiline);
result->tree = parser.ParsePattern(); RegExpTree* tree = parser.ParsePattern();
if (parser.failed()) { if (parser.failed()) {
ASSERT(result->tree == NULL); ASSERT(tree == NULL);
ASSERT(!result->error.is_null()); ASSERT(!result->error.is_null());
} else { } else {
ASSERT(result->tree != NULL); ASSERT(tree != NULL);
ASSERT(result->error.is_null()); ASSERT(result->error.is_null());
result->has_character_escapes = parser.HasCharacterEscapes(); result->tree = tree;
result->capture_count = parser.captures_started(); int capture_count = parser.captures_started();
result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
result->capture_count = capture_count;
} }
return !parser.failed(); return !parser.failed();
} }
......
...@@ -63,7 +63,7 @@ static SmartPointer<const char> Parse(const char* input) { ...@@ -63,7 +63,7 @@ static SmartPointer<const char> Parse(const char* input) {
return output; return output;
} }
static bool ParseEscapes(const char* input) { static bool CheckSimple(const char* input) {
V8::Initialize(NULL); V8::Initialize(NULL);
v8::HandleScope scope; v8::HandleScope scope;
unibrow::Utf8InputBuffer<> buffer(input, strlen(input)); unibrow::Utf8InputBuffer<> buffer(input, strlen(input));
...@@ -73,13 +73,12 @@ static bool ParseEscapes(const char* input) { ...@@ -73,13 +73,12 @@ static bool ParseEscapes(const char* input) {
CHECK(v8::internal::ParseRegExp(&reader, false, &result)); CHECK(v8::internal::ParseRegExp(&reader, false, &result));
CHECK(result.tree != NULL); CHECK(result.tree != NULL);
CHECK(result.error.is_null()); CHECK(result.error.is_null());
return result.has_character_escapes; return result.simple;
} }
#define CHECK_PARSE_EQ(input, expected) CHECK_EQ(expected, *Parse(input)) #define CHECK_PARSE_EQ(input, expected) CHECK_EQ(expected, *Parse(input))
#define CHECK_ESCAPES(input, has_escapes) CHECK_EQ(has_escapes, \ #define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input));
ParseEscapes(input));
TEST(Parser) { TEST(Parser) {
V8::Initialize(NULL); V8::Initialize(NULL);
...@@ -168,6 +167,11 @@ TEST(Parser) { ...@@ -168,6 +167,11 @@ TEST(Parser) {
CHECK_PARSE_EQ("(a)\\1", "(: (^ 'a') (<- 1))"); CHECK_PARSE_EQ("(a)\\1", "(: (^ 'a') (<- 1))");
CHECK_PARSE_EQ("(a\\1)", "(^ 'a')"); CHECK_PARSE_EQ("(a\\1)", "(^ 'a')");
CHECK_PARSE_EQ("(\\1a)", "(^ 'a')"); CHECK_PARSE_EQ("(\\1a)", "(^ 'a')");
CHECK_PARSE_EQ("(?=a)?a", "'a'");
CHECK_PARSE_EQ("(?=a){0,10}a", "'a'");
CHECK_PARSE_EQ("(?=a){1,10}a", "(: (-> + 'a') 'a')");
CHECK_PARSE_EQ("(?=a){9,10}a", "(: (-> + 'a') 'a')");
CHECK_PARSE_EQ("(?!a)?a", "'a'");
CHECK_PARSE_EQ("\\1(a)", "(^ 'a')"); CHECK_PARSE_EQ("\\1(a)", "(^ 'a')");
CHECK_PARSE_EQ("(?!(a))\\1", "(-> - (^ 'a'))"); CHECK_PARSE_EQ("(?!(a))\\1", "(-> - (^ 'a'))");
CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(-> - (: (^ 'a') (<- 1)))"); CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(-> - (: (^ 'a') (<- 1)))");
...@@ -186,47 +190,50 @@ TEST(Parser) { ...@@ -186,47 +190,50 @@ TEST(Parser) {
CHECK_PARSE_EQ("\\u003z", "'u003z'"); CHECK_PARSE_EQ("\\u003z", "'u003z'");
CHECK_PARSE_EQ("foo[z]*", "(: 'foo' (# 0 - g [z]))"); CHECK_PARSE_EQ("foo[z]*", "(: 'foo' (# 0 - g [z]))");
CHECK_ESCAPES("a", false); CHECK_SIMPLE("a", true);
CHECK_ESCAPES("a|b", false); CHECK_SIMPLE("a|b", false);
CHECK_ESCAPES("a\\n", true); CHECK_SIMPLE("a\\n", false);
CHECK_ESCAPES("^a", false); CHECK_SIMPLE("^a", false);
CHECK_ESCAPES("a$", false); CHECK_SIMPLE("a$", false);
CHECK_ESCAPES("a\\b!", false); CHECK_SIMPLE("a\\b!", false);
CHECK_ESCAPES("a\\Bb", false); CHECK_SIMPLE("a\\Bb", false);
CHECK_ESCAPES("a*", false); CHECK_SIMPLE("a*", false);
CHECK_ESCAPES("a*?", false); CHECK_SIMPLE("a*?", false);
CHECK_ESCAPES("a?", false); CHECK_SIMPLE("a?", false);
CHECK_ESCAPES("a??", false); CHECK_SIMPLE("a??", false);
CHECK_ESCAPES("a{0,1}?", false); CHECK_SIMPLE("a{0,1}?", false);
CHECK_ESCAPES("a{1,1}?", false); CHECK_SIMPLE("a{1,1}?", false);
CHECK_ESCAPES("a{1,2}?", false); CHECK_SIMPLE("a{1,2}?", false);
CHECK_ESCAPES("a+?", false); CHECK_SIMPLE("a+?", false);
CHECK_ESCAPES("(a)", false); CHECK_SIMPLE("(a)", false);
CHECK_ESCAPES("(a)\\1", false); CHECK_SIMPLE("(a)\\1", false);
CHECK_ESCAPES("(\\1a)", false); CHECK_SIMPLE("(\\1a)", false);
CHECK_ESCAPES("\\1(a)", false); CHECK_SIMPLE("\\1(a)", false);
CHECK_ESCAPES("a\\s", false); CHECK_SIMPLE("a\\s", false);
CHECK_ESCAPES("a\\S", false); CHECK_SIMPLE("a\\S", false);
CHECK_ESCAPES("a\\d", false); CHECK_SIMPLE("a\\d", false);
CHECK_ESCAPES("a\\D", false); CHECK_SIMPLE("a\\D", false);
CHECK_ESCAPES("a\\w", false); CHECK_SIMPLE("a\\w", false);
CHECK_ESCAPES("a\\W", false); CHECK_SIMPLE("a\\W", false);
CHECK_ESCAPES("a.", false); CHECK_SIMPLE("a.", false);
CHECK_ESCAPES("a\\q", true); CHECK_SIMPLE("a\\q", false);
CHECK_ESCAPES("a[a]", false); CHECK_SIMPLE("a[a]", false);
CHECK_ESCAPES("a[^a]", false); CHECK_SIMPLE("a[^a]", false);
CHECK_ESCAPES("a[a-z]", false); CHECK_SIMPLE("a[a-z]", false);
CHECK_ESCAPES("a[\\q]", false); CHECK_SIMPLE("a[\\q]", false);
CHECK_ESCAPES("a(?:b)", false); CHECK_SIMPLE("a(?:b)", false);
CHECK_ESCAPES("a(?=b)", false); CHECK_SIMPLE("a(?=b)", false);
CHECK_ESCAPES("a(?!b)", false); CHECK_SIMPLE("a(?!b)", false);
CHECK_ESCAPES("\\x60", true); CHECK_SIMPLE("\\x60", false);
CHECK_ESCAPES("\\u0060", true); CHECK_SIMPLE("\\u0060", false);
CHECK_ESCAPES("\\cA", true); CHECK_SIMPLE("\\cA", false);
CHECK_ESCAPES("\\q", true); CHECK_SIMPLE("\\q", false);
CHECK_ESCAPES("\\1112", true); CHECK_SIMPLE("\\1112", false);
CHECK_ESCAPES("\\0", true); CHECK_SIMPLE("\\0", false);
CHECK_ESCAPES("(a)\\1", false); CHECK_SIMPLE("(a)\\1", false);
CHECK_SIMPLE("(?=a)?a", false);
CHECK_SIMPLE("(?!a)?a\\1", false);
CHECK_SIMPLE("(?:(?=a))a\\1", false);
CHECK_PARSE_EQ("a{}", "'a{}'"); CHECK_PARSE_EQ("a{}", "'a{}'");
CHECK_PARSE_EQ("a{,}", "'a{,}'"); CHECK_PARSE_EQ("a{,}", "'a{,}'");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment