- Added lookbehind propagation for the initial node; now, if the

  initial node is interested in what precedes it the automaton is
  given an initial all-consuming character class that determines it.
- Added verification of some node information invariants.  We now
  check that if a node expresses interest in what precedes it that
  information is available to it after assertion expansion.


git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@964 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent e8b56e6f
...@@ -184,7 +184,7 @@ class OldSpace; ...@@ -184,7 +184,7 @@ class OldSpace;
class Property; class Property;
class Proxy; class Proxy;
class RegExpNode; class RegExpNode;
struct RegExpParseResult; struct RegExpCompileData;
class RegExpTree; class RegExpTree;
class RegExpCompiler; class RegExpCompiler;
class RegExpVisitor; class RegExpVisitor;
......
...@@ -260,7 +260,7 @@ Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re, ...@@ -260,7 +260,7 @@ Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
} else { } else {
FlattenString(pattern); FlattenString(pattern);
ZoneScope zone_scope(DELETE_ON_EXIT); ZoneScope zone_scope(DELETE_ON_EXIT);
RegExpParseResult parse_result; RegExpCompileData parse_result;
FlatStringReader reader(pattern); FlatStringReader reader(pattern);
if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) { if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
// Throw an exception if we fail to parse the pattern. // Throw an exception if we fail to parse the pattern.
...@@ -706,20 +706,19 @@ static Handle<FixedArray> GetCompiledIrregexp(Handle<JSRegExp> re, ...@@ -706,20 +706,19 @@ static Handle<FixedArray> GetCompiledIrregexp(Handle<JSRegExp> re,
pattern->Flatten(shape); pattern->Flatten(shape);
} }
RegExpParseResult parse_result; RegExpCompileData compile_data;
FlatStringReader reader(pattern); FlatStringReader reader(pattern);
if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) { if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
// Throw an exception if we fail to parse the pattern. // Throw an exception if we fail to parse the pattern.
// THIS SHOULD NOT HAPPEN. We already parsed it successfully once. // THIS SHOULD NOT HAPPEN. We already parsed it successfully once.
ThrowRegExpException(re, ThrowRegExpException(re,
pattern, pattern,
parse_result.error, compile_data.error,
"malformed_regexp"); "malformed_regexp");
return Handle<FixedArray>::null(); return Handle<FixedArray>::null();
} }
Handle<FixedArray> compiled_entry = Handle<FixedArray> compiled_entry =
RegExpEngine::Compile(&parse_result, RegExpEngine::Compile(&compile_data,
NULL,
flags.is_ignore_case(), flags.is_ignore_case(),
flags.is_multiline(), flags.is_multiline(),
pattern, pattern,
...@@ -2603,9 +2602,7 @@ RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, ...@@ -2603,9 +2602,7 @@ RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) { RegExpNode* on_success) {
ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); return new TextNode(this, on_success);
elms->Add(TextElement::CharClass(this));
return new TextNode(elms, on_success);
} }
...@@ -3265,7 +3262,7 @@ OutSet* DispatchTable::Get(uc16 value) { ...@@ -3265,7 +3262,7 @@ OutSet* DispatchTable::Get(uc16 value) {
// Analysis // Analysis
void Analysis::EnsureAnalyzed(RegExpNode* that) { void AssertionPropagation::EnsureAnalyzed(RegExpNode* that) {
if (that->info()->been_analyzed || that->info()->being_analyzed) if (that->info()->been_analyzed || that->info()->being_analyzed)
return; return;
that->info()->being_analyzed = true; that->info()->being_analyzed = true;
...@@ -3275,7 +3272,7 @@ void Analysis::EnsureAnalyzed(RegExpNode* that) { ...@@ -3275,7 +3272,7 @@ void Analysis::EnsureAnalyzed(RegExpNode* that) {
} }
void Analysis::VisitEnd(EndNode* that) { void AssertionPropagation::VisitEnd(EndNode* that) {
// nothing to do // nothing to do
} }
...@@ -3298,7 +3295,7 @@ void TextNode::CalculateOffsets() { ...@@ -3298,7 +3295,7 @@ void TextNode::CalculateOffsets() {
} }
void Analysis::VisitText(TextNode* that) { void AssertionPropagation::VisitText(TextNode* that) {
if (ignore_case_) { if (ignore_case_) {
that->MakeCaseIndependent(); that->MakeCaseIndependent();
} }
...@@ -3314,7 +3311,7 @@ void Analysis::VisitText(TextNode* that) { ...@@ -3314,7 +3311,7 @@ void Analysis::VisitText(TextNode* that) {
} }
void Analysis::VisitAction(ActionNode* that) { void AssertionPropagation::VisitAction(ActionNode* that) {
RegExpNode* target = that->on_success(); RegExpNode* target = that->on_success();
EnsureAnalyzed(target); EnsureAnalyzed(target);
// If the next node is interested in what it follows then this node // If the next node is interested in what it follows then this node
...@@ -3323,7 +3320,7 @@ void Analysis::VisitAction(ActionNode* that) { ...@@ -3323,7 +3320,7 @@ void Analysis::VisitAction(ActionNode* that) {
} }
void Analysis::VisitChoice(ChoiceNode* that) { void AssertionPropagation::VisitChoice(ChoiceNode* that) {
NodeInfo* info = that->info(); NodeInfo* info = that->info();
for (int i = 0; i < that->alternatives()->length(); i++) { for (int i = 0; i < that->alternatives()->length(); i++) {
RegExpNode* node = that->alternatives()->at(i).node(); RegExpNode* node = that->alternatives()->at(i).node();
...@@ -3335,7 +3332,7 @@ void Analysis::VisitChoice(ChoiceNode* that) { ...@@ -3335,7 +3332,7 @@ void Analysis::VisitChoice(ChoiceNode* that) {
} }
void Analysis::VisitBackReference(BackReferenceNode* that) { void AssertionPropagation::VisitBackReference(BackReferenceNode* that) {
EnsureAnalyzed(that->on_success()); EnsureAnalyzed(that->on_success());
} }
...@@ -3650,15 +3647,118 @@ void DispatchTableConstructor::VisitAction(ActionNode* that) { ...@@ -3650,15 +3647,118 @@ void DispatchTableConstructor::VisitAction(ActionNode* that) {
} }
Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input, #ifdef DEBUG
RegExpNode** node_return,
class VisitNodeScope {
public:
explicit VisitNodeScope(RegExpNode* node) : node_(node) {
ASSERT(!node->info()->visited);
node->info()->visited = true;
}
~VisitNodeScope() {
node_->info()->visited = false;
}
private:
RegExpNode* node_;
};
class NodeValidator : public NodeVisitor {
public:
virtual void ValidateInfo(NodeInfo* info) = 0;
#define DECLARE_VISIT(Type) \
virtual void Visit##Type(Type##Node* that);
FOR_EACH_NODE_TYPE(DECLARE_VISIT)
#undef DECLARE_VISIT
};
class PostAnalysisNodeValidator : public NodeValidator {
public:
virtual void ValidateInfo(NodeInfo* info);
};
class PostExpansionNodeValidator : public NodeValidator {
public:
virtual void ValidateInfo(NodeInfo* info);
};
void PostAnalysisNodeValidator::ValidateInfo(NodeInfo* info) {
ASSERT(info->been_analyzed);
}
void PostExpansionNodeValidator::ValidateInfo(NodeInfo* info) {
ASSERT_EQ(info->determine_newline, info->does_determine_newline);
ASSERT_EQ(info->determine_start, info->does_determine_start);
ASSERT_EQ(info->determine_word, info->does_determine_word);
ASSERT_EQ(info->follows_word_interest,
(info->follows_word != NodeInfo::UNKNOWN));
if (false) {
// These are still unimplemented.
ASSERT_EQ(info->follows_start_interest,
(info->follows_start != NodeInfo::UNKNOWN));
ASSERT_EQ(info->follows_newline_interest,
(info->follows_newline != NodeInfo::UNKNOWN));
}
}
void NodeValidator::VisitAction(ActionNode* that) {
if (that->info()->visited) return;
VisitNodeScope scope(that);
ValidateInfo(that->info());
that->on_success()->Accept(this);
}
void NodeValidator::VisitBackReference(BackReferenceNode* that) {
if (that->info()->visited) return;
VisitNodeScope scope(that);
ValidateInfo(that->info());
that->on_success()->Accept(this);
}
void NodeValidator::VisitChoice(ChoiceNode* that) {
if (that->info()->visited) return;
VisitNodeScope scope(that);
ValidateInfo(that->info());
ZoneList<GuardedAlternative>* alts = that->alternatives();
for (int i = 0; i < alts->length(); i++)
alts->at(i).node()->Accept(this);
}
void NodeValidator::VisitEnd(EndNode* that) {
if (that->info()->visited) return;
VisitNodeScope scope(that);
ValidateInfo(that->info());
}
void NodeValidator::VisitText(TextNode* that) {
if (that->info()->visited) return;
VisitNodeScope scope(that);
ValidateInfo(that->info());
that->on_success()->Accept(this);
}
#endif
Handle<FixedArray> RegExpEngine::Compile(RegExpCompileData* data,
bool ignore_case, bool ignore_case,
bool is_multiline, bool is_multiline,
Handle<String> pattern, Handle<String> pattern,
bool is_ascii) { bool is_ascii) {
RegExpCompiler compiler(input->capture_count, ignore_case, is_ascii); RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
// Wrap the body of the regexp in capture #0. // Wrap the body of the regexp in capture #0.
RegExpNode* captured_body = RegExpCapture::ToNode(input->tree, RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
0, 0,
&compiler, &compiler,
compiler.accept()); compiler.accept());
...@@ -3673,17 +3773,43 @@ Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input, ...@@ -3673,17 +3773,43 @@ Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,
new RegExpCharacterClass('*'), new RegExpCharacterClass('*'),
&compiler, &compiler,
captured_body); captured_body);
if (node_return != NULL) *node_return = node; AssertionPropagation analysis(ignore_case);
Analysis analysis(ignore_case);
analysis.EnsureAnalyzed(node); analysis.EnsureAnalyzed(node);
NodeInfo info = *node->info(); NodeInfo info = *node->info();
data->has_lookbehind = info.HasLookbehind();
if (data->has_lookbehind) {
// If this node needs information about the preceding text we let
// it start with a character class that consumes a single character
// and proceeds to wherever is appropriate. This means that if
// has_lookbehind is set the code generator must start one character
// before the start position.
node = new TextNode(new RegExpCharacterClass('*'), node);
analysis.EnsureAnalyzed(node);
}
#ifdef DEBUG
PostAnalysisNodeValidator post_analysis_validator;
node->Accept(&post_analysis_validator);
#endif
node = node->EnsureExpanded(&info); node = node->EnsureExpanded(&info);
#ifdef DEBUG
PostExpansionNodeValidator post_expansion_validator;
node->Accept(&post_expansion_validator);
#endif
data->node = node;
if (is_multiline && !FLAG_attempt_multiline_irregexp) { if (is_multiline && !FLAG_attempt_multiline_irregexp) {
return Handle<FixedArray>::null(); return Handle<FixedArray>::null();
} }
if (data->has_lookbehind) {
return Handle<FixedArray>::null();
}
if (FLAG_irregexp_native) { if (FLAG_irregexp_native) {
#ifdef ARM #ifdef ARM
// Unimplemented, fall-through to bytecode implementation. // Unimplemented, fall-through to bytecode implementation.
...@@ -3695,10 +3821,10 @@ Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input, ...@@ -3695,10 +3821,10 @@ Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,
mode = RegExpMacroAssemblerIA32::UC16; mode = RegExpMacroAssemblerIA32::UC16;
} }
RegExpMacroAssemblerIA32 macro_assembler(mode, RegExpMacroAssemblerIA32 macro_assembler(mode,
(input->capture_count + 1) * 2); (data->capture_count + 1) * 2);
return compiler.Assemble(&macro_assembler, return compiler.Assemble(&macro_assembler,
node, node,
input->capture_count, data->capture_count,
pattern); pattern);
#endif #endif
} }
...@@ -3706,7 +3832,7 @@ Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input, ...@@ -3706,7 +3832,7 @@ Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,
RegExpMacroAssemblerIrregexp macro_assembler(codes); RegExpMacroAssemblerIrregexp macro_assembler(codes);
return compiler.Assemble(&macro_assembler, return compiler.Assemble(&macro_assembler,
node, node,
input->capture_count, data->capture_count,
pattern); pattern);
} }
......
...@@ -528,6 +528,12 @@ struct NodeInfo { ...@@ -528,6 +528,12 @@ struct NodeInfo {
does_determine_start = that->does_determine_start; does_determine_start = that->does_determine_start;
} }
bool HasLookbehind() {
return follows_word_interest ||
follows_newline_interest ||
follows_start_interest;
}
// Sets the interests of this node to include the interests of the // Sets the interests of this node to include the interests of the
// following node. // following node.
void AddFromFollowing(NodeInfo* that) { void AddFromFollowing(NodeInfo* that) {
...@@ -894,7 +900,7 @@ class ChoiceNode: public RegExpNode { ...@@ -894,7 +900,7 @@ class ChoiceNode: public RegExpNode {
private: private:
friend class DispatchTableConstructor; friend class DispatchTableConstructor;
friend class Analysis; friend class AssertionPropagation;
void GenerateGuard(RegExpMacroAssembler* macro_assembler, void GenerateGuard(RegExpMacroAssembler* macro_assembler,
Guard *guard, Guard *guard,
GenerationVariant* variant); GenerationVariant* variant);
...@@ -1052,9 +1058,45 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT) ...@@ -1052,9 +1058,45 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
}; };
class Analysis: public NodeVisitor { // Assertion propagation moves information about assertions such as
// \b to the affected nodes. For instance, in /.\b./ information must
// be propagated to the first '.' that whatever follows needs to know
// if it matched a word or a non-word, and to the second '.' that it
// has to check if it succeeds a word or non-word. In this case the
// result will be something like:
//
// +-------+ +------------+
// | . | | . |
// +-------+ ---> +------------+
// | word? | | check word |
// +-------+ +------------+
//
// At a later phase all nodes that determine information for their
// following nodes are split into several 'sibling' nodes. In this
// case the first '.' is split into one node that only matches words
// and one that only matches non-words. The second '.' is also split,
// into one node that assumes that the previous character was a word
// character and one that assumes that is was non-word. In this case
// the result is
//
// +------------------+ +------------------+
// /--> | intersect(., \w) | ---> | intersect(., \W) |
// | +------------------+ +------------------+
// | | follows \w |
// | +------------------+
// --?
// | +------------------+ +------------------+
// \--> | intersect(., \W) | ---> | intersect(., \w) |
// +------------------+ +------------------+
// | follows \W |
// +------------------+
//
// This way we don't need to explicitly check the previous character
// but can always assume that whoever consumed the previous character
// has propagated the relevant information forward.
class AssertionPropagation: public NodeVisitor {
public: public:
explicit Analysis(bool ignore_case) explicit AssertionPropagation(bool ignore_case)
: ignore_case_(ignore_case) { } : ignore_case_(ignore_case) { }
void EnsureAnalyzed(RegExpNode* node); void EnsureAnalyzed(RegExpNode* node);
...@@ -1066,12 +1108,20 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT) ...@@ -1066,12 +1108,20 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
private: private:
bool ignore_case_; bool ignore_case_;
DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis); DISALLOW_IMPLICIT_CONSTRUCTORS(AssertionPropagation);
}; };
struct RegExpParseResult { struct RegExpCompileData {
RegExpCompileData()
: tree(NULL),
node(NULL),
has_lookbehind(false),
has_character_escapes(false),
capture_count(0) { }
RegExpTree* tree; RegExpTree* tree;
RegExpNode* node;
bool has_lookbehind;
bool has_character_escapes; bool has_character_escapes;
Handle<String> error; Handle<String> error;
int capture_count; int capture_count;
...@@ -1080,8 +1130,7 @@ struct RegExpParseResult { ...@@ -1080,8 +1130,7 @@ struct RegExpParseResult {
class RegExpEngine: public AllStatic { class RegExpEngine: public AllStatic {
public: public:
static Handle<FixedArray> Compile(RegExpParseResult* input, static Handle<FixedArray> Compile(RegExpCompileData* input,
RegExpNode** node_return,
bool ignore_case, bool ignore_case,
bool multiline, bool multiline,
Handle<String> pattern, Handle<String> pattern,
......
...@@ -4302,7 +4302,7 @@ ScriptDataImpl* PreParse(unibrow::CharacterStream* stream, ...@@ -4302,7 +4302,7 @@ ScriptDataImpl* PreParse(unibrow::CharacterStream* stream,
bool ParseRegExp(FlatStringReader* input, bool ParseRegExp(FlatStringReader* input,
bool multiline, bool multiline,
RegExpParseResult* result) { RegExpCompileData* result) {
ASSERT(result != NULL); ASSERT(result != NULL);
// Make sure we have a stack guard. // Make sure we have a stack guard.
StackGuard guard; StackGuard guard;
......
...@@ -147,7 +147,7 @@ ScriptDataImpl* PreParse(unibrow::CharacterStream* stream, ...@@ -147,7 +147,7 @@ ScriptDataImpl* PreParse(unibrow::CharacterStream* stream,
bool ParseRegExp(FlatStringReader* input, bool ParseRegExp(FlatStringReader* input,
bool multiline, bool multiline,
RegExpParseResult* result); RegExpCompileData* result);
// Support for doing lazy compilation. The script is the script containing full // Support for doing lazy compilation. The script is the script containing full
......
...@@ -55,7 +55,7 @@ static SmartPointer<const char> Parse(const char* input) { ...@@ -55,7 +55,7 @@ static SmartPointer<const char> Parse(const char* input) {
v8::HandleScope scope; v8::HandleScope scope;
ZoneScope zone_scope(DELETE_ON_EXIT); ZoneScope zone_scope(DELETE_ON_EXIT);
FlatStringReader reader(CStrVector(input)); FlatStringReader reader(CStrVector(input));
RegExpParseResult result; RegExpCompileData result;
CHECK(v8::internal::ParseRegExp(&reader, false, &result)); CHECK(v8::internal::ParseRegExp(&reader, false, &result));
CHECK(result.tree != NULL); CHECK(result.tree != NULL);
CHECK(result.error.is_null()); CHECK(result.error.is_null());
...@@ -69,7 +69,7 @@ static bool ParseEscapes(const char* input) { ...@@ -69,7 +69,7 @@ static bool ParseEscapes(const char* input) {
unibrow::Utf8InputBuffer<> buffer(input, strlen(input)); unibrow::Utf8InputBuffer<> buffer(input, strlen(input));
ZoneScope zone_scope(DELETE_ON_EXIT); ZoneScope zone_scope(DELETE_ON_EXIT);
FlatStringReader reader(CStrVector(input)); FlatStringReader reader(CStrVector(input));
RegExpParseResult result; RegExpCompileData result;
CHECK(v8::internal::ParseRegExp(&reader, false, &result)); CHECK(v8::internal::ParseRegExp(&reader, false, &result));
CHECK(result.tree != NULL); CHECK(result.tree != NULL);
CHECK(result.error.is_null()); CHECK(result.error.is_null());
...@@ -259,7 +259,7 @@ static void ExpectError(const char* input, ...@@ -259,7 +259,7 @@ static void ExpectError(const char* input,
v8::HandleScope scope; v8::HandleScope scope;
ZoneScope zone_scope(DELETE_ON_EXIT); ZoneScope zone_scope(DELETE_ON_EXIT);
FlatStringReader reader(CStrVector(input)); FlatStringReader reader(CStrVector(input));
RegExpParseResult result; RegExpCompileData result;
CHECK_EQ(false, v8::internal::ParseRegExp(&reader, false, &result)); CHECK_EQ(false, v8::internal::ParseRegExp(&reader, false, &result));
CHECK(result.tree == NULL); CHECK(result.tree == NULL);
CHECK(!result.error.is_null()); CHECK(!result.error.is_null());
...@@ -358,13 +358,12 @@ TEST(CharacterClassEscapes) { ...@@ -358,13 +358,12 @@ TEST(CharacterClassEscapes) {
static RegExpNode* Compile(const char* input, bool multiline, bool is_ascii) { static RegExpNode* Compile(const char* input, bool multiline, bool is_ascii) {
V8::Initialize(NULL); V8::Initialize(NULL);
FlatStringReader reader(CStrVector(input)); FlatStringReader reader(CStrVector(input));
RegExpParseResult result; RegExpCompileData compile_data;
if (!v8::internal::ParseRegExp(&reader, multiline, &result)) if (!v8::internal::ParseRegExp(&reader, multiline, &compile_data))
return NULL; return NULL;
RegExpNode* node = NULL;
Handle<String> pattern = Factory::NewStringFromUtf8(CStrVector(input)); Handle<String> pattern = Factory::NewStringFromUtf8(CStrVector(input));
RegExpEngine::Compile(&result, &node, false, multiline, pattern, is_ascii); RegExpEngine::Compile(&compile_data, false, multiline, pattern, is_ascii);
return node; return compile_data.node;
} }
...@@ -1128,14 +1127,6 @@ TEST(LatinCanonicalize) { ...@@ -1128,14 +1127,6 @@ TEST(LatinCanonicalize) {
} }
TEST(SimplePropagation) {
v8::HandleScope scope;
ZoneScope zone_scope(DELETE_ON_EXIT);
RegExpNode* node = Compile("(a|^b|c)", false, true);
CHECK(node->info()->follows_start_interest);
}
static uc32 CanonRange(uc32 c) { static uc32 CanonRange(uc32 c) {
unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth]; unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth];
int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL); int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL);
...@@ -1301,5 +1292,5 @@ TEST(CharClassDifference) { ...@@ -1301,5 +1292,5 @@ TEST(CharClassDifference) {
TEST(Graph) { TEST(Graph) {
V8::Initialize(NULL); V8::Initialize(NULL);
Execute("(?=[d#.])", false, true, true); Execute("\\bboy\\b", false, true, true);
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment