Commit 2b77e718 authored by erik.corry@gmail.com's avatar erik.corry@gmail.com

Add support for \b and ^ and $ in multiline mode, completing Irregexp

features.  Switch on Irregexp by default.
Review URL: http://codereview.chromium.org/18193

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@1104 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent d6e33a94
......@@ -1382,7 +1382,7 @@ class RegExpCharacterClass: public RegExpTree {
// W : non-ASCII word character
// d : ASCII digit
// D : non-ASCII digit
// . : non-unicode newline
// . : non-unicode non-newline
// * : All characters
uc16 standard_type() { return set_.standard_set_type(); }
ZoneList<CharacterRange>* ranges() { return set_.ranges(); }
......
......@@ -72,8 +72,9 @@ V(LOOKUP_HI_MAP8, 36, 99) /* l_himap8 start8 byte_map_addr32 addr32* */ \
V(CHECK_REGISTER_LT, 37, 8) /* check_reg_lt register_index value16 addr32 */ \
V(CHECK_REGISTER_GE, 38, 8) /* check_reg_ge register_index value16 addr32 */ \
V(CHECK_REGISTER_EQ_POS, 39, 6) /* check_register_eq_pos index addr32 */ \
V(CHECK_NOT_AT_START, 40, 5) /* check_not_at_start addr32 */ \
V(CHECK_GREEDY, 41, 5) /* check_greedy addr32 */
V(CHECK_AT_START, 40, 5) /* check_at_start addr32 */ \
V(CHECK_NOT_AT_START, 41, 5) /* check_not_at_start addr32 */ \
V(CHECK_GREEDY, 42, 5) /* check_greedy addr32 */
#define DECLARE_BYTECODES(name, code, length) \
static const int BC_##name = code;
......
......@@ -199,12 +199,11 @@ DEFINE_bool(usage_computation, true, "compute variable usage counts")
DEFINE_bool(preemption, false,
"activate a 100ms timer that switches between V8 threads")
// irregexp
// Irregexp
DEFINE_bool(irregexp, false, "new regular expression code")
DEFINE_bool(trace_regexps, false, "trace Irregexp execution")
DEFINE_bool(irregexp_native, false, "use native code Irregexp implementation (IA32 only)")
DEFINE_bool(disable_jscre, false, "abort if JSCRE is used. Only useful with --irregexp")
DEFINE_bool(attempt_multiline_irregexp, false, "attempt to use Irregexp for multiline regexps")
// Testing flags test/cctest/test-{flags,api,serialization}.cc
DEFINE_bool(testing_bool_flag, true, "testing_bool_flag")
......
......@@ -490,6 +490,13 @@ static bool RawMatch(const byte* code_base,
}
break;
}
BYTECODE(CHECK_AT_START)
if (current == 0) {
pc = code_base + Load32(pc + 1);
} else {
pc += BC_CHECK_AT_START_LENGTH;
}
break;
BYTECODE(CHECK_NOT_AT_START)
if (current == 0) {
pc += BC_CHECK_NOT_AT_START_LENGTH;
......
This diff is collapsed.
......@@ -410,6 +410,7 @@ class DispatchTable : public ZoneObject {
VISIT(Action) \
VISIT(Choice) \
VISIT(BackReference) \
VISIT(Assertion) \
VISIT(Text)
......@@ -619,12 +620,6 @@ class RegExpNode: public ZoneObject {
// the deferred actions in the current trace and generating a goto.
static const int kMaxCopiesCodeGenerated = 10;
// Propagates the given interest information forward. When seeing
// \bfoo for instance, the \b is implemented by propagating forward
// to the 'foo' string that it should only succeed if its first
// character is a letter xor the previous character was a letter.
virtual RegExpNode* PropagateForward(NodeInfo* info) = 0;
NodeInfo* info() { return &info_; }
void AddSibling(RegExpNode* node) { siblings_.Add(node); }
......@@ -744,7 +739,6 @@ class ActionNode: public SeqRegExpNode {
int filled_in) {
return on_success()->GetQuickCheckDetails(details, compiler, filled_in);
}
virtual RegExpNode* PropagateForward(NodeInfo* info);
Type type() { return type_; }
// TODO(erikcorry): We should allow some action nodes in greedy loops.
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
......@@ -797,7 +791,6 @@ class TextNode: public SeqRegExpNode {
elms_->Add(TextElement::CharClass(that));
}
virtual void Accept(NodeVisitor* visitor);
virtual RegExpNode* PropagateForward(NodeInfo* info);
virtual bool Emit(RegExpCompiler* compiler, Trace* trace);
virtual int EatsAtLeast(int recursion_depth);
virtual void GetQuickCheckDetails(QuickCheckDetails* details,
......@@ -831,6 +824,47 @@ class TextNode: public SeqRegExpNode {
};
class AssertionNode: public SeqRegExpNode {
public:
enum AssertionNodeType {
AT_END,
AT_START,
AT_BOUNDARY,
AT_NON_BOUNDARY,
AFTER_NEWLINE
};
static AssertionNode* AtEnd(RegExpNode* on_success) {
return new AssertionNode(AT_END, on_success);
}
static AssertionNode* AtStart(RegExpNode* on_success) {
return new AssertionNode(AT_START, on_success);
}
static AssertionNode* AtBoundary(RegExpNode* on_success) {
return new AssertionNode(AT_BOUNDARY, on_success);
}
static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
return new AssertionNode(AT_NON_BOUNDARY, on_success);
}
static AssertionNode* AfterNewline(RegExpNode* on_success) {
return new AssertionNode(AFTER_NEWLINE, on_success);
}
virtual void Accept(NodeVisitor* visitor);
virtual bool Emit(RegExpCompiler* compiler, Trace* trace);
virtual int EatsAtLeast(int recursion_depth);
virtual void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int filled_in) {
return on_success()->GetQuickCheckDetails(details, compiler, filled_in);
}
virtual AssertionNode* Clone() { return new AssertionNode(*this); }
AssertionNodeType type() { return type_; }
private:
AssertionNode(AssertionNodeType t, RegExpNode* on_success)
: SeqRegExpNode(on_success), type_(t) { }
AssertionNodeType type_;
};
class BackReferenceNode: public SeqRegExpNode {
public:
BackReferenceNode(int start_reg,
......@@ -843,13 +877,12 @@ class BackReferenceNode: public SeqRegExpNode {
int start_register() { return start_reg_; }
int end_register() { return end_reg_; }
virtual bool Emit(RegExpCompiler* compiler, Trace* trace);
virtual int EatsAtLeast(int recursion_depth) { return 0; }
virtual int EatsAtLeast(int recursion_depth);
virtual void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in) {
return;
}
virtual RegExpNode* PropagateForward(NodeInfo* info);
virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }
private:
......@@ -871,12 +904,8 @@ class EndNode: public RegExpNode {
// Returning 0 from EatsAtLeast should ensure we never get here.
UNREACHABLE();
}
virtual RegExpNode* PropagateForward(NodeInfo* info);
virtual EndNode* Clone() { return new EndNode(*this); }
protected:
void EmitInfoChecks(RegExpMacroAssembler* macro, Trace* trace);
private:
Action action_;
};
......@@ -947,7 +976,6 @@ class ChoiceNode: public RegExpNode {
virtual void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in);
virtual RegExpNode* PropagateForward(NodeInfo* info);
virtual ChoiceNode* Clone() { return new ChoiceNode(*this); }
bool being_calculated() { return being_calculated_; }
......@@ -1133,8 +1161,7 @@ class Trace {
void set_quick_check_performed(QuickCheckDetails* d) {
quick_check_performed_ = *d;
}
void clear_quick_check_performed() {
}
void InvalidateCurrentCharacter();
void AdvanceCurrentPositionInTrace(int by, bool ascii);
private:
int FindAffectedRegisters(OutSet* affected_registers);
......
......@@ -174,6 +174,20 @@ void RegExpMacroAssemblerIA32::CheckCharacterGT(uc16 limit, Label* on_greater) {
}
void RegExpMacroAssemblerIA32::CheckAtStart(Label* on_at_start) {
Label ok;
// Did we start the match at the start of the string at all?
__ cmp(Operand(ebp, kAtStart), Immediate(0));
BranchOrBacktrack(equal, &ok);
// If we did, are we still at the start of the input?
__ mov(eax, Operand(ebp, kInputEndOffset));
__ add(eax, Operand(edi));
__ cmp(eax, Operand(ebp, kInputStartOffset));
BranchOrBacktrack(equal, on_at_start);
__ bind(&ok);
}
void RegExpMacroAssemblerIA32::CheckNotAtStart(Label* on_not_at_start) {
// Did we start the match at the start of the string at all?
__ cmp(Operand(ebp, kAtStart), Immediate(0));
......
......@@ -43,6 +43,7 @@ class RegExpMacroAssemblerIA32: public RegExpMacroAssembler {
virtual void AdvanceRegister(int reg, int by);
virtual void Backtrack();
virtual void Bind(Label* label);
virtual void CheckAtStart(Label* on_at_start);
virtual void CheckBitmap(uc16 start, Label* bitmap, Label* on_zero);
virtual void CheckCharacter(uint32_t c, Label* on_equal);
virtual void CheckCharacterAfterAnd(uint32_t c,
......
......@@ -256,6 +256,12 @@ void RegExpMacroAssemblerIrregexp::CheckCharacter(uint32_t c, Label* on_equal) {
}
void RegExpMacroAssemblerIrregexp::CheckAtStart(Label* on_at_start) {
Emit(BC_CHECK_AT_START);
EmitOrLink(on_at_start);
}
void RegExpMacroAssemblerIrregexp::CheckNotAtStart(Label* on_not_at_start) {
Emit(BC_CHECK_NOT_AT_START);
EmitOrLink(on_not_at_start);
......
......@@ -81,6 +81,7 @@ class RegExpMacroAssemblerIrregexp: public RegExpMacroAssembler {
virtual void CheckCharacterGT(uc16 limit, Label* on_greater);
virtual void CheckCharacterLT(uc16 limit, Label* on_less);
virtual void CheckGreedyLoop(Label* on_tos_equals_current_position);
virtual void CheckAtStart(Label* on_at_start);
virtual void CheckNotAtStart(Label* on_not_at_start);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c,
......
......@@ -210,6 +210,12 @@ void RegExpMacroAssemblerTracer::CheckCharacter(uint32_t c, Label* on_equal) {
}
void RegExpMacroAssemblerTracer::CheckAtStart(Label* on_at_start) {
PrintF(" CheckAtStart(label[%08x]);\n", on_at_start);
assembler_->CheckAtStart(on_at_start);
}
void RegExpMacroAssemblerTracer::CheckNotAtStart(Label* on_not_at_start) {
PrintF(" CheckNotAtStart(label[%08x]);\n", on_not_at_start);
assembler_->CheckNotAtStart(on_not_at_start);
......
......@@ -41,6 +41,7 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
virtual void AdvanceRegister(int reg, int by); // r[reg] += by.
virtual void Backtrack();
virtual void Bind(Label* label);
virtual void CheckAtStart(Label* on_at_start);
virtual void CheckBitmap(uc16 start, Label* bitmap, Label* on_zero);
virtual void CheckCharacter(uint32_t c, Label* on_equal);
virtual void CheckCharacterAfterAnd(uint32_t c,
......
......@@ -61,6 +61,7 @@ class RegExpMacroAssembler {
// stack by an earlier PushBacktrack(Label*).
virtual void Backtrack() = 0;
virtual void Bind(Label* label) = 0;
virtual void CheckAtStart(Label* on_at_start) = 0;
// Check the current character against a bitmap. The range of the current
// character must be from start to start + length_of_bitmap_in_bits.
virtual void CheckBitmap(
......
......@@ -572,6 +572,9 @@ function splitMatch(separator, subject, current_index, start_index) {
if (ovector == null) return null;
var nof_results = ovector.length >> 1;
var result = new $Array(nof_results + 1);
// Section 15.5.4.14 paragraph two says that we do not allow zero length
// matches at the end of the string.
if (ovector[0] === subject.length) return null;
result[0] = ovector[1];
result[1] = subject.slice(current_index, ovector[0]);
for (var i = 1; i < nof_results; i++) {
......
......@@ -240,11 +240,8 @@ ecma_3/RegExp/regress-119909: PASS || FAIL_OK
# 'minimum repeat count' is reached, the empty string must not match.
# In this case, we are similar but not identical to JSC. Hard to
# support the JS behavior with PCRE, so maybe emulate JSC?
#
# Note: We do not support toSource currently so we cannot run this
# test. We should make an isolated test case for the regexp issue.
ecma_3/RegExp/regress-209919: FAIL_OK
js1_5/extensions/regress-459606: FAIL_OK
ecma_3/RegExp/regress-209919: PASS || FAIL_OK
js1_5/extensions/regress-459606: PASS || FAIL_OK
# PCRE's match limit is reached. SpiderMonkey hangs on the first one,
......@@ -265,11 +262,6 @@ ecma_3/RegExp/regress-307456: PASS || FAIL_OK
js1_5/Regress/regress-230216-2: FAIL_OK
# According to ECMA-262, \b is a 'word' boundary, where words are only
# ASCII characters. PCRE supports non-ASCII word characters.
js1_5/Regress/regress-247179: FAIL_OK
# Regexp too long for PCRE.
js1_5/Regress/regress-280769: PASS || FAIL
js1_5/Regress/regress-280769-1: PASS || FAIL
......@@ -471,7 +463,7 @@ ecma_3/Unicode/uc-001: FAIL_OK
# A non-breaking space doesn't match \s in a regular expression. This behaviour
# matches JSC. All the VMs have different behaviours in which characters match
# \s so we do the same as JSC until they change.
ecma_3/Unicode/uc-002: FAIL_OK
ecma_3/Unicode/uc-002: PASS || FAIL_OK
# String.prototype.split on empty strings always returns an array
......@@ -521,10 +513,12 @@ js1_5/Regress/regress-336100: FAIL_OK
# Regular expression test failures due to PCRE. We match JSC (ie, perl)
# behavior and not the ECMA spec.
ecma_3/RegExp/15.10.2-1: FAIL_OK
ecma_3/RegExp/perlstress-001: FAIL_OK
ecma_3/RegExp/perlstress-001: PASS || FAIL_OK
ecma_3/RegExp/regress-334158: PASS || FAIL
# This test fails due to http://code.google.com/p/v8/issues/detail?id=187
# Failure to clear captures when a lookahead is unwound.
ecma_3/RegExp/15.10.2-1: PASS || FAIL_OK
# This test requires a failure if we try to compile a function with more
# than 65536 arguments. This seems to be a Mozilla restriction.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment