Commit 7b4b4959 authored by erik.corry@gmail.com's avatar erik.corry@gmail.com

* Have an ASCII and a UC16 interpreter for Irregexp bytecodes -

  never have to convert an ASCII string to UC16 for Irregexp.
* Generate slightly different code when we know the subject string
  is ASCII.
Review URL: http://codereview.chromium.org/13247

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@941 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 6a8fdf04
...@@ -60,6 +60,22 @@ static bool BackRefMatchesNoCase(int from, ...@@ -60,6 +60,22 @@ static bool BackRefMatchesNoCase(int from,
} }
static bool BackRefMatchesNoCase(int from,
int current,
int len,
Vector<const char> subject) {
for (int i = 0; i < len; i++) {
unsigned int old_char = subject[from++];
unsigned int new_char = subject[current++];
if (old_char == new_char) continue;
if (old_char - 'A' <= 'Z' - 'A') old_char |= 0x20;
if (new_char - 'A' <= 'Z' - 'A') new_char |= 0x20;
if (old_char != new_char) return false;
}
return true;
}
#ifdef DEBUG #ifdef DEBUG
static void TraceInterpreter(const byte* code_base, static void TraceInterpreter(const byte* code_base,
const byte* pc, const byte* pc,
...@@ -96,8 +112,9 @@ static void TraceInterpreter(const byte* code_base, ...@@ -96,8 +112,9 @@ static void TraceInterpreter(const byte* code_base,
template <typename Char>
static bool RawMatch(const byte* code_base, static bool RawMatch(const byte* code_base,
Vector<const uc16> subject, Vector<const Char> subject,
int* registers, int* registers,
int current, int current,
int current_char) { int current_char) {
...@@ -405,23 +422,32 @@ static bool RawMatch(const byte* code_base, ...@@ -405,23 +422,32 @@ static bool RawMatch(const byte* code_base,
bool IrregexpInterpreter::Match(Handle<ByteArray> code_array, bool IrregexpInterpreter::Match(Handle<ByteArray> code_array,
Handle<String> subject16, Handle<String> subject,
int* registers, int* registers,
int start_position) { int start_position) {
ASSERT(StringShape(*subject16).IsTwoByteRepresentation()); ASSERT(subject->IsFlat(StringShape(*subject)));
ASSERT(subject16->IsFlat(StringShape(*subject16)));
AssertNoAllocation a; AssertNoAllocation a;
const byte* code_base = code_array->GetDataStartAddress(); const byte* code_base = code_array->GetDataStartAddress();
StringShape subject_shape(*subject);
uc16 previous_char = '\n'; uc16 previous_char = '\n';
Vector<const uc16> subject_vector = if (subject_shape.IsAsciiRepresentation()) {
Vector<const uc16>(subject16->GetTwoByteData(), subject16->length()); Vector<const char> subject_vector = subject->ToAsciiVector();
if (start_position != 0) previous_char = subject_vector[start_position - 1]; if (start_position != 0) previous_char = subject_vector[start_position - 1];
return RawMatch(code_base, return RawMatch(code_base,
subject_vector, subject_vector,
registers, registers,
start_position, start_position,
previous_char); previous_char);
} else {
Vector<const uc16> subject_vector = subject->ToUC16Vector();
if (start_position != 0) previous_char = subject_vector[start_position - 1];
return RawMatch(code_base,
subject_vector,
registers,
start_position,
previous_char);
}
} }
} } // namespace v8::internal } } // namespace v8::internal
...@@ -36,7 +36,7 @@ namespace v8 { namespace internal { ...@@ -36,7 +36,7 @@ namespace v8 { namespace internal {
class IrregexpInterpreter { class IrregexpInterpreter {
public: public:
static bool Match(Handle<ByteArray> code, static bool Match(Handle<ByteArray> code,
Handle<String> subject16, Handle<String> subject,
int* captures, int* captures,
int start_position); int start_position);
}; };
......
...@@ -883,12 +883,13 @@ Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<FixedArray> irregexp, ...@@ -883,12 +883,13 @@ Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<FixedArray> irregexp,
int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value(); int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value();
if (!subject->IsFlat(StringShape(*subject))) {
FlattenString(subject);
}
switch (tag) { switch (tag) {
case RegExpMacroAssembler::kIA32Implementation: { case RegExpMacroAssembler::kIA32Implementation: {
#ifndef ARM #ifndef ARM
if (!subject->IsFlat(StringShape(*subject))) {
FlattenString(subject);
}
Handle<Code> code = IrregexpNativeCode(irregexp); Handle<Code> code = IrregexpNativeCode(irregexp);
StringShape shape(*subject); StringShape shape(*subject);
...@@ -962,10 +963,8 @@ Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<FixedArray> irregexp, ...@@ -962,10 +963,8 @@ Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<FixedArray> irregexp,
} }
Handle<ByteArray> byte_codes = IrregexpByteCode(irregexp); Handle<ByteArray> byte_codes = IrregexpByteCode(irregexp);
Handle<String> two_byte_subject = CachedStringToTwoByte(subject);
rc = IrregexpInterpreter::Match(byte_codes, rc = IrregexpInterpreter::Match(byte_codes,
two_byte_subject, subject,
offsets_vector, offsets_vector,
previous_index); previous_index);
break; break;
...@@ -1191,7 +1190,7 @@ DispatchTable* ChoiceNode::GetTable(bool ignore_case) { ...@@ -1191,7 +1190,7 @@ DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
class RegExpCompiler { class RegExpCompiler {
public: public:
RegExpCompiler(int capture_count, bool ignore_case); RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
int AllocateRegister() { return next_register_++; } int AllocateRegister() { return next_register_++; }
...@@ -1215,6 +1214,7 @@ class RegExpCompiler { ...@@ -1215,6 +1214,7 @@ class RegExpCompiler {
inline void DecrementRecursionDepth() { recursion_depth_--; } inline void DecrementRecursionDepth() { recursion_depth_--; }
inline bool ignore_case() { return ignore_case_; } inline bool ignore_case() { return ignore_case_; }
inline bool ascii() { return ascii_; }
private: private:
EndNode* accept_; EndNode* accept_;
...@@ -1223,6 +1223,7 @@ class RegExpCompiler { ...@@ -1223,6 +1223,7 @@ class RegExpCompiler {
int recursion_depth_; int recursion_depth_;
RegExpMacroAssembler* macro_assembler_; RegExpMacroAssembler* macro_assembler_;
bool ignore_case_; bool ignore_case_;
bool ascii_;
}; };
...@@ -1239,11 +1240,12 @@ class RecursionCheck { ...@@ -1239,11 +1240,12 @@ class RecursionCheck {
// Attempts to compile the regexp using an Irregexp code generator. Returns // Attempts to compile the regexp using an Irregexp code generator. Returns
// a fixed array or a null handle depending on whether it succeeded. // a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case) RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
: next_register_(2 * (capture_count + 1)), : next_register_(2 * (capture_count + 1)),
work_list_(NULL), work_list_(NULL),
recursion_depth_(0), recursion_depth_(0),
ignore_case_(ignore_case) { ignore_case_(ignore_case),
ascii_(ascii) {
accept_ = new EndNode(EndNode::ACCEPT); accept_ = new EndNode(EndNode::ACCEPT);
} }
...@@ -1682,7 +1684,6 @@ static inline void EmitAtomLetters( ...@@ -1682,7 +1684,6 @@ static inline void EmitAtomLetters(
chars[0], chars[0],
chars[1], chars[1],
on_failure)) { on_failure)) {
ok.Unuse();
} else { } else {
macro_assembler->CheckCharacter(chars[0], &ok); macro_assembler->CheckCharacter(chars[0], &ok);
macro_assembler->CheckNotCharacter(chars[1], on_failure); macro_assembler->CheckNotCharacter(chars[1], on_failure);
...@@ -1711,8 +1712,12 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, ...@@ -1711,8 +1712,12 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
RegExpCharacterClass* cc, RegExpCharacterClass* cc,
int cp_offset, int cp_offset,
Label* on_failure, Label* on_failure,
bool check_offset) { bool check_offset,
bool ascii) {
ZoneList<CharacterRange>* ranges = cc->ranges(); ZoneList<CharacterRange>* ranges = cc->ranges();
const int max_char = ascii ?
String::kMaxAsciiCharCode :
String::kMaxUC16CharCode;
Label success; Label success;
...@@ -1721,16 +1726,27 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, ...@@ -1721,16 +1726,27 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
int range_count = ranges->length(); int range_count = ranges->length();
if (range_count == 0) { int last_valid_range = range_count - 1;
while (last_valid_range >= 0) {
CharacterRange& range = ranges->at(last_valid_range);
if (range.from() <= max_char) {
break;
}
last_valid_range--;
}
if (last_valid_range < 0) {
if (!cc->is_negated()) { if (!cc->is_negated()) {
// TODO(plesner): We can remove this when the node level does our
// ASCII optimizations for us.
macro_assembler->GoTo(on_failure); macro_assembler->GoTo(on_failure);
} }
return; return;
} }
if (range_count == 1 && if (last_valid_range == 0 &&
!cc->is_negated() && !cc->is_negated() &&
ranges->at(0).IsEverything(0xffff)) { ranges->at(0).IsEverything(max_char)) {
// This is a common case hit by non-anchored expressions. // This is a common case hit by non-anchored expressions.
// TODO(erikcorry): We should have a macro assembler instruction that just // TODO(erikcorry): We should have a macro assembler instruction that just
// checks for end of string without loading the character. // checks for end of string without loading the character.
...@@ -1748,18 +1764,22 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, ...@@ -1748,18 +1764,22 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
macro_assembler->LoadCurrentCharacterUnchecked(cp_offset); macro_assembler->LoadCurrentCharacterUnchecked(cp_offset);
} }
for (int i = 0; i < range_count - 1; i++) { for (int i = 0; i <= last_valid_range; i++) {
CharacterRange& range = ranges->at(i); CharacterRange& range = ranges->at(i);
Label next_range; Label next_range;
uc16 from = range.from(); uc16 from = range.from();
uc16 to = range.to(); uc16 to = range.to();
if (from > max_char) {
continue;
}
if (to > max_char) to = max_char;
if (to == from) { if (to == from) {
macro_assembler->CheckCharacter(to, char_is_in_class); macro_assembler->CheckCharacter(to, char_is_in_class);
} else { } else {
if (from != 0) { if (from != 0) {
macro_assembler->CheckCharacterLT(from, &next_range); macro_assembler->CheckCharacterLT(from, &next_range);
} }
if (to != 0xffff) { if (to != max_char) {
macro_assembler->CheckCharacterLT(to + 1, char_is_in_class); macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
} else { } else {
macro_assembler->GoTo(char_is_in_class); macro_assembler->GoTo(char_is_in_class);
...@@ -1768,10 +1788,13 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, ...@@ -1768,10 +1788,13 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
macro_assembler->Bind(&next_range); macro_assembler->Bind(&next_range);
} }
CharacterRange& range = ranges->at(range_count - 1); CharacterRange& range = ranges->at(last_valid_range);
uc16 from = range.from(); uc16 from = range.from();
uc16 to = range.to(); uc16 to = range.to();
if (to > max_char) to = max_char;
ASSERT(to >= from);
if (to == from) { if (to == from) {
if (cc->is_negated()) { if (cc->is_negated()) {
macro_assembler->CheckCharacter(to, on_failure); macro_assembler->CheckCharacter(to, on_failure);
...@@ -1786,7 +1809,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, ...@@ -1786,7 +1809,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
macro_assembler->CheckCharacterLT(from, on_failure); macro_assembler->CheckCharacterLT(from, on_failure);
} }
} }
if (to != 0xffff) { if (to != String::kMaxUC16CharCode) {
if (cc->is_negated()) { if (cc->is_negated()) {
macro_assembler->CheckCharacterLT(to + 1, on_failure); macro_assembler->CheckCharacterLT(to + 1, on_failure);
} else { } else {
...@@ -1875,7 +1898,25 @@ bool TextNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) { ...@@ -1875,7 +1898,25 @@ bool TextNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) {
macro_assembler->GoTo(backtrack); macro_assembler->GoTo(backtrack);
return true; return true;
} }
// First, handle straight character matches. // First check for non-ASCII text.
// TODO(plesner): We should do this at node level.
if (compiler->ascii()) {
for (int i = element_count - 1; i >= 0; i--) {
TextElement elm = elms_->at(i);
if (elm.type == TextElement::ATOM) {
Vector<const uc16> quarks = elm.data.u_atom->data();
for (int j = quarks.length() - 1; j >= 0; j--) {
if (quarks[j] > String::kMaxAsciiCharCode) {
macro_assembler->GoTo(backtrack);
return true;
}
}
} else {
ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
}
}
}
// Second, handle straight character matches.
int checked_up_to = -1; int checked_up_to = -1;
for (int i = element_count - 1; i >= 0; i--) { for (int i = element_count - 1; i >= 0; i--) {
TextElement elm = elms_->at(i); TextElement elm = elms_->at(i);
...@@ -1902,7 +1943,7 @@ bool TextNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) { ...@@ -1902,7 +1943,7 @@ bool TextNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) {
ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
} }
} }
// Second, handle case independent letter matches if any. // Third, handle case independent letter matches if any.
if (compiler->ignore_case()) { if (compiler->ignore_case()) {
for (int i = element_count - 1; i >= 0; i--) { for (int i = element_count - 1; i >= 0; i--) {
TextElement elm = elms_->at(i); TextElement elm = elms_->at(i);
...@@ -1930,7 +1971,8 @@ bool TextNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) { ...@@ -1930,7 +1971,8 @@ bool TextNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) {
cc, cc,
cp_offset, cp_offset,
backtrack, backtrack,
checked_up_to < cp_offset); checked_up_to < cp_offset,
compiler->ascii());
if (cp_offset > checked_up_to) checked_up_to = cp_offset; if (cp_offset > checked_up_to) checked_up_to = cp_offset;
} }
} }
...@@ -2791,7 +2833,7 @@ static void AddClassNegated(const uc16 *elmv, ...@@ -2791,7 +2833,7 @@ static void AddClassNegated(const uc16 *elmv,
int elmc, int elmc,
ZoneList<CharacterRange>* ranges) { ZoneList<CharacterRange>* ranges) {
ASSERT(elmv[0] != 0x0000); ASSERT(elmv[0] != 0x0000);
ASSERT(elmv[elmc-1] != 0xFFFF); ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
uc16 last = 0x0000; uc16 last = 0x0000;
for (int i = 0; i < elmc; i += 2) { for (int i = 0; i < elmc; i += 2) {
ASSERT(last <= elmv[i] - 1); ASSERT(last <= elmv[i] - 1);
...@@ -2799,7 +2841,7 @@ static void AddClassNegated(const uc16 *elmv, ...@@ -2799,7 +2841,7 @@ static void AddClassNegated(const uc16 *elmv,
ranges->Add(CharacterRange(last, elmv[i] - 1)); ranges->Add(CharacterRange(last, elmv[i] - 1));
last = elmv[i + 1] + 1; last = elmv[i + 1] + 1;
} }
ranges->Add(CharacterRange(last, 0xFFFF)); ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
} }
...@@ -3187,7 +3229,7 @@ void DispatchTable::AddRange(CharacterRange full_range, int value) { ...@@ -3187,7 +3229,7 @@ void DispatchTable::AddRange(CharacterRange full_range, int value) {
entry->AddValue(value); entry->AddValue(value);
// Bail out if the last interval ended at 0xFFFF since otherwise // Bail out if the last interval ended at 0xFFFF since otherwise
// adding 1 will wrap around to 0. // adding 1 will wrap around to 0.
if (entry->to() == 0xFFFF) if (entry->to() == String::kMaxUC16CharCode)
break; break;
ASSERT(entry->to() + 1 > current.from()); ASSERT(entry->to() + 1 > current.from());
current.set_from(entry->to() + 1); current.set_from(entry->to() + 1);
...@@ -3562,14 +3604,14 @@ void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) { ...@@ -3562,14 +3604,14 @@ void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
if (last < range.from()) if (last < range.from())
AddRange(CharacterRange(last, range.from() - 1)); AddRange(CharacterRange(last, range.from() - 1));
if (range.to() >= last) { if (range.to() >= last) {
if (range.to() == 0xFFFF) { if (range.to() == String::kMaxUC16CharCode) {
return; return;
} else { } else {
last = range.to() + 1; last = range.to() + 1;
} }
} }
} }
AddRange(CharacterRange(last, 0xFFFF)); AddRange(CharacterRange(last, String::kMaxUC16CharCode));
} }
...@@ -3611,7 +3653,7 @@ Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input, ...@@ -3611,7 +3653,7 @@ Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,
bool is_multiline, bool is_multiline,
Handle<String> pattern, Handle<String> pattern,
bool is_ascii) { bool is_ascii) {
RegExpCompiler compiler(input->capture_count, ignore_case); RegExpCompiler compiler(input->capture_count, ignore_case, is_ascii);
// Wrap the body of the regexp in capture #0. // Wrap the body of the regexp in capture #0.
RegExpNode* captured_body = RegExpCapture::ToNode(input->tree, RegExpNode* captured_body = RegExpCapture::ToNode(input->tree,
0, 0,
......
...@@ -48,6 +48,9 @@ namespace v8 { namespace internal { ...@@ -48,6 +48,9 @@ namespace v8 { namespace internal {
const int kGetterIndex = 0; const int kGetterIndex = 0;
const int kSetterIndex = 1; const int kSetterIndex = 1;
const int String::kMaxAsciiCharCode;
const int String::kMaxUC16CharCode;
bool Object::IsInstanceOf(FunctionTemplateInfo* expected) { bool Object::IsInstanceOf(FunctionTemplateInfo* expected) {
// There is a constraint on the object; check // There is a constraint on the object; check
if (!this->IsJSObject()) return false; if (!this->IsJSObject()) return false;
......
...@@ -3212,6 +3212,7 @@ class String: public HeapObject { ...@@ -3212,6 +3212,7 @@ class String: public HeapObject {
// Max ascii char code. // Max ascii char code.
static const int kMaxAsciiCharCode = unibrow::Utf8::kMaxOneByteChar; static const int kMaxAsciiCharCode = unibrow::Utf8::kMaxOneByteChar;
static const int kMaxUC16CharCode = 0xffff;
// Minimum length for a cons or sliced string. // Minimum length for a cons or sliced string.
static const int kMinNonFlatLength = 13; static const int kMinNonFlatLength = 13;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment