Commit 5aac3a5b authored by lrn@chromium.org's avatar lrn@chromium.org

KMP algorithm is still left in the source. If this change checks out to be...

KMP algorithm is still left in the source. If this change checks out to be faster, it should be removed.


git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@463 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 097d3506
......@@ -218,7 +218,7 @@ Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
}
LOG(RegExpExecEvent(re, start_index, subject));
int value = Runtime::StringMatchKmp(subject, needle, start_index);
int value = Runtime::StringMatch(subject, needle, start_index);
if (value == -1) return Factory::null_value();
Handle<JSArray> result = Factory::NewJSArray(2);
SetElement(result, 0, Handle<Smi>(Smi::FromInt(value)));
......@@ -239,7 +239,7 @@ Handle<Object> RegExpImpl::AtomExecGlobal(Handle<JSRegExp> re,
LOG(RegExpExecEvent(re, index, subject));
int value = -1;
if (index + needle_length <= subject_length) {
value = Runtime::StringMatchKmp(subject, needle, index);
value = Runtime::StringMatch(subject, needle, index);
}
if (value == -1) break;
HandleScope scope;
......
......@@ -1053,6 +1053,64 @@ static int SimpleIndexOf(Vector<const schar> subject,
return -1;
}
// Maximal length (+1) of suffix that is indexed. Also the size of the
// maximal bad-character skip.
static const int kBMHSignificantSuffixLength = 0xff;
// Significant bits taken from characters to use in bad-character
// skips, to reduce size of the table for Unicode letters.
static const int kBMHSignificantBitsMask = 0xff;
// Number of elements in bad-char table.
static const int kBMHBadCharCount = kBMHSignificantBitsMask + 1;
// Simplified Boyer-Moore string matching. Only uses bad-char skipping,
// and restricts table to a suffix of long strings (also restricting
// the maximum possible skip-length) in order to reduce space.
template <typename schar, typename pchar>
static int BoyerMooreHorspoolIndexOf(Vector<const schar> subject,
Vector<const pchar> pattern,
int start_index) {
ASSERT(kBMHSignificantSuffixLength < 0x100); // We can use bytes as skips.
static byte bad_char_map[kBMHBadCharCount];
int m = pattern.length();
int n = subject.length();
// Cap bad char table to last p chars of pattern. Also max skip value.
int p = m < kBMHSignificantSuffixLength ? m : kBMHSignificantSuffixLength;
memset(bad_char_map, p, kBMHBadCharCount);
// Run forwards to populate bad_char_table, so that *last* instance
// of character equivalence class is the one registered.
// Notice: Doesn't include last character.
for (int i = p < m ? m - p : 0; i < m - 1; i++) {
pchar c = pattern[i];
if (sizeof(schar) == 1 && c > 255) return -1;
bad_char_map[c & kBMHSignificantBitsMask] = m - 1 - i;
}
for (int i = start_index + m - 1, j = m - 1; i < n;) {
schar c = subject[i];
if (c == pattern[j]) {
if (j == 0) {
return i;
}
j--;
i--;
} else {
int skip = bad_char_map[c & kBMHSignificantBitsMask];
if (skip < (m - j)) {
skip = m - j;
}
i += skip;
j = m - 1;
}
}
return -1;
}
// Full KMP pattern match.
template <typename schar, typename pchar> // Pattern & subject char types
static int KMPIndexOf(Vector<const schar> subject,
......@@ -1101,35 +1159,40 @@ static int KMPIndexOf(Vector<const schar> subject,
// Dispatch to different algorithms for different length of pattern/subject
template <typename schar, typename pchar>
static int StringMatchKMP(Vector<const schar> sub,
Vector<const pchar> pat,
int start_index) {
static int StringMatchStrategy(Vector<const schar> sub,
Vector<const pchar> pat,
int start_index) {
int pattern_length = pat.length();
// Searching for one specific character is common. For one
// character patterns the KMP algorithm is guaranteed to slow down
// the search, so we just run through the subject string.
if (pat.length() == 1) {
if (pattern_length == 1) {
return SingleCharIndexOf(sub, pat[0], start_index);
}
// For small searches, KMP is not worth the setup overhead.
if (sub.length() - start_index < 100) {
// For small searches, a complex sort is not worth the setup overhead.
if (sub.length() - start_index < 25) {
return SimpleIndexOf(sub, pat, start_index);
}
// For patterns with a larger length we use the KMP algorithm.
return KMPIndexOf(sub, pat, start_index);
return BoyerMooreHorspoolIndexOf(sub, pat, start_index);
}
// Perform string match of pattern on subject, starting at start index.
// Caller must ensure that 0 <= start_index <= sub->length(),
// and should check that pat->length() + start_index <= sub->length()
int Runtime::StringMatchKmp(Handle<String> sub,
Handle<String> pat,
int start_index) {
int Runtime::StringMatch(Handle<String> sub,
Handle<String> pat,
int start_index) {
ASSERT(0 <= start_index);
ASSERT(start_index <= sub->length());
if (pat->length() == 0) return start_index;
int pattern_length = pat->length();
if (pattern_length == 0) return start_index;
int subject_length = sub->length();
if (start_index + pattern_length > subject_length) return -1;
FlattenString(sub);
FlattenString(pat);
......@@ -1138,15 +1201,15 @@ int Runtime::StringMatchKmp(Handle<String> sub,
if (pat->is_ascii()) {
Vector<const char> pat_vector = ToAsciiVector(*pat);
if (sub->is_ascii()) {
return StringMatchKMP(ToAsciiVector(*sub), pat_vector, start_index);
return StringMatchStrategy(ToAsciiVector(*sub), pat_vector, start_index);
}
return StringMatchKMP(ToUC16Vector(*sub), pat_vector, start_index);
return StringMatchStrategy(ToUC16Vector(*sub), pat_vector, start_index);
}
Vector<const uc16> pat_vector = ToUC16Vector(*pat);
if (sub->is_ascii()) {
return StringMatchKMP(ToAsciiVector(*sub), pat_vector, start_index);
return StringMatchStrategy(ToAsciiVector(*sub), pat_vector, start_index);
}
return StringMatchKMP(ToUC16Vector(*sub), pat_vector, start_index);
return StringMatchStrategy(ToUC16Vector(*sub), pat_vector, start_index);
}
......@@ -1161,7 +1224,7 @@ static Object* Runtime_StringIndexOf(Arguments args) {
uint32_t start_index;
if (!Array::IndexFromObject(index, &start_index)) return Smi::FromInt(-1);
int position = Runtime::StringMatchKmp(sub, pat, start_index);
int position = Runtime::StringMatch(sub, pat, start_index);
return Smi::FromInt(position);
}
......
......@@ -335,7 +335,7 @@ class Runtime : public AllStatic {
// Get the runtime function with the given name.
static Function* FunctionForName(const char* name);
static int StringMatchKmp(Handle<String> sub, Handle<String> pat, int index);
static int StringMatch(Handle<String> sub, Handle<String> pat, int index);
// TODO(1240886): The following three methods are *not* handle safe,
// but accept handle arguments. This seems fragile.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment