Commit dc87d51f authored by lrn@chromium.org's avatar lrn@chromium.org

Changed Boyer-Moore's bad-char table code:

- Reduce it to half size if the pattern is ASCII, saving on initialization
- If pattern is ASCII and subject is not, any non-ASCII char can cause a
  full pattern-length shift, even if we haven't indexed the entire pattern.
- Use memset to initialize buffer in the common case where the pattern is
  shorter than the max significant suffix limit.


git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@519 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent ebd11b73
...@@ -965,7 +965,7 @@ static const int kBMMinPatternLength = 5; ...@@ -965,7 +965,7 @@ static const int kBMMinPatternLength = 5;
// Holds the two buffers used by Boyer-Moore string search's Good Suffix // Holds the two buffers used by Boyer-Moore string search's Good Suffix
// shift. Only allows the last kBMMaxShift characters of the needle // shift. Only allows the last kBMMaxShift characters of the needle
// to be indexed. // to be indexed.
class BMGoodSuffixBuffers: public AllStatic { class BMGoodSuffixBuffers {
public: public:
BMGoodSuffixBuffers() {} BMGoodSuffixBuffers() {}
inline void init(int needle_length) { inline void init(int needle_length) {
...@@ -1005,12 +1005,18 @@ static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern, ...@@ -1005,12 +1005,18 @@ static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern,
// Run forwards to populate bad_char_table, so that *last* instance // Run forwards to populate bad_char_table, so that *last* instance
// of character equivalence class is the one registered. // of character equivalence class is the one registered.
// Notice: Doesn't include the last character. // Notice: Doesn't include the last character.
for (int i = 0; i < kBMAlphabetSize; i++) { int table_size = (sizeof(pchar) == 1) ? String::kMaxAsciiCharCode + 1
bad_char_occurence[i] = start - 1; : kBMAlphabetSize;
if (start == 0) { // All patterns less than kBMMaxShift in length.
memset(bad_char_occurence, -1, table_size * sizeof(*bad_char_occurence));
} else {
for (int i = 0; i < table_size; i++) {
bad_char_occurence[i] = start - 1;
}
} }
for (int i = start; i < pattern.length() - 1; i++) { for (int i = start; i < pattern.length() - 1; i++) {
pchar c = pattern[i]; pchar c = pattern[i];
int bucket = c % kBMAlphabetSize; int bucket = (sizeof(pchar) ==1) ? c : c % kBMAlphabetSize;
bad_char_occurence[bucket] = i; bad_char_occurence[bucket] = i;
} }
} }
...@@ -1065,6 +1071,19 @@ static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern, ...@@ -1065,6 +1071,19 @@ static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern,
} }
} }
template <typename schar, typename pchar>
static inline int CharOccurence(int char_code) {
if (sizeof(schar) == 1) {
return bad_char_occurence[char_code];
}
if (sizeof(pchar) == 1) {
if (char_code > String::kMaxAsciiCharCode) {
return -1;
}
return bad_char_occurence[char_code];
}
return bad_char_occurence[char_code % kBMAlphabetSize];
}
// Restricted simplified Boyer-Moore string matching. Restricts tables to a // Restricted simplified Boyer-Moore string matching. Restricts tables to a
// suffix of long pattern strings and handles only equivalence classes // suffix of long pattern strings and handles only equivalence classes
...@@ -1090,7 +1109,7 @@ static int BoyerMooreSimplified(Vector<const schar> subject, ...@@ -1090,7 +1109,7 @@ static int BoyerMooreSimplified(Vector<const schar> subject,
int j = m - 1; int j = m - 1;
int c; int c;
while (last_char != (c = subject[idx + j])) { while (last_char != (c = subject[idx + j])) {
int bc_occ = bad_char_occurence[c % kBMAlphabetSize]; int bc_occ = CharOccurence<schar, pchar>(c);
int shift = j - bc_occ; int shift = j - bc_occ;
idx += shift; idx += shift;
badness += 1 - shift; // at most zero, so badness cannot increase. badness += 1 - shift; // at most zero, so badness cannot increase.
...@@ -1105,7 +1124,7 @@ static int BoyerMooreSimplified(Vector<const schar> subject, ...@@ -1105,7 +1124,7 @@ static int BoyerMooreSimplified(Vector<const schar> subject,
complete = true; complete = true;
return idx; return idx;
} else { } else {
int bc_occ = bad_char_occurence[c % kBMAlphabetSize]; int bc_occ = CharOccurence<schar, pchar>(c);
int shift = bc_occ < j ? j - bc_occ : 1; int shift = bc_occ < j ? j - bc_occ : 1;
idx += shift; idx += shift;
// Badness increases by the number of characters we have // Badness increases by the number of characters we have
...@@ -1141,7 +1160,7 @@ static int BoyerMooreIndexOf(Vector<const schar> subject, ...@@ -1141,7 +1160,7 @@ static int BoyerMooreIndexOf(Vector<const schar> subject,
int j = m - 1; int j = m - 1;
schar c; schar c;
while (last_char != (c = subject[idx + j])) { while (last_char != (c = subject[idx + j])) {
int shift = j - bad_char_occurence[c % kBMAlphabetSize]; int shift = j - CharOccurence<schar, pchar>(c);
idx += shift; idx += shift;
if (idx > n - m) { if (idx > n - m) {
return -1; return -1;
...@@ -1155,7 +1174,7 @@ static int BoyerMooreIndexOf(Vector<const schar> subject, ...@@ -1155,7 +1174,7 @@ static int BoyerMooreIndexOf(Vector<const schar> subject,
idx += 1; idx += 1;
} else { } else {
int gs_shift = bmgs_buffers.shift(j + 1); // Good suffix shift. int gs_shift = bmgs_buffers.shift(j + 1); // Good suffix shift.
int bc_occ = bad_char_occurence[c % kBMAlphabetSize]; int bc_occ = CharOccurence<schar, pchar>(c);
int shift = j - bc_occ; // Bad-char shift. int shift = j - bc_occ; // Bad-char shift.
shift = (gs_shift > shift) ? gs_shift : shift; shift = (gs_shift > shift) ? gs_shift : shift;
idx += shift; idx += shift;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment