Commit 19fceabf authored by lrn@chromium.org's avatar lrn@chromium.org

The BoyerMooreStringSearch now uses separate functions to build its tables.

This hightens readability.


git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@497 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 51c7fa95
...@@ -993,65 +993,34 @@ class BMGoodSuffixBuffers: public AllStatic { ...@@ -993,65 +993,34 @@ class BMGoodSuffixBuffers: public AllStatic {
static int bad_char_occurence[kBMAlphabetSize]; static int bad_char_occurence[kBMAlphabetSize];
static BMGoodSuffixBuffers bmgs_buffers; static BMGoodSuffixBuffers bmgs_buffers;
// Restricted Boyer-Moore string matching. Restricts tables to a // Compute the bad-char table for Boyer-Moore in the static buffer.
// suffix of long pattern strings and handles only equivalence classes // Return false if the pattern contains non-ASCII characters that cannot be
// of the full alphabet. This allows us to ensure that tables take only // in the searched string.
// a fixed amount of space. template <typename pchar, bool check_ascii>
template <typename schar, typename pchar> static bool BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern,
static int BoyerMooreIndexOf(Vector<const schar> subject, int start) {
Vector<const pchar> pattern,
int start_index) {
int m = pattern.length();
int n = subject.length();
// Only preprocess at most kBMMaxShift last characters of pattern.
int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
int len = m - start;
// Run forwards to populate bad_char_table, so that *last* instance // Run forwards to populate bad_char_table, so that *last* instance
// of character equivalence class is the one registered. // of character equivalence class is the one registered.
// Notice: Doesn't include last character. // Notice: Doesn't include the last character.
for (int i = 0; i < kBMAlphabetSize; i++) { for (int i = 0; i < kBMAlphabetSize; i++) {
bad_char_occurence[i] = start - 1; bad_char_occurence[i] = start - 1;
} }
for (int i = start; i < m; i++) { for (int i = start; i < pattern.length(); i++) {
uc32 c = pattern[i]; uc32 c = pattern[i];
bad_char_occurence[c % kBMAlphabetSize] = i; bad_char_occurence[c % kBMAlphabetSize] = i;
if (sizeof(schar) == 1 && if (check_ascii &&
sizeof(pchar) > 1 &&
c > String::kMaxAsciiCharCode) { c > String::kMaxAsciiCharCode) {
return -1; return false;
}
}
// End of Bad Char computation.
int badness = 0; // How bad we are doing without a good-suffix table.
int idx; // No matches found prior to this index.
// Perform search
for (idx = start_index; idx <= n - m;) {
int j = m - 1;
schar c;
while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;
if (j < 0) {
return idx;
} else {
int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
int shift = bc_occ < j ? j - bc_occ : 1;
idx += shift;
// Badness increases by the number of characters we have
// checked, and decreases by the number of characters we
// can skip by shifting. It's a measure of how we are doing
// compared to reading each character exactly once.
badness += (m - j) - shift;
if (badness > m) break;
} }
} }
return true;
}
// If we are not done, we got here because we should build the Good Suffix template <typename pchar>
// table and continue searching. static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern,
if (idx <= n - m) { int start,
int len) {
int m = pattern.length();
// Compute Good Suffix tables. // Compute Good Suffix tables.
bmgs_buffers.init(m); bmgs_buffers.init(m);
...@@ -1070,7 +1039,7 @@ static int BoyerMooreIndexOf(Vector<const schar> subject, ...@@ -1070,7 +1039,7 @@ static int BoyerMooreIndexOf(Vector<const schar> subject,
suffix--; suffix--;
bmgs_buffers.suffix(i) = suffix; bmgs_buffers.suffix(i) = suffix;
if (suffix == m) { if (suffix == m) {
// no suffix to extend, so we check against last_char only. // No suffix to extend, so we check against last_char only.
while (i > start && pattern[i - 1] != last_char) { while (i > start && pattern[i - 1] != last_char) {
if (bmgs_buffers.shift(m) == len) { if (bmgs_buffers.shift(m) == len) {
bmgs_buffers.shift(m) = m - i; bmgs_buffers.shift(m) = m - i;
...@@ -1095,8 +1064,57 @@ static int BoyerMooreIndexOf(Vector<const schar> subject, ...@@ -1095,8 +1064,57 @@ static int BoyerMooreIndexOf(Vector<const schar> subject,
} }
} }
} }
// End of Good Suffix computation. }
// Restricted Boyer-Moore string matching. Restricts tables to a
// suffix of long pattern strings and handles only equivalence classes
// of the full alphabet. This allows us to ensure that tables take only
// a fixed amount of space.
template <typename schar, typename pchar>
static int BoyerMooreIndexOf(Vector<const schar> subject,
Vector<const pchar> pattern,
int start_index) {
int m = pattern.length();
int n = subject.length();
// Only preprocess at most kBMMaxShift last characters of pattern.
int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
int len = m - start;
if (sizeof(pchar) > 1 && sizeof(schar) == 1) {
BoyerMoorePopulateBadCharTable<pchar, true>(pattern, start);
} else {
if (!BoyerMoorePopulateBadCharTable<pchar, false>(pattern, start)) {
return -1;
}
}
int badness = 0; // How bad we are doing without a good-suffix table.
int idx; // No matches found prior to this index.
// Perform search
for (idx = start_index; idx <= n - m;) {
int j = m - 1;
schar c;
while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;
if (j < 0) {
return idx;
} else {
int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
int shift = bc_occ < j ? j - bc_occ : 1;
idx += shift;
// Badness increases by the number of characters we have
// checked, and decreases by the number of characters we
// can skip by shifting. It's a measure of how we are doing
// compared to reading each character exactly once.
badness += (m - j) - shift;
if (badness > m) break;
}
}
// If we are not done, we got here because we should build the Good Suffix
// table and continue searching.
if (idx <= n - m) {
BoyerMoorePopulateGoodSuffixTable(pattern, start, len);
// Continue search from i. // Continue search from i.
do { do {
int j = m - 1; int j = m - 1;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment