Commit 9e0609db authored by lrn@chromium.org's avatar lrn@chromium.org

Most operations are faster than before.


git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@492 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 49c5ed00
...@@ -952,148 +952,175 @@ static Object* Runtime_CharFromCode(Arguments args) { ...@@ -952,148 +952,175 @@ static Object* Runtime_CharFromCode(Arguments args) {
} }
template <typename schar, typename pchar> // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
static int SingleCharIndexOf(Vector<const schar> string, // limit, we can fix the size of tables.
pchar pattern_char, static const int kBMMaxShift = 0xff;
int start_index) { static const int kBMAlphabetSize = 0x100; // Reduce alphabet to this size.
for (int i = start_index, n = string.length(); i < n; i++) {
if (pattern_char == string[i]) { // Holds the two buffers used by Boyer-Moore string search's Good Suffix
return i; // shift. Only allows the last kBMMaxShift characters of the needle
// to be indexed.
class BMGoodSuffixBuffers: public AllStatic {
public:
BMGoodSuffixBuffers() {}
inline void init(int needle_length) {
ASSERT(needle_length > 1);
int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift;
int len = needle_length - start;
biased_suffixes_ = suffixes_ - start;
biased_good_suffix_shift_ = good_suffix_shift_ - start;
for (int i = 0; i <= len; i++) {
good_suffix_shift_[i] = len;
} }
} }
return -1; inline int& suffix(int index) {
} ASSERT(biased_suffixes_ + index >= suffixes_);
return biased_suffixes_[index];
// Trivial string search for shorter strings.
template <typename pchar, typename schar>
static int SimpleIndexOf(Vector<const schar> subject,
Vector<const pchar> pattern,
int start_index) {
int pattern_length = pattern.length();
int subject_length = subject.length();
// We know our pattern is at least 2 characters, we cache the first so
// the common case of the first character not matching is faster.
pchar pattern_first_char = pattern[0];
for (int i = start_index, n = subject_length - pattern_length; i <= n; i++) {
if (subject[i] != pattern_first_char) continue;
bool failure = false;
for (int j = 1; j < pattern_length; j++) {
if (pattern[j] != subject[j+i]) {
failure = true;
break;
}
}
if (!failure) {
return i;
}
} }
return -1; inline int& shift(int index) {
} ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_);
return biased_good_suffix_shift_[index];
// Maximal length (+1) of suffix that is indexed. Also the size of the }
// maximal bad-character skip. private:
static const int kBMHSignificantSuffixLength = 0xff; int suffixes_[kBMMaxShift + 1];
int good_suffix_shift_[kBMMaxShift + 1];
// Significant bits taken from characters to use in bad-character int *biased_suffixes_;
// skips, to reduce size of the table for Unicode letters. int *biased_good_suffix_shift_;
static const int kBMHSignificantBitsMask = 0xff; DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers);
};
// Number of elements in bad-char table. // buffers reused by BoyerMoore
static const int kBMHBadCharCount = kBMHSignificantBitsMask + 1; static int bad_char_occurence[kBMAlphabetSize];
static BMGoodSuffixBuffers bmgs_buffers;
// Simplified Boyer-Moore string matching. Only uses bad-char skipping, // Restricted Boyer-Moore string matching. Restricts tables to a
// and restricts table to a suffix of long strings (also restricting // suffix of long pattern strings and handles only equivalence classes
// the maximum possible skip-length) in order to reduce space. // of the full alphabet. This allows us to ensure that tables take only
// a fixed amount of space.
template <typename schar, typename pchar> template <typename schar, typename pchar>
static int BoyerMooreHorspoolIndexOf(Vector<const schar> subject, static int BoyerMooreIndexOf(Vector<const schar> subject,
Vector<const pchar> pattern, Vector<const pchar> pattern,
int start_index) { int start_index) {
ASSERT(kBMHSignificantSuffixLength < 0x100); // We can use bytes as skips.
static byte bad_char_map[kBMHBadCharCount];
int m = pattern.length(); int m = pattern.length();
int n = subject.length(); int n = subject.length();
// Cap bad char table to last p chars of pattern. Also max skip value.
int p = m < kBMHSignificantSuffixLength ? m : kBMHSignificantSuffixLength;
memset(bad_char_map, p, kBMHBadCharCount); // Only preprocess at most kBMMaxShift last characters of pattern.
int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
int len = m - start;
// Run forwards to populate bad_char_table, so that *last* instance // Run forwards to populate bad_char_table, so that *last* instance
// of character equivalence class is the one registered. // of character equivalence class is the one registered.
// Notice: Doesn't include last character. // Notice: Doesn't include last character.
for (int i = p < m ? m - p : 0; i < m - 1; i++) { for (int i = 0; i < kBMAlphabetSize; i++) {
bad_char_occurence[i] = start - 1;
}
for (int i = start; i < m; i++) {
uc32 c = pattern[i]; uc32 c = pattern[i];
bad_char_occurence[c % kBMAlphabetSize] = i;
if (sizeof(schar) == 1 && if (sizeof(schar) == 1 &&
sizeof(pchar) > 1 && sizeof(pchar) > 1 &&
c > String::kMaxAsciiCharCode) { c > String::kMaxAsciiCharCode) {
return -1; return -1;
} }
bad_char_map[c & kBMHSignificantBitsMask] = m - 1 - i;
} }
// End of Bad Char computation.
for (int i = start_index + m - 1, j = m - 1; i < n;) { // Compute Good Suffix shift table.
schar c = subject[i]; bmgs_buffers.init(m);
if (c == pattern[j]) {
if (j == 0) { bmgs_buffers.shift(m-1) = 1;
return i; bmgs_buffers.suffix(m) = m + 1;
pchar last_char = pattern[m - 1];
int suffix = m + 1;
for (int i = m; i > start;) {
for (pchar c = pattern[i - 1]; suffix <= m && c != pattern[suffix - 1];) {
if (bmgs_buffers.shift(suffix) == len) {
bmgs_buffers.shift(suffix) = suffix - i;
} }
j--; suffix = bmgs_buffers.suffix(suffix);
i--; }
} else { i--;
int skip = bad_char_map[c & kBMHSignificantBitsMask]; suffix--;
if (skip < (m - j)) { bmgs_buffers.suffix(i) = suffix;
skip = m - j; if (suffix == m) {
// no suffix to extend, so we check against last_char only.
while (i > start && pattern[i - 1] != last_char) {
if (bmgs_buffers.shift(m) == len) {
bmgs_buffers.shift(m) = m - i;
}
i--;
bmgs_buffers.suffix(i) = m;
}
if (i > start) {
i--;
suffix--;
bmgs_buffers.suffix(i) = suffix;
} }
i += skip;
j = m - 1;
} }
} }
return -1; if (suffix < m) {
} for (int i = start; i <= m; i++) {
if (bmgs_buffers.shift(i) == len) {
bmgs_buffers.shift(i) = suffix - start;
// Full KMP pattern match. }
template <typename schar, typename pchar> // Pattern & subject char types if (i == suffix) {
static int KMPIndexOf(Vector<const schar> subject, suffix = bmgs_buffers.suffix(suffix);
Vector<const pchar> pattern, }
int start_index) { }
int subject_length = subject.length(); }
int pattern_length = pattern.length(); // End of Good Suffix computation.
SmartPointer<int> next_table(NewArray<int>(pattern_length));
// Compute KMP "next" table
int i = 0;
int j = -1;
next_table[0] = -1;
pchar p = pattern[0]; // Perform search
while (i < pattern_length - 1) { for (int i = start_index; i <= n - m;) {
while (j > -1 && p != pattern[j]) { int j = m - 1;
j = next_table[j]; schar c;
} while (j >= 0 && pattern[j] == (c = subject[i + j])) j--;
i++; if (j < 0) {
j++; return i;
p = pattern[i]; } else if (j < start) {
if (p == pattern[j]) { // we have matched more than our tables allow us to be smart about.
next_table[i] = next_table[j]; i += 1;
} else { } else {
next_table[i] = j; int gs_shift = bmgs_buffers.shift(j + 1);
int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
int bc_shift = j - bc_occ;
i += (gs_shift > bc_shift) ? gs_shift : bc_shift;
} }
} }
return -1;
}
// Search using the 'next' table. template <typename schar, typename pchar>
int pattern_index = 0; static int SingleCharIndexOf(Vector<const schar> string,
int subject_index = start_index; pchar pattern_char,
while (subject_index < subject_length) { int start_index) {
schar subject_char = subject[subject_index]; for (int i = start_index, n = string.length(); i < n; i++) {
while (pattern_index > -1 && pattern[pattern_index] != subject_char) { if (pattern_char == string[i]) {
pattern_index = next_table[pattern_index]; return i;
} }
pattern_index++; }
subject_index++; return -1;
if (pattern_index >= pattern_length) { }
return subject_index - pattern_index;
// Trivial string search for shorter strings.
template <typename pchar, typename schar>
static int SimpleIndexOf(Vector<const schar> subject,
Vector<const pchar> pattern,
int start_index) {
int pattern_length = pattern.length();
int subject_length = subject.length();
// We know our pattern is at least 2 characters, we cache the first so
// the common case of the first character not matching is faster.
pchar pattern_first_char = pattern[0];
for (int i = start_index, n = subject_length - pattern_length; i <= n; i++) {
if (subject[i] != pattern_first_char) continue;
int j = 1;
while (pattern[j] == subject[j+i]) {
j++;
if (j == pattern_length) {
return i;
}
} }
} }
return -1; return -1;
...@@ -1105,19 +1132,15 @@ static int StringMatchStrategy(Vector<const schar> sub, ...@@ -1105,19 +1132,15 @@ static int StringMatchStrategy(Vector<const schar> sub,
Vector<const pchar> pat, Vector<const pchar> pat,
int start_index) { int start_index) {
int pattern_length = pat.length(); int pattern_length = pat.length();
// Searching for one specific character is common. For one ASSERT(pattern_length > 1);
// character patterns the KMP algorithm is guaranteed to slow down
// the search, so we just run through the subject string.
if (pattern_length == 1) {
return SingleCharIndexOf(sub, pat[0], start_index);
}
// For small searches, a complex sort is not worth the setup overhead. // For small searches, a complex sort is not worth the setup overhead.
if (sub.length() - start_index < 25) { int subject_length = sub.length() - start_index;
if (subject_length < 100 || pattern_length < 4) {
return SimpleIndexOf(sub, pat, start_index); return SimpleIndexOf(sub, pat, start_index);
} }
return BoyerMooreHorspoolIndexOf(sub, pat, start_index); return BoyerMooreIndexOf(sub, pat, start_index);
} }
// Perform string match of pattern on subject, starting at start index. // Perform string match of pattern on subject, starting at start index.
...@@ -1136,6 +1159,17 @@ int Runtime::StringMatch(Handle<String> sub, ...@@ -1136,6 +1159,17 @@ int Runtime::StringMatch(Handle<String> sub,
if (start_index + pattern_length > subject_length) return -1; if (start_index + pattern_length > subject_length) return -1;
FlattenString(sub); FlattenString(sub);
// Searching for one specific character is common. For one
// character patterns linear search is necessary, so any smart
// algorithm is unnecessary overhead.
if (pattern_length == 1) {
AssertNoAllocation no_heap_allocation; // ensure vectors stay valid
if (sub->is_ascii_representation()) {
return SingleCharIndexOf(sub->ToAsciiVector(), pat->Get(0), start_index);
}
return SingleCharIndexOf(sub->ToUC16Vector(), pat->Get(0), start_index);
}
FlattenString(pat); FlattenString(pat);
AssertNoAllocation no_heap_allocation; // ensure vectors stay valid AssertNoAllocation no_heap_allocation; // ensure vectors stay valid
......
...@@ -27,6 +27,12 @@ ...@@ -27,6 +27,12 @@
var s = "test test test"; var s = "test test test";
assertEquals(0, s.indexOf("t"));
assertEquals(3, s.indexOf("t", 1));
assertEquals(5, s.indexOf("t", 4));
assertEquals(1, s.indexOf("e"));
assertEquals(2, s.indexOf("s"));
assertEquals(5, s.indexOf("test", 4)); assertEquals(5, s.indexOf("test", 4));
assertEquals(5, s.indexOf("test", 5)); assertEquals(5, s.indexOf("test", 5));
assertEquals(10, s.indexOf("test", 6)); assertEquals(10, s.indexOf("test", 6));
...@@ -47,3 +53,104 @@ assertEquals(4, reString.indexOf("[a-z]+")); ...@@ -47,3 +53,104 @@ assertEquals(4, reString.indexOf("[a-z]+"));
assertEquals(10, reString.indexOf("(asdf)?")); assertEquals(10, reString.indexOf("(asdf)?"));
assertEquals(1, String.prototype.indexOf.length); assertEquals(1, String.prototype.indexOf.length);
// Random greek letters
var twoByteString = "\u039a\u0391\u03a3\u03a3\u0395";
// Test single char pattern
assertEquals(0, twoByteString.indexOf("\u039a"), "Lamda");
assertEquals(1, twoByteString.indexOf("\u0391"), "Alpha");
assertEquals(2, twoByteString.indexOf("\u03a3"), "First Sigma");
assertEquals(3, twoByteString.indexOf("\u03a3",3), "Second Sigma");
assertEquals(4, twoByteString.indexOf("\u0395"), "Epsilon");
assertEquals(-1, twoByteString.indexOf("\u0392"), "Not beta");
// Test multi-char pattern
assertEquals(0, twoByteString.indexOf("\u039a\u0391"), "lambda Alpha");
assertEquals(1, twoByteString.indexOf("\u0391\u03a3"), "Alpha Sigma");
assertEquals(2, twoByteString.indexOf("\u03a3\u03a3"), "Sigma Sigma");
assertEquals(3, twoByteString.indexOf("\u03a3\u0395"), "Sigma Epsilon");
assertEquals(-1, twoByteString.indexOf("\u0391\u03a3\u0395"),
"Not Alpha Sigma Epsilon");
//single char pattern
assertEquals(4, twoByteString.indexOf("\u0395"));
// Test complex string indexOf algorithms. Only trigger for long strings.
// Long string that isn't a simple repeat of a shorter string.
var long = "A";
for(var i = 66; i < 76; i++) { // from 'B' to 'K'
long = long + String.fromCharCode(i) + long;
}
// pattern of 15 chars, repeated every 16 chars in long
var pattern = "ABACABADABACABA";
for(var i = 0; i < long.length - pattern.length; i+= 7) {
var index = long.indexOf(pattern, i);
assertEquals((i + 15) & ~0xf, index, "Long ABACABA...-string at index " + i);
}
assertEquals(510, long.indexOf("AJABACA"), "Long AJABACA, First J");
assertEquals(1534, long.indexOf("AJABACA", 511), "Long AJABACA, Second J");
pattern = "JABACABADABACABA";
assertEquals(511, long.indexOf(pattern), "Long JABACABA..., First J");
assertEquals(1535, long.indexOf(pattern, 512), "Long JABACABA..., Second J");
var lipsum = "lorem ipsum per se esse fugiendum. itaque aiunt hanc quasi "
+ "naturalem atque insitam in animis nostris inesse notionem, ut "
+ "alterum esse appetendum, alterum aspernandum sentiamus. Alii autem,"
+ " quibus ego assentior, cum a philosophis compluribus permulta "
+ "dicantur, cur nec voluptas in bonis sit numeranda nec in malis "
+ "dolor, non existimant oportere nimium nos causae confidere, sed et"
+ " argumentandum et accurate disserendum et rationibus conquisitis de"
+ " voluptate et dolore disputandum putant.\n"
+ "Sed ut perspiciatis, unde omnis iste natus error sit voluptatem "
+ "accusantium doloremque laudantium, totam rem aperiam eaque ipsa,"
+ "quae ab illo inventore veritatis et quasi architecto beatae vitae "
+ "dicta sunt, explicabo. nemo enim ipsam voluptatem, quia voluptas"
+ "sit, aspernatur aut odit aut fugit, sed quia consequuntur magni"
+ " dolores eos, qui ratione voluptatem sequi nesciunt, neque porro"
+ " quisquam est, qui dolorem ipsum, quia dolor sit, amet, "
+ "consectetur, adipisci velit, sed quia non numquam eius modi"
+ " tempora incidunt, ut labore et dolore magnam aliquam quaerat "
+ "voluptatem. ut enim ad minima veniam, quis nostrum exercitationem "
+ "ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi "
+ "consequatur? quis autem vel eum iure reprehenderit, qui in ea "
+ "voluptate velit esse, quam nihil molestiae consequatur, vel illum, "
+ "qui dolorem eum fugiat, quo voluptas nulla pariatur?\n"
+ "At vero eos et accusamus et iusto odio dignissimos ducimus, qui "
+ "blanditiis praesentium voluptatum deleniti atque corrupti, quos "
+ "dolores et quas molestias excepturi sint, obcaecati cupiditate "
+ "non provident, similique sunt in culpa, qui officia deserunt "
+ "mollitia animi, id est laborum et dolorum fuga. et harum quidem "
+ "rerum facilis est et expedita distinctio. nam libero tempore, "
+ "cum soluta nobis est eligendi optio, cumque nihil impedit, quo "
+ "minus id, quod maxime placeat, facere possimus, omnis voluptas "
+ "assumenda est, omnis dolor repellendus. temporibus autem "
+ "quibusdam et aut officiis debitis aut rerum necessitatibus "
+ "saepe eveniet, ut et voluptates repudiandae sint et molestiae "
+ "non recusandae. itaque earum rerum hic tenetur a sapiente "
+ "delectus, ut aut reiciendis voluptatibus maiores alias consequatur "
+ "aut perferendis doloribus asperiores repellat.";
assertEquals(893, lipsum.indexOf("lorem ipsum, quia dolor sit, amet"),
"Lipsum");
// test a lot of substrings of differing length and start-position.
for(var i = 255; i < lipsum.length; i += 3) {
for(var len = 661; i + len < lipsum.length; len += 4) {
var substring = lipsum.substring(i, i + len);
var index = -1;
do {
index = lipsum.indexOf(substring, index + 1);
assertTrue(index != -1,
"Lipsum substring " + i + ".." + (i + len-1) + " not found");
assertEquals(lipsum.substring(index, index + len), substring,
"Wrong lipsum substring found: " + i + ".." + (i + len - 1) + "/" +
index + ".." + (index + len - 1));
} while (index >= 0 && index < i);
assertEquals(i, index, "Lipsum match at " + i + ".." + (i + len - 1));
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment