string-search.h 19.8 KB
Newer Older
1
// Copyright 2011 the V8 project authors. All rights reserved.
2 3
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
4

5 6
#ifndef V8_STRINGS_STRING_SEARCH_H_
#define V8_STRINGS_STRING_SEARCH_H_
7

8
#include "src/base/strings.h"
9
#include "src/base/vector.h"
10
#include "src/execution/isolate.h"
11

12 13 14
namespace v8 {
namespace internal {

15 16 17 18 19 20 21 22 23 24 25 26
//---------------------------------------------------------------------
// String Search object.
//---------------------------------------------------------------------

// Class holding constants and methods that apply to all string search variants,
// independently of subject and pattern char size.
class StringSearchBase {
 protected:
  // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
  // limit, we can fix the size of tables. For a needle longer than this limit,
  // search will not be optimal, since we only build tables for a suffix
  // of the string, but it is a safe approximation.
27
  static const int kBMMaxShift = Isolate::kBMMaxShift;
28 29 30 31 32 33 34 35

  // Reduce alphabet to this size.
  // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
  // proportional to the input alphabet. We reduce the alphabet size by
  // equating input characters modulo a smaller alphabet size. This gives
  // a potentially less efficient searching, but is a safe approximation.
  // For needles using only characters in the same Unicode 256-code point page,
  // there is no search speed degradation.
36
  static const int kLatin1AlphabetSize = 256;
37
  static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
38 39 40 41 42 43

  // Bad-char shift table stored in the state. It's length is the alphabet size.
  // For patterns below this length, the skip length of Boyer-Moore is too short
  // to compensate for the algorithmic overhead compared to simple brute force.
  static const int kBMMinPatternLength = 7;

44
  static inline bool IsOneByteString(base::Vector<const uint8_t> string) {
45 46 47
    return true;
  }

48
  static inline bool IsOneByteString(base::Vector<const base::uc16> string) {
49
    return String::IsOneByte(string.begin(), string.length());
50 51
  }

52
  friend class Isolate;
53 54 55 56
};

template <typename PatternChar, typename SubjectChar>
class StringSearch : private StringSearchBase {
57
 public:
58
  StringSearch(Isolate* isolate, base::Vector<const PatternChar> pattern)
59 60
      : isolate_(isolate),
        pattern_(pattern),
61
        start_(std::max(0, pattern.length() - kBMMaxShift)) {
62
    if (sizeof(PatternChar) > sizeof(SubjectChar)) {
63
      if (!IsOneByteString(pattern_)) {
64 65 66 67 68 69 70 71 72 73 74 75
        strategy_ = &FailSearch;
        return;
      }
    }
    int pattern_length = pattern_.length();
    if (pattern_length < kBMMinPatternLength) {
      if (pattern_length == 1) {
        strategy_ = &SingleCharSearch;
        return;
      }
      strategy_ = &LinearSearch;
      return;
76
    }
77
    strategy_ = &InitialSearch;
78
  }
79

80
  int Search(base::Vector<const SubjectChar> subject, int index) {
81
    return strategy_(this, subject, index);
82
  }
83 84 85

  static inline int AlphabetSize() {
    if (sizeof(PatternChar) == 1) {
86 87
      // Latin1 needle.
      return kLatin1AlphabetSize;
88
    } else {
89
      DCHECK_EQ(sizeof(PatternChar), 2);
90 91 92
      // UC16 needle.
      return kUC16AlphabetSize;
    }
93
  }
94

95
 private:
96
  using SearchFunction = int (*)(StringSearch<PatternChar, SubjectChar>*,
97
                                 base::Vector<const SubjectChar>, int);
98 99

  static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
100
                        base::Vector<const SubjectChar>, int) {
101 102
    return -1;
  }
103

104
  static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
105
                              base::Vector<const SubjectChar> subject,
106 107 108
                              int start_index);

  static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
109 110
                          base::Vector<const SubjectChar> subject,
                          int start_index);
111 112

  static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
113 114
                           base::Vector<const SubjectChar> subject,
                           int start_index);
115 116 117

  static int BoyerMooreHorspoolSearch(
      StringSearch<PatternChar, SubjectChar>* search,
118
      base::Vector<const SubjectChar> subject, int start_index);
119 120

  static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
121
                              base::Vector<const SubjectChar> subject,
122 123 124 125 126 127
                              int start_index);

  void PopulateBoyerMooreHorspoolTable();

  void PopulateBoyerMooreTable();

128
  static inline bool exceedsOneByte(uint8_t c) { return false; }
129 130 131 132 133

  static inline bool exceedsOneByte(uint16_t c) {
    return c > String::kMaxOneByteCharCodeU;
  }

134 135 136 137 138 139
  static inline int CharOccurrence(int* bad_char_occurrence,
                                   SubjectChar char_code) {
    if (sizeof(SubjectChar) == 1) {
      return bad_char_occurrence[static_cast<int>(char_code)];
    }
    if (sizeof(PatternChar) == 1) {
140
      if (exceedsOneByte(char_code)) {
141 142
        return -1;
      }
143
      return bad_char_occurrence[static_cast<unsigned int>(char_code)];
144
    }
145 146
    // Both pattern and subject are UC16. Reduce character to equivalence
    // class.
147 148 149 150
    int equiv_class = char_code % kUC16AlphabetSize;
    return bad_char_occurrence[equiv_class];
  }

151 152 153 154 155
  // The following tables are shared by all searches.
  // TODO(lrn): Introduce a way for a pattern to keep its tables
  // between searches (e.g., for an Atom RegExp).

  // Store for the BoyerMoore(Horspool) bad char shift table.
156 157
  // Return a table covering the last kBMMaxShift+1 positions of
  // pattern.
158
  int* bad_char_table() { return isolate_->bad_char_shift_table(); }
159

160
  // Store for the BoyerMoore good suffix shift table.
161 162 163
  int* good_suffix_shift_table() {
    // Return biased pointer that maps the range  [start_..pattern_.length()
    // to the kGoodSuffixShiftTable array.
164
    return isolate_->good_suffix_shift_table() - start_;
165 166
  }

167 168
  // Table used temporarily while building the BoyerMoore good suffix
  // shift table.
169 170 171
  int* suffix_table() {
    // Return biased pointer that maps the range  [start_..pattern_.length()
    // to the kSuffixTable array.
172
    return isolate_->suffix_table() - start_;
173 174
  }

175
  Isolate* isolate_;
176
  // The pattern to search for.
177
  base::Vector<const PatternChar> pattern_;
178 179
  // Pointer to implementation of the search.
  SearchFunction strategy_;
180
  // Cache value of max(0, pattern_length() - kBMMaxShift)
181
  int start_;
182 183
};

184 185 186 187 188 189
template <typename T, typename U>
inline T AlignDown(T value, U alignment) {
  return reinterpret_cast<T>(
      (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
}

190
inline uint8_t GetHighestValueByte(base::uc16 character) {
191 192
  return std::max(static_cast<uint8_t>(character & 0xFF),
                  static_cast<uint8_t>(character >> 8));
193 194 195 196
}

inline uint8_t GetHighestValueByte(uint8_t character) { return character; }

197
template <typename PatternChar, typename SubjectChar>
198 199 200
inline int FindFirstCharacter(base::Vector<const PatternChar> pattern,
                              base::Vector<const SubjectChar> subject,
                              int index) {
201
  const PatternChar pattern_first_char = pattern[0];
202 203
  const int max_n = (subject.length() - pattern.length() + 1);

204 205 206 207 208 209 210 211 212
  if (sizeof(SubjectChar) == 2 && pattern_first_char == 0) {
    // Special-case looking for the 0 char in other than one-byte strings.
    // memchr mostly fails in this case due to every other byte being 0 in text
    // that is mostly ascii characters.
    for (int i = index; i < max_n; ++i) {
      if (subject[i] == 0) return i;
    }
    return -1;
  }
213 214 215 216 217
  const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
  const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
  int pos = index;
  do {
    DCHECK_GE(max_n - pos, 0);
218
    const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
219
        memchr(subject.begin() + pos, search_byte,
220
               (max_n - pos) * sizeof(SubjectChar)));
221
    if (char_pos == nullptr) return -1;
222
    char_pos = AlignDown(char_pos, sizeof(SubjectChar));
223
    pos = static_cast<int>(char_pos - subject.begin());
224 225 226
    if (subject[pos] == search_char) return pos;
  } while (++pos < max_n);

227 228 229
  return -1;
}

230 231 232
//---------------------------------------------------------------------
// Single Character Pattern Search Strategy
//---------------------------------------------------------------------
233

234 235 236
template <typename PatternChar, typename SubjectChar>
int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
    StringSearch<PatternChar, SubjectChar>* search,
237
    base::Vector<const SubjectChar> subject, int index) {
238
  DCHECK_EQ(1, search->pattern_.length());
239
  PatternChar pattern_first_char = search->pattern_[0];
240 241 242
  if (sizeof(PatternChar) > sizeof(SubjectChar)) {
    if (exceedsOneByte(pattern_first_char)) {
      return -1;
243 244
    }
  }
245
  return FindFirstCharacter(search->pattern_, subject, index);
246 247 248 249 250 251 252
}

//---------------------------------------------------------------------
// Linear Search Strategy
//---------------------------------------------------------------------

template <typename PatternChar, typename SubjectChar>
253
inline bool CharCompare(const PatternChar* pattern, const SubjectChar* subject,
254
                        int length) {
255
  DCHECK_GT(length, 0);
256 257 258 259 260 261 262 263 264 265 266 267 268 269
  int pos = 0;
  do {
    if (pattern[pos] != subject[pos]) {
      return false;
    }
    pos++;
  } while (pos < length);
  return true;
}

// Simple linear search for short patterns. Never bails out.
template <typename PatternChar, typename SubjectChar>
int StringSearch<PatternChar, SubjectChar>::LinearSearch(
    StringSearch<PatternChar, SubjectChar>* search,
270 271
    base::Vector<const SubjectChar> subject, int index) {
  base::Vector<const PatternChar> pattern = search->pattern_;
272
  DCHECK_GT(pattern.length(), 1);
273 274 275 276
  int pattern_length = pattern.length();
  int i = index;
  int n = subject.length() - pattern_length;
  while (i <= n) {
277 278 279 280
    i = FindFirstCharacter(pattern, subject, i);
    if (i == -1) return -1;
    DCHECK_LE(i, n);
    i++;
281 282
    // Loop extracted to separate function to allow using return to do
    // a deeper break.
283
    if (CharCompare(pattern.begin() + 1, subject.begin() + i,
284 285 286 287 288 289 290 291 292 293 294 295 296 297
                    pattern_length - 1)) {
      return i - 1;
    }
  }
  return -1;
}

//---------------------------------------------------------------------
// Boyer-Moore string search
//---------------------------------------------------------------------

template <typename PatternChar, typename SubjectChar>
int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
    StringSearch<PatternChar, SubjectChar>* search,
298 299
    base::Vector<const SubjectChar> subject, int start_index) {
  base::Vector<const PatternChar> pattern = search->pattern_;
300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
  int subject_length = subject.length();
  int pattern_length = pattern.length();
  // Only preprocess at most kBMMaxShift last characters of pattern.
  int start = search->start_;

  int* bad_char_occurence = search->bad_char_table();
  int* good_suffix_shift = search->good_suffix_shift_table();

  PatternChar last_char = pattern[pattern_length - 1];
  int index = start_index;
  // Continue search from i.
  while (index <= subject_length - pattern_length) {
    int j = pattern_length - 1;
    int c;
    while (last_char != (c = subject[index + j])) {
315
      int shift = j - CharOccurrence(bad_char_occurence, c);
316 317 318 319 320 321 322 323 324 325 326
      index += shift;
      if (index > subject_length - pattern_length) {
        return -1;
      }
    }
    while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
    if (j < 0) {
      return index;
    } else if (j < start) {
      // we have matched more than our tables allow us to be smart about.
      // Fall back on BMH shift.
327 328 329
      index += pattern_length - 1 -
               CharOccurrence(bad_char_occurence,
                              static_cast<SubjectChar>(last_char));
330 331
    } else {
      int gs_shift = good_suffix_shift[j + 1];
332
      int bc_occ = CharOccurrence(bad_char_occurence, c);
333 334 335 336 337 338
      int shift = j - bc_occ;
      if (gs_shift > shift) {
        shift = gs_shift;
      }
      index += shift;
    }
339
  }
340 341

  return -1;
342 343
}

344 345 346
template <typename PatternChar, typename SubjectChar>
void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
  int pattern_length = pattern_.length();
347
  const PatternChar* pattern = pattern_.begin();
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
  // Only look at the last kBMMaxShift characters of pattern (from start_
  // to pattern_length).
  int start = start_;
  int length = pattern_length - start;

  // Biased tables so that we can use pattern indices as table indices,
  // even if we only cover the part of the pattern from offset start.
  int* shift_table = good_suffix_shift_table();
  int* suffix_table = this->suffix_table();

  // Initialize table.
  for (int i = start; i < pattern_length; i++) {
    shift_table[i] = length;
  }
  shift_table[pattern_length] = 1;
  suffix_table[pattern_length] = pattern_length + 1;
364

365 366 367 368
  if (pattern_length <= start) {
    return;
  }

369 370 371
  // Find suffixes.
  PatternChar last_char = pattern[pattern_length - 1];
  int suffix = pattern_length + 1;
372
  {
373
    int i = pattern_length;
374 375
    while (i > start) {
      PatternChar c = pattern[i - 1];
376 377 378
      while (suffix <= pattern_length && c != pattern[suffix - 1]) {
        if (shift_table[suffix] == length) {
          shift_table[suffix] = suffix - i;
379
        }
380
        suffix = suffix_table[suffix];
381
      }
382 383
      suffix_table[--i] = --suffix;
      if (suffix == pattern_length) {
384 385
        // No suffix to extend, so we check against last_char only.
        while ((i > start) && (pattern[i - 1] != last_char)) {
386 387
          if (shift_table[pattern_length] == length) {
            shift_table[pattern_length] = pattern_length - i;
388
          }
389
          suffix_table[--i] = pattern_length;
390 391
        }
        if (i > start) {
392
          suffix_table[--i] = --suffix;
393 394 395 396
        }
      }
    }
  }
397 398 399 400 401
  // Build shift table using suffixes.
  if (suffix < pattern_length) {
    for (int i = start; i <= pattern_length; i++) {
      if (shift_table[i] == length) {
        shift_table[i] = suffix - start;
402 403
      }
      if (i == suffix) {
404
        suffix = suffix_table[suffix];
405 406 407 408 409
      }
    }
  }
}

410 411 412
//---------------------------------------------------------------------
// Boyer-Moore-Horspool string search.
//---------------------------------------------------------------------
413

414 415 416
template <typename PatternChar, typename SubjectChar>
int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
    StringSearch<PatternChar, SubjectChar>* search,
417 418
    base::Vector<const SubjectChar> subject, int start_index) {
  base::Vector<const PatternChar> pattern = search->pattern_;
419 420 421 422
  int subject_length = subject.length();
  int pattern_length = pattern.length();
  int* char_occurrences = search->bad_char_table();
  int badness = -pattern_length;
423 424

  // How bad we are doing without a good-suffix table.
425
  PatternChar last_char = pattern[pattern_length - 1];
426 427
  int last_char_shift =
      pattern_length - 1 -
428
      CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
429
  // Perform search
430 431 432 433 434 435
  int index = start_index;  // No matches found prior to this index.
  while (index <= subject_length - pattern_length) {
    int j = pattern_length - 1;
    int subject_char;
    while (last_char != (subject_char = subject[index + j])) {
      int bc_occ = CharOccurrence(char_occurrences, subject_char);
436
      int shift = j - bc_occ;
437
      index += shift;
438
      badness += 1 - shift;  // at most zero, so badness cannot increase.
439
      if (index > subject_length - pattern_length) {
440 441 442 443
        return -1;
      }
    }
    j--;
444
    while (j >= 0 && pattern[j] == (subject[index + j])) j--;
445
    if (j < 0) {
446
      return index;
447
    } else {
448
      index += last_char_shift;
449 450 451 452
      // Badness increases by the number of characters we have
      // checked, and decreases by the number of characters we
      // can skip by shifting. It's a measure of how we are doing
      // compared to reading each character exactly once.
453
      badness += (pattern_length - j) - last_char_shift;
454
      if (badness > 0) {
455 456 457
        search->PopulateBoyerMooreTable();
        search->strategy_ = &BoyerMooreSearch;
        return BoyerMooreSearch(search, subject, index);
458 459 460 461 462 463
      }
    }
  }
  return -1;
}

464 465 466
template <typename PatternChar, typename SubjectChar>
void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
  int pattern_length = pattern_.length();
467

468 469 470 471 472 473 474 475 476
  int* bad_char_occurrence = bad_char_table();

  // Only preprocess at most kBMMaxShift last characters of pattern.
  int start = start_;
  // Run forwards to populate bad_char_table, so that *last* instance
  // of character equivalence class is the one registered.
  // Notice: Doesn't include the last character.
  int table_size = AlphabetSize();
  if (start == 0) {  // All patterns less than kBMMaxShift in length.
477
    memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
478 479 480
  } else {
    for (int i = 0; i < table_size; i++) {
      bad_char_occurrence[i] = start - 1;
481 482
    }
  }
483 484 485 486 487
  for (int i = start; i < pattern_length - 1; i++) {
    PatternChar c = pattern_[i];
    int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
    bad_char_occurrence[bucket] = i;
  }
488 489
}

490 491 492
//---------------------------------------------------------------------
// Linear string search with bailout to BMH.
//---------------------------------------------------------------------
493

494 495
// Simple linear search for short patterns, which bails out if the string
// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
496
template <typename PatternChar, typename SubjectChar>
497 498
int StringSearch<PatternChar, SubjectChar>::InitialSearch(
    StringSearch<PatternChar, SubjectChar>* search,
499 500
    base::Vector<const SubjectChar> subject, int index) {
  base::Vector<const PatternChar> pattern = search->pattern_;
501 502 503 504 505 506 507 508
  int pattern_length = pattern.length();
  // Badness is a count of how much work we have done.  When we have
  // done enough work we decide it's probably worth switching to a better
  // algorithm.
  int badness = -10 - (pattern_length << 2);

  // We know our pattern is at least 2 characters, we cache the first so
  // the common case of the first character not matching is faster.
509
  for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
510
    badness++;
511
    if (badness <= 0) {
512 513 514
      i = FindFirstCharacter(pattern, subject, i);
      if (i == -1) return -1;
      DCHECK_LE(i, n);
515 516 517 518 519 520 521 522 523
      int j = 1;
      do {
        if (pattern[j] != subject[i + j]) {
          break;
        }
        j++;
      } while (j < pattern_length);
      if (j == pattern_length) {
        return i;
524
      }
525
      badness += j;
526
    } else {
527 528 529
      search->PopulateBoyerMooreHorspoolTable();
      search->strategy_ = &BoyerMooreHorspoolSearch;
      return BoyerMooreHorspoolSearch(search, subject, i);
530 531 532 533 534
    }
  }
  return -1;
}

535 536 537 538
// Perform a a single stand-alone search.
// If searching multiple times for the same pattern, a search
// object should be constructed once and the Search function then called
// for each search.
539
template <typename SubjectChar, typename PatternChar>
540 541
int SearchString(Isolate* isolate, base::Vector<const SubjectChar> subject,
                 base::Vector<const PatternChar> pattern, int start_index) {
542
  StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
543
  return search.Search(subject, start_index);
544 545
}

546 547 548 549
// A wrapper function around SearchString that wraps raw pointers to the subject
// and pattern as vectors before calling SearchString. Used from the
// StringIndexOf builtin.
template <typename SubjectChar, typename PatternChar>
550 551 552
intptr_t SearchStringRaw(Isolate* isolate, const SubjectChar* subject_ptr,
                         int subject_length, const PatternChar* pattern_ptr,
                         int pattern_length, int start_index) {
553
  DisallowGarbageCollection no_gc;
554 555
  base::Vector<const SubjectChar> subject(subject_ptr, subject_length);
  base::Vector<const PatternChar> pattern(pattern_ptr, pattern_length);
556 557 558
  return SearchString(isolate, subject, pattern, start_index);
}

559 560
}  // namespace internal
}  // namespace v8
561

562
#endif  // V8_STRINGS_STRING_SEARCH_H_