test-regexp.cc 58.5 KB
Newer Older
1
// Copyright 2012 the V8 project authors. All rights reserved.
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
//       notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
//       copyright notice, this list of conditions and the following
//       disclaimer in the documentation and/or other materials provided
//       with the distribution.
//     * Neither the name of Google Inc. nor the names of its
//       contributors may be used to endorse or promote products derived
//       from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

28 29
#include <cstdlib>
#include <sstream>
30

31
#include "src/v8.h"
32

33 34 35
#include "src/ast.h"
#include "src/char-predicates-inl.h"
#include "src/jsregexp.h"
36
#include "src/ostreams.h"
37 38
#include "src/parser.h"
#include "src/regexp-macro-assembler.h"
39
#include "src/regexp-macro-assembler-irregexp.h"
40
#include "src/string-stream.h"
41
#ifdef V8_INTERPRETED_REGEXP
42
#include "src/interpreter-irregexp.h"
43
#else  // V8_INTERPRETED_REGEXP
44
#include "src/macro-assembler.h"
45
#if V8_TARGET_ARCH_ARM
46
#include "src/arm/assembler-arm.h"  // NOLINT
47 48
#include "src/arm/macro-assembler-arm.h"
#include "src/arm/regexp-macro-assembler-arm.h"
49
#endif
50
#if V8_TARGET_ARCH_ARM64
51 52 53
#include "src/arm64/assembler-arm64.h"
#include "src/arm64/macro-assembler-arm64.h"
#include "src/arm64/regexp-macro-assembler-arm64.h"
54
#endif
55 56 57 58 59
#if V8_TARGET_ARCH_PPC
#include "src/ppc/assembler-ppc.h"
#include "src/ppc/macro-assembler-ppc.h"
#include "src/ppc/regexp-macro-assembler-ppc.h"
#endif
60
#if V8_TARGET_ARCH_MIPS
61 62 63
#include "src/mips/assembler-mips.h"
#include "src/mips/macro-assembler-mips.h"
#include "src/mips/regexp-macro-assembler-mips.h"
64
#endif
65 66 67 68 69
#if V8_TARGET_ARCH_MIPS64
#include "src/mips64/assembler-mips64.h"
#include "src/mips64/macro-assembler-mips64.h"
#include "src/mips64/regexp-macro-assembler-mips64.h"
#endif
70
#if V8_TARGET_ARCH_X64
71 72 73
#include "src/x64/assembler-x64.h"
#include "src/x64/macro-assembler-x64.h"
#include "src/x64/regexp-macro-assembler-x64.h"
74
#endif
75
#if V8_TARGET_ARCH_IA32
76 77 78
#include "src/ia32/assembler-ia32.h"
#include "src/ia32/macro-assembler-ia32.h"
#include "src/ia32/regexp-macro-assembler-ia32.h"
79
#endif
danno@chromium.org's avatar
danno@chromium.org committed
80
#if V8_TARGET_ARCH_X87
81 82 83
#include "src/x87/assembler-x87.h"
#include "src/x87/macro-assembler-x87.h"
#include "src/x87/regexp-macro-assembler-x87.h"
danno@chromium.org's avatar
danno@chromium.org committed
84
#endif
85
#endif  // V8_INTERPRETED_REGEXP
86
#include "test/cctest/cctest.h"
87 88 89 90

using namespace v8::internal;


91
static bool CheckParse(const char* input) {
92
  v8::HandleScope scope(CcTest::isolate());
93
  Zone zone;
94
  FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
95
  RegExpCompileData result;
96 97
  return v8::internal::RegExpParser::ParseRegExp(
      CcTest::i_isolate(), &zone, &reader, false, false, &result);
98 99 100
}


101
static void CheckParseEq(const char* input, const char* expected) {
102
  v8::HandleScope scope(CcTest::isolate());
103
  Zone zone;
104
  FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
105
  RegExpCompileData result;
106 107
  CHECK(v8::internal::RegExpParser::ParseRegExp(
      CcTest::i_isolate(), &zone, &reader, false, false, &result));
108 109
  CHECK(result.tree != NULL);
  CHECK(result.error.is_null());
110
  std::ostringstream os;
111
  result.tree->Print(os, &zone);
112
  CHECK_EQ(0, strcmp(expected, os.str().c_str()));
113 114
}

115

116
static bool CheckSimple(const char* input) {
117
  v8::HandleScope scope(CcTest::isolate());
118
  Zone zone;
119
  FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
120
  RegExpCompileData result;
121 122
  CHECK(v8::internal::RegExpParser::ParseRegExp(
      CcTest::i_isolate(), &zone, &reader, false, false, &result));
123 124
  CHECK(result.tree != NULL);
  CHECK(result.error.is_null());
125
  return result.simple;
126 127
}

128 129 130 131 132
struct MinMaxPair {
  int min_match;
  int max_match;
};

133

134
static MinMaxPair CheckMinMaxMatch(const char* input) {
135
  v8::HandleScope scope(CcTest::isolate());
136
  Zone zone;
137
  FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
138
  RegExpCompileData result;
139 140
  CHECK(v8::internal::RegExpParser::ParseRegExp(
      CcTest::i_isolate(), &zone, &reader, false, false, &result));
141 142 143 144 145 146 147 148 149
  CHECK(result.tree != NULL);
  CHECK(result.error.is_null());
  int min_match = result.tree->min_match();
  int max_match = result.tree->max_match();
  MinMaxPair pair = { min_match, max_match };
  return pair;
}


150
#define CHECK_PARSE_ERROR(input) CHECK(!CheckParse(input))
151
#define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input));
152 153 154 155 156
#define CHECK_MIN_MAX(input, min, max)                                         \
  { MinMaxPair min_max = CheckMinMaxMatch(input);                              \
    CHECK_EQ(min, min_max.min_match);                                          \
    CHECK_EQ(max, min_max.max_match);                                          \
  }
157 158

TEST(Parser) {
159 160
  CHECK_PARSE_ERROR("?");

161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
  CheckParseEq("abc", "'abc'");
  CheckParseEq("", "%");
  CheckParseEq("abc|def", "(| 'abc' 'def')");
  CheckParseEq("abc|def|ghi", "(| 'abc' 'def' 'ghi')");
  CheckParseEq("^xxx$", "(: @^i 'xxx' @$i)");
  CheckParseEq("ab\\b\\d\\bcd", "(: 'ab' @b [0-9] @b 'cd')");
  CheckParseEq("\\w|\\d", "(| [0-9 A-Z _ a-z] [0-9])");
  CheckParseEq("a*", "(# 0 - g 'a')");
  CheckParseEq("a*?", "(# 0 - n 'a')");
  CheckParseEq("abc+", "(: 'ab' (# 1 - g 'c'))");
  CheckParseEq("abc+?", "(: 'ab' (# 1 - n 'c'))");
  CheckParseEq("xyz?", "(: 'xy' (# 0 1 g 'z'))");
  CheckParseEq("xyz??", "(: 'xy' (# 0 1 n 'z'))");
  CheckParseEq("xyz{0,1}", "(: 'xy' (# 0 1 g 'z'))");
  CheckParseEq("xyz{0,1}?", "(: 'xy' (# 0 1 n 'z'))");
  CheckParseEq("xyz{93}", "(: 'xy' (# 93 93 g 'z'))");
  CheckParseEq("xyz{93}?", "(: 'xy' (# 93 93 n 'z'))");
  CheckParseEq("xyz{1,32}", "(: 'xy' (# 1 32 g 'z'))");
  CheckParseEq("xyz{1,32}?", "(: 'xy' (# 1 32 n 'z'))");
  CheckParseEq("xyz{1,}", "(: 'xy' (# 1 - g 'z'))");
  CheckParseEq("xyz{1,}?", "(: 'xy' (# 1 - n 'z'))");
  CheckParseEq("a\\fb\\nc\\rd\\te\\vf", "'a\\x0cb\\x0ac\\x0dd\\x09e\\x0bf'");
  CheckParseEq("a\\nb\\bc", "(: 'a\\x0ab' @b 'c')");
  CheckParseEq("(?:foo)", "'foo'");
  CheckParseEq("(?: foo )", "' foo '");
  CheckParseEq("(foo|bar|baz)", "(^ (| 'foo' 'bar' 'baz'))");
  CheckParseEq("foo|(bar|baz)|quux", "(| 'foo' (^ (| 'bar' 'baz')) 'quux')");
  CheckParseEq("foo(?=bar)baz", "(: 'foo' (-> + 'bar') 'baz')");
  CheckParseEq("foo(?!bar)baz", "(: 'foo' (-> - 'bar') 'baz')");
  CheckParseEq("()", "(^ %)");
  CheckParseEq("(?=)", "(-> + %)");
  CheckParseEq("[]", "^[\\x00-\\uffff]");  // Doesn't compile on windows
  CheckParseEq("[^]", "[\\x00-\\uffff]");  // \uffff isn't in codepage 1252
  CheckParseEq("[x]", "[x]");
  CheckParseEq("[xyz]", "[x y z]");
  CheckParseEq("[a-zA-Z0-9]", "[a-z A-Z 0-9]");
  CheckParseEq("[-123]", "[- 1 2 3]");
  CheckParseEq("[^123]", "^[1 2 3]");
  CheckParseEq("]", "']'");
  CheckParseEq("}", "'}'");
  CheckParseEq("[a-b-c]", "[a-b - c]");
  CheckParseEq("[\\d]", "[0-9]");
  CheckParseEq("[x\\dz]", "[x 0-9 z]");
  CheckParseEq("[\\d-z]", "[0-9 - z]");
  CheckParseEq("[\\d-\\d]", "[0-9 - 0-9]");
  CheckParseEq("[z-\\d]", "[z - 0-9]");
207
  // Control character outside character class.
208 209 210 211 212
  CheckParseEq("\\cj\\cJ\\ci\\cI\\ck\\cK", "'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'");
  CheckParseEq("\\c!", "'\\c!'");
  CheckParseEq("\\c_", "'\\c_'");
  CheckParseEq("\\c~", "'\\c~'");
  CheckParseEq("\\c1", "'\\c1'");
213
  // Control character inside character class.
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
  CheckParseEq("[\\c!]", "[\\ c !]");
  CheckParseEq("[\\c_]", "[\\x1f]");
  CheckParseEq("[\\c~]", "[\\ c ~]");
  CheckParseEq("[\\ca]", "[\\x01]");
  CheckParseEq("[\\cz]", "[\\x1a]");
  CheckParseEq("[\\cA]", "[\\x01]");
  CheckParseEq("[\\cZ]", "[\\x1a]");
  CheckParseEq("[\\c1]", "[\\x11]");

  CheckParseEq("[a\\]c]", "[a ] c]");
  CheckParseEq("\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ", "'[]{}()%^# '");
  CheckParseEq("[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "[[ ] { } ( ) % ^ #  ]");
  CheckParseEq("\\0", "'\\x00'");
  CheckParseEq("\\8", "'8'");
  CheckParseEq("\\9", "'9'");
  CheckParseEq("\\11", "'\\x09'");
  CheckParseEq("\\11a", "'\\x09a'");
  CheckParseEq("\\011", "'\\x09'");
  CheckParseEq("\\00011", "'\\x0011'");
  CheckParseEq("\\118", "'\\x098'");
  CheckParseEq("\\111", "'I'");
  CheckParseEq("\\1111", "'I1'");
  CheckParseEq("(x)(x)(x)\\1", "(: (^ 'x') (^ 'x') (^ 'x') (<- 1))");
  CheckParseEq("(x)(x)(x)\\2", "(: (^ 'x') (^ 'x') (^ 'x') (<- 2))");
  CheckParseEq("(x)(x)(x)\\3", "(: (^ 'x') (^ 'x') (^ 'x') (<- 3))");
  CheckParseEq("(x)(x)(x)\\4", "(: (^ 'x') (^ 'x') (^ 'x') '\\x04')");
  CheckParseEq("(x)(x)(x)\\1*",
               "(: (^ 'x') (^ 'x') (^ 'x')"
               " (# 0 - g (<- 1)))");
  CheckParseEq("(x)(x)(x)\\2*",
               "(: (^ 'x') (^ 'x') (^ 'x')"
               " (# 0 - g (<- 2)))");
  CheckParseEq("(x)(x)(x)\\3*",
               "(: (^ 'x') (^ 'x') (^ 'x')"
               " (# 0 - g (<- 3)))");
  CheckParseEq("(x)(x)(x)\\4*",
               "(: (^ 'x') (^ 'x') (^ 'x')"
               " (# 0 - g '\\x04'))");
  CheckParseEq("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\10",
               "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
               " (^ 'x') (^ 'x') (^ 'x') (^ 'x') (<- 10))");
  CheckParseEq("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\11",
               "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
               " (^ 'x') (^ 'x') (^ 'x') (^ 'x') '\\x09')");
  CheckParseEq("(a)\\1", "(: (^ 'a') (<- 1))");
  CheckParseEq("(a\\1)", "(^ 'a')");
  CheckParseEq("(\\1a)", "(^ 'a')");
  CheckParseEq("(?=a)?a", "'a'");
  CheckParseEq("(?=a){0,10}a", "'a'");
  CheckParseEq("(?=a){1,10}a", "(: (-> + 'a') 'a')");
  CheckParseEq("(?=a){9,10}a", "(: (-> + 'a') 'a')");
  CheckParseEq("(?!a)?a", "'a'");
  CheckParseEq("\\1(a)", "(^ 'a')");
  CheckParseEq("(?!(a))\\1", "(: (-> - (^ 'a')) (<- 1))");
  CheckParseEq("(?!\\1(a\\1)\\1)\\1", "(: (-> - (: (^ 'a') (<- 1))) (<- 1))");
  CheckParseEq("[\\0]", "[\\x00]");
  CheckParseEq("[\\11]", "[\\x09]");
  CheckParseEq("[\\11a]", "[\\x09 a]");
  CheckParseEq("[\\011]", "[\\x09]");
  CheckParseEq("[\\00011]", "[\\x00 1 1]");
  CheckParseEq("[\\118]", "[\\x09 8]");
  CheckParseEq("[\\111]", "[I]");
  CheckParseEq("[\\1111]", "[I 1]");
  CheckParseEq("\\x34", "'\x34'");
  CheckParseEq("\\x60", "'\x60'");
  CheckParseEq("\\x3z", "'x3z'");
  CheckParseEq("\\c", "'\\c'");
  CheckParseEq("\\u0034", "'\x34'");
  CheckParseEq("\\u003z", "'u003z'");
  CheckParseEq("foo[z]*", "(: 'foo' (# 0 - g [z]))");
284

285
  CHECK_SIMPLE("", false);
286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
  CHECK_SIMPLE("a", true);
  CHECK_SIMPLE("a|b", false);
  CHECK_SIMPLE("a\\n", false);
  CHECK_SIMPLE("^a", false);
  CHECK_SIMPLE("a$", false);
  CHECK_SIMPLE("a\\b!", false);
  CHECK_SIMPLE("a\\Bb", false);
  CHECK_SIMPLE("a*", false);
  CHECK_SIMPLE("a*?", false);
  CHECK_SIMPLE("a?", false);
  CHECK_SIMPLE("a??", false);
  CHECK_SIMPLE("a{0,1}?", false);
  CHECK_SIMPLE("a{1,1}?", false);
  CHECK_SIMPLE("a{1,2}?", false);
  CHECK_SIMPLE("a+?", false);
  CHECK_SIMPLE("(a)", false);
  CHECK_SIMPLE("(a)\\1", false);
  CHECK_SIMPLE("(\\1a)", false);
  CHECK_SIMPLE("\\1(a)", false);
  CHECK_SIMPLE("a\\s", false);
  CHECK_SIMPLE("a\\S", false);
  CHECK_SIMPLE("a\\d", false);
  CHECK_SIMPLE("a\\D", false);
  CHECK_SIMPLE("a\\w", false);
  CHECK_SIMPLE("a\\W", false);
  CHECK_SIMPLE("a.", false);
  CHECK_SIMPLE("a\\q", false);
  CHECK_SIMPLE("a[a]", false);
  CHECK_SIMPLE("a[^a]", false);
  CHECK_SIMPLE("a[a-z]", false);
  CHECK_SIMPLE("a[\\q]", false);
  CHECK_SIMPLE("a(?:b)", false);
  CHECK_SIMPLE("a(?=b)", false);
  CHECK_SIMPLE("a(?!b)", false);
  CHECK_SIMPLE("\\x60", false);
  CHECK_SIMPLE("\\u0060", false);
  CHECK_SIMPLE("\\cA", false);
  CHECK_SIMPLE("\\q", false);
  CHECK_SIMPLE("\\1112", false);
  CHECK_SIMPLE("\\0", false);
  CHECK_SIMPLE("(a)\\1", false);
  CHECK_SIMPLE("(?=a)?a", false);
  CHECK_SIMPLE("(?!a)?a\\1", false);
  CHECK_SIMPLE("(?:(?=a))a\\1", false);
330

331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
  CheckParseEq("a{}", "'a{}'");
  CheckParseEq("a{,}", "'a{,}'");
  CheckParseEq("a{", "'a{'");
  CheckParseEq("a{z}", "'a{z}'");
  CheckParseEq("a{1z}", "'a{1z}'");
  CheckParseEq("a{12z}", "'a{12z}'");
  CheckParseEq("a{12,", "'a{12,'");
  CheckParseEq("a{12,3b", "'a{12,3b'");
  CheckParseEq("{}", "'{}'");
  CheckParseEq("{,}", "'{,}'");
  CheckParseEq("{", "'{'");
  CheckParseEq("{z}", "'{z}'");
  CheckParseEq("{1z}", "'{1z}'");
  CheckParseEq("{12z}", "'{12z}'");
  CheckParseEq("{12,", "'{12,'");
  CheckParseEq("{12,3b", "'{12,3b'");
347 348 349 350 351 352 353 354 355 356 357 358 359

  CHECK_MIN_MAX("a", 1, 1);
  CHECK_MIN_MAX("abc", 3, 3);
  CHECK_MIN_MAX("a[bc]d", 3, 3);
  CHECK_MIN_MAX("a|bc", 1, 2);
  CHECK_MIN_MAX("ab|c", 1, 2);
  CHECK_MIN_MAX("a||bc", 0, 2);
  CHECK_MIN_MAX("|", 0, 0);
  CHECK_MIN_MAX("(?:ab)", 2, 2);
  CHECK_MIN_MAX("(?:ab|cde)", 2, 3);
  CHECK_MIN_MAX("(?:ab)|cde", 2, 3);
  CHECK_MIN_MAX("(ab)", 2, 2);
  CHECK_MIN_MAX("(ab|cde)", 2, 3);
360 361
  CHECK_MIN_MAX("(ab)\\1", 2, 4);
  CHECK_MIN_MAX("(ab|cde)\\1", 2, 6);
362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
  CHECK_MIN_MAX("(?:ab)?", 0, 2);
  CHECK_MIN_MAX("(?:ab)*", 0, RegExpTree::kInfinity);
  CHECK_MIN_MAX("(?:ab)+", 2, RegExpTree::kInfinity);
  CHECK_MIN_MAX("a?", 0, 1);
  CHECK_MIN_MAX("a*", 0, RegExpTree::kInfinity);
  CHECK_MIN_MAX("a+", 1, RegExpTree::kInfinity);
  CHECK_MIN_MAX("a??", 0, 1);
  CHECK_MIN_MAX("a*?", 0, RegExpTree::kInfinity);
  CHECK_MIN_MAX("a+?", 1, RegExpTree::kInfinity);
  CHECK_MIN_MAX("(?:a?)?", 0, 1);
  CHECK_MIN_MAX("(?:a*)?", 0, RegExpTree::kInfinity);
  CHECK_MIN_MAX("(?:a+)?", 0, RegExpTree::kInfinity);
  CHECK_MIN_MAX("(?:a?)+", 0, RegExpTree::kInfinity);
  CHECK_MIN_MAX("(?:a*)+", 0, RegExpTree::kInfinity);
  CHECK_MIN_MAX("(?:a+)+", 1, RegExpTree::kInfinity);
  CHECK_MIN_MAX("(?:a?)*", 0, RegExpTree::kInfinity);
  CHECK_MIN_MAX("(?:a*)*", 0, RegExpTree::kInfinity);
  CHECK_MIN_MAX("(?:a+)*", 0, RegExpTree::kInfinity);
  CHECK_MIN_MAX("a{0}", 0, 0);
  CHECK_MIN_MAX("(?:a+){0}", 0, 0);
  CHECK_MIN_MAX("(?:a+){0,0}", 0, 0);
  CHECK_MIN_MAX("a*b", 1, RegExpTree::kInfinity);
  CHECK_MIN_MAX("a+b", 2, RegExpTree::kInfinity);
  CHECK_MIN_MAX("a*b|c", 1, RegExpTree::kInfinity);
  CHECK_MIN_MAX("a+b|c", 1, RegExpTree::kInfinity);
  CHECK_MIN_MAX("(?:a{5,1000000}){3,1000000}", 15, RegExpTree::kInfinity);
  CHECK_MIN_MAX("(?:ab){4,7}", 8, 14);
  CHECK_MIN_MAX("a\\bc", 2, 2);
  CHECK_MIN_MAX("a\\Bc", 2, 2);
  CHECK_MIN_MAX("a\\sc", 3, 3);
  CHECK_MIN_MAX("a\\Sc", 3, 3);
  CHECK_MIN_MAX("a(?=b)c", 2, 2);
  CHECK_MIN_MAX("a(?=bbb|bb)c", 2, 2);
  CHECK_MIN_MAX("a(?!bbb|bb)c", 2, 2);
396 397
}

398

399
TEST(ParserRegression) {
400 401 402 403
  CheckParseEq("[A-Z$-][x]", "(! [A-Z $ -] [x])");
  CheckParseEq("a{3,4*}", "(: 'a{3,' (# 0 - g '4') '}')");
  CheckParseEq("{", "'{'");
  CheckParseEq("a|", "(| 'a' %)");
404 405 406 407
}

static void ExpectError(const char* input,
                        const char* expected) {
408
  v8::HandleScope scope(CcTest::isolate());
409
  Zone zone;
410
  FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
411
  RegExpCompileData result;
412 413
  CHECK(!v8::internal::RegExpParser::ParseRegExp(
            CcTest::i_isolate(), &zone, &reader, false, false, &result));
414 415
  CHECK(result.tree == NULL);
  CHECK(!result.error.is_null());
rmcilroy's avatar
rmcilroy committed
416
  v8::base::SmartArrayPointer<char> str = result.error->ToCString(ALLOW_NULLS);
417
  CHECK_EQ(0, strcmp(expected, str.get()));
418 419 420
}


421
TEST(Errors) {
422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437
  const char* kEndBackslash = "\\ at end of pattern";
  ExpectError("\\", kEndBackslash);
  const char* kUnterminatedGroup = "Unterminated group";
  ExpectError("(foo", kUnterminatedGroup);
  const char* kInvalidGroup = "Invalid group";
  ExpectError("(?", kInvalidGroup);
  const char* kUnterminatedCharacterClass = "Unterminated character class";
  ExpectError("[", kUnterminatedCharacterClass);
  ExpectError("[a-", kUnterminatedCharacterClass);
  const char* kNothingToRepeat = "Nothing to repeat";
  ExpectError("*", kNothingToRepeat);
  ExpectError("?", kNothingToRepeat);
  ExpectError("+", kNothingToRepeat);
  ExpectError("{1}", kNothingToRepeat);
  ExpectError("{1,2}", kNothingToRepeat);
  ExpectError("{1,}", kNothingToRepeat);
438 439 440 441

  // Check that we don't allow more than kMaxCapture captures
  const int kMaxCaptures = 1 << 16;  // Must match RegExpParser::kMaxCaptures.
  const char* kTooManyCaptures = "Too many captures";
442
  std::ostringstream os;
443
  for (int i = 0; i <= kMaxCaptures; i++) {
444
    os << "()";
445
  }
446
  ExpectError(os.str().c_str(), kTooManyCaptures);
447 448 449 450 451 452 453 454 455 456 457 458 459
}


static bool IsDigit(uc16 c) {
  return ('0' <= c && c <= '9');
}


static bool NotDigit(uc16 c) {
  return !IsDigit(c);
}


460 461 462 463
static bool IsWhiteSpaceOrLineTerminator(uc16 c) {
  // According to ECMA 5.1, 15.10.2.12 the CharacterClassEscape \s includes
  // WhiteSpace (7.2) and LineTerminator (7.3) values.
  return v8::internal::WhiteSpaceOrLineTerminator::Is(c);
464 465 466
}


467 468
static bool NotWhiteSpaceNorLineTermiantor(uc16 c) {
  return !IsWhiteSpaceOrLineTerminator(c);
469 470 471 472
}


static bool NotWord(uc16 c) {
473
  return !IsRegExpWord(c);
474 475 476 477
}


static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) {
478
  Zone zone;
479
  ZoneList<CharacterRange>* ranges =
480 481
      new(&zone) ZoneList<CharacterRange>(2, &zone);
  CharacterRange::AddClassEscape(c, ranges, &zone);
482 483 484 485 486 487 488 489 490 491 492 493
  for (unsigned i = 0; i < (1 << 16); i++) {
    bool in_class = false;
    for (int j = 0; !in_class && j < ranges->length(); j++) {
      CharacterRange& range = ranges->at(j);
      in_class = (range.from() <= i && i <= range.to());
    }
    CHECK_EQ(pred(i), in_class);
  }
}


TEST(CharacterClassEscapes) {
494
  TestCharacterClassEscapes('.', IsRegExpNewline);
495 496
  TestCharacterClassEscapes('d', IsDigit);
  TestCharacterClassEscapes('D', NotDigit);
497 498
  TestCharacterClassEscapes('s', IsWhiteSpaceOrLineTerminator);
  TestCharacterClassEscapes('S', NotWhiteSpaceNorLineTermiantor);
499
  TestCharacterClassEscapes('w', IsRegExpWord);
500 501 502 503
  TestCharacterClassEscapes('W', NotWord);
}


504 505
static RegExpNode* Compile(const char* input, bool multiline, bool unicode,
                           bool is_one_byte, Zone* zone) {
506
  Isolate* isolate = CcTest::i_isolate();
507
  FlatStringReader reader(isolate, CStrVector(input));
508
  RegExpCompileData compile_data;
509 510 511
  if (!v8::internal::RegExpParser::ParseRegExp(CcTest::i_isolate(), zone,
                                               &reader, multiline, unicode,
                                               &compile_data))
512
    return NULL;
513 514 515
  Handle<String> pattern = isolate->factory()
                               ->NewStringFromUtf8(CStrVector(input))
                               .ToHandleChecked();
516
  Handle<String> sample_subject =
517
      isolate->factory()->NewStringFromUtf8(CStrVector("")).ToHandleChecked();
518 519
  RegExpEngine::Compile(isolate, zone, &compile_data, false, false, multiline,
                        false, pattern, sample_subject, is_one_byte);
520
  return compile_data.node;
521 522 523
}


524 525
static void Execute(const char* input, bool multiline, bool unicode,
                    bool is_one_byte, bool dot_output = false) {
526
  v8::HandleScope scope(CcTest::isolate());
527
  Zone zone;
528
  RegExpNode* node = Compile(input, multiline, unicode, is_one_byte, &zone);
529 530 531
  USE(node);
#ifdef DEBUG
  if (dot_output) {
532
    RegExpEngine::DotPrint(input, node, false);
533 534 535 536 537 538 539 540 541 542
  }
#endif  // DEBUG
}


class TestConfig {
 public:
  typedef int Key;
  typedef int Value;
  static const int kNoKey;
543
  static int NoValue() { return 0; }
544 545 546 547 548 549 550 551 552 553 554 555 556 557
  static inline int Compare(int a, int b) {
    if (a < b)
      return -1;
    else if (a > b)
      return 1;
    else
      return 0;
  }
};


const int TestConfig::kNoKey = 0;


558
static unsigned PseudoRandom(int i, int j) {
559 560 561 562 563
  return ~(~((i * 781) ^ (j * 329)));
}


TEST(SplayTreeSimple) {
564
  static const unsigned kLimit = 1000;
565
  Zone zone;
566
  ZoneSplayTree<TestConfig> tree(&zone);
567 568
  bool seen[kLimit];
  for (unsigned i = 0; i < kLimit; i++) seen[i] = false;
569
#define CHECK_MAPS_EQUAL() do {                                      \
570 571
    for (unsigned k = 0; k < kLimit; k++)                            \
      CHECK_EQ(seen[k], tree.Find(k, &loc));                         \
572 573 574
  } while (false)
  for (int i = 0; i < 50; i++) {
    for (int j = 0; j < 50; j++) {
575
      int next = PseudoRandom(i, j) % kLimit;
576
      if (seen[next]) {
577 578 579 580 581 582 583
        // We've already seen this one.  Check the value and remove
        // it.
        ZoneSplayTree<TestConfig>::Locator loc;
        CHECK(tree.Find(next, &loc));
        CHECK_EQ(next, loc.key());
        CHECK_EQ(3 * next, loc.value());
        tree.Remove(next);
584
        seen[next] = false;
585 586 587 588 589 590 591 592
        CHECK_MAPS_EQUAL();
      } else {
        // Check that it wasn't there already and then add it.
        ZoneSplayTree<TestConfig>::Locator loc;
        CHECK(!tree.Find(next, &loc));
        CHECK(tree.Insert(next, &loc));
        CHECK_EQ(next, loc.key());
        loc.set_value(3 * next);
593
        seen[next] = true;
594 595 596
        CHECK_MAPS_EQUAL();
      }
      int val = PseudoRandom(j, i) % kLimit;
597 598 599 600 601
      if (seen[val]) {
        ZoneSplayTree<TestConfig>::Locator loc;
        CHECK(tree.FindGreatestLessThan(val, &loc));
        CHECK_EQ(loc.key(), val);
        break;
602 603
      }
      val = PseudoRandom(i + j, i - j) % kLimit;
604 605 606 607 608
      if (seen[val]) {
        ZoneSplayTree<TestConfig>::Locator loc;
        CHECK(tree.FindLeastGreaterThan(val, &loc));
        CHECK_EQ(loc.key(), val);
        break;
609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631
      }
    }
  }
}


TEST(DispatchTableConstruction) {
  // Initialize test data.
  static const int kLimit = 1000;
  static const int kRangeCount = 8;
  static const int kRangeSize = 16;
  uc16 ranges[kRangeCount][2 * kRangeSize];
  for (int i = 0; i < kRangeCount; i++) {
    Vector<uc16> range(ranges[i], 2 * kRangeSize);
    for (int j = 0; j < 2 * kRangeSize; j++) {
      range[j] = PseudoRandom(i + 25, j + 87) % kLimit;
    }
    range.Sort();
    for (int j = 1; j < 2 * kRangeSize; j++) {
      CHECK(range[j-1] <= range[j]);
    }
  }
  // Enter test data into dispatch table.
632
  Zone zone;
633
  DispatchTable table(&zone);
634 635 636
  for (int i = 0; i < kRangeCount; i++) {
    uc16* range = ranges[i];
    for (int j = 0; j < 2 * kRangeSize; j += 2)
637
      table.AddRange(CharacterRange(range[j], range[j + 1]), i, &zone);
638 639 640 641 642 643 644 645 646 647 648 649 650 651
  }
  // Check that the table looks as we would expect
  for (int p = 0; p < kLimit; p++) {
    OutSet* outs = table.Get(p);
    for (int j = 0; j < kRangeCount; j++) {
      uc16* range = ranges[j];
      bool is_on = false;
      for (int k = 0; !is_on && (k < 2 * kRangeSize); k += 2)
        is_on = (range[k] <= p && p <= range[k + 1]);
      CHECK_EQ(is_on, outs->Get(j));
    }
  }
}

652

653 654 655 656 657 658 659 660 661
// Test of debug-only syntax.
#ifdef DEBUG

TEST(ParsePossessiveRepetition) {
  bool old_flag_value = FLAG_regexp_possessive_quantifier;

  // Enable possessive quantifier syntax.
  FLAG_regexp_possessive_quantifier = true;

662 663 664 665 666
  CheckParseEq("a*+", "(# 0 - p 'a')");
  CheckParseEq("a++", "(# 1 - p 'a')");
  CheckParseEq("a?+", "(# 0 1 p 'a')");
  CheckParseEq("a{10,20}+", "(# 10 20 p 'a')");
  CheckParseEq("za{10,20}+b", "(: 'z' (# 10 20 p 'a') 'b')");
667 668 669 670 671 672 673 674 675 676 677 678 679 680

  // Disable possessive quantifier syntax.
  FLAG_regexp_possessive_quantifier = false;

  CHECK_PARSE_ERROR("a*+");
  CHECK_PARSE_ERROR("a++");
  CHECK_PARSE_ERROR("a?+");
  CHECK_PARSE_ERROR("a{10,20}+");
  CHECK_PARSE_ERROR("a{10,20}+b");

  FLAG_regexp_possessive_quantifier = old_flag_value;
}

#endif
681

682 683
// Tests of interpreter.

684

685
#ifndef V8_INTERPRETED_REGEXP
686

lrn@chromium.org's avatar
lrn@chromium.org committed
687
#if V8_TARGET_ARCH_IA32
688
typedef RegExpMacroAssemblerIA32 ArchRegExpMacroAssembler;
lrn@chromium.org's avatar
lrn@chromium.org committed
689
#elif V8_TARGET_ARCH_X64
690
typedef RegExpMacroAssemblerX64 ArchRegExpMacroAssembler;
lrn@chromium.org's avatar
lrn@chromium.org committed
691 692
#elif V8_TARGET_ARCH_ARM
typedef RegExpMacroAssemblerARM ArchRegExpMacroAssembler;
693 694
#elif V8_TARGET_ARCH_ARM64
typedef RegExpMacroAssemblerARM64 ArchRegExpMacroAssembler;
695 696
#elif V8_TARGET_ARCH_PPC
typedef RegExpMacroAssemblerPPC ArchRegExpMacroAssembler;
697
#elif V8_TARGET_ARCH_MIPS
698
typedef RegExpMacroAssemblerMIPS ArchRegExpMacroAssembler;
699 700
#elif V8_TARGET_ARCH_MIPS64
typedef RegExpMacroAssemblerMIPS ArchRegExpMacroAssembler;
danno@chromium.org's avatar
danno@chromium.org committed
701 702
#elif V8_TARGET_ARCH_X87
typedef RegExpMacroAssemblerX87 ArchRegExpMacroAssembler;
703 704
#endif

705 706
class ContextInitializer {
 public:
707
  ContextInitializer()
708 709
      : scope_(CcTest::isolate()),
        env_(v8::Context::New(CcTest::isolate())) {
710 711 712 713 714 715 716
    env_->Enter();
  }
  ~ContextInitializer() {
    env_->Exit();
  }
 private:
  v8::HandleScope scope_;
717
  v8::Handle<v8::Context> env_;
718 719
};

720

721 722 723 724 725
static ArchRegExpMacroAssembler::Result Execute(Code* code,
                                                String* input,
                                                int start_offset,
                                                const byte* input_start,
                                                const byte* input_end,
726
                                                int* captures) {
727
  return NativeRegExpMacroAssembler::Execute(
728
      code,
729
      input,
730
      start_offset,
731 732
      input_start,
      input_end,
733
      captures,
734
      0,
735
      CcTest::i_isolate());
736 737
}

738

739
TEST(MacroAssemblerNativeSuccess) {
740
  v8::V8::Initialize();
741
  ContextInitializer initializer;
742
  Isolate* isolate = CcTest::i_isolate();
743
  Factory* factory = isolate->factory();
744
  Zone zone;
745

746 747
  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
                             4);
748 749 750

  m.Succeed();

751
  Handle<String> source = factory->NewStringFromStaticChars("");
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
752
  Handle<Object> code_object = m.GetCode(source);
753 754 755
  Handle<Code> code = Handle<Code>::cast(code_object);

  int captures[4] = {42, 37, 87, 117};
756
  Handle<String> input = factory->NewStringFromStaticChars("foofoo");
757
  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
758 759
  const byte* start_adr =
      reinterpret_cast<const byte*>(seq_input->GetCharsAddress());
760

761 762 763 764 765 766
  NativeRegExpMacroAssembler::Result result =
      Execute(*code,
              *input,
              0,
              start_adr,
              start_adr + seq_input->length(),
767
              captures);
768

769
  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
770 771 772 773 774 775 776
  CHECK_EQ(-1, captures[0]);
  CHECK_EQ(-1, captures[1]);
  CHECK_EQ(-1, captures[2]);
  CHECK_EQ(-1, captures[3]);
}


777
TEST(MacroAssemblerNativeSimple) {
778
  v8::V8::Initialize();
779
  ContextInitializer initializer;
780
  Isolate* isolate = CcTest::i_isolate();
781
  Factory* factory = isolate->factory();
782
  Zone zone;
783

784 785
  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
                             4);
786

787 788 789 790 791 792 793 794 795
  Label fail, backtrack;
  m.PushBacktrack(&fail);
  m.CheckNotAtStart(NULL);
  m.LoadCurrentCharacter(2, NULL);
  m.CheckNotCharacter('o', NULL);
  m.LoadCurrentCharacter(1, NULL, false);
  m.CheckNotCharacter('o', NULL);
  m.LoadCurrentCharacter(0, NULL, false);
  m.CheckNotCharacter('f', NULL);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
796
  m.WriteCurrentPositionToRegister(0, 0);
797
  m.WriteCurrentPositionToRegister(1, 3);
798
  m.AdvanceCurrentPosition(3);
799
  m.PushBacktrack(&backtrack);
800
  m.Succeed();
801 802
  m.Bind(&backtrack);
  m.Backtrack();
803 804 805
  m.Bind(&fail);
  m.Fail();

806
  Handle<String> source = factory->NewStringFromStaticChars("^foo");
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
807
  Handle<Object> code_object = m.GetCode(source);
808 809 810
  Handle<Code> code = Handle<Code>::cast(code_object);

  int captures[4] = {42, 37, 87, 117};
811
  Handle<String> input = factory->NewStringFromStaticChars("foofoo");
812
  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
813 814
  Address start_adr = seq_input->GetCharsAddress();

815 816 817 818 819 820
  NativeRegExpMacroAssembler::Result result =
      Execute(*code,
              *input,
              0,
              start_adr,
              start_adr + input->length(),
821
              captures);
822

823
  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
824 825 826 827 828
  CHECK_EQ(0, captures[0]);
  CHECK_EQ(3, captures[1]);
  CHECK_EQ(-1, captures[2]);
  CHECK_EQ(-1, captures[3]);

829
  input = factory->NewStringFromStaticChars("barbarbar");
830
  seq_input = Handle<SeqOneByteString>::cast(input);
831 832
  start_adr = seq_input->GetCharsAddress();

833 834 835 836 837
  result = Execute(*code,
                   *input,
                   0,
                   start_adr,
                   start_adr + input->length(),
838
                   captures);
839

840
  CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
841 842 843
}


844
TEST(MacroAssemblerNativeSimpleUC16) {
845
  v8::V8::Initialize();
846
  ContextInitializer initializer;
847
  Isolate* isolate = CcTest::i_isolate();
848
  Factory* factory = isolate->factory();
849
  Zone zone;
850

851 852
  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::UC16,
                             4);
853

854 855 856 857 858 859 860 861 862
  Label fail, backtrack;
  m.PushBacktrack(&fail);
  m.CheckNotAtStart(NULL);
  m.LoadCurrentCharacter(2, NULL);
  m.CheckNotCharacter('o', NULL);
  m.LoadCurrentCharacter(1, NULL, false);
  m.CheckNotCharacter('o', NULL);
  m.LoadCurrentCharacter(0, NULL, false);
  m.CheckNotCharacter('f', NULL);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
863
  m.WriteCurrentPositionToRegister(0, 0);
864
  m.WriteCurrentPositionToRegister(1, 3);
865
  m.AdvanceCurrentPosition(3);
866
  m.PushBacktrack(&backtrack);
867
  m.Succeed();
868 869
  m.Bind(&backtrack);
  m.Backtrack();
870 871 872
  m.Bind(&fail);
  m.Fail();

873
  Handle<String> source = factory->NewStringFromStaticChars("^foo");
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
874
  Handle<Object> code_object = m.GetCode(source);
875 876 877
  Handle<Code> code = Handle<Code>::cast(code_object);

  int captures[4] = {42, 37, 87, 117};
878
  const uc16 input_data[6] = {'f', 'o', 'o', 'f', 'o',
879
                              static_cast<uc16>(0x2603)};
880 881
  Handle<String> input = factory->NewStringFromTwoByte(
      Vector<const uc16>(input_data, 6)).ToHandleChecked();
882 883 884
  Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
  Address start_adr = seq_input->GetCharsAddress();

885 886 887 888 889 890
  NativeRegExpMacroAssembler::Result result =
      Execute(*code,
              *input,
              0,
              start_adr,
              start_adr + input->length(),
891
              captures);
892

893
  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
894 895 896 897 898
  CHECK_EQ(0, captures[0]);
  CHECK_EQ(3, captures[1]);
  CHECK_EQ(-1, captures[2]);
  CHECK_EQ(-1, captures[3]);

899
  const uc16 input_data2[9] = {'b', 'a', 'r', 'b', 'a', 'r', 'b', 'a',
900
                               static_cast<uc16>(0x2603)};
901 902
  input = factory->NewStringFromTwoByte(
      Vector<const uc16>(input_data2, 9)).ToHandleChecked();
903 904 905
  seq_input = Handle<SeqTwoByteString>::cast(input);
  start_adr = seq_input->GetCharsAddress();

906 907 908 909 910
  result = Execute(*code,
                   *input,
                   0,
                   start_adr,
                   start_adr + input->length() * 2,
911
                   captures);
912

913
  CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
914 915 916
}


917
TEST(MacroAssemblerNativeBacktrack) {
918
  v8::V8::Initialize();
919
  ContextInitializer initializer;
920
  Isolate* isolate = CcTest::i_isolate();
921
  Factory* factory = isolate->factory();
922
  Zone zone;
923

924 925
  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
                             0);
926 927 928 929 930 931 932 933 934 935 936 937

  Label fail;
  Label backtrack;
  m.LoadCurrentCharacter(10, &fail);
  m.Succeed();
  m.Bind(&fail);
  m.PushBacktrack(&backtrack);
  m.LoadCurrentCharacter(10, NULL);
  m.Succeed();
  m.Bind(&backtrack);
  m.Fail();

938
  Handle<String> source = factory->NewStringFromStaticChars("..........");
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
939
  Handle<Object> code_object = m.GetCode(source);
940 941
  Handle<Code> code = Handle<Code>::cast(code_object);

942
  Handle<String> input = factory->NewStringFromStaticChars("foofoo");
943
  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
944 945
  Address start_adr = seq_input->GetCharsAddress();

946 947 948 949 950 951
  NativeRegExpMacroAssembler::Result result =
      Execute(*code,
              *input,
              0,
              start_adr,
              start_adr + input->length(),
952
              NULL);
953

954
  CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
955 956
}

957

958
TEST(MacroAssemblerNativeBackReferenceLATIN1) {
959
  v8::V8::Initialize();
960
  ContextInitializer initializer;
961
  Isolate* isolate = CcTest::i_isolate();
962
  Factory* factory = isolate->factory();
963
  Zone zone;
964

965 966
  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
                             4);
967

erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
968
  m.WriteCurrentPositionToRegister(0, 0);
969
  m.AdvanceCurrentPosition(2);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
970
  m.WriteCurrentPositionToRegister(1, 0);
971 972 973 974 975 976 977
  Label nomatch;
  m.CheckNotBackReference(0, &nomatch);
  m.Fail();
  m.Bind(&nomatch);
  m.AdvanceCurrentPosition(2);
  Label missing_match;
  m.CheckNotBackReference(0, &missing_match);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
978
  m.WriteCurrentPositionToRegister(2, 0);
979 980 981 982
  m.Succeed();
  m.Bind(&missing_match);
  m.Fail();

983
  Handle<String> source = factory->NewStringFromStaticChars("^(..)..\1");
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
984
  Handle<Object> code_object = m.GetCode(source);
985 986
  Handle<Code> code = Handle<Code>::cast(code_object);

987
  Handle<String> input = factory->NewStringFromStaticChars("fooofo");
988
  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
989 990
  Address start_adr = seq_input->GetCharsAddress();

lrn@chromium.org's avatar
lrn@chromium.org committed
991
  int output[4];
992 993 994 995 996 997
  NativeRegExpMacroAssembler::Result result =
      Execute(*code,
              *input,
              0,
              start_adr,
              start_adr + input->length(),
998
              output);
999 1000

  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1001 1002 1003
  CHECK_EQ(0, output[0]);
  CHECK_EQ(2, output[1]);
  CHECK_EQ(6, output[2]);
lrn@chromium.org's avatar
lrn@chromium.org committed
1004
  CHECK_EQ(-1, output[3]);
1005 1006 1007
}


1008
TEST(MacroAssemblerNativeBackReferenceUC16) {
1009 1010
  v8::V8::Initialize();
  ContextInitializer initializer;
1011
  Isolate* isolate = CcTest::i_isolate();
1012
  Factory* factory = isolate->factory();
1013
  Zone zone;
1014

1015 1016
  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::UC16,
                             4);
1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032

  m.WriteCurrentPositionToRegister(0, 0);
  m.AdvanceCurrentPosition(2);
  m.WriteCurrentPositionToRegister(1, 0);
  Label nomatch;
  m.CheckNotBackReference(0, &nomatch);
  m.Fail();
  m.Bind(&nomatch);
  m.AdvanceCurrentPosition(2);
  Label missing_match;
  m.CheckNotBackReference(0, &missing_match);
  m.WriteCurrentPositionToRegister(2, 0);
  m.Succeed();
  m.Bind(&missing_match);
  m.Fail();

1033
  Handle<String> source = factory->NewStringFromStaticChars("^(..)..\1");
1034 1035 1036 1037
  Handle<Object> code_object = m.GetCode(source);
  Handle<Code> code = Handle<Code>::cast(code_object);

  const uc16 input_data[6] = {'f', 0x2028, 'o', 'o', 'f', 0x2028};
1038 1039
  Handle<String> input = factory->NewStringFromTwoByte(
      Vector<const uc16>(input_data, 6)).ToHandleChecked();
1040 1041 1042
  Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
  Address start_adr = seq_input->GetCharsAddress();

lrn@chromium.org's avatar
lrn@chromium.org committed
1043
  int output[4];
1044 1045
  NativeRegExpMacroAssembler::Result result =
      Execute(*code,
1046 1047 1048 1049 1050
              *input,
              0,
              start_adr,
              start_adr + input->length() * 2,
              output);
1051

1052
  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1053 1054 1055
  CHECK_EQ(0, output[0]);
  CHECK_EQ(2, output[1]);
  CHECK_EQ(6, output[2]);
lrn@chromium.org's avatar
lrn@chromium.org committed
1056
  CHECK_EQ(-1, output[3]);
1057 1058 1059 1060
}



1061
TEST(MacroAssemblernativeAtStart) {
1062
  v8::V8::Initialize();
1063
  ContextInitializer initializer;
1064
  Isolate* isolate = CcTest::i_isolate();
1065
  Factory* factory = isolate->factory();
1066
  Zone zone;
1067

1068 1069
  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
                             0);
1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080

  Label not_at_start, newline, fail;
  m.CheckNotAtStart(&not_at_start);
  // Check that prevchar = '\n' and current = 'f'.
  m.CheckCharacter('\n', &newline);
  m.Bind(&fail);
  m.Fail();
  m.Bind(&newline);
  m.LoadCurrentCharacter(0, &fail);
  m.CheckNotCharacter('f', &fail);
  m.Succeed();
1081

1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
  m.Bind(&not_at_start);
  // Check that prevchar = 'o' and current = 'b'.
  Label prevo;
  m.CheckCharacter('o', &prevo);
  m.Fail();
  m.Bind(&prevo);
  m.LoadCurrentCharacter(0, &fail);
  m.CheckNotCharacter('b', &fail);
  m.Succeed();

1092
  Handle<String> source = factory->NewStringFromStaticChars("(^f|ob)");
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1093
  Handle<Object> code_object = m.GetCode(source);
1094 1095
  Handle<Code> code = Handle<Code>::cast(code_object);

1096
  Handle<String> input = factory->NewStringFromStaticChars("foobar");
1097
  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1098 1099
  Address start_adr = seq_input->GetCharsAddress();

1100 1101 1102 1103 1104 1105
  NativeRegExpMacroAssembler::Result result =
      Execute(*code,
              *input,
              0,
              start_adr,
              start_adr + input->length(),
1106
              NULL);
1107 1108 1109 1110 1111 1112 1113 1114

  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);

  result = Execute(*code,
                   *input,
                   3,
                   start_adr + 3,
                   start_adr + input->length(),
1115
                   NULL);
1116 1117

  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1118 1119 1120
}


1121
TEST(MacroAssemblerNativeBackRefNoCase) {
1122
  v8::V8::Initialize();
1123
  ContextInitializer initializer;
1124
  Isolate* isolate = CcTest::i_isolate();
1125
  Factory* factory = isolate->factory();
1126
  Zone zone;
1127

1128 1129
  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
                             4);
1130 1131 1132

  Label fail, succ;

erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1133 1134
  m.WriteCurrentPositionToRegister(0, 0);
  m.WriteCurrentPositionToRegister(2, 0);
1135
  m.AdvanceCurrentPosition(3);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1136
  m.WriteCurrentPositionToRegister(3, 0);
1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149
  m.CheckNotBackReferenceIgnoreCase(2, &fail);  // Match "AbC".
  m.CheckNotBackReferenceIgnoreCase(2, &fail);  // Match "ABC".
  Label expected_fail;
  m.CheckNotBackReferenceIgnoreCase(2, &expected_fail);
  m.Bind(&fail);
  m.Fail();

  m.Bind(&expected_fail);
  m.AdvanceCurrentPosition(3);  // Skip "xYz"
  m.CheckNotBackReferenceIgnoreCase(2, &succ);
  m.Fail();

  m.Bind(&succ);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1150
  m.WriteCurrentPositionToRegister(1, 0);
1151 1152
  m.Succeed();

erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1153
  Handle<String> source =
1154
      factory->NewStringFromStaticChars("^(abc)\1\1(?!\1)...(?!\1)");
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1155
  Handle<Object> code_object = m.GetCode(source);
1156 1157
  Handle<Code> code = Handle<Code>::cast(code_object);

1158
  Handle<String> input = factory->NewStringFromStaticChars("aBcAbCABCxYzab");
1159
  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1160 1161 1162
  Address start_adr = seq_input->GetCharsAddress();

  int output[4];
1163 1164 1165 1166 1167 1168
  NativeRegExpMacroAssembler::Result result =
      Execute(*code,
              *input,
              0,
              start_adr,
              start_adr + input->length(),
1169
              output);
1170 1171

  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1172 1173 1174 1175 1176 1177 1178 1179
  CHECK_EQ(0, output[0]);
  CHECK_EQ(12, output[1]);
  CHECK_EQ(0, output[2]);
  CHECK_EQ(3, output[3]);
}



1180
TEST(MacroAssemblerNativeRegisters) {
1181
  v8::V8::Initialize();
1182
  ContextInitializer initializer;
1183
  Isolate* isolate = CcTest::i_isolate();
1184
  Factory* factory = isolate->factory();
1185
  Zone zone;
1186

1187 1188
  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
                             6);
1189 1190 1191 1192

  uc16 foo_chars[3] = {'f', 'o', 'o'};
  Vector<const uc16> foo(foo_chars, 3);

lrn@chromium.org's avatar
lrn@chromium.org committed
1193
  enum registers { out1, out2, out3, out4, out5, out6, sp, loop_cnt };
1194 1195
  Label fail;
  Label backtrack;
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1196
  m.WriteCurrentPositionToRegister(out1, 0);  // Output: [0]
1197
  m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
1198 1199 1200 1201
  m.PushBacktrack(&backtrack);
  m.WriteStackPointerToRegister(sp);
  // Fill stack and registers
  m.AdvanceCurrentPosition(2);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1202
  m.WriteCurrentPositionToRegister(out1, 0);
1203
  m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217
  m.PushBacktrack(&fail);
  // Drop backtrack stack frames.
  m.ReadStackPointerFromRegister(sp);
  // And take the first backtrack (to &backtrack)
  m.Backtrack();

  m.PushCurrentPosition();
  m.AdvanceCurrentPosition(2);
  m.PopCurrentPosition();

  m.Bind(&backtrack);
  m.PopRegister(out1);
  m.ReadCurrentPositionFromRegister(out1);
  m.AdvanceCurrentPosition(3);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1218
  m.WriteCurrentPositionToRegister(out2, 0);  // [0,3]
1219 1220 1221 1222 1223 1224 1225

  Label loop;
  m.SetRegister(loop_cnt, 0);  // loop counter
  m.Bind(&loop);
  m.AdvanceRegister(loop_cnt, 1);
  m.AdvanceCurrentPosition(1);
  m.IfRegisterLT(loop_cnt, 3, &loop);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1226
  m.WriteCurrentPositionToRegister(out3, 0);  // [0,3,6]
1227 1228 1229 1230 1231 1232 1233

  Label loop2;
  m.SetRegister(loop_cnt, 2);  // loop counter
  m.Bind(&loop2);
  m.AdvanceRegister(loop_cnt, -1);
  m.AdvanceCurrentPosition(1);
  m.IfRegisterGE(loop_cnt, 0, &loop2);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1234
  m.WriteCurrentPositionToRegister(out4, 0);  // [0,3,6,9]
1235 1236 1237

  Label loop3;
  Label exit_loop3;
1238 1239
  m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
  m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
1240 1241 1242
  m.ReadCurrentPositionFromRegister(out3);
  m.Bind(&loop3);
  m.AdvanceCurrentPosition(1);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1243
  m.CheckGreedyLoop(&exit_loop3);
1244 1245
  m.GoTo(&loop3);
  m.Bind(&exit_loop3);
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1246
  m.PopCurrentPosition();
lrn@chromium.org's avatar
lrn@chromium.org committed
1247
  m.WriteCurrentPositionToRegister(out5, 0);  // [0,3,6,9,9,-1]
1248 1249 1250 1251 1252 1253

  m.Succeed();

  m.Bind(&fail);
  m.Fail();

1254
  Handle<String> source = factory->NewStringFromStaticChars("<loop test>");
erik.corry@gmail.com's avatar
erik.corry@gmail.com committed
1255
  Handle<Object> code_object = m.GetCode(source);
1256 1257 1258
  Handle<Code> code = Handle<Code>::cast(code_object);

  // String long enough for test (content doesn't matter).
1259
  Handle<String> input = factory->NewStringFromStaticChars("foofoofoofoofoo");
1260
  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1261 1262
  Address start_adr = seq_input->GetCharsAddress();

lrn@chromium.org's avatar
lrn@chromium.org committed
1263
  int output[6];
1264 1265
  NativeRegExpMacroAssembler::Result result =
      Execute(*code,
lrn@chromium.org's avatar
lrn@chromium.org committed
1266 1267 1268 1269
              *input,
              0,
              start_adr,
              start_adr + input->length(),
1270
              output);
1271

1272
  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1273 1274 1275 1276 1277
  CHECK_EQ(0, output[0]);
  CHECK_EQ(3, output[1]);
  CHECK_EQ(6, output[2]);
  CHECK_EQ(9, output[3]);
  CHECK_EQ(9, output[4]);
lrn@chromium.org's avatar
lrn@chromium.org committed
1278
  CHECK_EQ(-1, output[5]);
1279 1280
}

1281

1282
TEST(MacroAssemblerStackOverflow) {
1283
  v8::V8::Initialize();
1284
  ContextInitializer initializer;
1285
  Isolate* isolate = CcTest::i_isolate();
1286
  Factory* factory = isolate->factory();
1287
  Zone zone;
1288

1289 1290
  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
                             0);
1291 1292 1293 1294 1295 1296 1297

  Label loop;
  m.Bind(&loop);
  m.PushBacktrack(&loop);
  m.GoTo(&loop);

  Handle<String> source =
1298
      factory->NewStringFromStaticChars("<stack overflow test>");
1299 1300 1301 1302
  Handle<Object> code_object = m.GetCode(source);
  Handle<Code> code = Handle<Code>::cast(code_object);

  // String long enough for test (content doesn't matter).
1303
  Handle<String> input = factory->NewStringFromStaticChars("dummy");
1304
  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1305 1306
  Address start_adr = seq_input->GetCharsAddress();

1307 1308 1309 1310 1311 1312
  NativeRegExpMacroAssembler::Result result =
      Execute(*code,
              *input,
              0,
              start_adr,
              start_adr + input->length(),
1313
              NULL);
1314

1315
  CHECK_EQ(NativeRegExpMacroAssembler::EXCEPTION, result);
1316 1317
  CHECK(isolate->has_pending_exception());
  isolate->clear_pending_exception();
1318 1319 1320
}


1321
TEST(MacroAssemblerNativeLotsOfRegisters) {
1322 1323
  v8::V8::Initialize();
  ContextInitializer initializer;
1324
  Isolate* isolate = CcTest::i_isolate();
1325
  Factory* factory = isolate->factory();
1326
  Zone zone;
1327

1328 1329
  ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
                             2);
1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344

  // At least 2048, to ensure the allocated space for registers
  // span one full page.
  const int large_number = 8000;
  m.WriteCurrentPositionToRegister(large_number, 42);
  m.WriteCurrentPositionToRegister(0, 0);
  m.WriteCurrentPositionToRegister(1, 1);
  Label done;
  m.CheckNotBackReference(0, &done);  // Performs a system-stack push.
  m.Bind(&done);
  m.PushRegister(large_number, RegExpMacroAssembler::kNoStackLimitCheck);
  m.PopRegister(1);
  m.Succeed();

  Handle<String> source =
1345
      factory->NewStringFromStaticChars("<huge register space test>");
1346 1347 1348 1349
  Handle<Object> code_object = m.GetCode(source);
  Handle<Code> code = Handle<Code>::cast(code_object);

  // String long enough for test (content doesn't matter).
1350
  Handle<String> input = factory->NewStringFromStaticChars("sample text");
1351
  Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1352 1353 1354
  Address start_adr = seq_input->GetCharsAddress();

  int captures[2];
1355 1356 1357 1358 1359 1360
  NativeRegExpMacroAssembler::Result result =
      Execute(*code,
              *input,
              0,
              start_adr,
              start_adr + input->length(),
1361
              captures);
1362 1363

  CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1364 1365 1366
  CHECK_EQ(0, captures[0]);
  CHECK_EQ(42, captures[1]);

1367
  isolate->clear_pending_exception();
1368 1369
}

1370
#else  // V8_INTERPRETED_REGEXP
1371 1372 1373

TEST(MacroAssembler) {
  byte codes[1024];
1374 1375 1376
  Zone zone;
  RegExpMacroAssemblerIrregexp m(CcTest::i_isolate(), Vector<byte>(codes, 1024),
                                 &zone);
1377
  // ^f(o)o.
1378 1379
  Label start, fail, backtrack;

1380 1381 1382 1383 1384 1385
  m.SetRegister(4, 42);
  m.PushRegister(4, RegExpMacroAssembler::kNoStackLimitCheck);
  m.AdvanceRegister(4, 42);
  m.GoTo(&start);
  m.Fail();
  m.Bind(&start);
1386 1387 1388 1389 1390 1391 1392 1393
  m.PushBacktrack(&fail);
  m.CheckNotAtStart(NULL);
  m.LoadCurrentCharacter(0, NULL);
  m.CheckNotCharacter('f', NULL);
  m.LoadCurrentCharacter(1, NULL);
  m.CheckNotCharacter('o', NULL);
  m.LoadCurrentCharacter(2, NULL);
  m.CheckNotCharacter('o', NULL);
1394
  m.WriteCurrentPositionToRegister(0, 0);
1395 1396 1397
  m.WriteCurrentPositionToRegister(1, 3);
  m.WriteCurrentPositionToRegister(2, 1);
  m.WriteCurrentPositionToRegister(3, 2);
1398
  m.AdvanceCurrentPosition(3);
1399
  m.PushBacktrack(&backtrack);
1400
  m.Succeed();
1401 1402
  m.Bind(&backtrack);
  m.ClearRegisters(2, 3);
1403
  m.Backtrack();
1404
  m.Bind(&fail);
1405 1406 1407
  m.PopRegister(0);
  m.Fail();

1408
  Isolate* isolate = CcTest::i_isolate();
1409 1410
  Factory* factory = isolate->factory();
  HandleScope scope(isolate);
1411

1412
  Handle<String> source = factory->NewStringFromStaticChars("^f(o)o");
1413 1414 1415 1416
  Handle<ByteArray> array = Handle<ByteArray>::cast(m.GetCode(source));
  int captures[5];

  const uc16 str1[] = {'f', 'o', 'o', 'b', 'a', 'r'};
1417 1418
  Handle<String> f1_16 = factory->NewStringFromTwoByte(
      Vector<const uc16>(str1, 6)).ToHandleChecked();
1419

1420
  CHECK(IrregexpInterpreter::Match(isolate, array, f1_16, captures, 0));
1421 1422 1423 1424 1425 1426 1427
  CHECK_EQ(0, captures[0]);
  CHECK_EQ(3, captures[1]);
  CHECK_EQ(1, captures[2]);
  CHECK_EQ(2, captures[3]);
  CHECK_EQ(84, captures[4]);

  const uc16 str2[] = {'b', 'a', 'r', 'f', 'o', 'o'};
1428 1429
  Handle<String> f2_16 = factory->NewStringFromTwoByte(
      Vector<const uc16>(str2, 6)).ToHandleChecked();
1430

1431
  CHECK(!IrregexpInterpreter::Match(isolate, array, f2_16, captures, 0));
1432 1433 1434
  CHECK_EQ(42, captures[0]);
}

1435
#endif  // V8_INTERPRETED_REGEXP
1436 1437


1438 1439 1440 1441
TEST(AddInverseToTable) {
  static const int kLimit = 1000;
  static const int kRangeCount = 16;
  for (int t = 0; t < 10; t++) {
1442
    Zone zone;
1443
    ZoneList<CharacterRange>* ranges =
1444
        new(&zone) ZoneList<CharacterRange>(kRangeCount, &zone);
1445 1446 1447 1448
    for (int i = 0; i < kRangeCount; i++) {
      int from = PseudoRandom(t + 87, i + 25) % kLimit;
      int to = from + (PseudoRandom(i + 87, t + 25) % (kLimit / 20));
      if (to > kLimit) to = kLimit;
1449
      ranges->Add(CharacterRange(from, to), &zone);
1450
    }
1451 1452
    DispatchTable table(&zone);
    DispatchTableConstructor cons(&table, false, &zone);
1453 1454 1455 1456 1457 1458 1459 1460 1461 1462
    cons.set_choice_index(0);
    cons.AddInverse(ranges);
    for (int i = 0; i < kLimit; i++) {
      bool is_on = false;
      for (int j = 0; !is_on && j < kRangeCount; j++)
        is_on = ranges->at(j).Contains(i);
      OutSet* set = table.Get(i);
      CHECK_EQ(is_on, set->Get(0) == false);
    }
  }
1463
  Zone zone;
1464
  ZoneList<CharacterRange>* ranges =
1465 1466 1467 1468
      new(&zone) ZoneList<CharacterRange>(1, &zone);
  ranges->Add(CharacterRange(0xFFF0, 0xFFFE), &zone);
  DispatchTable table(&zone);
  DispatchTableConstructor cons(&table, false, &zone);
1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489
  cons.set_choice_index(0);
  cons.AddInverse(ranges);
  CHECK(!table.Get(0xFFFE)->Get(0));
  CHECK(table.Get(0xFFFF)->Get(0));
}


static uc32 canonicalize(uc32 c) {
  unibrow::uchar canon[unibrow::Ecma262Canonicalize::kMaxWidth];
  int count = unibrow::Ecma262Canonicalize::Convert(c, '\0', canon, NULL);
  if (count == 0) {
    return c;
  } else {
    CHECK_EQ(1, count);
    return canon[0];
  }
}


TEST(LatinCanonicalize) {
  unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1490 1491
  for (unibrow::uchar lower = 'a'; lower <= 'z'; lower++) {
    unibrow::uchar upper = lower + ('A' - 'a');
1492 1493 1494 1495 1496 1497 1498 1499 1500 1501
    CHECK_EQ(canonicalize(lower), canonicalize(upper));
    unibrow::uchar uncanon[unibrow::Ecma262UnCanonicalize::kMaxWidth];
    int length = un_canonicalize.get(lower, '\0', uncanon);
    CHECK_EQ(2, length);
    CHECK_EQ(upper, uncanon[0]);
    CHECK_EQ(lower, uncanon[1]);
  }
  for (uc32 c = 128; c < (1 << 21); c++)
    CHECK_GE(canonicalize(c), 128);
  unibrow::Mapping<unibrow::ToUppercase> to_upper;
1502 1503
  // Canonicalization is only defined for the Basic Multilingual Plane.
  for (uc32 c = 0; c < (1 << 16); c++) {
1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517
    unibrow::uchar upper[unibrow::ToUppercase::kMaxWidth];
    int length = to_upper.get(c, '\0', upper);
    if (length == 0) {
      length = 1;
      upper[0] = c;
    }
    uc32 u = upper[0];
    if (length > 1 || (c >= 128 && u < 128))
      u = c;
    CHECK_EQ(u, canonicalize(c));
  }
}


1518
static uc32 CanonRangeEnd(uc32 c) {
1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534
  unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth];
  int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL);
  if (count == 0) {
    return c;
  } else {
    CHECK_EQ(1, count);
    return canon[0];
  }
}


TEST(RangeCanonicalization) {
  // Check that we arrive at the same result when using the basic
  // range canonicalization primitives as when using immediate
  // canonicalization.
  unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550
  int block_start = 0;
  while (block_start <= 0xFFFF) {
    uc32 block_end = CanonRangeEnd(block_start);
    unsigned block_length = block_end - block_start + 1;
    if (block_length > 1) {
      unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth];
      int first_length = un_canonicalize.get(block_start, '\0', first);
      for (unsigned i = 1; i < block_length; i++) {
        unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth];
        int succ_length = un_canonicalize.get(block_start + i, '\0', succ);
        CHECK_EQ(first_length, succ_length);
        for (int j = 0; j < succ_length; j++) {
          int calc = first[j] + i;
          int found = succ[j];
          CHECK_EQ(calc, found);
        }
1551 1552
      }
    }
1553
    block_start = block_start + block_length;
1554 1555 1556 1557
  }
}


1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573
TEST(UncanonicalizeEquivalence) {
  unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
  for (int i = 0; i < (1 << 16); i++) {
    int length = un_canonicalize.get(i, '\0', chars);
    for (int j = 0; j < length; j++) {
      unibrow::uchar chars2[unibrow::Ecma262UnCanonicalize::kMaxWidth];
      int length2 = un_canonicalize.get(chars[j], '\0', chars2);
      CHECK_EQ(length, length2);
      for (int k = 0; k < length; k++)
        CHECK_EQ(static_cast<int>(chars[k]), static_cast<int>(chars2[k]));
    }
  }
}


1574
static void TestRangeCaseIndependence(Isolate* isolate, CharacterRange input,
1575
                                      Vector<CharacterRange> expected) {
1576
  Zone zone;
1577
  int count = expected.length();
1578
  ZoneList<CharacterRange>* list =
1579
      new(&zone) ZoneList<CharacterRange>(count, &zone);
1580
  input.AddCaseEquivalents(isolate, &zone, list, false);
1581 1582 1583 1584 1585 1586 1587 1588
  CHECK_EQ(count, list->length());
  for (int i = 0; i < list->length(); i++) {
    CHECK_EQ(expected[i].from(), list->at(i).from());
    CHECK_EQ(expected[i].to(), list->at(i).to());
  }
}


1589 1590
static void TestSimpleRangeCaseIndependence(Isolate* isolate,
                                            CharacterRange input,
1591 1592 1593
                                            CharacterRange expected) {
  EmbeddedVector<CharacterRange, 1> vector;
  vector[0] = expected;
1594
  TestRangeCaseIndependence(isolate, input, vector);
1595 1596 1597 1598
}


TEST(CharacterRangeCaseIndependence) {
1599 1600
  Isolate* isolate = CcTest::i_isolate();
  TestSimpleRangeCaseIndependence(isolate, CharacterRange::Singleton('a'),
1601
                                  CharacterRange::Singleton('A'));
1602
  TestSimpleRangeCaseIndependence(isolate, CharacterRange::Singleton('z'),
1603
                                  CharacterRange::Singleton('Z'));
1604
  TestSimpleRangeCaseIndependence(isolate, CharacterRange('a', 'z'),
1605
                                  CharacterRange('A', 'Z'));
1606
  TestSimpleRangeCaseIndependence(isolate, CharacterRange('c', 'f'),
1607
                                  CharacterRange('C', 'F'));
1608
  TestSimpleRangeCaseIndependence(isolate, CharacterRange('a', 'b'),
1609
                                  CharacterRange('A', 'B'));
1610
  TestSimpleRangeCaseIndependence(isolate, CharacterRange('y', 'z'),
1611
                                  CharacterRange('Y', 'Z'));
1612
  TestSimpleRangeCaseIndependence(isolate, CharacterRange('a' - 1, 'z' + 1),
1613
                                  CharacterRange('A', 'Z'));
1614
  TestSimpleRangeCaseIndependence(isolate, CharacterRange('A', 'Z'),
1615
                                  CharacterRange('a', 'z'));
1616
  TestSimpleRangeCaseIndependence(isolate, CharacterRange('C', 'F'),
1617
                                  CharacterRange('c', 'f'));
1618
  TestSimpleRangeCaseIndependence(isolate, CharacterRange('A' - 1, 'Z' + 1),
1619 1620 1621 1622
                                  CharacterRange('a', 'z'));
  // Here we need to add [l-z] to complete the case independence of
  // [A-Za-z] but we expect [a-z] to be added since we always add a
  // whole block at a time.
1623
  TestSimpleRangeCaseIndependence(isolate, CharacterRange('A', 'k'),
1624 1625 1626 1627
                                  CharacterRange('a', 'z'));
}


1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640
static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) {
  if (ranges == NULL)
    return false;
  for (int i = 0; i < ranges->length(); i++) {
    CharacterRange range = ranges->at(i);
    if (range.from() <= c && c <= range.to())
      return true;
  }
  return false;
}


TEST(CharClassDifference) {
1641
  Zone zone;
1642
  ZoneList<CharacterRange>* base =
1643 1644
      new(&zone) ZoneList<CharacterRange>(1, &zone);
  base->Add(CharacterRange::Everything(), &zone);
1645
  Vector<const int> overlay = CharacterRange::GetWordBounds();
1646 1647
  ZoneList<CharacterRange>* included = NULL;
  ZoneList<CharacterRange>* excluded = NULL;
1648
  CharacterRange::Split(base, overlay, &included, &excluded, &zone);
1649 1650 1651 1652 1653
  for (int i = 0; i < (1 << 16); i++) {
    bool in_base = InClass(i, base);
    if (in_base) {
      bool in_overlay = false;
      for (int j = 0; !in_overlay && j < overlay.length(); j += 2) {
1654
        if (overlay[j] <= i && i < overlay[j+1])
1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666
          in_overlay = true;
      }
      CHECK_EQ(in_overlay, InClass(i, included));
      CHECK_EQ(!in_overlay, InClass(i, excluded));
    } else {
      CHECK(!InClass(i, included));
      CHECK(!InClass(i, excluded));
    }
  }
}


1667
TEST(CanonicalizeCharacterSets) {
1668
  Zone zone;
1669
  ZoneList<CharacterRange>* list =
1670
      new(&zone) ZoneList<CharacterRange>(4, &zone);
1671 1672
  CharacterSet set(list);

1673 1674 1675
  list->Add(CharacterRange(10, 20), &zone);
  list->Add(CharacterRange(30, 40), &zone);
  list->Add(CharacterRange(50, 60), &zone);
1676
  set.Canonicalize();
1677 1678 1679 1680 1681 1682 1683
  DCHECK_EQ(3, list->length());
  DCHECK_EQ(10, list->at(0).from());
  DCHECK_EQ(20, list->at(0).to());
  DCHECK_EQ(30, list->at(1).from());
  DCHECK_EQ(40, list->at(1).to());
  DCHECK_EQ(50, list->at(2).from());
  DCHECK_EQ(60, list->at(2).to());
1684 1685

  list->Rewind(0);
1686 1687 1688
  list->Add(CharacterRange(10, 20), &zone);
  list->Add(CharacterRange(50, 60), &zone);
  list->Add(CharacterRange(30, 40), &zone);
1689
  set.Canonicalize();
1690 1691 1692 1693 1694 1695 1696
  DCHECK_EQ(3, list->length());
  DCHECK_EQ(10, list->at(0).from());
  DCHECK_EQ(20, list->at(0).to());
  DCHECK_EQ(30, list->at(1).from());
  DCHECK_EQ(40, list->at(1).to());
  DCHECK_EQ(50, list->at(2).from());
  DCHECK_EQ(60, list->at(2).to());
1697 1698

  list->Rewind(0);
1699 1700 1701 1702 1703
  list->Add(CharacterRange(30, 40), &zone);
  list->Add(CharacterRange(10, 20), &zone);
  list->Add(CharacterRange(25, 25), &zone);
  list->Add(CharacterRange(100, 100), &zone);
  list->Add(CharacterRange(1, 1), &zone);
1704
  set.Canonicalize();
1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715
  DCHECK_EQ(5, list->length());
  DCHECK_EQ(1, list->at(0).from());
  DCHECK_EQ(1, list->at(0).to());
  DCHECK_EQ(10, list->at(1).from());
  DCHECK_EQ(20, list->at(1).to());
  DCHECK_EQ(25, list->at(2).from());
  DCHECK_EQ(25, list->at(2).to());
  DCHECK_EQ(30, list->at(3).from());
  DCHECK_EQ(40, list->at(3).to());
  DCHECK_EQ(100, list->at(4).from());
  DCHECK_EQ(100, list->at(4).to());
1716 1717

  list->Rewind(0);
1718 1719 1720
  list->Add(CharacterRange(10, 19), &zone);
  list->Add(CharacterRange(21, 30), &zone);
  list->Add(CharacterRange(20, 20), &zone);
1721
  set.Canonicalize();
1722 1723 1724
  DCHECK_EQ(1, list->length());
  DCHECK_EQ(10, list->at(0).from());
  DCHECK_EQ(30, list->at(0).to());
1725 1726
}

1727 1728

TEST(CharacterRangeMerge) {
1729
  Zone zone;
1730 1731
  ZoneList<CharacterRange> l1(4, &zone);
  ZoneList<CharacterRange> l2(4, &zone);
1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745
  // Create all combinations of intersections of ranges, both singletons and
  // longer.

  int offset = 0;

  // The five kinds of singleton intersections:
  //     X
  //   Y      - outside before
  //    Y     - outside touching start
  //     Y    - overlap
  //      Y   - outside touching end
  //       Y  - outside after

  for (int i = 0; i < 5; i++) {
1746 1747
    l1.Add(CharacterRange::Singleton(offset + 2), &zone);
    l2.Add(CharacterRange::Singleton(offset + i), &zone);
1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761
    offset += 6;
  }

  // The seven kinds of singleton/non-singleton intersections:
  //    XXX
  //  Y        - outside before
  //   Y       - outside touching start
  //    Y      - inside touching start
  //     Y     - entirely inside
  //      Y    - inside touching end
  //       Y   - outside touching end
  //        Y  - disjoint after

  for (int i = 0; i < 7; i++) {
1762 1763
    l1.Add(CharacterRange::Range(offset + 2, offset + 4), &zone);
    l2.Add(CharacterRange::Singleton(offset + i), &zone);
1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782
    offset += 8;
  }

  // The eleven kinds of non-singleton intersections:
  //
  //       XXXXXXXX
  // YYYY                  - outside before.
  //   YYYY                - outside touching start.
  //     YYYY              - overlapping start
  //       YYYY            - inside touching start
  //         YYYY          - entirely inside
  //           YYYY        - inside touching end
  //             YYYY      - overlapping end
  //               YYYY    - outside touching end
  //                 YYYY  - outside after
  //       YYYYYYYY        - identical
  //     YYYYYYYYYYYY      - containing entirely.

  for (int i = 0; i < 9; i++) {
1783 1784
    l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);  // Length 8.
    l2.Add(CharacterRange::Range(offset + 2 * i, offset + 2 * i + 3), &zone);
1785 1786
    offset += 22;
  }
1787 1788
  l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
  l2.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
1789
  offset += 22;
1790 1791
  l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
  l2.Add(CharacterRange::Range(offset + 4, offset + 17), &zone);
1792 1793 1794 1795 1796 1797
  offset += 22;

  // Different kinds of multi-range overlap:
  // XXXXXXXXXXXXXXXXXXXXXX         XXXXXXXXXXXXXXXXXXXXXX
  //   YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y

1798 1799
  l1.Add(CharacterRange::Range(offset, offset + 21), &zone);
  l1.Add(CharacterRange::Range(offset + 31, offset + 52), &zone);
1800
  for (int i = 0; i < 6; i++) {
1801 1802
    l2.Add(CharacterRange::Range(offset + 2, offset + 5), &zone);
    l2.Add(CharacterRange::Singleton(offset + 8), &zone);
1803 1804 1805
    offset += 9;
  }

1806 1807
  DCHECK(CharacterRange::IsCanonical(&l1));
  DCHECK(CharacterRange::IsCanonical(&l2));
1808

1809 1810 1811
  ZoneList<CharacterRange> first_only(4, &zone);
  ZoneList<CharacterRange> second_only(4, &zone);
  ZoneList<CharacterRange> both(4, &zone);
1812
}
1813 1814


1815
TEST(Graph) {
1816
  Execute("\\b\\w+\\b", false, true, true);
1817
}