regexp.js 18.6 KB
Newer Older
1
// Copyright 2006-2009 the V8 project authors. All rights reserved.
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
//       notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
//       copyright notice, this list of conditions and the following
//       disclaimer in the documentation and/or other materials provided
//       with the distribution.
//     * Neither the name of Google Inc. nor the names of its
//       contributors may be used to endorse or promote products derived
//       from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// Expect $Object = global.Object;
// Expect $Array = global.Array;

const $RegExp = global.RegExp;

// A recursive descent parser for Patterns according to the grammar of
// ECMA-262 15.10.1, with deviations noted below.
function DoConstructRegExp(object, pattern, flags, isConstructorCall) {
  // RegExp : Called as constructor; see ECMA-262, section 15.10.4.
  if (IS_REGEXP(pattern)) {
    if (!IS_UNDEFINED(flags)) {
      throw MakeTypeError('regexp_flags', []);
    }
    flags = (pattern.global ? 'g' : '')
        + (pattern.ignoreCase ? 'i' : '')
        + (pattern.multiline ? 'm' : '');
    pattern = pattern.source;
  }

  pattern = IS_UNDEFINED(pattern) ? '' : ToString(pattern);
  flags = IS_UNDEFINED(flags) ? '' : ToString(flags);

  var global = false;
  var ignoreCase = false;
  var multiline = false;

  for (var i = 0; i < flags.length; i++) {
55
    var c = StringCharAt.call(flags, i);
56 57
    switch (c) {
      case 'g':
58
        // Allow duplicate flags to be consistent with JSC and others.
59 60 61 62 63 64 65 66 67 68
        global = true;
        break;
      case 'i':
        ignoreCase = true;
        break;
      case 'm':
        multiline = true;
        break;
      default:
        // Ignore flags that have no meaning to be consistent with
69
        // JSC.
70 71 72 73
        break;
    }
  }

74
  if (!isConstructorCall) {
75
    regExpCache.type = 'none';
76
  }
77
  %RegExpInitializeObject(object, pattern, global, ignoreCase, multiline);
78 79 80

  // Call internal function to compile the pattern.
  %RegExpCompile(object, pattern, flags);
81
}
82 83 84


function RegExpConstructor(pattern, flags) {
85
  if (%_IsConstructCall()) {
86 87 88 89 90 91 92 93
    DoConstructRegExp(this, pattern, flags, true);
  } else {
    // RegExp : Called as function; see ECMA-262, section 15.10.3.1.
    if (IS_REGEXP(pattern) && IS_UNDEFINED(flags)) {
      return pattern;
    }
    return new $RegExp(pattern, flags);
  }
94
}
95 96 97 98


// Deprecated RegExp.prototype.compile method.  We behave like the constructor
// were called again.  In SpiderMonkey, this method returns the regexp object.
99
// In JSC, it returns undefined.  For compatibility with JSC, we match their
100 101
// behavior.
function CompileRegExp(pattern, flags) {
102
  // Both JSC and SpiderMonkey treat a missing pattern argument as the
103
  // empty subject string, and an actual undefined value passed as the
104
  // pattern as the string 'undefined'.  Note that JSC is inconsistent
105 106
  // here, treating undefined values differently in
  // RegExp.prototype.compile and in the constructor, where they are
107
  // the empty string.  For compatibility with JSC, we match their
108 109 110 111 112 113 114 115 116 117
  // behavior.
  if (IS_UNDEFINED(pattern) && %_ArgumentsLength() != 0) {
    DoConstructRegExp(this, 'undefined', flags, false);
  } else {
    DoConstructRegExp(this, pattern, flags, false);
  }
}


function DoRegExpExec(regexp, string, index) {
118 119 120
  var result = %_RegExpExec(regexp, string, index, lastMatchInfo);
  if (result !== null) lastMatchInfoOverride = null;
  return result;
121 122
}

123

124 125 126 127 128 129 130
function RegExpCache() {
  this.type = 'none';
  this.regExp = 0;
  this.subject = 0;
  this.replaceString = 0;
  this.lastIndex = 0;
  this.answer = 0;
131 132 133
  // answerSaved marks whether the contents of answer is valid for a cache
  // hit in RegExpExec, StringMatch and StringSplit.
  this.answerSaved = false;
134 135 136 137
}


var regExpCache = new RegExpCache();
138 139


140
function CloneRegExpResult(array) {
141
  if (array == null) return null;
142 143 144
  var length = array.length;
  var answer = %_RegExpConstructResult(length, array.index, array.input);
  for (var i = 0; i < length; i++) {
145 146 147 148 149 150
    answer[i] = array[i];
  }
  return answer;
}


151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
function BuildResultFromMatchInfo(lastMatchInfo, s) {
  var numResults = NUMBER_OF_CAPTURES(lastMatchInfo) >> 1;
  var result = %_RegExpConstructResult(numResults, lastMatchInfo[CAPTURE0], s);
  if (numResults === 1) {
    var matchStart = lastMatchInfo[CAPTURE(0)];
    var matchEnd = lastMatchInfo[CAPTURE(1)];
    result[0] = SubString(s, matchStart, matchEnd);
  } else {
    for (var i = 0; i < numResults; i++) {
      var matchStart = lastMatchInfo[CAPTURE(i << 1)];
      var matchEnd = lastMatchInfo[CAPTURE((i << 1) + 1)];
      if (matchStart != -1 && matchEnd != -1) {
        result[i] = SubString(s, matchStart, matchEnd);
      } else {
        // Make sure the element is present. Avoid reading the undefined
        // property from the global object since this may change.
        result[i] = void 0;
      }
    }
  }
  return result;
}


function RegExpExecNoTests(regexp, string, start) {
  // Must be called with RegExp, string and positive integer as arguments.
  var matchInfo = DoRegExpExec(regexp, string, start);
  var result = null;
  if (matchInfo !== null) {
    result = BuildResultFromMatchInfo(matchInfo, string);
  }
  return result;
}


186
function RegExpExec(string) {
187 188 189 190 191 192
  if (!IS_REGEXP(this)) {
    throw MakeTypeError('incompatible_method_receiver',
                        ['RegExp.prototype.exec', this]);
  }

  var cache = regExpCache;
193
  var saveAnswer = false;
194 195 196 197 198

  if (%_ObjectEquals(cache.type, 'exec') &&
      %_ObjectEquals(cache.lastIndex, this.lastIndex) &&
      %_ObjectEquals(cache.regExp, this) &&
      %_ObjectEquals(cache.subject, string)) {
199
    if (cache.answerSaved) {
200
      return CloneRegExpResult(cache.answer);
201
    } else {
202
      saveAnswer = true;
203 204 205
    }
  }

206
  if (%_ArgumentsLength() == 0) {
207
    var regExpInput = LAST_INPUT(lastMatchInfo);
208 209 210
    if (IS_UNDEFINED(regExpInput)) {
      throw MakeError('no_input_to_regexp', [this]);
    }
211
    string = regExpInput;
212
  }
213 214 215 216 217 218
  var s;
  if (IS_STRING(string)) {
    s = string;
  } else {
    s = ToString(string);
  }
219
  var lastIndex = this.lastIndex;
220

221 222 223 224 225 226 227
  var i = this.global ? TO_INTEGER(lastIndex) : 0;

  if (i < 0 || i > s.length) {
    this.lastIndex = 0;
    return null;
  }

228
  %_Log('regexp', 'regexp-exec,%0r,%1S,%2i', [this, s, lastIndex]);
229
  // matchIndices is either null or the lastMatchInfo array.
230
  var matchIndices = %_RegExpExec(this, s, i, lastMatchInfo);
231 232

  if (matchIndices == null) {
233 234 235 236
    if (this.global) {
      this.lastIndex = 0;
      if (lastIndex != 0) return matchIndices;
    }
237 238 239 240
    cache.lastIndex = lastIndex;
    cache.regExp = this;
    cache.subject = s;
    cache.answer = matchIndices;  // Null.
241
    cache.answerSaved = true;     // Safe since no cloning is needed.
242
    cache.type = 'exec';
243
    return matchIndices;        // No match.
244
  }
245
  lastMatchInfoOverride = null;
246
  var result = BuildResultFromMatchInfo(matchIndices, s);
247

248 249 250
  if (this.global) {
    this.lastIndex = lastMatchInfo[CAPTURE1];
  } else {
251 252 253
    cache.regExp = this;
    cache.subject = s;
    cache.lastIndex = lastIndex;
254
    if (saveAnswer) cache.answer = CloneRegExpResult(result);
255
    cache.answerSaved = saveAnswer;
256
    cache.type = 'exec';
257
  }
258 259
  return result;

260
}
261 262


263 264 265 266
// One-element cache for the simplified test regexp.
var regexp_key;
var regexp_val;

267
// Section 15.10.6.3 doesn't actually make sense, but the intention seems to be
268 269 270
// that test is defined in terms of String.prototype.exec. However, it probably
// means the original value of String.prototype.exec, which is what everybody
// else implements.
271
function RegExpTest(string) {
272
  if (!IS_REGEXP(this)) {
273
    throw MakeTypeError('incompatible_method_receiver',
274 275 276 277 278 279 280 281 282
                        ['RegExp.prototype.test', this]);
  }
  if (%_ArgumentsLength() == 0) {
    var regExpInput = LAST_INPUT(lastMatchInfo);
    if (IS_UNDEFINED(regExpInput)) {
      throw MakeError('no_input_to_regexp', [this]);
    }
    string = regExpInput;
  }
283 284 285 286 287 288 289
  var s;
  if (IS_STRING(string)) {
    s = string;
  } else {
    s = ToString(string);
  }

290
  var lastIndex = this.lastIndex;
291 292 293 294 295 296 297 298
  var cache = regExpCache;
  if (%_ObjectEquals(cache.type, 'test') &&
      %_ObjectEquals(cache.regExp, this) &&
      %_ObjectEquals(cache.subject, string) &&
      %_ObjectEquals(cache.lastIndex, lastIndex)) {
    return cache.answer;
  }

299 300
  // Remove irrelevant preceeding '.*' in a test regexp. The expression
  // checks whether this.source starts with '.*' and that the third
301
  // char is not a '?'
302
  if (%_StringCharCodeAt(this.source,0) == 46 && // '.'
303 304
      %_StringCharCodeAt(this.source,1) == 42 && // '*'
      %_StringCharCodeAt(this.source,2) != 63) { // '?'
305
    if (!%_ObjectEquals(regexp_key, this)) {
306
      regexp_key = this;
307
      regexp_val = new $RegExp(this.source.substring(2, this.source.length),
308 309 310 311 312 313
                               (this.global ? 'g' : '')
                               + (this.ignoreCase ? 'i' : '')
                               + (this.multiline ? 'm' : ''));
    }
    if (!regexp_val.test(s)) return false;
  }
314

315
  var length = s.length;
316 317
  var i = this.global ? TO_INTEGER(lastIndex) : 0;

318 319 320 321 322
  cache.type = 'test';
  cache.regExp = this;
  cache.subject = s;
  cache.lastIndex = i;

323
  if (i < 0 || i > length) {
324
    this.lastIndex = 0;
325
    cache.answer = false;
326 327 328 329 330
    return false;
  }

  %_Log('regexp', 'regexp-exec,%0r,%1S,%2i', [this, s, lastIndex]);
  // matchIndices is either null or the lastMatchInfo array.
331
  var matchIndices = %_RegExpExec(this, s, i, lastMatchInfo);
332 333 334

  if (matchIndices == null) {
    if (this.global) this.lastIndex = 0;
335
    cache.answer = false;
336 337
    return false;
  }
338
  lastMatchInfoOverride = null;
339
  if (this.global) this.lastIndex = lastMatchInfo[CAPTURE1];
340
  cache.answer = true;
341
  return true;
342
}
343 344 345 346 347 348 349 350 351 352 353 354 355 356 357


function RegExpToString() {
  // If this.source is an empty string, output /(?:)/.
  // http://bugzilla.mozilla.org/show_bug.cgi?id=225550
  // ecma_2/RegExp/properties-001.js.
  var src = this.source ? this.source : '(?:)';
  var result = '/' + src + '/';
  if (this.global)
    result += 'g';
  if (this.ignoreCase)
    result += 'i';
  if (this.multiline)
    result += 'm';
  return result;
358
}
359 360 361 362 363 364 365


// Getters for the static properties lastMatch, lastParen, leftContext, and
// rightContext of the RegExp constructor.  The properties are computed based
// on the captures array of the last successful match and the subject string
// of the last successful match.
function RegExpGetLastMatch() {
366 367 368
  if (lastMatchInfoOverride !== null) {
    return lastMatchInfoOverride[0];
  }
369 370 371 372
  var regExpSubject = LAST_SUBJECT(lastMatchInfo);
  return SubString(regExpSubject,
                   lastMatchInfo[CAPTURE0],
                   lastMatchInfo[CAPTURE1]);
373 374
}

375 376

function RegExpGetLastParen() {
377 378
  if (lastMatchInfoOverride) {
    var override = lastMatchInfoOverride;
379
    if (override.length <= 3) return '';
380 381
    return override[override.length - 3];
  }
382 383
  var length = NUMBER_OF_CAPTURES(lastMatchInfo);
  if (length <= 2) return '';  // There were no captures.
384 385 386
  // We match the SpiderMonkey behavior: return the substring defined by the
  // last pair (after the first pair) of elements of the capture array even if
  // it is empty.
387 388 389 390 391 392 393
  var regExpSubject = LAST_SUBJECT(lastMatchInfo);
  var start = lastMatchInfo[CAPTURE(length - 2)];
  var end = lastMatchInfo[CAPTURE(length - 1)];
  if (start != -1 && end != -1) {
    return SubString(regExpSubject, start, end);
  }
  return "";
394 395
}

396 397

function RegExpGetLeftContext() {
398 399 400 401 402 403 404 405 406 407 408
  var start_index;
  var subject;
  if (!lastMatchInfoOverride) {
    start_index = lastMatchInfo[CAPTURE0];
    subject = LAST_SUBJECT(lastMatchInfo);
  } else {
    var override = lastMatchInfoOverride;
    start_index = override[override.length - 2];
    subject = override[override.length - 1];
  }
  return SubString(subject, 0, start_index);
409 410
}

411 412

function RegExpGetRightContext() {
413 414 415 416 417 418 419 420 421 422 423
  var start_index;
  var subject;
  if (!lastMatchInfoOverride) {
    start_index = lastMatchInfo[CAPTURE1];
    subject = LAST_SUBJECT(lastMatchInfo);
  } else {
    var override = lastMatchInfoOverride;
    subject = override[override.length - 1];
    start_index = override[override.length - 2] + subject.length;
  }
  return SubString(subject, start_index, subject.length);
424
}
425 426 427 428


// The properties $1..$9 are the first nine capturing substrings of the last
// successful match, or ''.  The function RegExpMakeCaptureGetter will be
429
// called with indices from 1 to 9.
430 431
function RegExpMakeCaptureGetter(n) {
  return function() {
432 433 434 435
    if (lastMatchInfoOverride) {
      if (n < lastMatchInfoOverride.length - 2) return lastMatchInfoOverride[n];
      return '';
    }
436
    var index = n * 2;
437 438 439
    if (index >= NUMBER_OF_CAPTURES(lastMatchInfo)) return '';
    var matchStart = lastMatchInfo[CAPTURE(index)];
    var matchEnd = lastMatchInfo[CAPTURE(index + 1)];
440
    if (matchStart == -1 || matchEnd == -1) return '';
441
    return SubString(LAST_SUBJECT(lastMatchInfo), matchStart, matchEnd);
442
  };
443
}
444 445


446 447 448 449 450 451 452 453 454 455
// Property of the builtins object for recording the result of the last
// regexp match.  The property lastMatchInfo includes the matchIndices
// array of the last successful regexp match (an array of start/end index
// pairs for the match and all the captured substrings), the invariant is
// that there are at least two capture indeces.  The array also contains
// the subject string for the last successful match.
var lastMatchInfo = [
    2,                 // REGEXP_NUMBER_OF_CAPTURES
    "",                // Last subject.
    void 0,            // Last input - settable with RegExpSetInput.
456 457
    0,                 // REGEXP_FIRST_CAPTURE + 0
    0,                 // REGEXP_FIRST_CAPTURE + 1
458
];
459

460 461
// Override last match info with an array of actual substrings.
// Used internally by replace regexp with function.
462 463
// The array has the format of an "apply" argument for a replacement
// function.
464 465
var lastMatchInfoOverride = null;

466 467 468 469 470
// -------------------------------------------------------------------

function SetupRegExp() {
  %FunctionSetInstanceClassName($RegExp, 'RegExp');
  %FunctionSetPrototype($RegExp, new $Object());
471
  %SetProperty($RegExp.prototype, 'constructor', $RegExp, DONT_ENUM);
472 473 474 475 476 477 478 479 480 481 482 483 484
  %SetCode($RegExp, RegExpConstructor);

  InstallFunctions($RegExp.prototype, DONT_ENUM, $Array(
    "exec", RegExpExec,
    "test", RegExpTest,
    "toString", RegExpToString,
    "compile", CompileRegExp
  ));

  // The length of compile is 1 in SpiderMonkey.
  %FunctionSetLength($RegExp.prototype.compile, 1);

  // The properties input, $input, and $_ are aliases for each other.  When this
485
  // value is set the value it is set to is coerced to a string.
486
  // Getter and setter for the input.
487
  function RegExpGetInput() {
488
    var regExpInput = LAST_INPUT(lastMatchInfo);
489 490
    return IS_UNDEFINED(regExpInput) ? "" : regExpInput;
  }
491
  function RegExpSetInput(string) {
492
    regExpCache.type = 'none';
493
    LAST_INPUT(lastMatchInfo) = ToString(string);
494
  };
495 496

  %DefineAccessor($RegExp, 'input', GETTER, RegExpGetInput, DONT_DELETE);
497
  %DefineAccessor($RegExp, 'input', SETTER, RegExpSetInput, DONT_DELETE);
498
  %DefineAccessor($RegExp, '$_', GETTER, RegExpGetInput, DONT_ENUM | DONT_DELETE);
499
  %DefineAccessor($RegExp, '$_', SETTER, RegExpSetInput, DONT_ENUM | DONT_DELETE);
500
  %DefineAccessor($RegExp, '$input', GETTER, RegExpGetInput, DONT_ENUM | DONT_DELETE);
501
  %DefineAccessor($RegExp, '$input', SETTER, RegExpSetInput, DONT_ENUM | DONT_DELETE);
502 503 504 505 506 507 508

  // The properties multiline and $* are aliases for each other.  When this
  // value is set in SpiderMonkey, the value it is set to is coerced to a
  // boolean.  We mimic that behavior with a slight difference: in SpiderMonkey
  // the value of the expression 'RegExp.multiline = null' (for instance) is the
  // boolean false (ie, the value after coercion), while in V8 it is the value
  // null (ie, the value before coercion).
509 510 511 512 513 514 515

  // Getter and setter for multiline.
  var multiline = false;
  function RegExpGetMultiline() { return multiline; };
  function RegExpSetMultiline(flag) { multiline = flag ? true : false; };

  %DefineAccessor($RegExp, 'multiline', GETTER, RegExpGetMultiline, DONT_DELETE);
516
  %DefineAccessor($RegExp, 'multiline', SETTER, RegExpSetMultiline, DONT_DELETE);
517
  %DefineAccessor($RegExp, '$*', GETTER, RegExpGetMultiline, DONT_ENUM | DONT_DELETE);
518
  %DefineAccessor($RegExp, '$*', SETTER, RegExpSetMultiline, DONT_ENUM | DONT_DELETE);
519 520 521 522 523 524 525


  function NoOpSetter(ignored) {}


  // Static properties set by a successful match.
  %DefineAccessor($RegExp, 'lastMatch', GETTER, RegExpGetLastMatch, DONT_DELETE);
526
  %DefineAccessor($RegExp, 'lastMatch', SETTER, NoOpSetter, DONT_DELETE);
527
  %DefineAccessor($RegExp, '$&', GETTER, RegExpGetLastMatch, DONT_ENUM | DONT_DELETE);
528
  %DefineAccessor($RegExp, '$&', SETTER, NoOpSetter, DONT_ENUM | DONT_DELETE);
529
  %DefineAccessor($RegExp, 'lastParen', GETTER, RegExpGetLastParen, DONT_DELETE);
530
  %DefineAccessor($RegExp, 'lastParen', SETTER, NoOpSetter, DONT_DELETE);
531
  %DefineAccessor($RegExp, '$+', GETTER, RegExpGetLastParen, DONT_ENUM | DONT_DELETE);
532
  %DefineAccessor($RegExp, '$+', SETTER, NoOpSetter, DONT_ENUM | DONT_DELETE);
533
  %DefineAccessor($RegExp, 'leftContext', GETTER, RegExpGetLeftContext, DONT_DELETE);
534
  %DefineAccessor($RegExp, 'leftContext', SETTER, NoOpSetter, DONT_DELETE);
535
  %DefineAccessor($RegExp, '$`', GETTER, RegExpGetLeftContext, DONT_ENUM | DONT_DELETE);
536
  %DefineAccessor($RegExp, '$`', SETTER, NoOpSetter, DONT_ENUM | DONT_DELETE);
537
  %DefineAccessor($RegExp, 'rightContext', GETTER, RegExpGetRightContext, DONT_DELETE);
538
  %DefineAccessor($RegExp, 'rightContext', SETTER, NoOpSetter, DONT_DELETE);
539
  %DefineAccessor($RegExp, "$'", GETTER, RegExpGetRightContext, DONT_ENUM | DONT_DELETE);
540
  %DefineAccessor($RegExp, "$'", SETTER, NoOpSetter, DONT_ENUM | DONT_DELETE);
541

542 543
  for (var i = 1; i < 10; ++i) {
    %DefineAccessor($RegExp, '$' + i, GETTER, RegExpMakeCaptureGetter(i), DONT_DELETE);
544
    %DefineAccessor($RegExp, '$' + i, SETTER, NoOpSetter, DONT_DELETE);
545
  }
546 547 548 549
}


SetupRegExp();