Commit 42309697 authored by yangguo's avatar yangguo Committed by Commit bot

[regexp] parse RegExpUnicodeEscapeSequence according to spec.

See http://tc39.github.io/ecma262/#prod-RegExpUnicodeEscapeSequence

R=erik.corry@gmail.com, erikcorry@chromium.org
BUG=v8:2952
LOG=N

Review URL: https://codereview.chromium.org/1681893002

Cr-Commit-Position: refs/heads/master@{#33892}
parent 9dd5fe29
......@@ -478,7 +478,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
Advance(2);
uc32 value;
if (ParseUnicodeEscape(&value)) {
builder->AddUnicodeCharacter(value);
builder->AddEscapedUnicodeCharacter(value);
} else if (!unicode()) {
builder->AddCharacter('u');
} else {
......@@ -797,7 +797,7 @@ bool RegExpParser::ParseHexEscape(int length, uc32* value) {
return true;
}
// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
bool RegExpParser::ParseUnicodeEscape(uc32* value) {
// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
// allowed). In the latter case, the number of hex digits between { } is
......@@ -815,7 +815,24 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) {
return false;
}
// \u but no {, or \u{...} escapes not allowed.
return ParseHexEscape(4, value);
bool result = ParseHexEscape(4, value);
if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
current() == '\\') {
// Attempt to read trail surrogate.
int start = position();
if (Next() == 'u') {
Advance(2);
uc32 trail;
if (ParseHexEscape(4, &trail) &&
unibrow::Utf16::IsTrailSurrogate(trail)) {
*value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
static_cast<uc16>(trail));
return true;
}
}
Reset(start);
}
return result;
}
ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() {
......@@ -938,6 +955,12 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
return '\\';
}
case '0':
// With /u, \0 is interpreted as NUL if not followed by another digit.
if (unicode() && !(Next() >= '0' && Next() <= '9')) {
Advance();
return 0;
}
// Fall through.
case '1':
case '2':
case '3':
......@@ -982,9 +1005,9 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
}
default: {
uc32 result = current();
// With /u, no identity escapes except for syntax characters are
// With /u, no identity escapes except for syntax characters and '-' are
// allowed. Otherwise, all identity escapes are allowed.
if (!unicode() || IsSyntaxCharacterOrSlash(result)) {
if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
Advance();
return result;
}
......@@ -1020,22 +1043,6 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
Advance();
}
if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {
// Combine with possibly following trail surrogate.
int start = position();
uc32 second = current();
if (second == '\\') {
second = ParseClassCharacterEscape(CHECK_FAILED);
} else {
Advance();
}
if (unibrow::Utf16::IsTrailSurrogate(second)) {
first = unibrow::Utf16::CombineSurrogatePair(first, second);
} else {
Reset(start);
}
}
return CharacterRange::Singleton(first);
}
......@@ -1264,6 +1271,13 @@ void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
}
}
void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
// A lead or trail surrogate parsed via escape sequence will not
// pair up with any preceding lead or following trail surrogate.
FlushPendingSurrogate();
AddUnicodeCharacter(character);
FlushPendingSurrogate();
}
void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
......
......@@ -102,6 +102,7 @@ class RegExpBuilder : public ZoneObject {
RegExpBuilder(Zone* zone, bool ignore_case, bool unicode);
void AddCharacter(uc16 character);
void AddUnicodeCharacter(uc32 character);
void AddEscapedUnicodeCharacter(uc32 character);
// "Adds" an empty expression. Does nothing except consume a
// following quantifier
void AddEmpty();
......
......@@ -257,25 +257,31 @@ assertTrue(/\ud808\udf45{3}/u.test("\u{12345}\u{12345}\u{12345}"));
assertFalse(new RegExp("\u{12345}{3}", "u").test("\u{12345}\udf45\udf45"));
assertFalse(/\u{12345}{3}/u.test("\u{12345}\udf45\udf45"));
// Mixed escapes and literal surrogates.
// Literal surrogates.
assertEquals(["\u{10000}\u{10000}"],
new RegExp("\ud800\udc00+", "u").exec("\u{10000}\u{10000}"));
assertEquals(["\u{10000}\u{10000}"],
new RegExp("\\ud800\\udc00+", "u").exec("\u{10000}\u{10000}"));
assertEquals(["\u{10000}\u{10000}"],
new RegExp("\\ud800\udc00+", "u").exec("\u{10000}\u{10000}"));
assertEquals(["\u{10000}\u{10000}"],
new RegExp("\ud800\\udc00+", "u").exec("\u{10000}\u{10000}"));
assertEquals(["\u{10003}\u{50001}"],
new RegExp("[\\ud800\\udc03-\\ud900\\udc01\]+", "u").exec(
"\u{10003}\u{50001}"));
assertEquals(["\u{10003}\u{50001}"],
new RegExp("[\\ud800\udc03-\ud900\\udc01\]+", "u").exec(
"\u{10003}\u{50001}"));
assertEquals(["\u{50001}"],
new RegExp("[\\ud800\udc03-\ud900\\udc01\]+", "u").exec(
"\u{10002}\u{50001}"));
assertEquals(["\u{10003}\u{50001}"],
new RegExp("[\ud800\udc03-\u{50001}\]+", "u").exec(
"\u{10003}\u{50001}"));
// Unicode escape sequences to represent a non-BMP character cannot have
// mixed notation, and must follow the rules for RegExpUnicodeEscapeSequence.
assertThrows(() => new RegExp("[\\ud800\udc03-\ud900\\udc01\]+", "u"));
assertThrows(() => new RegExp("[\\ud800\udc03-\ud900\\udc01\]+", "u"));
assertNull(new RegExp("\\ud800\udc00+", "u").exec("\u{10000}\u{10000}"));
assertNull(new RegExp("\ud800\\udc00+", "u").exec("\u{10000}\u{10000}"));
assertNull(new RegExp("[\\ud800\udc00]", "u").exec("\u{10000}"));
assertNull(new RegExp("[\\{ud800}\udc00]", "u").exec("\u{10000}"));
assertNull(new RegExp("[\ud800\\udc00]", "u").exec("\u{10000}"));
assertNull(new RegExp("[\ud800\\{udc00}]", "u").exec("\u{10000}"));
assertNull(/\u{d800}\u{dc00}+/u.exec("\ud800\udc00\udc00"));
assertNull(/\ud800\u{dc00}+/u.exec("\ud800\udc00\udc00"));
assertNull(/\u{d800}\udc00+/u.exec("\ud800\udc00\udc00"));
......@@ -5,30 +5,40 @@
// Flags: --harmony-unicode-regexps
// test262/data/test/language/literals/regexp/u-dec-esc
assertThrows("/\\1/u");
assertThrows("/\\1/u", SyntaxError);
// test262/language/literals/regexp/u-invalid-char-range-a
assertThrows("/[\\w-a]/u");
assertThrows("/[\\w-a]/u", SyntaxError);
// test262/language/literals/regexp/u-invalid-char-range-b
assertThrows("/[a-\\w]/u");
assertThrows("/[a-\\w]/u", SyntaxError);
// test262/language/literals/regexp/u-invalid-char-esc
assertThrows("/\\c/u");
assertThrows("/\\c0/u");
assertThrows("/\\c/u", SyntaxError);
assertThrows("/\\c0/u", SyntaxError);
// test262/built-ins/RegExp/unicode_restricted_quantifiable_assertion
assertThrows("/(?=.)*/u");
assertThrows("/(?=.)*/u", SyntaxError);
// test262/built-ins/RegExp/unicode_restricted_octal_escape
assertThrows("/[\\1]/u");
assertThrows("/\\00/u");
assertThrows("/\\09/u");
assertThrows("/[\\1]/u", SyntaxError);
assertThrows("/\\00/u", SyntaxError);
assertThrows("/\\09/u", SyntaxError);
// test262/built-ins/RegExp/unicode_restricted_identity_escape_alpha
assertThrows("/[\\c]/u");
assertThrows("/[\\c]/u", SyntaxError);
// test262/built-ins/RegExp/unicode_restricted_identity_escape_c
assertThrows("/[\\c0]/u");
assertThrows("/[\\c0]/u", SyntaxError);
// test262/built-ins/RegExp/unicode_restricted_incomple_quantifier
assertThrows("/a{/u");
assertThrows("/a{1,/u");
assertThrows("/{/u");
assertThrows("/}/u");
assertThrows("/a{/u", SyntaxError);
assertThrows("/a{1,/u", SyntaxError);
assertThrows("/{/u", SyntaxError);
assertThrows("/}/u", SyntaxError);
// test262/data/test/built-ins/RegExp/unicode_restricted_brackets
assertThrows("/]/u");
assertThrows("/]/u", SyntaxError);
// test262/built-ins/RegExp/unicode_identity_escape
/\//u;
// escaped \0 is allowed inside a character class.
assertEquals(["\0"], /[\0]/u.exec("\0"));
// unless it is followed by another digit.
assertThrows("/[\\00]/u", SyntaxError);
assertThrows("/[\\01]/u", SyntaxError);
assertThrows("/[\\09]/u", SyntaxError);
assertEquals(["\u{0}1\u{0}a\u{0}"], /[1\0a]+/u.exec("b\u{0}1\u{0}a\u{0}2"));
// escaped \- is allowed inside a character class.
assertEquals(["-"], /[a\-z]/u.exec("12-34"));
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment