Commit 42309697 authored by yangguo's avatar yangguo Committed by Commit bot

[regexp] parse RegExpUnicodeEscapeSequence according to spec.

See http://tc39.github.io/ecma262/#prod-RegExpUnicodeEscapeSequence

R=erik.corry@gmail.com, erikcorry@chromium.org
BUG=v8:2952
LOG=N

Review URL: https://codereview.chromium.org/1681893002

Cr-Commit-Position: refs/heads/master@{#33892}
parent 9dd5fe29
...@@ -478,7 +478,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { ...@@ -478,7 +478,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
Advance(2); Advance(2);
uc32 value; uc32 value;
if (ParseUnicodeEscape(&value)) { if (ParseUnicodeEscape(&value)) {
builder->AddUnicodeCharacter(value); builder->AddEscapedUnicodeCharacter(value);
} else if (!unicode()) { } else if (!unicode()) {
builder->AddCharacter('u'); builder->AddCharacter('u');
} else { } else {
...@@ -797,7 +797,7 @@ bool RegExpParser::ParseHexEscape(int length, uc32* value) { ...@@ -797,7 +797,7 @@ bool RegExpParser::ParseHexEscape(int length, uc32* value) {
return true; return true;
} }
// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
bool RegExpParser::ParseUnicodeEscape(uc32* value) { bool RegExpParser::ParseUnicodeEscape(uc32* value) {
// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
// allowed). In the latter case, the number of hex digits between { } is // allowed). In the latter case, the number of hex digits between { } is
...@@ -815,7 +815,24 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) { ...@@ -815,7 +815,24 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) {
return false; return false;
} }
// \u but no {, or \u{...} escapes not allowed. // \u but no {, or \u{...} escapes not allowed.
return ParseHexEscape(4, value); bool result = ParseHexEscape(4, value);
if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
current() == '\\') {
// Attempt to read trail surrogate.
int start = position();
if (Next() == 'u') {
Advance(2);
uc32 trail;
if (ParseHexEscape(4, &trail) &&
unibrow::Utf16::IsTrailSurrogate(trail)) {
*value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
static_cast<uc16>(trail));
return true;
}
}
Reset(start);
}
return result;
} }
ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() { ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() {
...@@ -938,6 +955,12 @@ uc32 RegExpParser::ParseClassCharacterEscape() { ...@@ -938,6 +955,12 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
return '\\'; return '\\';
} }
case '0': case '0':
// With /u, \0 is interpreted as NUL if not followed by another digit.
if (unicode() && !(Next() >= '0' && Next() <= '9')) {
Advance();
return 0;
}
// Fall through.
case '1': case '1':
case '2': case '2':
case '3': case '3':
...@@ -982,9 +1005,9 @@ uc32 RegExpParser::ParseClassCharacterEscape() { ...@@ -982,9 +1005,9 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
} }
default: { default: {
uc32 result = current(); uc32 result = current();
// With /u, no identity escapes except for syntax characters are // With /u, no identity escapes except for syntax characters and '-' are
// allowed. Otherwise, all identity escapes are allowed. // allowed. Otherwise, all identity escapes are allowed.
if (!unicode() || IsSyntaxCharacterOrSlash(result)) { if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
Advance(); Advance();
return result; return result;
} }
...@@ -1020,22 +1043,6 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { ...@@ -1020,22 +1043,6 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
Advance(); Advance();
} }
if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {
// Combine with possibly following trail surrogate.
int start = position();
uc32 second = current();
if (second == '\\') {
second = ParseClassCharacterEscape(CHECK_FAILED);
} else {
Advance();
}
if (unibrow::Utf16::IsTrailSurrogate(second)) {
first = unibrow::Utf16::CombineSurrogatePair(first, second);
} else {
Reset(start);
}
}
return CharacterRange::Singleton(first); return CharacterRange::Singleton(first);
} }
...@@ -1264,6 +1271,13 @@ void RegExpBuilder::AddUnicodeCharacter(uc32 c) { ...@@ -1264,6 +1271,13 @@ void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
} }
} }
void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
// A lead or trail surrogate parsed via escape sequence will not
// pair up with any preceding lead or following trail surrogate.
FlushPendingSurrogate();
AddUnicodeCharacter(character);
FlushPendingSurrogate();
}
void RegExpBuilder::AddEmpty() { pending_empty_ = true; } void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
......
...@@ -102,6 +102,7 @@ class RegExpBuilder : public ZoneObject { ...@@ -102,6 +102,7 @@ class RegExpBuilder : public ZoneObject {
RegExpBuilder(Zone* zone, bool ignore_case, bool unicode); RegExpBuilder(Zone* zone, bool ignore_case, bool unicode);
void AddCharacter(uc16 character); void AddCharacter(uc16 character);
void AddUnicodeCharacter(uc32 character); void AddUnicodeCharacter(uc32 character);
void AddEscapedUnicodeCharacter(uc32 character);
// "Adds" an empty expression. Does nothing except consume a // "Adds" an empty expression. Does nothing except consume a
// following quantifier // following quantifier
void AddEmpty(); void AddEmpty();
......
...@@ -257,25 +257,31 @@ assertTrue(/\ud808\udf45{3}/u.test("\u{12345}\u{12345}\u{12345}")); ...@@ -257,25 +257,31 @@ assertTrue(/\ud808\udf45{3}/u.test("\u{12345}\u{12345}\u{12345}"));
assertFalse(new RegExp("\u{12345}{3}", "u").test("\u{12345}\udf45\udf45")); assertFalse(new RegExp("\u{12345}{3}", "u").test("\u{12345}\udf45\udf45"));
assertFalse(/\u{12345}{3}/u.test("\u{12345}\udf45\udf45")); assertFalse(/\u{12345}{3}/u.test("\u{12345}\udf45\udf45"));
// Mixed escapes and literal surrogates. // Literal surrogates.
assertEquals(["\u{10000}\u{10000}"], assertEquals(["\u{10000}\u{10000}"],
new RegExp("\ud800\udc00+", "u").exec("\u{10000}\u{10000}")); new RegExp("\ud800\udc00+", "u").exec("\u{10000}\u{10000}"));
assertEquals(["\u{10000}\u{10000}"], assertEquals(["\u{10000}\u{10000}"],
new RegExp("\\ud800\\udc00+", "u").exec("\u{10000}\u{10000}")); new RegExp("\\ud800\\udc00+", "u").exec("\u{10000}\u{10000}"));
assertEquals(["\u{10000}\u{10000}"],
new RegExp("\\ud800\udc00+", "u").exec("\u{10000}\u{10000}"));
assertEquals(["\u{10000}\u{10000}"],
new RegExp("\ud800\\udc00+", "u").exec("\u{10000}\u{10000}"));
assertEquals(["\u{10003}\u{50001}"], assertEquals(["\u{10003}\u{50001}"],
new RegExp("[\\ud800\\udc03-\\ud900\\udc01\]+", "u").exec( new RegExp("[\\ud800\\udc03-\\ud900\\udc01\]+", "u").exec(
"\u{10003}\u{50001}")); "\u{10003}\u{50001}"));
assertEquals(["\u{10003}\u{50001}"],
new RegExp("[\\ud800\udc03-\ud900\\udc01\]+", "u").exec(
"\u{10003}\u{50001}"));
assertEquals(["\u{50001}"],
new RegExp("[\\ud800\udc03-\ud900\\udc01\]+", "u").exec(
"\u{10002}\u{50001}"));
assertEquals(["\u{10003}\u{50001}"], assertEquals(["\u{10003}\u{50001}"],
new RegExp("[\ud800\udc03-\u{50001}\]+", "u").exec( new RegExp("[\ud800\udc03-\u{50001}\]+", "u").exec(
"\u{10003}\u{50001}")); "\u{10003}\u{50001}"));
// Unicode escape sequences to represent a non-BMP character cannot have
// mixed notation, and must follow the rules for RegExpUnicodeEscapeSequence.
assertThrows(() => new RegExp("[\\ud800\udc03-\ud900\\udc01\]+", "u"));
assertThrows(() => new RegExp("[\\ud800\udc03-\ud900\\udc01\]+", "u"));
assertNull(new RegExp("\\ud800\udc00+", "u").exec("\u{10000}\u{10000}"));
assertNull(new RegExp("\ud800\\udc00+", "u").exec("\u{10000}\u{10000}"));
assertNull(new RegExp("[\\ud800\udc00]", "u").exec("\u{10000}"));
assertNull(new RegExp("[\\{ud800}\udc00]", "u").exec("\u{10000}"));
assertNull(new RegExp("[\ud800\\udc00]", "u").exec("\u{10000}"));
assertNull(new RegExp("[\ud800\\{udc00}]", "u").exec("\u{10000}"));
assertNull(/\u{d800}\u{dc00}+/u.exec("\ud800\udc00\udc00"));
assertNull(/\ud800\u{dc00}+/u.exec("\ud800\udc00\udc00"));
assertNull(/\u{d800}\udc00+/u.exec("\ud800\udc00\udc00"));
...@@ -5,30 +5,40 @@ ...@@ -5,30 +5,40 @@
// Flags: --harmony-unicode-regexps // Flags: --harmony-unicode-regexps
// test262/data/test/language/literals/regexp/u-dec-esc // test262/data/test/language/literals/regexp/u-dec-esc
assertThrows("/\\1/u"); assertThrows("/\\1/u", SyntaxError);
// test262/language/literals/regexp/u-invalid-char-range-a // test262/language/literals/regexp/u-invalid-char-range-a
assertThrows("/[\\w-a]/u"); assertThrows("/[\\w-a]/u", SyntaxError);
// test262/language/literals/regexp/u-invalid-char-range-b // test262/language/literals/regexp/u-invalid-char-range-b
assertThrows("/[a-\\w]/u"); assertThrows("/[a-\\w]/u", SyntaxError);
// test262/language/literals/regexp/u-invalid-char-esc // test262/language/literals/regexp/u-invalid-char-esc
assertThrows("/\\c/u"); assertThrows("/\\c/u", SyntaxError);
assertThrows("/\\c0/u"); assertThrows("/\\c0/u", SyntaxError);
// test262/built-ins/RegExp/unicode_restricted_quantifiable_assertion // test262/built-ins/RegExp/unicode_restricted_quantifiable_assertion
assertThrows("/(?=.)*/u"); assertThrows("/(?=.)*/u", SyntaxError);
// test262/built-ins/RegExp/unicode_restricted_octal_escape // test262/built-ins/RegExp/unicode_restricted_octal_escape
assertThrows("/[\\1]/u"); assertThrows("/[\\1]/u", SyntaxError);
assertThrows("/\\00/u"); assertThrows("/\\00/u", SyntaxError);
assertThrows("/\\09/u"); assertThrows("/\\09/u", SyntaxError);
// test262/built-ins/RegExp/unicode_restricted_identity_escape_alpha // test262/built-ins/RegExp/unicode_restricted_identity_escape_alpha
assertThrows("/[\\c]/u"); assertThrows("/[\\c]/u", SyntaxError);
// test262/built-ins/RegExp/unicode_restricted_identity_escape_c // test262/built-ins/RegExp/unicode_restricted_identity_escape_c
assertThrows("/[\\c0]/u"); assertThrows("/[\\c0]/u", SyntaxError);
// test262/built-ins/RegExp/unicode_restricted_incomple_quantifier // test262/built-ins/RegExp/unicode_restricted_incomple_quantifier
assertThrows("/a{/u"); assertThrows("/a{/u", SyntaxError);
assertThrows("/a{1,/u"); assertThrows("/a{1,/u", SyntaxError);
assertThrows("/{/u"); assertThrows("/{/u", SyntaxError);
assertThrows("/}/u"); assertThrows("/}/u", SyntaxError);
// test262/data/test/built-ins/RegExp/unicode_restricted_brackets // test262/data/test/built-ins/RegExp/unicode_restricted_brackets
assertThrows("/]/u"); assertThrows("/]/u", SyntaxError);
// test262/built-ins/RegExp/unicode_identity_escape // test262/built-ins/RegExp/unicode_identity_escape
/\//u; /\//u;
// escaped \0 is allowed inside a character class.
assertEquals(["\0"], /[\0]/u.exec("\0"));
// unless it is followed by another digit.
assertThrows("/[\\00]/u", SyntaxError);
assertThrows("/[\\01]/u", SyntaxError);
assertThrows("/[\\09]/u", SyntaxError);
assertEquals(["\u{0}1\u{0}a\u{0}"], /[1\0a]+/u.exec("b\u{0}1\u{0}a\u{0}2"));
// escaped \- is allowed inside a character class.
assertEquals(["-"], /[a\-z]/u.exec("12-34"));
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment