Commit bb6a5357 authored by yangguo's avatar yangguo Committed by Commit bot

[regexp] restrict pattern syntax for unicode mode.

ES2015 Annex B.1.4 specifies a restricted pattern language for unicode
mode. This change reflects that, based on some test262 test cases.

R=littledan@chromium.org
BUG=v8:2952
LOG=N

Committed: https://crrev.com/e918c4ec464456a374098049ca22eac2107f6223
Cr-Commit-Position: refs/heads/master@{#33584}

Review URL: https://codereview.chromium.org/1645573002

Cr-Commit-Position: refs/heads/master@{#33603}
parent b6c9b703
...@@ -102,11 +102,28 @@ void RegExpParser::Advance(int dist) { ...@@ -102,11 +102,28 @@ void RegExpParser::Advance(int dist) {
bool RegExpParser::simple() { return simple_; } bool RegExpParser::simple() { return simple_; }
bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
bool RegExpParser::IsSyntaxCharacter(uc32 c) { switch (c) {
return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' || case '^':
c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' || case '$':
c == '{' || c == '}' || c == '|'; case '\\':
case '.':
case '*':
case '+':
case '?':
case '(':
case ')':
case '[':
case ']':
case '{':
case '}':
case '|':
case '/':
return true;
default:
break;
}
return false;
} }
...@@ -161,14 +178,14 @@ RegExpTree* RegExpParser::ParseDisjunction() { ...@@ -161,14 +178,14 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case kEndMarker: case kEndMarker:
if (state->IsSubexpression()) { if (state->IsSubexpression()) {
// Inside a parenthesized group when hitting end of input. // Inside a parenthesized group when hitting end of input.
ReportError(CStrVector("Unterminated group") CHECK_FAILED); return ReportError(CStrVector("Unterminated group"));
} }
DCHECK_EQ(INITIAL, state->group_type()); DCHECK_EQ(INITIAL, state->group_type());
// Parsing completed successfully. // Parsing completed successfully.
return builder->ToRegExp(); return builder->ToRegExp();
case ')': { case ')': {
if (!state->IsSubexpression()) { if (!state->IsSubexpression()) {
ReportError(CStrVector("Unmatched ')'") CHECK_FAILED); return ReportError(CStrVector("Unmatched ')'"));
} }
DCHECK_NE(INITIAL, state->group_type()); DCHECK_NE(INITIAL, state->group_type());
...@@ -276,13 +293,12 @@ RegExpTree* RegExpParser::ParseDisjunction() { ...@@ -276,13 +293,12 @@ RegExpTree* RegExpParser::ParseDisjunction() {
} }
// Fall through. // Fall through.
default: default:
ReportError(CStrVector("Invalid group") CHECK_FAILED); return ReportError(CStrVector("Invalid group"));
break;
} }
Advance(2); Advance(2);
} else { } else {
if (captures_started_ >= kMaxCaptures) { if (captures_started_ >= kMaxCaptures) {
ReportError(CStrVector("Too many captures") CHECK_FAILED); return ReportError(CStrVector("Too many captures"));
} }
captures_started_++; captures_started_++;
} }
...@@ -360,24 +376,25 @@ RegExpTree* RegExpParser::ParseDisjunction() { ...@@ -360,24 +376,25 @@ RegExpTree* RegExpParser::ParseDisjunction() {
} }
break; break;
} }
// With /u, no identity escapes except for syntax characters
// are allowed. Otherwise, all identity escapes are allowed.
if (unicode()) {
return ReportError(CStrVector("Invalid escape"));
}
uc32 first_digit = Next(); uc32 first_digit = Next();
if (first_digit == '8' || first_digit == '9') { if (first_digit == '8' || first_digit == '9') {
// If the 'u' flag is present, only syntax characters can be builder->AddCharacter(first_digit);
// escaped, Advance(2);
// no other identity escapes are allowed. If the 'u' flag is not
// present, all identity escapes are allowed.
if (!unicode()) {
builder->AddCharacter(first_digit);
Advance(2);
} else {
return ReportError(CStrVector("Invalid escape"));
}
break; break;
} }
} }
// FALLTHROUGH // FALLTHROUGH
case '0': { case '0': {
Advance(); Advance();
if (unicode() && Next() >= '0' && Next() <= '9') {
// With /u, decimal escape with leading 0 are not parsed as octal.
return ReportError(CStrVector("Invalid decimal escape"));
}
uc32 octal = ParseOctalLiteral(); uc32 octal = ParseOctalLiteral();
builder->AddCharacter(octal); builder->AddCharacter(octal);
break; break;
...@@ -415,6 +432,10 @@ RegExpTree* RegExpParser::ParseDisjunction() { ...@@ -415,6 +432,10 @@ RegExpTree* RegExpParser::ParseDisjunction() {
// This is outside the specification. We match JSC in // This is outside the specification. We match JSC in
// reading the backslash as a literal character instead // reading the backslash as a literal character instead
// of as starting an escape. // of as starting an escape.
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
return ReportError(CStrVector("Invalid unicode escape"));
}
builder->AddCharacter('\\'); builder->AddCharacter('\\');
} else { } else {
Advance(2); Advance(2);
...@@ -430,8 +451,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { ...@@ -430,8 +451,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
} else if (!unicode()) { } else if (!unicode()) {
builder->AddCharacter('x'); builder->AddCharacter('x');
} else { } else {
// If the 'u' flag is present, invalid escapes are not treated as // With /u, invalid escapes are not treated as identity escapes.
// identity escapes.
return ReportError(CStrVector("Invalid escape")); return ReportError(CStrVector("Invalid escape"));
} }
break; break;
...@@ -444,20 +464,16 @@ RegExpTree* RegExpParser::ParseDisjunction() { ...@@ -444,20 +464,16 @@ RegExpTree* RegExpParser::ParseDisjunction() {
} else if (!unicode()) { } else if (!unicode()) {
builder->AddCharacter('u'); builder->AddCharacter('u');
} else { } else {
// If the 'u' flag is present, invalid escapes are not treated as // With /u, invalid escapes are not treated as identity escapes.
// identity escapes.
return ReportError(CStrVector("Invalid unicode escape")); return ReportError(CStrVector("Invalid unicode escape"));
} }
break; break;
} }
default: default:
Advance(); Advance();
// If the 'u' flag is present, only syntax characters can be // With /u, no identity escapes except for syntax characters
// escaped, no // are allowed. Otherwise, all identity escapes are allowed.
// other identity escapes are allowed. If the 'u' flag is not if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
// present,
// all identity escapes are allowed.
if (!unicode() || IsSyntaxCharacter(current())) {
builder->AddCharacter(current()); builder->AddCharacter(current());
Advance(); Advance();
} else { } else {
...@@ -469,10 +485,16 @@ RegExpTree* RegExpParser::ParseDisjunction() { ...@@ -469,10 +485,16 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '{': { case '{': {
int dummy; int dummy;
if (ParseIntervalQuantifier(&dummy, &dummy)) { if (ParseIntervalQuantifier(&dummy, &dummy)) {
ReportError(CStrVector("Nothing to repeat") CHECK_FAILED); return ReportError(CStrVector("Nothing to repeat"));
} }
// fallthrough // fallthrough
} }
case '}':
case ']':
if (unicode()) {
return ReportError(CStrVector("Lone quantifier brackets"));
}
// fallthrough
default: default:
builder->AddUnicodeCharacter(current()); builder->AddUnicodeCharacter(current());
Advance(); Advance();
...@@ -505,13 +527,15 @@ RegExpTree* RegExpParser::ParseDisjunction() { ...@@ -505,13 +527,15 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '{': case '{':
if (ParseIntervalQuantifier(&min, &max)) { if (ParseIntervalQuantifier(&min, &max)) {
if (max < min) { if (max < min) {
ReportError(CStrVector("numbers out of order in {} quantifier.") return ReportError(
CHECK_FAILED); CStrVector("numbers out of order in {} quantifier"));
} }
break; break;
} else { } else if (unicode()) {
continue; // With /u, incomplete quantifiers are not allowed.
return ReportError(CStrVector("Incomplete quantifier"));
} }
continue;
default: default:
continue; continue;
} }
...@@ -524,7 +548,9 @@ RegExpTree* RegExpParser::ParseDisjunction() { ...@@ -524,7 +548,9 @@ RegExpTree* RegExpParser::ParseDisjunction() {
quantifier_type = RegExpQuantifier::POSSESSIVE; quantifier_type = RegExpQuantifier::POSSESSIVE;
Advance(); Advance();
} }
builder->AddQuantifierToAtom(min, max, quantifier_type); if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
return ReportError(CStrVector("Invalid quantifier"));
}
} }
} }
...@@ -822,15 +848,24 @@ uc32 RegExpParser::ParseClassCharacterEscape() { ...@@ -822,15 +848,24 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
case 'c': { case 'c': {
uc32 controlLetter = Next(); uc32 controlLetter = Next();
uc32 letter = controlLetter & ~('A' ^ 'a'); uc32 letter = controlLetter & ~('A' ^ 'a');
// For compatibility with JSC, inside a character class // For compatibility with JSC, inside a character class. We also accept
// we also accept digits and underscore as control characters. // digits and underscore as control characters, unless with /u.
if ((controlLetter >= '0' && controlLetter <= '9') || if (letter >= 'A' && letter <= 'Z') {
controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
Advance(2); Advance(2);
// Control letters mapped to ASCII control characters in the range // Control letters mapped to ASCII control characters in the range
// 0x00-0x1f. // 0x00-0x1f.
return controlLetter & 0x1f; return controlLetter & 0x1f;
} }
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
ReportError(CStrVector("Invalid class escape"));
return 0;
}
if ((controlLetter >= '0' && controlLetter <= '9') ||
controlLetter == '_') {
Advance(2);
return controlLetter & 0x1f;
}
// We match JSC in reading the backslash as a literal // We match JSC in reading the backslash as a literal
// character instead of as starting an escape. // character instead of as starting an escape.
return '\\'; return '\\';
...@@ -846,43 +881,43 @@ uc32 RegExpParser::ParseClassCharacterEscape() { ...@@ -846,43 +881,43 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
// For compatibility, we interpret a decimal escape that isn't // For compatibility, we interpret a decimal escape that isn't
// a back reference (and therefore either \0 or not valid according // a back reference (and therefore either \0 or not valid according
// to the specification) as a 1..3 digit octal character code. // to the specification) as a 1..3 digit octal character code.
if (unicode()) {
// With /u, decimal escape is not interpreted as octal character code.
ReportError(CStrVector("Invalid class escape"));
return 0;
}
return ParseOctalLiteral(); return ParseOctalLiteral();
case 'x': { case 'x': {
Advance(); Advance();
uc32 value; uc32 value;
if (ParseHexEscape(2, &value)) { if (ParseHexEscape(2, &value)) return value;
return value; if (unicode()) {
} // With /u, invalid escapes are not treated as identity escapes.
if (!unicode()) { ReportError(CStrVector("Invalid escape"));
// If \x is not followed by a two-digit hexadecimal, treat it return 0;
// as an identity escape.
return 'x';
} }
// If the 'u' flag is present, invalid escapes are not treated as // If \x is not followed by a two-digit hexadecimal, treat it
// identity escapes. // as an identity escape.
ReportError(CStrVector("Invalid escape")); return 'x';
return 0;
} }
case 'u': { case 'u': {
Advance(); Advance();
uc32 value; uc32 value;
if (ParseUnicodeEscape(&value)) { if (ParseUnicodeEscape(&value)) return value;
return value; if (unicode()) {
} // With /u, invalid escapes are not treated as identity escapes.
if (!unicode()) { ReportError(CStrVector("Invalid unicode escape"));
return 'u'; return 0;
} }
// If the 'u' flag is present, invalid escapes are not treated as // If \u is not followed by a two-digit hexadecimal, treat it
// identity escapes. // as an identity escape.
ReportError(CStrVector("Invalid unicode escape")); return 'u';
return 0;
} }
default: { default: {
uc32 result = current(); uc32 result = current();
// If the 'u' flag is present, only syntax characters can be escaped, no // With /u, no identity escapes except for syntax characters are
// other identity escapes are allowed. If the 'u' flag is not present, all // allowed. Otherwise, all identity escapes are allowed.
// identity escapes are allowed. if (!unicode() || IsSyntaxCharacterOrSlash(result)) {
if (!unicode() || IsSyntaxCharacter(result)) {
Advance(); Advance();
return result; return result;
} }
...@@ -956,6 +991,7 @@ static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, ...@@ -956,6 +991,7 @@ static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
RegExpTree* RegExpParser::ParseCharacterClass() { RegExpTree* RegExpParser::ParseCharacterClass() {
static const char* kUnterminated = "Unterminated character class"; static const char* kUnterminated = "Unterminated character class";
static const char* kRangeInvalid = "Invalid character class";
static const char* kRangeOutOfOrder = "Range out of order in character class"; static const char* kRangeOutOfOrder = "Range out of order in character class";
DCHECK_EQ(current(), '['); DCHECK_EQ(current(), '[');
...@@ -985,13 +1021,18 @@ RegExpTree* RegExpParser::ParseCharacterClass() { ...@@ -985,13 +1021,18 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
// Either end is an escaped character class. Treat the '-' verbatim. // Either end is an escaped character class. Treat the '-' verbatim.
if (unicode()) {
// ES2015 21.2.2.15.1 step 1.
return ReportError(CStrVector(kRangeInvalid));
}
AddRangeOrEscape(ranges, char_class, first, zone()); AddRangeOrEscape(ranges, char_class, first, zone());
ranges->Add(CharacterRange::Singleton('-'), zone()); ranges->Add(CharacterRange::Singleton('-'), zone());
AddRangeOrEscape(ranges, char_class_2, next, zone()); AddRangeOrEscape(ranges, char_class_2, next, zone());
continue; continue;
} }
// ES2015 21.2.2.15.1 step 6.
if (first.from() > next.to()) { if (first.from() > next.to()) {
return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED); return ReportError(CStrVector(kRangeOutOfOrder));
} }
ranges->Add(CharacterRange::Range(first.from(), next.to()), zone()); ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
} else { } else {
...@@ -999,7 +1040,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() { ...@@ -999,7 +1040,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
} }
} }
if (!has_more()) { if (!has_more()) {
return ReportError(CStrVector(kUnterminated) CHECK_FAILED); return ReportError(CStrVector(kUnterminated));
} }
Advance(); Advance();
if (ranges->length() == 0) { if (ranges->length() == 0) {
...@@ -1162,7 +1203,7 @@ void RegExpBuilder::AddEmpty() { pending_empty_ = true; } ...@@ -1162,7 +1203,7 @@ void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
if (NeedsDesugaringForUnicode(cc)) { if (NeedsDesugaringForUnicode(cc)) {
// In unicode mode, character class needs to be desugared, so it // With /u, character class needs to be desugared, so it
// must be a standalone term instead of being part of a RegExpText. // must be a standalone term instead of being part of a RegExpText.
AddTerm(cc); AddTerm(cc);
} else { } else {
...@@ -1275,13 +1316,12 @@ RegExpTree* RegExpBuilder::ToRegExp() { ...@@ -1275,13 +1316,12 @@ RegExpTree* RegExpBuilder::ToRegExp() {
return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
} }
bool RegExpBuilder::AddQuantifierToAtom(
void RegExpBuilder::AddQuantifierToAtom(
int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
FlushPendingSurrogate(); FlushPendingSurrogate();
if (pending_empty_) { if (pending_empty_) {
pending_empty_ = false; pending_empty_ = false;
return; return true;
} }
RegExpTree* atom; RegExpTree* atom;
if (characters_ != NULL) { if (characters_ != NULL) {
...@@ -1304,23 +1344,26 @@ void RegExpBuilder::AddQuantifierToAtom( ...@@ -1304,23 +1344,26 @@ void RegExpBuilder::AddQuantifierToAtom(
} else if (terms_.length() > 0) { } else if (terms_.length() > 0) {
DCHECK(last_added_ == ADD_ATOM); DCHECK(last_added_ == ADD_ATOM);
atom = terms_.RemoveLast(); atom = terms_.RemoveLast();
// With /u, lookarounds are not quantifiable.
if (unicode() && atom->IsLookaround()) return false;
if (atom->max_match() == 0) { if (atom->max_match() == 0) {
// Guaranteed to only match an empty string. // Guaranteed to only match an empty string.
LAST(ADD_TERM); LAST(ADD_TERM);
if (min == 0) { if (min == 0) {
return; return true;
} }
terms_.Add(atom, zone()); terms_.Add(atom, zone());
return; return true;
} }
} else { } else {
// Only call immediately after adding an atom or character! // Only call immediately after adding an atom or character!
UNREACHABLE(); UNREACHABLE();
return; return false;
} }
terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
zone()); zone());
LAST(ADD_TERM); LAST(ADD_TERM);
return true;
} }
} // namespace internal } // namespace internal
......
...@@ -111,7 +111,7 @@ class RegExpBuilder : public ZoneObject { ...@@ -111,7 +111,7 @@ class RegExpBuilder : public ZoneObject {
void AddTerm(RegExpTree* tree); void AddTerm(RegExpTree* tree);
void AddAssertion(RegExpTree* tree); void AddAssertion(RegExpTree* tree);
void NewAlternative(); // '|' void NewAlternative(); // '|'
void AddQuantifierToAtom(int min, int max, bool AddQuantifierToAtom(int min, int max,
RegExpQuantifier::QuantifierType type); RegExpQuantifier::QuantifierType type);
RegExpTree* ToRegExp(); RegExpTree* ToRegExp();
...@@ -198,7 +198,7 @@ class RegExpParser BASE_EMBEDDED { ...@@ -198,7 +198,7 @@ class RegExpParser BASE_EMBEDDED {
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; } bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; } bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
static bool IsSyntaxCharacter(uc32 c); static bool IsSyntaxCharacterOrSlash(uc32 c);
static const int kMaxCaptures = 1 << 16; static const int kMaxCaptures = 1 << 16;
static const uc32 kEndMarker = (1 << 21); static const uc32 kEndMarker = (1 << 21);
......
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-unicode-regexps
// test262/data/test/language/literals/regexp/u-dec-esc
assertThrows("/\\1/u");
// test262/language/literals/regexp/u-invalid-char-range-a
assertThrows("/[\\w-a]/u");
// test262/language/literals/regexp/u-invalid-char-range-b
assertThrows("/[a-\\w]/u");
// test262/language/literals/regexp/u-invalid-char-esc
assertThrows("/\\c/u");
assertThrows("/\\c0/u");
// test262/built-ins/RegExp/unicode_restricted_quantifiable_assertion
assertThrows("/(?=.)*/u");
// test262/built-ins/RegExp/unicode_restricted_octal_escape
assertThrows("/[\\1]/u");
assertThrows("/\\00/u");
assertThrows("/\\09/u");
// test262/built-ins/RegExp/unicode_restricted_identity_escape_alpha
assertThrows("/[\\c]/u");
// test262/built-ins/RegExp/unicode_restricted_identity_escape_c
assertThrows("/[\\c0]/u");
// test262/built-ins/RegExp/unicode_restricted_incomple_quantifier
assertThrows("/a{/u");
assertThrows("/a{1,/u");
assertThrows("/{/u");
assertThrows("/}/u");
// test262/data/test/built-ins/RegExp/unicode_restricted_brackets
assertThrows("/]/u");
// test262/built-ins/RegExp/unicode_identity_escape
/\//u;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment