Commit 90fd0ee8 authored by lrn@chromium.org's avatar lrn@chromium.org

Change interpretation of malformed \c? escapes in RegExp to match JSC.

Review URL: http://codereview.chromium.org/6171001

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@6225 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent e74b0ae7
...@@ -4022,9 +4022,21 @@ RegExpTree* RegExpParser::ParseDisjunction() { ...@@ -4022,9 +4022,21 @@ RegExpTree* RegExpParser::ParseDisjunction() {
builder->AddCharacter('\v'); builder->AddCharacter('\v');
break; break;
case 'c': { case 'c': {
Advance(2); Advance();
uc32 control = ParseControlLetterEscape(); uc32 controlLetter = Next();
builder->AddCharacter(control); // Special case if it is an ASCII letter.
// Convert lower case letters to uppercase.
uc32 letter = controlLetter & ~('a' ^ 'A');
if (letter < 'A' || 'Z' < letter) {
// controlLetter is not in range 'A'-'Z' or 'a'-'z'.
// This is outside the specification. We match JSC in
// reading the backslash as a literal character instead
// of as starting an escape.
builder->AddCharacter('\\');
} else {
Advance(2);
builder->AddCharacter(controlLetter & 0x1f);
}
break; break;
} }
case 'x': { case 'x': {
...@@ -4299,23 +4311,6 @@ bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) { ...@@ -4299,23 +4311,6 @@ bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
} }
// Upper and lower case letters differ by one bit.
STATIC_CHECK(('a' ^ 'A') == 0x20);
uc32 RegExpParser::ParseControlLetterEscape() {
if (!has_more())
return 'c';
uc32 letter = current() & ~(0x20); // Collapse upper and lower case letters.
if (letter < 'A' || 'Z' < letter) {
// Non-spec error-correction: "\c" followed by non-control letter is
// interpreted as an IdentityEscape of 'c'.
return 'c';
}
Advance();
return letter & 0x1f; // Remainder modulo 32, per specification.
}
uc32 RegExpParser::ParseOctalLiteral() { uc32 RegExpParser::ParseOctalLiteral() {
ASSERT('0' <= current() && current() <= '7'); ASSERT('0' <= current() && current() <= '7');
// For compatibility with some other browsers (not all), we parse // For compatibility with some other browsers (not all), we parse
...@@ -4381,9 +4376,23 @@ uc32 RegExpParser::ParseClassCharacterEscape() { ...@@ -4381,9 +4376,23 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
case 'v': case 'v':
Advance(); Advance();
return '\v'; return '\v';
case 'c': case 'c': {
Advance(); uc32 controlLetter = Next();
return ParseControlLetterEscape(); uc32 letter = controlLetter & ~('A' ^ 'a');
// For compatibility with JSC, inside a character class
// we also accept digits and underscore as control characters.
if ((controlLetter >= '0' && controlLetter <= '9') ||
controlLetter == '_' ||
(letter >= 'A' && letter <= 'Z')) {
Advance(2);
// Control letters mapped to ASCII control characters in the range
// 0x00-0x1f.
return controlLetter & 0x1f;
}
// We match JSC in reading the backslash as a literal
// character instead of as starting an escape.
return '\\';
}
case '0': case '1': case '2': case '3': case '4': case '5': case '0': case '1': case '2': case '3': case '4': case '5':
case '6': case '7': case '6': case '7':
// For compatibility, we interpret a decimal escape that isn't // For compatibility, we interpret a decimal escape that isn't
......
...@@ -321,7 +321,6 @@ class RegExpParser { ...@@ -321,7 +321,6 @@ class RegExpParser {
// and sets the value if it is. // and sets the value if it is.
bool ParseHexEscape(int length, uc32* value); bool ParseHexEscape(int length, uc32* value);
uc32 ParseControlLetterEscape();
uc32 ParseOctalLiteral(); uc32 ParseOctalLiteral();
// Tries to parse the input as a back reference. If successful it // Tries to parse the input as a back reference. If successful it
......
...@@ -731,11 +731,18 @@ bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) { ...@@ -731,11 +731,18 @@ bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {
while (c0_ != '/' || in_character_class) { while (c0_ != '/' || in_character_class) {
if (ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) return false; if (ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) return false;
if (c0_ == '\\') { // escaped character if (c0_ == '\\') { // Escape sequence.
AddLiteralCharAdvance(); AddLiteralCharAdvance();
if (ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) return false; if (ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) return false;
AddLiteralCharAdvance(); AddLiteralCharAdvance();
} else { // unescaped character // If the escape allows more characters, i.e., \x??, \u????, or \c?,
// only "safe" characters are allowed (letters, digits, underscore),
// otherwise the escape isn't valid and the invalid character has
// its normal meaning. I.e., we can just continue scanning without
// worrying whether the following characters are part of the escape
// or not, since any '/', '\\' or '[' is guaranteed to not be part
// of the escape sequence.
} else { // Unescaped character.
if (c0_ == '[') in_character_class = true; if (c0_ == '[') in_character_class = true;
if (c0_ == ']') in_character_class = false; if (c0_ == ']') in_character_class = false;
AddLiteralCharAdvance(); AddLiteralCharAdvance();
......
...@@ -645,3 +645,58 @@ TEST(StreamScanner) { ...@@ -645,3 +645,58 @@ TEST(StreamScanner) {
TestStreamScanner(&stream3, expectations3, 1, 1 + i); TestStreamScanner(&stream3, expectations3, 1, 1 + i);
} }
} }
void TestScanRegExp(const char* re_source, const char* expected) {
i::Utf8ToUC16CharacterStream stream(
reinterpret_cast<const i::byte*>(re_source),
static_cast<unsigned>(strlen(re_source)));
i::V8JavaScriptScanner scanner;
scanner.Initialize(&stream);
i::Token::Value start = scanner.peek();
CHECK(start == i::Token::DIV || start == i::Token::ASSIGN_DIV);
CHECK(scanner.ScanRegExpPattern(start == i::Token::ASSIGN_DIV));
scanner.Next(); // Current token is now the regexp literal.
CHECK(scanner.is_literal_ascii());
i::Vector<const char> actual = scanner.literal_ascii_string();
for (int i = 0; i < actual.length(); i++) {
CHECK_NE('\0', expected[i]);
CHECK_EQ(expected[i], actual[i]);
}
}
TEST(RegExpScanning) {
// RegExp token with added garbage at the end. The scanner should only
// scan the RegExp until the terminating slash just before "flipperwald".
TestScanRegExp("/b/flipperwald", "b");
// Incomplete escape sequences doesn't hide the terminating slash.
TestScanRegExp("/\\x/flipperwald", "\\x");
TestScanRegExp("/\\u/flipperwald", "\\u");
TestScanRegExp("/\\u1/flipperwald", "\\u1");
TestScanRegExp("/\\u12/flipperwald", "\\u12");
TestScanRegExp("/\\u123/flipperwald", "\\u123");
TestScanRegExp("/\\c/flipperwald", "\\c");
TestScanRegExp("/\\c//flipperwald", "\\c");
// Slashes inside character classes are not terminating.
TestScanRegExp("/[/]/flipperwald", "[/]");
TestScanRegExp("/[\\s-/]/flipperwald", "[\\s-/]");
// Incomplete escape sequences inside a character class doesn't hide
// the end of the character class.
TestScanRegExp("/[\\c/]/flipperwald", "[\\c/]");
TestScanRegExp("/[\\c]/flipperwald", "[\\c]");
TestScanRegExp("/[\\x]/flipperwald", "[\\x]");
TestScanRegExp("/[\\x1]/flipperwald", "[\\x1]");
TestScanRegExp("/[\\u]/flipperwald", "[\\u]");
TestScanRegExp("/[\\u1]/flipperwald", "[\\u1]");
TestScanRegExp("/[\\u12]/flipperwald", "[\\u12]");
TestScanRegExp("/[\\u123]/flipperwald", "[\\u123]");
// Escaped ']'s wont end the character class.
TestScanRegExp("/[\\]/]/flipperwald", "[\\]/]");
// Escaped slashes are not terminating.
TestScanRegExp("/\\//flipperwald", "\\/");
// Starting with '=' works too.
TestScanRegExp("/=/", "=");
TestScanRegExp("/=?/", "=?");
}
...@@ -176,11 +176,23 @@ TEST(Parser) { ...@@ -176,11 +176,23 @@ TEST(Parser) {
CHECK_PARSE_EQ("[\\d-z]", "[0-9 - z]"); CHECK_PARSE_EQ("[\\d-z]", "[0-9 - z]");
CHECK_PARSE_EQ("[\\d-\\d]", "[0-9 - 0-9]"); CHECK_PARSE_EQ("[\\d-\\d]", "[0-9 - 0-9]");
CHECK_PARSE_EQ("[z-\\d]", "[z - 0-9]"); CHECK_PARSE_EQ("[z-\\d]", "[z - 0-9]");
// Control character outside character class.
CHECK_PARSE_EQ("\\cj\\cJ\\ci\\cI\\ck\\cK", CHECK_PARSE_EQ("\\cj\\cJ\\ci\\cI\\ck\\cK",
"'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'"); "'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'");
CHECK_PARSE_EQ("\\c!", "'c!'"); CHECK_PARSE_EQ("\\c!", "'\\c!'");
CHECK_PARSE_EQ("\\c_", "'c_'"); CHECK_PARSE_EQ("\\c_", "'\\c_'");
CHECK_PARSE_EQ("\\c~", "'c~'"); CHECK_PARSE_EQ("\\c~", "'\\c~'");
CHECK_PARSE_EQ("\\c1", "'\\c1'");
// Control character inside character class.
CHECK_PARSE_EQ("[\\c!]", "[\\ c !]");
CHECK_PARSE_EQ("[\\c_]", "[\\x1f]");
CHECK_PARSE_EQ("[\\c~]", "[\\ c ~]");
CHECK_PARSE_EQ("[\\ca]", "[\\x01]");
CHECK_PARSE_EQ("[\\cz]", "[\\x1a]");
CHECK_PARSE_EQ("[\\cA]", "[\\x01]");
CHECK_PARSE_EQ("[\\cZ]", "[\\x1a]");
CHECK_PARSE_EQ("[\\c1]", "[\\x11]");
CHECK_PARSE_EQ("[a\\]c]", "[a ] c]"); CHECK_PARSE_EQ("[a\\]c]", "[a ] c]");
CHECK_PARSE_EQ("\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ", "'[]{}()%^# '"); CHECK_PARSE_EQ("\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ", "'[]{}()%^# '");
CHECK_PARSE_EQ("[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "[[ ] { } ( ) % ^ # ]"); CHECK_PARSE_EQ("[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "[[ ] { } ( ) % ^ # ]");
...@@ -234,7 +246,7 @@ TEST(Parser) { ...@@ -234,7 +246,7 @@ TEST(Parser) {
CHECK_PARSE_EQ("\\x34", "'\x34'"); CHECK_PARSE_EQ("\\x34", "'\x34'");
CHECK_PARSE_EQ("\\x60", "'\x60'"); CHECK_PARSE_EQ("\\x60", "'\x60'");
CHECK_PARSE_EQ("\\x3z", "'x3z'"); CHECK_PARSE_EQ("\\x3z", "'x3z'");
CHECK_PARSE_EQ("\\c", "'c'"); CHECK_PARSE_EQ("\\c", "'\\c'");
CHECK_PARSE_EQ("\\u0034", "'\x34'"); CHECK_PARSE_EQ("\\u0034", "'\x34'");
CHECK_PARSE_EQ("\\u003z", "'u003z'"); CHECK_PARSE_EQ("\\u003z", "'u003z'");
CHECK_PARSE_EQ("foo[z]*", "(: 'foo' (# 0 - g [z]))"); CHECK_PARSE_EQ("foo[z]*", "(: 'foo' (# 0 - g [z]))");
......
...@@ -84,15 +84,14 @@ assertEquals(result[4], 'D'); ...@@ -84,15 +84,14 @@ assertEquals(result[4], 'D');
assertEquals(result[5], 'E'); assertEquals(result[5], 'E');
assertEquals(result[6], 'F'); assertEquals(result[6], 'F');
// Some tests from the Mozilla tests, where our behavior differs from // Some tests from the Mozilla tests, where our behavior used to differ from
// SpiderMonkey. // SpiderMonkey.
// From ecma_3/RegExp/regress-334158.js // From ecma_3/RegExp/regress-334158.js
assertTrue(/\ca/.test( "\x01" )); assertTrue(/\ca/.test( "\x01" ));
assertFalse(/\ca/.test( "\\ca" )); assertFalse(/\ca/.test( "\\ca" ));
// Passes in KJS, fails in IrregularExpressions. assertFalse(/\ca/.test( "ca" ));
// See http://code.google.com/p/v8/issues/detail?id=152 assertTrue(/\c[a/]/.test( "\\ca" ));
//assertTrue(/\c[a/]/.test( "\x1ba/]" )); assertTrue(/\c[a/]/.test( "\\c/" ));
// Test \c in character class // Test \c in character class
re = /^[\cM]$/; re = /^[\cM]$/;
...@@ -104,11 +103,29 @@ assertFalse(re.test("\x03")); // I.e., read as \cc ...@@ -104,11 +103,29 @@ assertFalse(re.test("\x03")); // I.e., read as \cc
re = /^[\c]]$/; re = /^[\c]]$/;
assertTrue(re.test("c]")); assertTrue(re.test("c]"));
assertFalse(re.test("\\]")); assertTrue(re.test("\\]"));
assertFalse(re.test("\x1d")); // ']' & 0x1f assertFalse(re.test("\x1d")); // ']' & 0x1f
assertFalse(re.test("\\]"));
assertFalse(re.test("\x03]")); // I.e., read as \cc assertFalse(re.test("\x03]")); // I.e., read as \cc
re = /^[\c1]$/; // Digit control characters are masked in character classes.
assertTrue(re.test("\x11"));
assertFalse(re.test("\\"));
assertFalse(re.test("c"));
assertFalse(re.test("1"));
re = /^[\c_]$/; // Underscore control character is masked in character classes.
assertTrue(re.test("\x1f"));
assertFalse(re.test("\\"));
assertFalse(re.test("c"));
assertFalse(re.test("_"));
re = /^[\c$]$/; // Other characters are interpreted literally.
assertFalse(re.test("\x04"));
assertTrue(re.test("\\"));
assertTrue(re.test("c"));
assertTrue(re.test("$"));
assertTrue(/^[Z-\c-e]*$/.test("Z[\\cde"));
// Test that we handle \s and \S correctly inside some bizarre // Test that we handle \s and \S correctly inside some bizarre
// character classes. // character classes.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment