Commit 7939f9ac authored by lrn@chromium.org's avatar lrn@chromium.org

Make scanner handle invalid unicode escapes in identifiers correctly.

I.e., don't just convert \u to u in identifiers (like in strings and regexps).

Also make the scanning of RegExp flags not interpret the escapes.

(Fix and reapply of r8942)

BUG=v8:1620
TEST=mjsunit/regress/regress-1620

Review URL: http://codereview.chromium.org/7677012

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@8969 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent c50094bd
......@@ -41,12 +41,12 @@ Scanner::Scanner(UnicodeCache* unicode_cache)
: unicode_cache_(unicode_cache) { }
uc32 Scanner::ScanHexEscape(uc32 c, int length) {
ASSERT(length <= 4); // prevent overflow
uc32 Scanner::ScanHexNumber(int expected_length) {
ASSERT(expected_length <= 4); // prevent overflow
uc32 digits[4];
uc32 digits[4] = { 0, 0, 0, 0 };
uc32 x = 0;
for (int i = 0; i < length; i++) {
for (int i = 0; i < expected_length; i++) {
digits[i] = c0_;
int d = HexValue(c0_);
if (d < 0) {
......@@ -54,12 +54,11 @@ uc32 Scanner::ScanHexEscape(uc32 c, int length) {
// should be illegal, but other JS VMs just return the
// non-escaped version of the original character.
// Push back digits read, except the last one (in c0_).
// Push back digits that we have advanced past.
for (int j = i-1; j >= 0; j--) {
PushBack(digits[j]);
}
// Notice: No handling of error - treat it as "\u"->"u".
return c;
return -1;
}
x = x * 16 + d;
Advance();
......@@ -640,9 +639,17 @@ void JavaScriptScanner::ScanEscape() {
case 'n' : c = '\n'; break;
case 'r' : c = '\r'; break;
case 't' : c = '\t'; break;
case 'u' : c = ScanHexEscape(c, 4); break;
case 'u' : {
c = ScanHexNumber(4);
if (c < 0) c = 'u';
break;
}
case 'v' : c = '\v'; break;
case 'x' : c = ScanHexEscape(c, 2); break;
case 'x' : {
c = ScanHexNumber(2);
if (c < 0) c = 'x';
break;
}
case '0' : // fall through
case '1' : // fall through
case '2' : // fall through
......@@ -802,13 +809,11 @@ Token::Value JavaScriptScanner::ScanNumber(bool seen_period) {
uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {
Advance();
if (c0_ != 'u') return unibrow::Utf8::kBadChar;
if (c0_ != 'u') return -1;
Advance();
uc32 c = ScanHexEscape('u', 4);
// We do not allow a unicode escape sequence to start another
// unicode escape sequence.
if (c == '\\') return unibrow::Utf8::kBadChar;
return c;
uc32 result = ScanHexNumber(4);
if (result < 0) PushBack('u');
return result;
}
......@@ -926,7 +931,11 @@ Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {
if (c0_ == '\\') {
uc32 c = ScanIdentifierUnicodeEscape();
// Only allow legal identifier start characters.
if (!unicode_cache_->IsIdentifierStart(c)) return Token::ILLEGAL;
if (c < 0 ||
c == '\\' || // No recursive escapes.
!unicode_cache_->IsIdentifierStart(c)) {
return Token::ILLEGAL;
}
AddLiteralChar(c);
return ScanIdentifierSuffix(&literal);
}
......@@ -966,7 +975,11 @@ Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {
if (c0_ == '\\') {
uc32 c = ScanIdentifierUnicodeEscape();
// Only allow legal identifier part characters.
if (!unicode_cache_->IsIdentifierPart(c)) return Token::ILLEGAL;
if (c < 0 ||
c == '\\' ||
!unicode_cache_->IsIdentifierPart(c)) {
return Token::ILLEGAL;
}
AddLiteralChar(c);
} else {
AddLiteralChar(c0_);
......@@ -992,8 +1005,9 @@ bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {
// the scanner should pass uninterpreted bodies to the RegExp
// constructor.
LiteralScope literal(this);
if (seen_equal)
if (seen_equal) {
AddLiteralChar('=');
}
while (c0_ != '/' || in_character_class) {
if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
......@@ -1025,20 +1039,47 @@ bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {
}
bool JavaScriptScanner::ScanLiteralUnicodeEscape() {
ASSERT(c0_ == '\\');
uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};
Advance();
int i = 1;
if (c0_ == 'u') {
i++;
while (i < 6) {
Advance();
if (!IsHexDigit(c0_)) break;
chars_read[i] = c0_;
i++;
}
}
if (i < 6) {
// Incomplete escape. Undo all advances and return false.
while (i > 0) {
i--;
PushBack(chars_read[i]);
}
return false;
}
// Complete escape. Add all chars to current literal buffer.
for (int i = 0; i < 6; i++) {
AddLiteralChar(chars_read[i]);
}
return true;
}
bool JavaScriptScanner::ScanRegExpFlags() {
// Scan regular expression flags.
LiteralScope literal(this);
while (unicode_cache_->IsIdentifierPart(c0_)) {
if (c0_ == '\\') {
uc32 c = ScanIdentifierUnicodeEscape();
if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
// We allow any escaped character, unlike the restriction on
// IdentifierPart when it is used to build an IdentifierName.
AddLiteralChar(c);
continue;
if (c0_ != '\\') {
AddLiteralCharAdvance();
} else {
if (!ScanLiteralUnicodeEscape()) {
break;
}
}
AddLiteralCharAdvance();
}
literal.Complete();
......
......@@ -419,7 +419,7 @@ class Scanner {
}
}
uc32 ScanHexEscape(uc32 c, int length);
uc32 ScanHexNumber(int expected_length);
// Return the current source position.
int source_pos() {
......@@ -537,6 +537,10 @@ class JavaScriptScanner : public Scanner {
// Decodes a unicode escape-sequence which is part of an identifier.
// If the escape sequence cannot be decoded the result is kBadChar.
uc32 ScanIdentifierUnicodeEscape();
// Recognizes a uniocde escape-sequence and adds its characters,
// uninterpreted, to the current literal. Used for parsing RegExp
// flags.
bool ScanLiteralUnicodeEscape();
// Start position of the octal literal last scanned.
Location octal_pos_;
......
// Copyright 2011 the V8 project authors. All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Don't allow malformed unicode escape sequences in identifiers.
// In strings and regexps we currently allow malformed unicode escape
// sequences without throwing a SyntaxError. Instead "\u22gk" would
// treat the "\u" as an identity escape, and evaluate to "u22gk".
// Due to code sharing, we did the same in identifiers. This should
// no longer be the case.
// See: http://code.google.com/p/v8/issues/detail?id=1620
assertThrows("var \\u\\u\\u = 42;");
assertThrows("var \\u41 = 42;");
assertThrows("var \\u123 = 42;");
eval("var \\u1234 = 42;");
assertEquals(42, eval("\u1234"));
assertThrows("var uuu = 42; var x = \\u\\u\\u");
// Regressions introduced and fixed again while fixing the above.
// Handle 0xFFFD correctly (it's a valid value, and shouldn't be used
// to mark an error).
assertEquals(0xFFFD, "\uFFFD".charCodeAt(0));
// Handle unicode escapes in regexp flags correctly.
assertThrows("/x/g\\uim", SyntaxError);
assertThrows("/x/g\\u2im", SyntaxError);
assertThrows("/x/g\\u22im", SyntaxError);
assertThrows("/x/g\\u222im", SyntaxError);
assertThrows("/x/g\\\\u2222im", SyntaxError);
......@@ -99,6 +99,13 @@ S7.8.4_A7.4_T1: FAIL_OK
S7.8.4_A4.3_T5: FAIL_OK
S7.8.4_A7.2_T5: FAIL_OK
# Sputnik expects unicode escape sequences in RegExp flags to be interpreted.
# The specification requires them to be passed uninterpreted to the RegExp
# constructor. We now implement that.
S7.8.5_A3.1_T7: FAIL_OK
S7.8.5_A3.1_T8: FAIL_OK
S7.8.5_A3.1_T9: FAIL_OK
# We allow some keywords to be used as identifiers.
S7.5.3_A1.15: FAIL_OK
S7.5.3_A1.18: FAIL_OK
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment