Commit 2c3b392c authored by olehougaard's avatar olehougaard

Handling byte-order marks as specified in Ecmascript-262 and in compliance with Safari.

Review URL: http://codereview.chromium.org/15075

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@1006 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent ab2d4bc9
......@@ -119,6 +119,18 @@ void UTF16Buffer::PushBack(uc32 ch) {
}
static inline bool IsByteOrderMark(uc32 c) {
// The Unicode value U+FFFE is guaranteed never to be assigned as a
// Unicode character; this implies that in a Unicode context the
// 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
// character expressed in little-endian byte order (since it could
// not be a U+FFFE character expressed in big-endian byte
// order). Nevertheless, we check for it to be compatible with
// Spidermonkey.
return c == 0xFEFF || c == 0xFFFE;
}
uc32 UTF16Buffer::Advance() {
// NOTE: It is of importance to Persian / Farsi resources that we do
// *not* strip format control characters in the scanner; see
......@@ -126,16 +138,17 @@ uc32 UTF16Buffer::Advance() {
// https://bugzilla.mozilla.org/show_bug.cgi?id=274152
//
// So, even though ECMA-262, section 7.1, page 11, dictates that we
// must remove Unicode format-control characters, we do not. This is
// in line with how IE and SpiderMonkey handles it.
// must remove Unicode format-control characters, we only remove the BOM.
// This is in line with how Safari handles it.
if (!pushback_buffer()->is_empty()) {
pos_++;
return last_ = pushback_buffer()->RemoveLast();
} else if (stream_->has_more()) {
} else {
while (stream_->has_more()) {
pos_++;
uc32 next = stream_->GetNext();
return last_ = next;
} else {
if (!IsByteOrderMark(next)) return last_ = next;
}
// note: currently the following increment is necessary to avoid a
// test-parser problem!
pos_++;
......@@ -234,25 +247,11 @@ void Scanner::PushBack(uc32 ch) {
}
static inline bool IsByteOrderMark(uc32 c) {
// The Unicode value U+FFFE is guaranteed never to be assigned as a
// Unicode character; this implies that in a Unicode context the
// 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
// character expressed in little-endian byte order (since it could
// not be a U+FFFE character expressed in big-endian byte
// order). Nevertheless, we check for it to be compatible with
// Spidermonkey.
return c == 0xFEFF || c == 0xFFFE;
}
void Scanner::SkipWhiteSpace(bool initial) {
has_line_terminator_before_next_ = initial;
while (true) {
// We treat byte-order marks (BOMs) as whitespace for better
// compatibility with Spidermonkey and other JavaScript engines.
while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) {
while (kIsWhiteSpace.get(c0_)) {
// IsWhiteSpace() includes line terminators!
if (kIsLineTerminator.get(c0_))
// Ignore line terminators, but remember them. This is necessary
......
// Copyright 2008 the V8 project authors. All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// According to section 7.1 of EcmaScript-262 format control characters
// should be removed before parsing. We're following the discussion at
// https://bugs.webkit.org/show_bug.cgi?id=4931 in only removing the BOM.
// See also https://bugzilla.mozilla.org/show_bug.cgi?id=274152.
// Ignores BOM (and only BOM) in string literals.
var format_controls =
eval('"\uFEFF\u200F\u200E\u00AD\u2062\u200D\u200C\u200B"');
assertEquals('\u200F\u200E\u00AD\u2062\u200D\u200C\u200B',
format_controls);
// Ignores BOM in identifiers.
eval('var x\uFEFFy = 7');
assertEquals(7, xy);
// Doesn't ignore non-BOM format control characters.
assertThrows('var y\u200Fx = 7');
assertThrows('var y\u200Ex = 7');
assertThrows('var y\u20ADx = 7');
assertThrows('var y\u2062x = 7');
assertThrows('var y\u200Dx = 7');
assertThrows('var y\u200Cx = 7');
assertThrows('var y\u200Bx = 7');
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment