Commit b618d2a4 authored by yangguo@chromium.org's avatar yangguo@chromium.org

Fix inconsistencies wrt whitespaces.

This relands r19196 with fixes.

BUG=v8:3109
LOG=Y
R=mstarzinger@chromium.org

Review URL: https://codereview.chromium.org/141323007

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@19222 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent d2d3fc0e
......@@ -66,6 +66,27 @@ struct IdentifierPart {
}
};
// WhiteSpace according to ECMA-262 5.1, 7.2.
struct WhiteSpace {
static inline bool Is(uc32 c) {
return c == 0x0009 || // <TAB>
c == 0x000B || // <VT>
c == 0x000C || // <FF>
c == 0xFEFF || // <BOM>
// \u0020 and \u00A0 are included in unibrow::WhiteSpace.
unibrow::WhiteSpace::Is(c);
}
};
// WhiteSpace and LineTerminator according to ECMA-262 5.1, 7.2 and 7.3.
struct WhiteSpaceOrLineTerminator {
static inline bool Is(uc32 c) {
return WhiteSpace::Is(c) || unibrow::LineTerminator::Is(c);
}
};
} } // namespace v8::internal
#endif // V8_CHAR_PREDICATES_H_
......@@ -128,7 +128,7 @@ inline bool AdvanceToNonspace(UnicodeCache* unicode_cache,
Iterator* current,
EndMark end) {
while (*current != end) {
if (!unicode_cache->IsWhiteSpace(**current)) return true;
if (!unicode_cache->IsWhiteSpaceOrLineTerminator(**current)) return true;
++*current;
}
return false;
......
......@@ -122,7 +122,7 @@ class DateParser : public AllStatic {
}
bool SkipWhiteSpace() {
if (unicode_cache_->IsWhiteSpace(ch_)) {
if (unicode_cache_->IsWhiteSpaceOrLineTerminator(ch_)) {
Next();
return true;
}
......
......@@ -3597,9 +3597,12 @@ class AlternativeGenerationList {
// The '2' variant is has inclusive from and exclusive to.
static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0,
0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A,
0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 };
// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
0xFEFF, 0xFF00, 0x10000 };
static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
static const int kWordRanges[] = {
......
......@@ -6541,11 +6541,6 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_StringToUpperCase) {
}
static inline bool IsTrimWhiteSpace(unibrow::uchar c) {
return unibrow::WhiteSpace::Is(c) || c == 0x200b || c == 0xfeff;
}
RUNTIME_FUNCTION(MaybeObject*, Runtime_StringTrim) {
HandleScope scope(isolate);
ASSERT(args.length() == 3);
......@@ -6558,15 +6553,19 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_StringTrim) {
int length = string->length();
int left = 0;
UnicodeCache* unicode_cache = isolate->unicode_cache();
if (trimLeft) {
while (left < length && IsTrimWhiteSpace(string->Get(left))) {
while (left < length &&
unicode_cache->IsWhiteSpaceOrLineTerminator(string->Get(left))) {
left++;
}
}
int right = length;
if (trimRight) {
while (right > left && IsTrimWhiteSpace(string->Get(right - 1))) {
while (right > left &&
unicode_cache->IsWhiteSpaceOrLineTerminator(
string->Get(right - 1))) {
right--;
}
}
......
......@@ -246,7 +246,8 @@ Token::Value Scanner::Next() {
}
static inline bool IsByteOrderMark(uc32 c) {
// TODO(yangguo): check whether this is actually necessary.
static inline bool IsLittleEndianByteOrderMark(uc32 c) {
// The Unicode value U+FFFE is guaranteed never to be assigned as a
// Unicode character; this implies that in a Unicode context the
// 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
......@@ -254,7 +255,7 @@ static inline bool IsByteOrderMark(uc32 c) {
// not be a U+FFFE character expressed in big-endian byte
// order). Nevertheless, we check for it to be compatible with
// Spidermonkey.
return c == 0xFEFF || c == 0xFFFE;
return c == 0xFFFE;
}
......@@ -262,14 +263,14 @@ bool Scanner::SkipWhiteSpace() {
int start_position = source_pos();
while (true) {
// We treat byte-order marks (BOMs) as whitespace for better
// compatibility with Spidermonkey and other JavaScript engines.
while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
// IsWhiteSpace() includes line terminators!
while (true) {
// Advance as long as character is a WhiteSpace or LineTerminator.
// Remember if the latter is the case.
if (unicode_cache_->IsLineTerminator(c0_)) {
// Ignore line terminators, but remember them. This is necessary
// for automatic semicolon insertion.
has_line_terminator_before_next_ = true;
} else if (!unicode_cache_->IsWhiteSpace(c0_) &&
!IsLittleEndianByteOrderMark(c0_)) {
break;
}
Advance();
}
......
......@@ -139,12 +139,17 @@ class UnicodeCache {
bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); }
bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); }
bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); }
bool IsWhiteSpaceOrLineTerminator(unibrow::uchar c) {
return kIsWhiteSpaceOrLineTerminator.get(c);
}
private:
unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
unibrow::Predicate<WhiteSpace, 128> kIsWhiteSpace;
unibrow::Predicate<WhiteSpaceOrLineTerminator, 128>
kIsWhiteSpaceOrLineTerminator;
StaticResource<Utf8Decoder> utf8_decoder_;
DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
......
......@@ -25,7 +25,7 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// This file was generated at 2012-03-06 09:55:58.934483
// This file was generated at 2014-02-07 15:31:16.733174
#include "unicode-inl.h"
#include <stdlib.h>
......@@ -710,28 +710,6 @@ bool Letter::Is(uchar c) {
}
// Space: point.category == 'Zs'
static const uint16_t kSpaceTable0Size = 4;
static const int32_t kSpaceTable0[4] = {
32, 160, 5760, 6158 }; // NOLINT
static const uint16_t kSpaceTable1Size = 5;
static const int32_t kSpaceTable1[5] = {
1073741824, 10, 47, 95, 4096 }; // NOLINT
bool Space::Is(uchar c) {
int chunk_index = c >> 13;
switch (chunk_index) {
case 0: return LookupPredicate(kSpaceTable0,
kSpaceTable0Size,
c);
case 1: return LookupPredicate(kSpaceTable1,
kSpaceTable1Size,
c);
default: return false;
}
}
// Number: point.category == 'Nd'
static const uint16_t kNumberTable0Size = 56;
......@@ -767,14 +745,14 @@ bool Number::Is(uchar c) {
}
// WhiteSpace: 'Ws' in point.properties
// WhiteSpace: point.category == 'Zs'
static const uint16_t kWhiteSpaceTable0Size = 7;
static const int32_t kWhiteSpaceTable0[7] = {
1073741833, 13, 32, 133, 160, 5760, 6158 }; // NOLINT
static const uint16_t kWhiteSpaceTable1Size = 7;
static const int32_t kWhiteSpaceTable1[7] = {
1073741824, 10, 1073741864, 41, 47, 95, 4096 }; // NOLINT
static const uint16_t kWhiteSpaceTable0Size = 4;
static const int32_t kWhiteSpaceTable0[4] = {
32, 160, 5760, 6158 }; // NOLINT
static const uint16_t kWhiteSpaceTable1Size = 5;
static const int32_t kWhiteSpaceTable1[5] = {
1073741824, 10, 47, 95, 4096 }; // NOLINT
bool WhiteSpace::Is(uchar c) {
int chunk_index = c >> 13;
switch (chunk_index) {
......@@ -1833,8 +1811,6 @@ int UnicodeData::GetByteCount() {
+ kLetterTable5Size * sizeof(int32_t) // NOLINT
+ kLetterTable6Size * sizeof(int32_t) // NOLINT
+ kLetterTable7Size * sizeof(int32_t) // NOLINT
+ kSpaceTable0Size * sizeof(int32_t) // NOLINT
+ kSpaceTable1Size * sizeof(int32_t) // NOLINT
+ kNumberTable0Size * sizeof(int32_t) // NOLINT
+ kNumberTable5Size * sizeof(int32_t) // NOLINT
+ kNumberTable7Size * sizeof(int32_t) // NOLINT
......
......@@ -226,9 +226,6 @@ struct Lowercase {
struct Letter {
static bool Is(uchar c);
};
struct Space {
static bool Is(uchar c);
};
struct Number {
static bool Is(uchar c);
};
......
......@@ -444,27 +444,15 @@ static bool NotDigit(uc16 c) {
}
static bool IsWhiteSpace(uc16 c) {
switch (c) {
case 0x09:
case 0x0A:
case 0x0B:
case 0x0C:
case 0x0d:
case 0x20:
case 0xA0:
case 0x2028:
case 0x2029:
case 0xFEFF:
return true;
default:
return unibrow::Space::Is(c);
}
static bool IsWhiteSpaceOrLineTerminator(uc16 c) {
// According to ECMA 5.1, 15.10.2.12 the CharacterClassEscape \s includes
// WhiteSpace (7.2) and LineTerminator (7.3) values.
return v8::internal::WhiteSpaceOrLineTerminator::Is(c);
}
static bool NotWhiteSpace(uc16 c) {
return !IsWhiteSpace(c);
static bool NotWhiteSpaceNorLineTermiantor(uc16 c) {
return !IsWhiteSpaceOrLineTerminator(c);
}
......@@ -494,8 +482,8 @@ TEST(CharacterClassEscapes) {
TestCharacterClassEscapes('.', IsRegExpNewline);
TestCharacterClassEscapes('d', IsDigit);
TestCharacterClassEscapes('D', NotDigit);
TestCharacterClassEscapes('s', IsWhiteSpace);
TestCharacterClassEscapes('S', NotWhiteSpace);
TestCharacterClassEscapes('s', IsWhiteSpaceOrLineTerminator);
TestCharacterClassEscapes('S', NotWhiteSpaceNorLineTermiantor);
TestCharacterClassEscapes('w', IsRegExpWord);
TestCharacterClassEscapes('W', NotWord);
}
......
// Copyright (c) 2009 Apple Computer, Inc. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
//
// 3. Neither the name of the copyright holder(s) nor the names of any
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
// OF THE POSSIBILITY OF SUCH DAMAGE.
// Based on LayoutTests/fast/js/script-tests/string-trim.js
// References to trim(), trimLeft() and trimRight() functions for
// testing Function's *.call() and *.apply() methods.
var trim = String.prototype.trim;
var trimLeft = String.prototype.trimLeft;
var trimRight = String.prototype.trimRight;
var testString = 'foo bar';
var trimString = '';
var leftTrimString = '';
var rightTrimString = '';
var wsString = '';
var whitespace = [
{s : '\u0009', t : 'HORIZONTAL TAB'},
{s : '\u000A', t : 'LINE FEED OR NEW LINE'},
{s : '\u000B', t : 'VERTICAL TAB'},
{s : '\u000C', t : 'FORMFEED'},
{s : '\u000D', t : 'CARRIAGE RETURN'},
{s : '\u0020', t : 'SPACE'},
{s : '\u00A0', t : 'NO-BREAK SPACE'},
{s : '\u2000', t : 'EN QUAD'},
{s : '\u2001', t : 'EM QUAD'},
{s : '\u2002', t : 'EN SPACE'},
{s : '\u2003', t : 'EM SPACE'},
{s : '\u2004', t : 'THREE-PER-EM SPACE'},
{s : '\u2005', t : 'FOUR-PER-EM SPACE'},
{s : '\u2006', t : 'SIX-PER-EM SPACE'},
{s : '\u2007', t : 'FIGURE SPACE'},
{s : '\u2008', t : 'PUNCTUATION SPACE'},
{s : '\u2009', t : 'THIN SPACE'},
{s : '\u200A', t : 'HAIR SPACE'},
{s : '\u3000', t : 'IDEOGRAPHIC SPACE'},
{s : '\u2028', t : 'LINE SEPARATOR'},
{s : '\u2029', t : 'PARAGRAPH SEPARATOR'},
{s : '\u200B', t : 'ZERO WIDTH SPACE (category Cf)'}
];
for (var i = 0; i < whitespace.length; i++) {
assertEquals(whitespace[i].s.trim(), '');
assertEquals(whitespace[i].s.trimLeft(), '');
assertEquals(whitespace[i].s.trimRight(), '');
wsString += whitespace[i].s;
}
trimString = wsString + testString + wsString;
leftTrimString = testString + wsString; // Trimmed from the left.
rightTrimString = wsString + testString; // Trimmed from the right.
assertEquals(wsString.trim(), '');
assertEquals(wsString.trimLeft(), '');
assertEquals(wsString.trimRight(), '');
assertEquals(trimString.trim(), testString);
assertEquals(trimString.trimLeft(), leftTrimString);
assertEquals(trimString.trimRight(), rightTrimString);
assertEquals(leftTrimString.trim(), testString);
assertEquals(leftTrimString.trimLeft(), leftTrimString);
assertEquals(leftTrimString.trimRight(), testString);
assertEquals(rightTrimString.trim(), testString);
assertEquals(rightTrimString.trimLeft(), testString);
assertEquals(rightTrimString.trimRight(), rightTrimString);
var testValues = [0, Infinity, NaN, true, false, ({}), ['an','array'],
({toString:function(){return 'wibble'}})
];
for (var i = 0; i < testValues.length; i++) {
assertEquals(trim.call(testValues[i]), String(testValues[i]));
assertEquals(trimLeft.call(testValues[i]), String(testValues[i]));
assertEquals(trimRight.call(testValues[i]), String(testValues[i]));
}
// Copyright 2014 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
var whitespaces = [
// WhiteSpace defined in ECMA-262 5.1, 7.2
0x0009, // Tab TAB
0x000B, // Vertical Tab VT
0x000C, // Form Feed FF
0x0020, // Space SP
0x00A0, // No-break space NBSP
0xFEFF, // Byte Order Mark BOM
// LineTerminator defined in ECMA-262 5.1, 7.3
0x000A, // Line Feed LF
0x000D, // Carriage Return CR
0x2028, // Line Separator LS
0x2029, // Paragraph Separator PS
// Unicode 6.3.0 whitespaces (category 'Zs')
0x1680, // Ogham Space Mark
0x180E, // Mongolian Vowel Separator
0x2000, // EN QUAD
0x2001, // EM QUAD
0x2002, // EN SPACE
0x2003, // EM SPACE
0x2004, // THREE-PER-EM SPACE
0x2005, // FOUR-PER-EM SPACE
0x2006, // SIX-PER-EM SPACE
0x2007, // FIGURE SPACE
0x2008, // PUNCTUATION SPACE
0x2009, // THIN SPACE
0x200A, // HAIR SPACE
0x2028, // LINE SEPARATOR
0x2029, // PARAGRAPH SEPARATOR
0x202F, // NARROW NO-BREAK SPACE
0x205F, // MEDIUM MATHEMATICAL SPACE
0x3000, // IDEOGRAPHIC SPACE
];
// Add single twobyte char to force twobyte representation.
// Interestingly, snowman is not "white" space :)
var twobyte = "\u2603";
var onebyte = "\u007E";
var twobytespace = "\u2000";
var onebytespace = "\u0020";
function is_whitespace(c) {
return whitespaces.indexOf(c.charCodeAt(0)) > -1;
}
function test_regexp(str) {
var pos_match = str.match(/\s/);
var neg_match = str.match(/\S/);
var test_char = str[0];
var postfix = str[1];
if (is_whitespace(test_char)) {
assertEquals(test_char, pos_match[0]);
assertEquals(postfix, neg_match[0]);
} else {
assertEquals(test_char, neg_match[0]);
assertNull(pos_match);
}
}
function test_trim(c, infix) {
var str = c + c + c + infix + c;
if (is_whitespace(c)) {
assertEquals(infix, str.trim());
} else {
assertEquals(str, str.trim());
}
}
function test_parseInt(c, postfix) {
// Skip if prefix is a digit.
if (c >= "0" && c <= "9") return;
var str = c + c + "123" + postfix;
if (is_whitespace(c)) {
assertEquals(123, parseInt(str));
} else {
assertEquals(NaN, parseInt(str));
}
}
function test_eval(c, content) {
if (!is_whitespace(c)) return;
var str = c + c + "'" + content + "'" + c + c;
assertEquals(content, eval(str));
}
function test_stringtonumber(c, postfix) {
// Skip if prefix is a digit.
if (c >= "0" && c <= "9") return;
var result = 1 + Number(c + "123" + c + postfix);
if (is_whitespace(c)) {
assertEquals(124, result);
} else {
assertEquals(NaN, result);
}
}
for (var i = 0; i < 0x10000; i++) {
c = String.fromCharCode(i);
test_regexp(c + onebyte);
test_regexp(c + twobyte);
test_trim(c, onebyte + "trim");
test_trim(c, twobyte + "trim");
test_parseInt(c, onebyte);
test_parseInt(c, twobyte);
test_eval(c, onebyte);
test_eval(c, twobyte);
test_stringtonumber(c, onebytespace);
test_stringtonumber(c, twobytespace);
}
......@@ -89,20 +89,38 @@ PASS whitespace[19].s.trimRight() is ''
PASS whitespace[20].s.trim() is ''
PASS whitespace[20].s.trimLeft() is ''
PASS whitespace[20].s.trimRight() is ''
PASS whitespace[21].s.trim() is ''
PASS whitespace[21].s.trimLeft() is ''
PASS whitespace[21].s.trimRight() is ''
PASS wsString.trim() is ''
PASS wsString.trimLeft() is ''
PASS wsString.trimRight() is ''
PASS trimString.trim() is testString
PASS trimString.trimLeft() is leftTrimString
PASS trimString.trimRight() is rightTrimString
PASS leftTrimString.trim() is testString
FAIL whitespace[21].s.trim() should be . Was ​.
FAIL whitespace[21].s.trimLeft() should be . Was ​.
FAIL whitespace[21].s.trimRight() should be . Was ​.
FAIL wsString.trim() should be . Was ​.
FAIL wsString.trimLeft() should be . Was ​.
FAIL wsString.trimRight() should be . Was
             

​.
FAIL trimString.trim() should be foo bar. Was ​foo bar
             

​.
FAIL trimString.trimLeft() should be foo bar
             

​. Was ​foo bar
             

​.
FAIL trimString.trimRight() should be
             

​foo bar. Was
             

​foo bar
             

​.
FAIL leftTrimString.trim() should be foo bar. Was foo bar
             

​.
PASS leftTrimString.trimLeft() is leftTrimString
PASS leftTrimString.trimRight() is testString
PASS rightTrimString.trim() is testString
PASS rightTrimString.trimLeft() is testString
FAIL leftTrimString.trimRight() should be foo bar. Was foo bar
             

​.
FAIL rightTrimString.trim() should be foo bar. Was ​foo bar.
FAIL rightTrimString.trimLeft() should be foo bar. Was ​foo bar.
PASS rightTrimString.trimRight() is rightTrimString
PASS trim.call(0) is '0'
PASS trimLeft.call(0) is '0'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment