Commit 4aeb94a4 authored by jshin's avatar jshin Committed by Commit Bot

Use ICU for ID_START, ID_CONTINUE and WhiteSpace check

Use ICU to check ID_Start, ID_Continue and WhiteSpace even for BMP
when V8_INTL_SUPPORT is on (which is default).

Change LineTerminator::Is() to check 4 code points from
ES#sec-line-terminators instead of using tables and Lookup function.

Remove Lowercase::Is(). It's not used anywhere.

Update webkit/{ToNumber,parseFloat}.js to have the correct expectation
for U+180E and the corresponding expected files. This is a follow-up to
an earlier change ( https://codereview.chromium.org/2720953003 ).

CQ_INCLUDE_TRYBOTS=master.tryserver.v8:v8_win_dbg,v8_mac_dbg;master.tryserver.chromium.android:android_arm64_dbg_recipe
CQ_INCLUDE_TRYBOTS=master.tryserver.v8:v8_linux_noi18n_rel_ng

BUG=v8:5370,v8:5155
TEST=unittests --gtest_filter=CharP*
TEST=webkit: ToNumber, parseFloat
TEST=test262: built-ins/Number/S9.3*, built-ins/parse{Int,Float}/S15*
TEST=test262: language/white-space/mong*
TEST=test262: built-ins/String/prototype/trim/u180e
TEST=mjsunit: whitespaces

Review-Url: https://codereview.chromium.org/2331303002
Cr-Commit-Position: refs/heads/master@{#45957}
parent 8e646bd0
......@@ -2469,6 +2469,7 @@ v8_source_set("v8_base") {
} else {
sources -= [
"src/builtins/builtins-intl.cc",
"src/char-predicates.cc",
"src/intl.cc",
"src/intl.h",
"src/objects/intl-objects.cc",
......
......@@ -2,41 +2,43 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_INTL_SUPPORT
#error Internationalization is expected to be enabled.
#endif // V8_INTL_SUPPORT
#include "src/char-predicates.h"
#ifdef V8_INTL_SUPPORT
#include "unicode/uchar.h"
#include "unicode/urename.h"
#endif // V8_INTL_SUPPORT
namespace v8 {
namespace internal {
bool SupplementaryPlanes::IsIDStart(uc32 c) {
DCHECK(c > 0xFFFF);
#ifdef V8_INTL_SUPPORT
// This only works for code points in the SMPs, since ICU does not exclude
// code points with properties 'Pattern_Syntax' or 'Pattern_White_Space'.
// Code points in the SMP do not have those properties.
return u_isIDStart(c);
#else
// This is incorrect, but if we don't have ICU, use this as fallback.
return false;
#endif // V8_INTL_SUPPORT
// ES#sec-names-and-keywords Names and Keywords
// UnicodeIDStart, '$', '_' and '\'
bool IdentifierStart::Is(uc32 c) {
// cannot use u_isIDStart because it does not work for
// Other_ID_Start characters.
return u_hasBinaryProperty(c, UCHAR_ID_START) ||
(c < 0x60 && (c == '$' || c == '\\' || c == '_'));
}
// ES#sec-names-and-keywords Names and Keywords
// UnicodeIDContinue, '$', '_', '\', ZWJ, and ZWNJ
bool IdentifierPart::Is(uc32 c) {
// Can't use u_isIDPart because it does not work for
// Other_ID_Continue characters.
return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) ||
(c < 0x60 && (c == '$' || c == '\\' || c == '_')) || c == 0x200C ||
c == 0x200D;
}
bool SupplementaryPlanes::IsIDPart(uc32 c) {
DCHECK(c > 0xFFFF);
#ifdef V8_INTL_SUPPORT
// This only works for code points in the SMPs, since ICU does not exclude
// code points with properties 'Pattern_Syntax' or 'Pattern_White_Space'.
// Code points in the SMP do not have those properties.
return u_isIDPart(c);
#else
// This is incorrect, but if we don't have ICU, use this as fallback.
return false;
#endif // V8_INTL_SUPPORT
// ES#sec-white-space White Space
// gC=Zs, U+0009, U+000B, U+000C, U+FEFF
bool WhiteSpace::Is(uc32 c) {
return (u_charType(c) == U_SPACE_SEPARATOR) ||
(c < 0x0D && (c == 0x09 || c == 0x0B || c == 0x0C)) || c == 0xFEFF;
}
} // namespace internal
} // namespace v8
......@@ -26,53 +26,58 @@ inline bool IsBinaryDigit(uc32 c);
inline bool IsRegExpWord(uc32 c);
inline bool IsRegExpNewline(uc32 c);
struct V8_EXPORT_PRIVATE SupplementaryPlanes {
static bool IsIDStart(uc32 c);
static bool IsIDPart(uc32 c);
};
// ES6 draft section 11.6
// ES#sec-names-and-keywords
// This includes '_', '$' and '\', and ID_Start according to
// http://www.unicode.org/reports/tr31/, which consists of categories
// 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', but excluding properties
// 'Pattern_Syntax' or 'Pattern_White_Space'.
// For code points in the SMPs, we can resort to ICU (if available).
#ifdef V8_INTL_SUPPORT
struct V8_EXPORT_PRIVATE IdentifierStart {
static bool Is(uc32 c);
#else
struct IdentifierStart {
// Non-BMP characters are not supported without I18N.
static inline bool Is(uc32 c) {
if (c > 0xFFFF) return SupplementaryPlanes::IsIDStart(c);
return unibrow::ID_Start::Is(c);
return (c <= 0xFFFF) ? unibrow::ID_Start::Is(c) : false;
}
#endif
};
// ES6 draft section 11.6
// ES#sec-names-and-keywords
// This includes \u200c and \u200d, and ID_Continue according to
// http://www.unicode.org/reports/tr31/, which consists of ID_Start,
// the categories 'Mn', 'Mc', 'Nd', 'Pc', but excluding properties
// 'Pattern_Syntax' or 'Pattern_White_Space'.
// For code points in the SMPs, we can resort to ICU (if available).
#ifdef V8_INTL_SUPPORT
struct V8_EXPORT_PRIVATE IdentifierPart {
static bool Is(uc32 c);
#else
struct IdentifierPart {
static inline bool Is(uc32 c) {
if (c > 0xFFFF) return SupplementaryPlanes::IsIDPart(c);
return unibrow::ID_Start::Is(c) || unibrow::ID_Continue::Is(c);
// Non-BMP charaacters are not supported without I18N.
if (c <= 0xFFFF) {
return unibrow::ID_Start::Is(c) || unibrow::ID_Continue::Is(c);
}
return false;
}
#endif
};
// ES6 draft section 11.2
// This includes all code points of Unicode category 'Zs'.
// \u180e stops being one as of Unicode 6.3.0, but ES6 adheres to Unicode 5.1,
// so it is also included.
// Further included are \u0009, \u000b, \u0020, \u00a0, \u000c, and \ufeff.
// There are no category 'Zs' code points in the SMPs.
// Further included are \u0009, \u000b, \u000c, and \ufeff.
#ifdef V8_INTL_SUPPORT
struct V8_EXPORT_PRIVATE WhiteSpace {
static bool Is(uc32 c);
#else
struct WhiteSpace {
static inline bool Is(uc32 c) { return unibrow::WhiteSpace::Is(c); }
#endif
};
// WhiteSpace and LineTerminator according to ES6 draft section 11.2 and 11.3
// This consists of \000a, \000d, \u2028, and \u2029.
// This includes all the characters with Unicode category 'Z' (= Zs+Zl+Zp)
// as well as \u0009 - \u000d and \ufeff.
struct WhiteSpaceOrLineTerminator {
static inline bool Is(uc32 c) {
return WhiteSpace::Is(c) || unibrow::LineTerminator::Is(c);
......
This diff is collapsed.
......@@ -180,12 +180,10 @@ class Utf8 {
struct Uppercase {
static bool Is(uchar c);
};
struct Lowercase {
static bool Is(uchar c);
};
struct Letter {
static bool Is(uchar c);
};
#ifndef V8_INTL_SUPPORT
struct V8_EXPORT_PRIVATE ID_Start {
static bool Is(uchar c);
};
......@@ -195,6 +193,7 @@ struct V8_EXPORT_PRIVATE ID_Continue {
struct V8_EXPORT_PRIVATE WhiteSpace {
static bool Is(uchar c);
};
#endif // !V8_INTL_SUPPORT
struct V8_EXPORT_PRIVATE LineTerminator {
static bool Is(uchar c);
};
......
......@@ -1960,6 +1960,7 @@
}, { # v8_enable_i18n_support==0
'sources!': [
'builtins/builtins-intl.cc',
'char-predicates.cc',
'intl.cc',
'intl.h',
'objects/intl-objects.cc',
......
......@@ -10,32 +10,40 @@ namespace v8 {
namespace internal {
TEST(CharPredicatesTest, WhiteSpace) {
// As of Unicode 6.3.0, \u180E is no longer a white space. We still consider
// it to be one though, since JS recognizes all white spaces in Unicode 5.1.
EXPECT_TRUE(WhiteSpace::Is(0x0009));
EXPECT_TRUE(WhiteSpace::Is(0x000B));
EXPECT_TRUE(WhiteSpace::Is(0x000C));
EXPECT_TRUE(WhiteSpace::Is(' '));
EXPECT_TRUE(WhiteSpace::Is(0x00A0));
EXPECT_TRUE(WhiteSpace::Is(0x1680));
EXPECT_TRUE(WhiteSpace::Is(0x2000));
EXPECT_TRUE(WhiteSpace::Is(0x2007));
EXPECT_TRUE(WhiteSpace::Is(0x202F));
EXPECT_TRUE(WhiteSpace::Is(0x205F));
EXPECT_TRUE(WhiteSpace::Is(0x3000));
EXPECT_TRUE(WhiteSpace::Is(0xFEFF));
EXPECT_FALSE(WhiteSpace::Is(0x180E));
}
TEST(CharPredicatesTest, WhiteSpaceOrLineTerminator) {
// As of Unicode 6.3.0, \u180E is no longer a white space. We still consider
// it to be one though, since JS recognizes all white spaces in Unicode 5.1.
// White spaces
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x0009));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x000B));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x000C));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(' '));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x00A0));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x1680));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x2000));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x2007));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x202F));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x205F));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0xFEFF));
// Line terminators
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x000A));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x000D));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x2028));
EXPECT_TRUE(WhiteSpaceOrLineTerminator::Is(0x2029));
EXPECT_FALSE(WhiteSpaceOrLineTerminator::Is(0x180E));
}
......@@ -45,7 +53,11 @@ TEST(CharPredicatesTest, IdentifierStart) {
EXPECT_TRUE(IdentifierStart::Is('\\'));
// http://www.unicode.org/reports/tr31/
// curl http://www.unicode.org/Public/UCD/latest/ucd/PropList.txt |
// grep 'Other_ID_Start'
// Other_ID_Start
EXPECT_TRUE(IdentifierStart::Is(0x1885));
EXPECT_TRUE(IdentifierStart::Is(0x1886));
EXPECT_TRUE(IdentifierStart::Is(0x2118));
EXPECT_TRUE(IdentifierStart::Is(0x212E));
EXPECT_TRUE(IdentifierStart::Is(0x309B));
......@@ -53,7 +65,27 @@ TEST(CharPredicatesTest, IdentifierStart) {
// Issue 2892:
// \u2E2F has the Pattern_Syntax property, excluding it from ID_Start.
EXPECT_FALSE(unibrow::ID_Start::Is(0x2E2F));
EXPECT_FALSE(IdentifierStart::Is(0x2E2F));
#ifdef V8_INTL_SUPPORT
// New in Unicode 8.0 (6,847 code points)
// [:ID_Start:] & [[:Age=8.0:] - [:Age=7.0:]]
EXPECT_TRUE(IdentifierStart::Is(0x08B3));
EXPECT_TRUE(IdentifierStart::Is(0x0AF9));
EXPECT_TRUE(IdentifierStart::Is(0x13F8));
EXPECT_TRUE(IdentifierStart::Is(0x9FCD));
EXPECT_TRUE(IdentifierStart::Is(0xAB60));
EXPECT_TRUE(IdentifierStart::Is(0x10CC0));
EXPECT_TRUE(IdentifierStart::Is(0x108E0));
EXPECT_TRUE(IdentifierStart::Is(0x2B820));
// New in Unicode 9.0 (7,177 code points)
// [:ID_Start:] & [[:Age=9.0:] - [:Age=8.0:]]
EXPECT_TRUE(IdentifierStart::Is(0x1C80));
EXPECT_TRUE(IdentifierStart::Is(0x104DB));
EXPECT_TRUE(IdentifierStart::Is(0x1E922));
#endif
}
......@@ -64,8 +96,45 @@ TEST(CharPredicatesTest, IdentifierPart) {
EXPECT_TRUE(IdentifierPart::Is(0x200C));
EXPECT_TRUE(IdentifierPart::Is(0x200D));
#ifdef V8_INTL_SUPPORT
// New in Unicode 8.0 (6,847 code points)
// [:ID_Start:] & [[:Age=8.0:] - [:Age=7.0:]]
EXPECT_TRUE(IdentifierPart::Is(0x08B3));
EXPECT_TRUE(IdentifierPart::Is(0x0AF9));
EXPECT_TRUE(IdentifierPart::Is(0x13F8));
EXPECT_TRUE(IdentifierPart::Is(0x9FCD));
EXPECT_TRUE(IdentifierPart::Is(0xAB60));
EXPECT_TRUE(IdentifierPart::Is(0x10CC0));
EXPECT_TRUE(IdentifierPart::Is(0x108E0));
EXPECT_TRUE(IdentifierPart::Is(0x2B820));
// [[:ID_Continue:]-[:ID_Start:]] & [[:Age=8.0:]-[:Age=7.0:]]
// 162 code points
EXPECT_TRUE(IdentifierPart::Is(0x08E3));
EXPECT_TRUE(IdentifierPart::Is(0xA69E));
EXPECT_TRUE(IdentifierPart::Is(0x11730));
// New in Unicode 9.0 (7,177 code points)
// [:ID_Start:] & [[:Age=9.0:] - [:Age=8.0:]]
EXPECT_TRUE(IdentifierPart::Is(0x1C80));
EXPECT_TRUE(IdentifierPart::Is(0x104DB));
EXPECT_TRUE(IdentifierPart::Is(0x1E922));
// [[:ID_Continue:]-[:ID_Start:]] & [[:Age=9.0:]-[:Age=8.0:]]
// 162 code points
EXPECT_TRUE(IdentifierPart::Is(0x08D4));
EXPECT_TRUE(IdentifierPart::Is(0x1DFB));
EXPECT_TRUE(IdentifierPart::Is(0xA8C5));
EXPECT_TRUE(IdentifierPart::Is(0x11450));
#endif
// http://www.unicode.org/reports/tr31/
// curl http://www.unicode.org/Public/UCD/latest/ucd/PropList.txt |
// grep 'Other_ID_(Continue|Start)'
// Other_ID_Start
EXPECT_TRUE(IdentifierPart::Is(0x1885));
EXPECT_TRUE(IdentifierPart::Is(0x1886));
EXPECT_TRUE(IdentifierPart::Is(0x2118));
EXPECT_TRUE(IdentifierPart::Is(0x212E));
EXPECT_TRUE(IdentifierPart::Is(0x309B));
......@@ -98,6 +167,11 @@ TEST(CharPredicatesTest, SupplementaryPlaneIdentifiers) {
EXPECT_TRUE(IdentifierStart::Is(0x1014D)); // Category Nl
EXPECT_TRUE(IdentifierPart::Is(0x1014D));
// New in Unicode 8.0
// [ [:ID_Start=Yes:] & [:Age=8.0:]] - [:Age=7.0:]
EXPECT_TRUE(IdentifierStart::Is(0x108E0));
EXPECT_TRUE(IdentifierStart::Is(0x10C80));
// Only ID_Continue.
EXPECT_FALSE(IdentifierStart::Is(0x101FD)); // Category Mn
EXPECT_TRUE(IdentifierPart::Is(0x101FD));
......
......@@ -77,7 +77,7 @@ PASS +lf is 0
PASS +ls is 0
PASS +ps is 0
PASS +oghamSpaceMark is 0
FAIL +mongolianVowelSeparator should be 0. Was NaN.
PASS +mongolianVowelSeparator is NaN
PASS +enQuad is 0
PASS +emQuad is 0
PASS +enSpace is 0
......@@ -101,7 +101,7 @@ PASS +(lf + '1') is 1
PASS +(ls + '1') is 1
PASS +(ps + '1') is 1
PASS +(oghamSpaceMark + '1') is 1
FAIL +(mongolianVowelSeparator + '1') should be 1. Was NaN.
PASS +(mongolianVowelSeparator + '1') is NaN
PASS +(enQuad + '1') is 1
PASS +(emQuad + '1') is 1
PASS +(enSpace + '1') is 1
......@@ -125,7 +125,7 @@ PASS +('1' + lf) is 1
PASS +('1' + ls) is 1
PASS +('1' + ps) is 1
PASS +('1' + oghamSpaceMark) is 1
FAIL +('1' + mongolianVowelSeparator) should be 1. Was NaN.
PASS +('1' + mongolianVowelSeparator) is NaN
PASS +('1' + enQuad) is 1
PASS +('1' + emQuad) is 1
PASS +('1' + enSpace) is 1
......
......@@ -105,7 +105,7 @@ shouldBe("+lf", "0");
shouldBe("+ls", "0");
shouldBe("+ps", "0");
shouldBe("+oghamSpaceMark", "0");
shouldBe("+mongolianVowelSeparator", "0");
shouldBe("+mongolianVowelSeparator", "NaN");
shouldBe("+enQuad", "0");
shouldBe("+emQuad", "0");
shouldBe("+enSpace", "0");
......@@ -129,7 +129,7 @@ shouldBe("+(lf + '1')", "1");
shouldBe("+(ls + '1')", "1");
shouldBe("+(ps + '1')", "1");
shouldBe("+(oghamSpaceMark + '1')", "1");
shouldBe("+(mongolianVowelSeparator + '1')", "1");
shouldBe("+(mongolianVowelSeparator + '1')", "NaN");
shouldBe("+(enQuad + '1')", "1");
shouldBe("+(emQuad + '1')", "1");
shouldBe("+(enSpace + '1')", "1");
......@@ -153,7 +153,7 @@ shouldBe("+('1' + lf)", "1");
shouldBe("+('1' + ls)", "1");
shouldBe("+('1' + ps)", "1");
shouldBe("+('1' + oghamSpaceMark)", "1");
shouldBe("+('1' + mongolianVowelSeparator)", "1");
shouldBe("+('1' + mongolianVowelSeparator)", "NaN");
shouldBe("+('1' + enQuad)", "1");
shouldBe("+('1' + emQuad)", "1");
shouldBe("+('1' + enSpace)", "1");
......
......@@ -55,7 +55,7 @@ PASS parseFloat(lf + '1') is 1
PASS parseFloat(ls + '1') is 1
PASS parseFloat(ps + '1') is 1
PASS parseFloat(oghamSpaceMark + '1') is 1
FAIL parseFloat(mongolianVowelSeparator + '1') should be 1. Was NaN.
PASS parseFloat(mongolianVowelSeparator + '1') is NaN
PASS parseFloat(enQuad + '1') is 1
PASS parseFloat(emQuad + '1') is 1
PASS parseFloat(enSpace + '1') is 1
......
......@@ -81,7 +81,7 @@ shouldBe("parseFloat(lf + '1')", "1");
shouldBe("parseFloat(ls + '1')", "1");
shouldBe("parseFloat(ps + '1')", "1");
shouldBe("parseFloat(oghamSpaceMark + '1')", "1");
shouldBe("parseFloat(mongolianVowelSeparator + '1')", "1");
shouldBe("parseFloat(mongolianVowelSeparator + '1')", "NaN");
shouldBe("parseFloat(enQuad + '1')", "1");
shouldBe("parseFloat(emQuad + '1')", "1");
shouldBe("parseFloat(enSpace + '1')", "1");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment