Commit 13d18c00 authored by yangguo's avatar yangguo Committed by Commit bot

[regexp] extend property classes by script category.

R=littledan@chromium.org
BUG=v8:4743
LOG=N

Committed: https://crrev.com/22f6735ccbe2e341d341e61b9c38ce308b8da655
Cr-Commit-Position: refs/heads/master@{#34553}

Review URL: https://codereview.chromium.org/1774513002

Cr-Commit-Position: refs/heads/master@{#34562}
parent a8dc2c47
...@@ -43,6 +43,10 @@ inline bool IsAlphaNumeric(uc32 c) { ...@@ -43,6 +43,10 @@ inline bool IsAlphaNumeric(uc32 c) {
return IsInRange(AsciiAlphaToLower(c), 'a', 'z') || IsDecimalDigit(c); return IsInRange(AsciiAlphaToLower(c), 'a', 'z') || IsDecimalDigit(c);
} }
inline bool IsAlpha(uc32 c) {
return IsInRange(AsciiAlphaToLower(c), 'a', 'z');
}
inline bool IsDecimalDigit(uc32 c) { inline bool IsDecimalDigit(uc32 c) {
// ECMA-262, 3rd, 7.8.3 (p 16) // ECMA-262, 3rd, 7.8.3 (p 16)
return IsInRange(c, '0', '9'); return IsInRange(c, '0', '9');
......
...@@ -18,6 +18,7 @@ inline bool IsCarriageReturn(uc32 c); ...@@ -18,6 +18,7 @@ inline bool IsCarriageReturn(uc32 c);
inline bool IsLineFeed(uc32 c); inline bool IsLineFeed(uc32 c);
inline bool IsAsciiIdentifier(uc32 c); inline bool IsAsciiIdentifier(uc32 c);
inline bool IsAlphaNumeric(uc32 c); inline bool IsAlphaNumeric(uc32 c);
inline bool IsAlpha(uc32 c);
inline bool IsDecimalDigit(uc32 c); inline bool IsDecimalDigit(uc32 c);
inline bool IsHexDigit(uc32 c); inline bool IsHexDigit(uc32 c);
inline bool IsOctalDigit(uc32 c); inline bool IsOctalDigit(uc32 c);
......
...@@ -838,52 +838,59 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) { ...@@ -838,52 +838,59 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) {
ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() { ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() {
#ifdef V8_I18N_SUPPORT #ifdef V8_I18N_SUPPORT
char property_name[3]; ZoneList<char> property_name(0, zone());
memset(property_name, 0, sizeof(property_name));
if (current() == '{') { if (current() == '{') {
Advance(); for (Advance(); IsAlpha(current()); Advance()) {
if (current() < 'A' || current() > 'Z') return nullptr; property_name.Add(static_cast<char>(current()), zone());
property_name[0] = static_cast<char>(current());
Advance();
if (current() >= 'a' && current() <= 'z') {
property_name[1] = static_cast<char>(current());
Advance();
} }
if (current() != '}') return nullptr; if (current() != '}') return nullptr;
} else if (current() >= 'A' && current() <= 'Z') { } else if (IsAlpha(current())) {
property_name[0] = static_cast<char>(current()); property_name.Add(static_cast<char>(current()), zone());
} else { } else {
return nullptr; return nullptr;
} }
Advance(); Advance();
property_name.Add(0, zone()); // null-terminate string.
int32_t category =
u_getPropertyValueEnum(UCHAR_GENERAL_CATEGORY_MASK, property_name); // Property names are defined in unicode database files. For aliases of
if (category == UCHAR_INVALID_CODE) return nullptr; // these property names, see PropertyValueAliases.txt.
UProperty kPropertyClasses[] = {
USet* set = uset_openEmpty(); // General_Category (gc) found in PropertyValueAliases.txt
UErrorCode ec = U_ZERO_ERROR; UCHAR_GENERAL_CATEGORY_MASK,
uset_applyIntPropertyValue(set, UCHAR_GENERAL_CATEGORY_MASK, category, &ec); // Script (sc) found in Scripts.txt
ZoneList<CharacterRange>* ranges = nullptr; UCHAR_SCRIPT,
if (ec == U_ZERO_ERROR && !uset_isEmpty(set)) { };
uset_removeAllStrings(set);
int item_count = uset_getItemCount(set); for (int i = 0; i < arraysize(kPropertyClasses); i++) {
ranges = new (zone()) ZoneList<CharacterRange>(item_count, zone()); UProperty property_class = kPropertyClasses[i];
int item_result = 0; int32_t category = u_getPropertyValueEnum(
for (int i = 0; i < item_count; i++) { property_class, property_name.ToConstVector().start());
uc32 start = 0; if (category == UCHAR_INVALID_CODE) continue;
uc32 end = 0;
item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec); USet* set = uset_openEmpty();
ranges->Add(CharacterRange::Range(start, end), zone()); UErrorCode ec = U_ZERO_ERROR;
uset_applyIntPropertyValue(set, property_class, category, &ec);
ZoneList<CharacterRange>* ranges = nullptr;
if (ec == U_ZERO_ERROR && !uset_isEmpty(set)) {
uset_removeAllStrings(set);
int item_count = uset_getItemCount(set);
ranges = new (zone()) ZoneList<CharacterRange>(item_count, zone());
int item_result = 0;
for (int i = 0; i < item_count; i++) {
uc32 start = 0;
uc32 end = 0;
item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
ranges->Add(CharacterRange::Range(start, end), zone());
}
DCHECK_EQ(U_ZERO_ERROR, ec);
DCHECK_EQ(0, item_result);
} }
DCHECK_EQ(U_ZERO_ERROR, ec); uset_close(set);
DCHECK_EQ(0, item_result); return ranges;
} }
uset_close(set);
return ranges;
#else // V8_I18N_SUPPORT
return nullptr;
#endif // V8_I18N_SUPPORT #endif // V8_I18N_SUPPORT
return nullptr;
} }
bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
......
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-regexp-property
function t(re, s) { assertTrue(re.test(s)); }
function f(re, s) { assertFalse(re.test(s)); }
t(/\p{Common}+/u, ".");
f(/\p{Common}+/u, "supercalifragilisticexpialidocious");
t(/\p{Han}+/u, "话说天下大势,分久必合,合久必分");
t(/\p{Hani}+/u, "吾庄后有一桃园,花开正盛");
f(/\p{Han}+/u, "おはようございます");
f(/\p{Hani}+/u, "Something is rotten in the state of Denmark");
t(/\p{Latin}+/u, "Wie froh bin ich, daß ich weg bin!");
t(/\p{Latn}+/u,
"It was a bright day in April, and the clocks were striking thirteen");
f(/\p{Latin}+/u, "奔腾千里荡尘埃,渡水登山紫雾开");
f(/\p{Latn}+/u, "いただきます");
t(/\p{Hiragana}/u, "いただきます");
t(/\p{Hira}/u, "ありがとうございました");
f(/\p{Hiragana}/u,
"Als Gregor Samsa eines Morgens aus unruhigen Träumen erwachte");
f(/\p{Hira}/u, "Call me Ishmael");
t(/\p{Phoenician}/u, "\u{10900}\u{1091a}");
t(/\p{Phnx}/u, "\u{1091f}\u{10916}");
f(/\p{Phoenician}/u, "Arthur est un perroquet");
f(/\p{Phnx}/u, "设心狠毒非良士,操卓原来一路人");
t(/\p{Grek}/u, "ἄνδρα μοι ἔννεπε, μοῦσα, πολύτροπον, ὃς μάλα πολλὰ");
t(/\p{Greek}/u, "μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος");
f(/\p{Greek}/u, "高贤未服英雄志,屈节偏生杰士疑");
f(/\p{Greek}/u,
"Mr. Jones, of the Manor Farm, had locked the hen-houses for the night");
...@@ -324,7 +324,8 @@ ...@@ -324,7 +324,8 @@
'harmony/unicode-regexp-ignore-case': [PASS, ['no_i18n == True', FAIL]], 'harmony/unicode-regexp-ignore-case': [PASS, ['no_i18n == True', FAIL]],
'harmony/unicode-regexp-ignore-case-noi18n': [FAIL, ['no_i18n == True', PASS]], 'harmony/unicode-regexp-ignore-case-noi18n': [FAIL, ['no_i18n == True', PASS]],
# desugaring regexp property class relies on ICU. # desugaring regexp property class relies on ICU.
'harmony/unicode-regexp-property-class': [PASS, ['no_i18n == True', FAIL]], 'harmony/regexp-property-general-category': [PASS, ['no_i18n == True', FAIL]],
'harmony/regexp-property-script-category': [PASS, ['no_i18n == True', FAIL]],
}], # ALWAYS }], # ALWAYS
['novfp3 == True', { ['novfp3 == True', {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment