Commit 5be770b5 authored by yangguo's avatar yangguo Committed by Commit bot

[regexp] extend property classes by unicode blocks.

R=littledan@chromium.org
BUG=v8:4810
LOG=N

Review URL: https://codereview.chromium.org/1780183002

Cr-Commit-Position: refs/heads/master@{#34702}
parent 0d3c78d1
...@@ -843,58 +843,71 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) { ...@@ -843,58 +843,71 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) {
return result; return result;
} }
bool LookupPropertyClass(UProperty property, const char* property_name,
ZoneList<CharacterRange>* result, Zone* zone) {
int32_t property_value = u_getPropertyValueEnum(property, property_name);
if (property_value == UCHAR_INVALID_CODE) return false;
USet* set = uset_openEmpty();
UErrorCode ec = U_ZERO_ERROR;
uset_applyIntPropertyValue(set, property, property_value, &ec);
bool success = ec == U_ZERO_ERROR && !uset_isEmpty(set);
if (success) {
uset_removeAllStrings(set);
int item_count = uset_getItemCount(set);
int item_result = 0;
for (int i = 0; i < item_count; i++) {
uc32 start = 0;
uc32 end = 0;
item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
result->Add(CharacterRange::Range(start, end), zone);
}
DCHECK_EQ(U_ZERO_ERROR, ec);
DCHECK_EQ(0, item_result);
}
uset_close(set);
return success;
}
bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) { bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) {
#ifdef V8_I18N_SUPPORT #ifdef V8_I18N_SUPPORT
ZoneList<char> property_name(0, zone()); List<char> property_name_list;
if (current() == '{') { if (current() == '{') {
for (Advance(); current() != '}'; Advance()) { for (Advance(); current() != '}'; Advance()) {
if (!has_next()) return false; if (!has_next()) return false;
property_name.Add(static_cast<char>(current()), zone()); property_name_list.Add(static_cast<char>(current()));
} }
} else if (current() != kEndMarker) { } else if (current() != kEndMarker) {
property_name.Add(static_cast<char>(current()), zone()); property_name_list.Add(static_cast<char>(current()));
} else { } else {
return false; return false;
} }
Advance(); Advance();
property_name.Add(0, zone()); // null-terminate string. property_name_list.Add(0); // null-terminate string.
// Property names are defined in unicode database files. For aliases of const char* property_name = property_name_list.ToConstVector().start();
// these property names, see PropertyValueAliases.txt.
UProperty kPropertyClasses[] = { #define PROPERTY_NAME_LOOKUP(PROPERTY) \
// General_Category (gc) found in PropertyValueAliases.txt do { \
UCHAR_GENERAL_CATEGORY_MASK, if (LookupPropertyClass(PROPERTY, property_name, result, zone())) { \
// Script (sc) found in Scripts.txt return true; \
UCHAR_SCRIPT, } \
}; } while (false)
for (int i = 0; i < arraysize(kPropertyClasses); i++) { // General_Category (gc) found in PropertyValueAliases.txt
UProperty property_class = kPropertyClasses[i]; PROPERTY_NAME_LOOKUP(UCHAR_GENERAL_CATEGORY_MASK);
int32_t category = u_getPropertyValueEnum( // Script (sc) found in Scripts.txt
property_class, property_name.ToConstVector().start()); PROPERTY_NAME_LOOKUP(UCHAR_SCRIPT);
if (category == UCHAR_INVALID_CODE) continue; // To disambiguate from script names, block names have an "In"-prefix.
if (property_name_list.length() > 3 && property_name[0] == 'I' &&
USet* set = uset_openEmpty(); property_name[1] == 'n') {
UErrorCode ec = U_ZERO_ERROR; // Block (blk) found in Blocks.txt
uset_applyIntPropertyValue(set, property_class, category, &ec); property_name += 2;
if (ec == U_ZERO_ERROR && !uset_isEmpty(set)) { PROPERTY_NAME_LOOKUP(UCHAR_BLOCK);
uset_removeAllStrings(set);
int item_count = uset_getItemCount(set);
int item_result = 0;
for (int i = 0; i < item_count; i++) {
uc32 start = 0;
uc32 end = 0;
item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
result->Add(CharacterRange::Range(start, end), zone());
}
DCHECK_EQ(U_ZERO_ERROR, ec);
DCHECK_EQ(0, item_result);
}
uset_close(set);
return true;
} }
#undef PROPERTY_NAME_LOOKUP
#endif // V8_I18N_SUPPORT #endif // V8_I18N_SUPPORT
return false; return false;
} }
......
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-regexp-property --harmony-unicode-regexps
function t(re, s) { assertTrue(re.test(s)); }
function f(re, s) { assertFalse(re.test(s)); }
t(/\p{InASCII}+/u, ".");
t(/\p{In ASCII}+/u, "supercalifragilisticexpialidocious");
t(/\p{In Basic Latin}+/u, ".");
t(/\p{InBasicLatin}+/u, "supercalifragilisticexpialidocious");
t(/\p{InBasic_Latin}+/u, ".");
t(/\p{InBasic_Latin}+/u, "supercalifragilisticexpialidocious");
t(/\p{InBasicLatin}+/u, ".");
t(/\p{InBasicLatin}+/u, "supercalifragilisticexpialidocious");
t(/\p{In CJK}+/u, "话说天下大势,分久必合,合久必分");
t(/\p{InCJK}+/u, "吾庄后有一桃园,花开正盛");
f(/\p{InCJKUnifiedIdeographs}+/u, "おはようございます");
f(/\p{In CJK unified ideographs}+/u,
"Something is rotten in the state of Denmark");
t(/\p{InLatin_1}+/u, "Wie froh bin ich, daß ich weg bin!");
f(/\p{InASCII}+/u, "奔腾千里荡尘埃,渡水登山紫雾开");
f(/\p{In Latin 1}+/u, "いただきます");
t(/\p{InHiragana}/u, "いただきます");
t(/\p{Hiragana}/u, "\u{1b001}"); // This refers to the script "Hiragana".
f(/\p{InHiragana}/u, "\u{1b001}"); // This refers to the block "Hiragana".
t(/\p{InGreekAndCoptic}/u, "ἄνδρα μοι ἔννεπε, μοῦσα, πολύτροπον, ὃς μάλα πολλὰ");
t(/\p{InGreek}/u, "μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος");
assertThrows("/\\p{In}/u");
assertThrows("/\\pI/u");
assertThrows("/\\p{I}/u");
assertThrows("/\\p{CJK}/u");
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. // found in the LICENSE file.
// Flags: --harmony-regexp-property // Flags: --harmony-regexp-property --harmony-unicode-regexps
function t(re, s) { assertTrue(re.test(s)); } function t(re, s) { assertTrue(re.test(s)); }
function f(re, s) { assertFalse(re.test(s)); } function f(re, s) { assertFalse(re.test(s)); }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment