Commit a8e88eaa authored by yangguo's avatar yangguo Committed by Commit bot

[regexp] implement \p{Any}, \p{Ascii}, and \p{Assigned}.

R=littledan@chromium.org, mathias@qiwi.be
BUG=v8:4743

Committed: https://crrev.com/92bfd13457c80f02be01551f4ea9a5badfe0e4c4
Review-Url: https://codereview.chromium.org/2059113002
Cr-Original-Commit-Position: refs/heads/master@{#36969}
Cr-Commit-Position: refs/heads/master@{#36974}
parent fd7080cb
......@@ -362,11 +362,11 @@ RegExpTree* RegExpParser::ParseDisjunction() {
if (FLAG_harmony_regexp_property) {
ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone());
if (!ParsePropertyClass(ranges)) {
if (!ParsePropertyClass(ranges, p == 'P')) {
return ReportError(CStrVector("Invalid property name"));
}
RegExpCharacterClass* cc =
new (zone()) RegExpCharacterClass(ranges, p == 'P');
new (zone()) RegExpCharacterClass(ranges, false);
builder->AddCharacterClass(cc);
} else {
// With /u, no identity escapes except for syntax characters
......@@ -845,6 +845,9 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) {
}
#ifdef V8_I18N_SUPPORT
namespace {
bool IsExactPropertyAlias(const char* property_name, UProperty property) {
const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
......@@ -875,7 +878,7 @@ bool IsExactPropertyValueAlias(const char* property_value_name,
}
bool LookupPropertyValueName(UProperty property,
const char* property_value_name,
const char* property_value_name, bool negate,
ZoneList<CharacterRange>* result, Zone* zone) {
int32_t property_value =
u_getPropertyValueEnum(property, property_value_name);
......@@ -895,6 +898,7 @@ bool LookupPropertyValueName(UProperty property,
if (success) {
uset_removeAllStrings(set);
if (negate) uset_complement(set);
int item_count = uset_getItemCount(set);
int item_result = 0;
for (int i = 0; i < item_count; i++) {
......@@ -910,7 +914,33 @@ bool LookupPropertyValueName(UProperty property,
return success;
}
bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) {
template <size_t N>
inline bool NameEquals(const char* name, const char (&literal)[N]) {
return strncmp(name, literal, N + 1) == 0;
}
bool LookupSpecialPropertyValueName(const char* name,
ZoneList<CharacterRange>* result,
bool negate, Zone* zone) {
if (NameEquals(name, "Any")) {
if (!negate) result->Add(CharacterRange::Everything(), zone);
} else if (NameEquals(name, "ASCII")) {
result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
: CharacterRange::Range(0x0, 0x7f),
zone);
} else if (NameEquals(name, "Assigned")) {
return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
!negate, result, zone);
} else {
return false;
}
return true;
}
} // anonymous namespace
bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
bool negate) {
// Parse the property class as follows:
// - In \p{name}, 'name' is interpreted
// - either as a general category property value name.
......@@ -943,8 +973,12 @@ bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) {
if (second_part.is_empty()) {
// First attempt to interpret as general category property value name.
const char* name = first_part.ToConstVector().start();
if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, result,
zone())) {
if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
result, zone())) {
return true;
}
// Interpret "Any", "ASCII", and "Assigned".
if (LookupSpecialPropertyValueName(name, result, negate, zone())) {
return true;
}
// Then attempt to interpret as binary property name with value name 'Y'.
......@@ -952,7 +986,8 @@ bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) {
if (property < UCHAR_BINARY_START) return false;
if (property >= UCHAR_BINARY_LIMIT) return false;
if (!IsExactPropertyAlias(name, property)) return false;
return LookupPropertyValueName(property, "Y", result, zone());
return LookupPropertyValueName(property, negate ? "N" : "Y", false, result,
zone());
} else {
// Both property name and value name are specified. Attempt to interpret
// the property name as enumerated property.
......@@ -962,13 +997,15 @@ bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) {
if (property < UCHAR_INT_START) return false;
if (property >= UCHAR_INT_LIMIT) return false;
if (!IsExactPropertyAlias(property_name, property)) return false;
return LookupPropertyValueName(property, value_name, result, zone());
return LookupPropertyValueName(property, value_name, negate, result,
zone());
}
}
#else // V8_I18N_SUPPORT
bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) {
bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
bool negate) {
return false;
}
......@@ -1159,19 +1196,10 @@ bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) {
bool parse_success = false;
if (next == 'p') {
Advance(2);
parse_success = ParsePropertyClass(ranges);
parse_success = ParsePropertyClass(ranges, false);
} else if (next == 'P') {
Advance(2);
ZoneList<CharacterRange>* property_class =
new (zone()) ZoneList<CharacterRange>(2, zone());
parse_success = ParsePropertyClass(property_class);
if (parse_success) {
ZoneList<CharacterRange>* negated =
new (zone()) ZoneList<CharacterRange>(2, zone());
CharacterRange::Negate(property_class, negated, zone());
const Vector<CharacterRange> negated_vector = negated->ToVector();
ranges->AddAll(negated_vector, zone());
}
parse_success = ParsePropertyClass(ranges, true);
} else {
return false;
}
......
......@@ -174,7 +174,7 @@ class RegExpParser BASE_EMBEDDED {
bool ParseHexEscape(int length, uc32* value);
bool ParseUnicodeEscape(uc32* value);
bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
bool ParsePropertyClass(ZoneList<CharacterRange>* result);
bool ParsePropertyClass(ZoneList<CharacterRange>* result, bool negate);
uc32 ParseOctalLiteral();
......
......@@ -30,10 +30,10 @@ assertTrue(/\p{Ll}/iu.test("a"));
assertTrue(/\p{Ll}/iu.test("\u{118D4}"));
assertTrue(/\p{Ll}/iu.test("A"));
assertTrue(/\p{Ll}/iu.test("\u{118B4}"));
assertFalse(/\P{Ll}/iu.test("a"));
assertFalse(/\P{Ll}/iu.test("\u{118D4}"));
assertFalse(/\P{Ll}/iu.test("A"));
assertFalse(/\P{Ll}/iu.test("\u{118B4}"));
assertTrue(/\P{Ll}/iu.test("a"));
assertTrue(/\P{Ll}/iu.test("\u{118D4}"));
assertTrue(/\P{Ll}/iu.test("A"));
assertTrue(/\P{Ll}/iu.test("\u{118B4}"));
assertTrue(/\p{Lu}/u.test("A"));
assertFalse(/\P{Lu}/u.test("A"));
......@@ -48,10 +48,10 @@ assertTrue(/\p{Lu}/iu.test("a"));
assertTrue(/\p{Lu}/iu.test("\u{118D4}"));
assertTrue(/\p{Lu}/iu.test("A"));
assertTrue(/\p{Lu}/iu.test("\u{118B4}"));
assertFalse(/\P{Lu}/iu.test("a"));
assertFalse(/\P{Lu}/iu.test("\u{118D4}"));
assertFalse(/\P{Lu}/iu.test("A"));
assertFalse(/\P{Lu}/iu.test("\u{118B4}"));
assertTrue(/\P{Lu}/iu.test("a"));
assertTrue(/\P{Lu}/iu.test("\u{118D4}"));
assertTrue(/\P{Lu}/iu.test("A"));
assertTrue(/\P{Lu}/iu.test("\u{118B4}"));
assertTrue(/\p{Sm}/u.test("+"));
assertFalse(/\P{Sm}/u.test("+"));
......
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-regexp-property --harmony-unicode-regexps
const regexp = /\P{Lu}/ui;
const regexpu = /[\0-@\[-\xBF\xD7\xDF-\xFF\u0101\u0103\u0105\u0107\u0109\u010B\u010D\u010F\u0111\u0113\u0115\u0117\u0119\u011B\u011D\u011F\u0121\u0123\u0125\u0127\u0129\u012B\u012D\u012F\u0131\u0133\u0135\u0137\u0138\u013A\u013C\u013E\u0140\u0142\u0144\u0146\u0148\u0149\u014B\u014D\u014F\u0151\u0153\u0155\u0157\u0159\u015B\u015D\u015F\u0161\u0163\u0165\u0167\u0169\u016B\u016D\u016F\u0171\u0173\u0175\u0177\u017A\u017C\u017E-\u0180\u0183\u0185\u0188\u018C\u018D\u0192\u0195\u0199-\u019B\u019E\u01A1\u01A3\u01A5\u01A8\u01AA\u01AB\u01AD\u01B0\u01B4\u01B6\u01B9-\u01BB\u01BD-\u01C3\u01C5\u01C6\u01C8\u01C9\u01CB\u01CC\u01CE\u01D0\u01D2\u01D4\u01D6\u01D8\u01DA\u01DC\u01DD\u01DF\u01E1\u01E3\u01E5\u01E7\u01E9\u01EB\u01ED\u01EF\u01F0\u01F2\u01F3\u01F5\u01F9\u01FB\u01FD\u01FF\u0201\u0203\u0205\u0207\u0209\u020B\u020D\u020F\u0211\u0213\u0215\u0217\u0219\u021B\u021D\u021F\u0221\u0223\u0225\u0227\u0229\u022B\u022D\u022F\u0231\u0233-\u0239\u023C\u023F\u0240\u0242\u0247\u0249\u024B\u024D\u024F-\u036F\u0371\u0373-\u0375\u0377-\u037E\u0380-\u0385\u0387\u038B\u038D\u0390\u03A2\u03AC-\u03CE\u03D0\u03D1\u03D5-\u03D7\u03D9\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F3\u03F5\u03F6\u03F8\u03FB\u03FC\u0430-\u045F\u0461\u0463\u0465\u0467\u0469\u046B\u046D\u046F\u0471\u0473\u0475\u0477\u0479\u047B\u047D\u047F\u0481-\u0489\u048B\u048D\u048F\u0491\u0493\u0495\u0497\u0499\u049B\u049D\u049F\u04A1\u04A3\u04A5\u04A7\u04A9\u04AB\u04AD\u04AF\u04B1\u04B3\u04B5\u04B7\u04B9\u04BB\u04BD\u04BF\u04C2\u04C4\u04C6\u04C8\u04CA\u04CC\u04CE\u04CF\u04D1\u04D3\u04D5\u04D7\u04D9\u04DB\u04DD\u04DF\u04E1\u04E3\u04E5\u04E7\u04E9\u04EB\u04ED\u04EF\u04F1\u04F3\u04F5\u04F7\u04F9\u04FB\u04FD\u04FF\u0501\u0503\u0505\u0507\u0509\u050B\u050D\u050F\u0511\u0513\u0515\u0517\u0519\u051B\u051D\u051F\u0521\u0523\u0525\u0527\u0529\u052B\u052D\u052F\u0530\u0557-\u109F\u10C6\u10C8-\u10CC\u10CE-\u139F\u13F6-\u1DFF\u1E01\u1E03\u1E05\u1E07\u1E09\u1E0B\u1E0D\u1E0F\u1E11\u1E13\u1E15\u1E17\u1E19\u1E1B\u1E1D\u1E1F\u1E21\u1E23\u1E25\u1E27\u1E29\u1E2B\u1E2D\u1E2F\u1E31\u1E33\u1E35\u1E37\u1E39\u1E3B\u1E3D\u1E3F\u1E41\u1E43\u1E45\u1E47\u1E49\u1E4B\u1E4D\u1E4F\u1E51\u1E53\u1E55\u1E57\u1E59\u1E5B\u1E5D\u1E5F\u1E61\u1E63\u1E65\u1E67\u1E69\u1E6B\u1E6D\u1E6F\u1E71\u1E73\u1E75\u1E77\u1E79\u1E7B\u1E7D\u1E7F\u1E81\u1E83\u1E85\u1E87\u1E89\u1E8B\u1E8D\u1E8F\u1E91\u1E93\u1E95-\u1E9D\u1E9F\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7\u1EB9\u1EBB\u1EBD\u1EBF\u1EC1\u1EC3\u1EC5\u1EC7\u1EC9\u1ECB\u1ECD\u1ECF\u1ED1\u1ED3\u1ED5\u1ED7\u1ED9\u1EDB\u1EDD\u1EDF\u1EE1\u1EE3\u1EE5\u1EE7\u1EE9\u1EEB\u1EED\u1EEF\u1EF1\u1EF3\u1EF5\u1EF7\u1EF9\u1EFB\u1EFD\u1EFF-\u1F07\u1F10-\u1F17\u1F1E-\u1F27\u1F30-\u1F37\u1F40-\u1F47\u1F4E-\u1F58\u1F5A\u1F5C\u1F5E\u1F60-\u1F67\u1F70-\u1FB7\u1FBC-\u1FC7\u1FCC-\u1FD7\u1FDC-\u1FE7\u1FED-\u1FF7\u1FFC-\u2101\u2103-\u2106\u2108-\u210A\u210E\u210F\u2113\u2114\u2116-\u2118\u211E-\u2123\u2125\u2127\u2129\u212E\u212F\u2134-\u213D\u2140-\u2144\u2146-\u2182\u2184-\u2BFF\u2C2F-\u2C5F\u2C61\u2C65\u2C66\u2C68\u2C6A\u2C6C\u2C71\u2C73\u2C74\u2C76-\u2C7D\u2C81\u2C83\u2C85\u2C87\u2C89\u2C8B\u2C8D\u2C8F\u2C91\u2C93\u2C95\u2C97\u2C99\u2C9B\u2C9D\u2C9F\u2CA1\u2CA3\u2CA5\u2CA7\u2CA9\u2CAB\u2CAD\u2CAF\u2CB1\u2CB3\u2CB5\u2CB7\u2CB9\u2CBB\u2CBD\u2CBF\u2CC1\u2CC3\u2CC5\u2CC7\u2CC9\u2CCB\u2CCD\u2CCF\u2CD1\u2CD3\u2CD5\u2CD7\u2CD9\u2CDB\u2CDD\u2CDF\u2CE1\u2CE3-\u2CEA\u2CEC\u2CEE-\u2CF1\u2CF3-\uA63F\uA641\uA643\uA645\uA647\uA649\uA64B\uA64D\uA64F\uA651\uA653\uA655\uA657\uA659\uA65B\uA65D\uA65F\uA661\uA663\uA665\uA667\uA669\uA66B\uA66D-\uA67F\uA681\uA683\uA685\uA687\uA689\uA68B\uA68D\uA68F\uA691\uA693\uA695\uA697\uA699\uA69B-\uA721\uA723\uA725\uA727\uA729\uA72B\uA72D\uA72F-\uA731\uA733\uA735\uA737\uA739\uA73B\uA73D\uA73F\uA741\uA743\uA745\uA747\uA749\uA74B\uA74D\uA74F\uA751\uA753\uA755\uA757\uA759\uA75B\uA75D\uA75F\uA761\uA763\uA765\uA767\uA769\uA76B\uA76D\uA76F-\uA778\uA77A\uA77C\uA77F\uA781\uA783\uA785\uA787-\uA78A\uA78C\uA78E\uA78F\uA791\uA793-\uA795\uA797\uA799\uA79B\uA79D\uA79F\uA7A1\uA7A3\uA7A5\uA7A7\uA7A9\uA7AE\uA7AF\uA7B5\uA7B7-\uFF20\uFF3B-\u{103FF}\u{10428}-\u{10C7F}\u{10CB3}-\u{1189F}\u{118C0}-\u{1D3FF}\u{1D41A}-\u{1D433}\u{1D44E}-\u{1D467}\u{1D482}-\u{1D49B}\u{1D49D}\u{1D4A0}\u{1D4A1}\u{1D4A3}\u{1D4A4}\u{1D4A7}\u{1D4A8}\u{1D4AD}\u{1D4B6}-\u{1D4CF}\u{1D4EA}-\u{1D503}\u{1D506}\u{1D50B}\u{1D50C}\u{1D515}\u{1D51D}-\u{1D537}\u{1D53A}\u{1D53F}\u{1D545}\u{1D547}-\u{1D549}\u{1D551}-\u{1D56B}\u{1D586}-\u{1D59F}\u{1D5BA}-\u{1D5D3}\u{1D5EE}-\u{1D607}\u{1D622}-\u{1D63B}\u{1D656}-\u{1D66F}\u{1D68A}-\u{1D6A7}\u{1D6C1}-\u{1D6E1}\u{1D6FB}-\u{1D71B}\u{1D735}-\u{1D755}\u{1D76F}-\u{1D78F}\u{1D7A9}-\u{1D7C9}\u{1D7CB}-\u{10FFFF}]/ui;
for (let codePoint = 0; codePoint <= 0x10FFFF; codePoint++) {
const string = String.fromCodePoint(codePoint);
assertEquals(regexp.test(string), regexpu.test(string));
}
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-regexp-property --harmony-unicode-regexps
function t(re, s) { assertTrue(re.test(s)); }
function f(re, s) { assertFalse(re.test(s)); }
t(/\p{ASCII}+/u, "abc123");
f(/\p{ASCII}+/u, "ⓐⓑⓒ①②③");
f(/\p{ASCII}+/u, "🄰🄱🄲①②③");
f(/\P{ASCII}+/u, "abcd123");
t(/\P{ASCII}+/u, "ⓐⓑⓒ①②③");
t(/\P{ASCII}+/u, "🄰🄱🄲①②③");
f(/[^\p{ASCII}]+/u, "abc123");
f(/[\p{ASCII}]+/u, "ⓐⓑⓒ①②③");
f(/[\p{ASCII}]+/u, "🄰🄱🄲①②③");
t(/[^\P{ASCII}]+/u, "abcd123");
t(/[\P{ASCII}]+/u, "ⓐⓑⓒ①②③");
f(/[^\P{ASCII}]+/u, "🄰🄱🄲①②③");
t(/\p{Any}+/u, "🄰🄱🄲①②③");
assertEquals(["\ud800"], /\p{Any}/u.exec("\ud800\ud801"));
assertEquals(["\udc00"], /\p{Any}/u.exec("\udc00\udc01"));
assertEquals(["\ud800\udc01"], /\p{Any}/u.exec("\ud800\udc01"));
assertEquals(["\udc01"], /\p{Any}/u.exec("\udc01"));
f(/\P{Any}+/u, "123");
f(/[\P{Any}]+/u, "123");
t(/[\P{Any}\d]+/u, "123");
t(/[^\P{Any}]+/u, "123");
t(/\p{Assigned}+/u, "123");
t(/\p{Assigned}+/u, "🄰🄱🄲");
f(/\p{Assigned}+/u, "\ufdd0");
f(/\p{Assigned}+/u, "\u{fffff}");
f(/\P{Assigned}+/u, "123");
f(/\P{Assigned}+/u, "🄰🄱🄲");
t(/\P{Assigned}+/u, "\ufdd0");
t(/\P{Assigned}+/u, "\u{fffff}");
f(/\P{Assigned}/u, "");
t(/[^\P{Assigned}]+/u, "123");
f(/[\P{Assigned}]+/u, "🄰🄱🄲");
f(/[^\P{Assigned}]+/u, "\ufdd0");
t(/[\P{Assigned}]+/u, "\u{fffff}");
f(/[\P{Assigned}]/u, "");
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment