Commit e39ba014 authored by yangguo's avatar yangguo Committed by Commit bot

[regexp] extend \p syntax to binary and enumerated properties.

Also make the syntax a bit less complicated and speculative.

R=littledan@chromium.org
BUG=v8:4743
LOG=N

Review URL: https://codereview.chromium.org/1845243002

Cr-Commit-Position: refs/heads/master@{#35344}
parent 84ed7609
......@@ -845,29 +845,46 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) {
}
#ifdef V8_I18N_SUPPORT
bool IsExactPropertyValueAlias(const char* property_name, UProperty property,
int32_t property_value) {
bool IsExactPropertyAlias(const char* property_name, UProperty property) {
const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
for (int i = 0;; i++) {
const char* long_name = u_getPropertyName(
property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
if (long_name == NULL) break;
if (strcmp(property_name, long_name) == 0) return true;
}
return false;
}
bool IsExactPropertyValueAlias(const char* property_value_name,
UProperty property, int32_t property_value) {
const char* short_name =
u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
if (short_name != NULL && strcmp(property_value_name, short_name) == 0) {
return true;
}
for (int i = 0;; i++) {
const char* long_name = u_getPropertyValueName(
property, property_value,
static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
if (long_name == NULL) break;
if (strcmp(property_name, long_name) == 0) return true;
if (strcmp(property_value_name, long_name) == 0) return true;
}
return false;
}
bool LookupPropertyClass(UProperty property, const char* property_name,
ZoneList<CharacterRange>* result, Zone* zone) {
int32_t property_value = u_getPropertyValueEnum(property, property_name);
bool LookupPropertyValueName(UProperty property,
const char* property_value_name,
ZoneList<CharacterRange>* result, Zone* zone) {
int32_t property_value =
u_getPropertyValueEnum(property, property_value_name);
if (property_value == UCHAR_INVALID_CODE) return false;
// We require the property name to match exactly to one of the property value
// aliases. However, u_getPropertyValueEnum uses loose matching.
if (!IsExactPropertyValueAlias(property_name, property, property_value)) {
if (!IsExactPropertyValueAlias(property_value_name, property,
property_value)) {
return false;
}
......@@ -892,49 +909,75 @@ bool LookupPropertyClass(UProperty property, const char* property_name,
uset_close(set);
return success;
}
#endif // V8_I18N_SUPPORT
bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) {
#ifdef V8_I18N_SUPPORT
List<char> property_name_list;
// Parse the property class as follows:
// - \pN with a single-character N is equivalent to \p{N}
// - In \p{name}, 'name' is interpreted
// - either as a general category property value name.
// - or as a binary property name.
// - In \p{name=value}, 'name' is interpreted as an enumerated property name,
// and 'value' is interpreted as one of the available property value names.
// - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
// - Loose matching is not applied.
List<char> first_part;
List<char> second_part;
if (current() == '{') {
for (Advance(); current() != '}'; Advance()) {
// Parse \p{[PropertyName=]PropertyNameValue}
for (Advance(); current() != '}' && current() != '='; Advance()) {
if (!has_next()) return false;
property_name_list.Add(static_cast<char>(current()));
first_part.Add(static_cast<char>(current()));
}
if (current() == '=') {
for (Advance(); current() != '}'; Advance()) {
if (!has_next()) return false;
second_part.Add(static_cast<char>(current()));
}
second_part.Add(0); // null-terminate string.
}
} else if (current() != kEndMarker) {
property_name_list.Add(static_cast<char>(current()));
// Parse \pN, where N is a single-character property name value.
first_part.Add(static_cast<char>(current()));
} else {
return false;
}
Advance();
property_name_list.Add(0); // null-terminate string.
const char* property_name = property_name_list.ToConstVector().start();
#define PROPERTY_NAME_LOOKUP(PROPERTY) \
do { \
if (LookupPropertyClass(PROPERTY, property_name, result, zone())) { \
return true; \
} \
} while (false)
// General_Category (gc) found in PropertyValueAliases.txt
PROPERTY_NAME_LOOKUP(UCHAR_GENERAL_CATEGORY_MASK);
// Script (sc) found in Scripts.txt
PROPERTY_NAME_LOOKUP(UCHAR_SCRIPT);
// To disambiguate from script names, block names have an "In"-prefix.
if (property_name_list.length() > 3 && property_name[0] == 'I' &&
property_name[1] == 'n') {
// Block (blk) found in Blocks.txt
property_name += 2;
PROPERTY_NAME_LOOKUP(UCHAR_BLOCK);
first_part.Add(0); // null-terminate string.
if (second_part.is_empty()) {
// First attempt to interpret as general category property value name.
const char* name = first_part.ToConstVector().start();
if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, result,
zone())) {
return true;
}
// Then attempt to interpret as binary property name with value name 'Y'.
UProperty property = u_getPropertyEnum(name);
if (property < UCHAR_BINARY_START) return false;
if (property >= UCHAR_BINARY_LIMIT) return false;
if (!IsExactPropertyAlias(name, property)) return false;
return LookupPropertyValueName(property, "Y", result, zone());
} else {
// Both property name and value name are specified. Attempt to interpret
// the property name as enumerated property.
const char* property_name = first_part.ToConstVector().start();
const char* value_name = second_part.ToConstVector().start();
UProperty property = u_getPropertyEnum(property_name);
if (property < UCHAR_INT_START) return false;
if (property >= UCHAR_INT_LIMIT) return false;
if (!IsExactPropertyAlias(property_name, property)) return false;
return LookupPropertyValueName(property, value_name, result, zone());
}
#undef PROPERTY_NAME_LOOKUP
#endif // V8_I18N_SUPPORT
}
#else // V8_I18N_SUPPORT
bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result) {
return false;
}
#endif // V8_I18N_SUPPORT
bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
uc32 x = 0;
int d = HexValue(current());
......
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-regexp-property --harmony-unicode-regexps
function t(re, s) { assertTrue(re.test(s)); }
function f(re, s) { assertFalse(re.test(s)); }
t(/\p{Bidi_Control}+/u, "\u200E");
f(/\p{Bidi_C}+/u, "On a dark desert highway, cool wind in my hair");
t(/\p{AHex}+/u, "DEADBEEF");
t(/\p{Alphabetic}+/u, "abcdefg");
t(/\P{Alphabetic}+/u, "1234");
t(/\p{White_Space}+/u, "\u00A0");
t(/\p{Uppercase}+/u, "V");
f(/\p{Lower}+/u, "U");
t(/\p{Ideo}+/u, "字");
f(/\p{Ideo}+/u, "x");
assertThrows("/\\p{Hiragana}/u");
assertThrows("/\\p{Bidi_Class}/u");
assertThrows("/\\p{Bidi_C=False}/u");
assertThrows("/\\P{Bidi_Control=Y}/u");
assertThrows("/\\p{AHex=Yes}/u");
......@@ -7,28 +7,28 @@
function t(re, s) { assertTrue(re.test(s)); }
function f(re, s) { assertFalse(re.test(s)); }
t(/\p{InASCII}+/u, ".");
t(/\p{InASCII}+/u, "supercalifragilisticexpialidocious");
t(/\p{InBasic_Latin}+/u, ".");
t(/\p{InBasic_Latin}+/u, "supercalifragilisticexpialidocious");
t(/\p{InCJK}+/u, "话说天下大势,分久必合,合久必分");
t(/\p{InCJK_Unified_Ideographs}+/u, "吾庄后有一桃园,花开正盛");
f(/\p{InCJK}+/u, "おはようございます");
f(/\p{InCJK_Unified_Ideographs}+/u,
t(/\p{Block=ASCII}+/u, ".");
t(/\p{Block=ASCII}+/u, "supercalifragilisticexpialidocious");
t(/\p{Block=Basic_Latin}+/u, ".");
t(/\p{Block=Basic_Latin}+/u, "supercalifragilisticexpialidocious");
t(/\p{blk=CJK}+/u, "话说天下大势,分久必合,合久必分");
t(/\p{blk=CJK_Unified_Ideographs}+/u, "吾庄后有一桃园,花开正盛");
f(/\p{blk=CJK}+/u, "おはようございます");
f(/\p{blk=CJK_Unified_Ideographs}+/u,
"Something is rotten in the state of Denmark");
t(/\p{InLatin_1}+/u, "Wie froh bin ich, daß ich weg bin!");
f(/\p{InLatin_1_Supplement}+/u, "奔腾千里荡尘埃,渡水登山紫雾开");
f(/\p{InLatin_1_Sup}+/u, "いただきます");
t(/\p{blk=Latin_1}+/u, "Wie froh bin ich, daß ich weg bin!");
f(/\p{blk=Latin_1_Supplement}+/u, "奔腾千里荡尘埃,渡水登山紫雾开");
f(/\p{blk=Latin_1_Sup}+/u, "いただきます");
t(/\p{InHiragana}/u, "いただきます");
t(/\p{Hiragana}/u, "\u{1b001}"); // This refers to the script "Hiragana".
f(/\p{InHiragana}/u, "\u{1b001}"); // This refers to the block "Hiragana".
t(/\p{blk=Hiragana}/u, "いただきます");
t(/\p{sc=Hiragana}/u, "\u{1b001}"); // This refers to the script "Hiragana".
f(/\p{blk=Hiragana}/u, "\u{1b001}"); // This refers to the block "Hiragana".
t(/\p{InGreek_And_Coptic}/u,
t(/\p{blk=Greek_And_Coptic}/u,
"ἄνδρα μοι ἔννεπε, μοῦσα, πολύτροπον, ὃς μάλα πολλὰ");
t(/\p{InGreek}/u, "μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος");
t(/\p{blk=Greek}/u, "μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος");
assertThrows("/\\p{In}/u");
assertThrows("/\\pI/u");
......
......@@ -9,9 +9,6 @@ assertThrows("/[\\p{garbage}]/u");
assertThrows("/[\\p{}]/u");
assertThrows("/[\\p{]/u");
assertThrows("/[\\p}]/u");
assertThrows("/[\\p{Math}]/u");
assertThrows("/[\\p{Bidi_M}]/u");
assertThrows("/[\\p{Hex}]/u");
assertTrue(/^[\p{Lu}\p{Ll}]+$/u.test("ABCabc"));
assertTrue(/^[\p{Lu}-\p{Ll}]+$/u.test("ABC-abc"));
......@@ -19,6 +16,9 @@ assertFalse(/^[\P{Lu}\p{Ll}]+$/u.test("ABCabc"));
assertTrue(/^[\P{Lu}\p{Ll}]+$/u.test("abc"));
assertTrue(/^[\P{Lu}]+$/u.test("abc123"));
assertFalse(/^[\P{Lu}]+$/u.test("XYZ"));
assertTrue(/[\p{Math}]/u.test("+"));
assertTrue(/[\P{Bidi_M}]/u.test(" "));
assertTrue(/[\p{Hex}]/u.test("A"));
assertTrue(/^[^\P{Lu}]+$/u.test("XYZ"));
assertFalse(/^[^\p{Lu}\p{Ll}]+$/u.test("abc"));
......
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-regexp-property --harmony-unicode-regexps
function t(re, s) { assertTrue(re.test(s)); }
function f(re, s) { assertFalse(re.test(s)); }
t(/\p{Bidi_Class=L}+/u, "Is this the real life?");
t(/\p{bc=Left_To_Right}+/u, "Is this just fantasy?");
t(/\p{bc=AL}+/u, "السلام عليكم‎");
t(/\p{bc=Arabic_Letter}+/u, "متشرف بمعرفتك‎");
t(/\p{Line_Break=Glue}/u, "\u00A0");
t(/\p{lb=AL}/u, "~");
assertThrows("/\\p{Block=}/u");
assertThrows("/\\p{=}/u");
assertThrows("/\\p{=L}/u");
assertThrows("/\\p{=Hiragana}/u");
assertThrows("/\\p{Block=CJK=}/u");
assertThrows("/\\p{Age=V8_0}/u");
assertThrows("/\\p{General_Category=Letter}/u");
assertThrows("/\\p{gc=L}/u");
assertThrows("/\\p{General_Category_Mask=Letter}/u");
assertThrows("/\\p{gcm=L}/u");
......@@ -6,12 +6,12 @@
assertThrows("/\\p{In CJK}/u");
assertThrows("/\\p{InCJKUnifiedIdeographs}/u");
assertDoesNotThrow("/\\p{InCJK}/u");
assertDoesNotThrow("/\\p{InCJK_Unified_Ideographs}/u");
assertThrows("/\\p{InCJK}/u");
assertThrows("/\\p{InCJK_Unified_Ideographs}/u");
assertDoesNotThrow("/\\p{InCyrillic_Sup}/u");
assertDoesNotThrow("/\\p{InCyrillic_Supplement}/u");
assertDoesNotThrow("/\\p{InCyrillic_Supplementary}/u");
assertThrows("/\\p{InCyrillic_Sup}/u");
assertThrows("/\\p{InCyrillic_Supplement}/u");
assertThrows("/\\p{InCyrillic_Supplementary}/u");
assertThrows("/\\p{InCyrillicSupplementary}/u");
assertThrows("/\\p{InCyrillic_supplementary}/u");
......@@ -25,9 +25,18 @@ assertDoesNotThrow("/\\p{Mark}/u");
assertDoesNotThrow("/\\p{Combining_Mark}/u");
assertThrows("/\\p{Combining Mark}/u");
assertDoesNotThrow("/\\p{Copt}/u");
assertDoesNotThrow("/\\p{Coptic}/u");
assertDoesNotThrow("/\\p{Qaac}/u");
assertDoesNotThrow("/\\p{Egyp}/u");
assertDoesNotThrow("/\\p{Egyptian_Hieroglyphs}/u");
assertDoesNotThrow("/\\p{Script=Copt}/u");
assertThrows("/\\p{Coptic}/u");
assertThrows("/\\p{Qaac}/u");
assertThrows("/\\p{Egyp}/u");
assertDoesNotThrow("/\\p{Script=Egyptian_Hieroglyphs}/u");
assertThrows("/\\p{EgyptianHieroglyphs}/u");
assertThrows("/\\p{BidiClass=LeftToRight}/u");
assertThrows("/\\p{BidiC=LeftToRight}/u");
assertThrows("/\\p{bidi_c=Left_To_Right}/u");
assertDoesNotThrow("/\\p{Block=CJK}/u");
assertThrows("/\\p{Block = CJK}/u");
assertThrows("/\\p{Block=cjk}/u");
assertThrows("/\\p{BLK=CJK}/u");
......@@ -7,33 +7,33 @@
function t(re, s) { assertTrue(re.test(s)); }
function f(re, s) { assertFalse(re.test(s)); }
t(/\p{Common}+/u, ".");
f(/\p{Common}+/u, "supercalifragilisticexpialidocious");
t(/\p{Script=Common}+/u, ".");
f(/\p{Script=Common}+/u, "supercalifragilisticexpialidocious");
t(/\p{Han}+/u, "话说天下大势,分久必合,合久必分");
t(/\p{Hani}+/u, "吾庄后有一桃园,花开正盛");
f(/\p{Han}+/u, "おはようございます");
f(/\p{Hani}+/u, "Something is rotten in the state of Denmark");
t(/\p{Script=Han}+/u, "话说天下大势,分久必合,合久必分");
t(/\p{Script=Hani}+/u, "吾庄后有一桃园,花开正盛");
f(/\p{Script=Han}+/u, "おはようございます");
f(/\p{Script=Hani}+/u, "Something is rotten in the state of Denmark");
t(/\p{Latin}+/u, "Wie froh bin ich, daß ich weg bin!");
t(/\p{Latn}+/u,
t(/\p{Script=Latin}+/u, "Wie froh bin ich, daß ich weg bin!");
t(/\p{Script=Latn}+/u,
"It was a bright day in April, and the clocks were striking thirteen");
f(/\p{Latin}+/u, "奔腾千里荡尘埃,渡水登山紫雾开");
f(/\p{Latn}+/u, "いただきます");
f(/\p{Script=Latin}+/u, "奔腾千里荡尘埃,渡水登山紫雾开");
f(/\p{Script=Latn}+/u, "いただきます");
t(/\p{Hiragana}/u, "いただきます");
t(/\p{Hira}/u, "ありがとうございました");
f(/\p{Hiragana}/u,
t(/\p{sc=Hiragana}/u, "いただきます");
t(/\p{sc=Hira}/u, "ありがとうございました");
f(/\p{sc=Hiragana}/u,
"Als Gregor Samsa eines Morgens aus unruhigen Träumen erwachte");
f(/\p{Hira}/u, "Call me Ishmael");
f(/\p{sc=Hira}/u, "Call me Ishmael");
t(/\p{Phoenician}/u, "\u{10900}\u{1091a}");
t(/\p{Phnx}/u, "\u{1091f}\u{10916}");
f(/\p{Phoenician}/u, "Arthur est un perroquet");
f(/\p{Phnx}/u, "设心狠毒非良士,操卓原来一路人");
t(/\p{sc=Phoenician}/u, "\u{10900}\u{1091a}");
t(/\p{sc=Phnx}/u, "\u{1091f}\u{10916}");
f(/\p{sc=Phoenician}/u, "Arthur est un perroquet");
f(/\p{sc=Phnx}/u, "设心狠毒非良士,操卓原来一路人");
t(/\p{Grek}/u, "ἄνδρα μοι ἔννεπε, μοῦσα, πολύτροπον, ὃς μάλα πολλὰ");
t(/\p{Greek}/u, "μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος");
f(/\p{Greek}/u, "高贤未服英雄志,屈节偏生杰士疑");
f(/\p{Greek}/u,
t(/\p{sc=Grek}/u, "ἄνδρα μοι ἔννεπε, μοῦσα, πολύτροπον, ὃς μάλα πολλὰ");
t(/\p{sc=Greek}/u, "μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος");
f(/\p{sc=Greek}/u, "高贤未服英雄志,屈节偏生杰士疑");
f(/\p{sc=Greek}/u,
"Mr. Jones, of the Manor Farm, had locked the hen-houses for the night");
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment