Commit 5beb5ee7 authored by yangguo's avatar yangguo Committed by Commit bot

[regexp] implement latest spec draft for property class.

See https://github.com/mathiasbynens/es-regexp-unicode-property-escapes

Changes:
- only allow General Category, binary properties, Script, and Script_Extensions.
- implement Script_Extensions.

R=littledan@chromium.org
BUG=v8:4743

Review-Url: https://codereview.chromium.org/2502933002
Cr-Commit-Position: refs/heads/master@{#41091}
parent 4a70104a
...@@ -1083,13 +1083,19 @@ bool IsExactPropertyValueAlias(const char* property_value_name, ...@@ -1083,13 +1083,19 @@ bool IsExactPropertyValueAlias(const char* property_value_name,
bool LookupPropertyValueName(UProperty property, bool LookupPropertyValueName(UProperty property,
const char* property_value_name, bool negate, const char* property_value_name, bool negate,
ZoneList<CharacterRange>* result, Zone* zone) { ZoneList<CharacterRange>* result, Zone* zone) {
UProperty property_for_lookup = property;
if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
// For the property Script_Extensions, we have to do the property value
// name lookup as if the property is Script.
property_for_lookup = UCHAR_SCRIPT;
}
int32_t property_value = int32_t property_value =
u_getPropertyValueEnum(property, property_value_name); u_getPropertyValueEnum(property_for_lookup, property_value_name);
if (property_value == UCHAR_INVALID_CODE) return false; if (property_value == UCHAR_INVALID_CODE) return false;
// We require the property name to match exactly to one of the property value // We require the property name to match exactly to one of the property value
// aliases. However, u_getPropertyValueEnum uses loose matching. // aliases. However, u_getPropertyValueEnum uses loose matching.
if (!IsExactPropertyValueAlias(property_value_name, property, if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
property_value)) { property_value)) {
return false; return false;
} }
...@@ -1197,9 +1203,14 @@ bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result, ...@@ -1197,9 +1203,14 @@ bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
const char* property_name = first_part.ToConstVector().start(); const char* property_name = first_part.ToConstVector().start();
const char* value_name = second_part.ToConstVector().start(); const char* value_name = second_part.ToConstVector().start();
UProperty property = u_getPropertyEnum(property_name); UProperty property = u_getPropertyEnum(property_name);
if (property < UCHAR_INT_START) return false;
if (property >= UCHAR_INT_LIMIT) return false;
if (!IsExactPropertyAlias(property_name, property)) return false; if (!IsExactPropertyAlias(property_name, property)) return false;
if (property == UCHAR_GENERAL_CATEGORY) {
// We want to allow aggregate value names such as "Letter".
property = UCHAR_GENERAL_CATEGORY_MASK;
} else if (property != UCHAR_SCRIPT &&
property != UCHAR_SCRIPT_EXTENSIONS) {
return false;
}
return LookupPropertyValueName(property, value_name, negate, result, return LookupPropertyValueName(property, value_name, negate, result,
zone()); zone());
} }
......
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-regexp-property
function t(re, s) { assertTrue(re.test(s)); }
function f(re, s) { assertFalse(re.test(s)); }
t(/\p{Block=ASCII}+/u, ".");
t(/\p{Block=ASCII}+/u, "supercalifragilisticexpialidocious");
t(/\p{Block=Basic_Latin}+/u, ".");
t(/\p{Block=Basic_Latin}+/u, "supercalifragilisticexpialidocious");
t(/\p{blk=CJK}+/u, "话说天下大势,分久必合,合久必分");
t(/\p{blk=CJK_Unified_Ideographs}+/u, "吾庄后有一桃园,花开正盛");
f(/\p{blk=CJK}+/u, "おはようございます");
f(/\p{blk=CJK_Unified_Ideographs}+/u,
"Something is rotten in the state of Denmark");
t(/\p{blk=Latin_1}+/u, "Wie froh bin ich, daß ich weg bin!");
f(/\p{blk=Latin_1_Supplement}+/u, "奔腾千里荡尘埃,渡水登山紫雾开");
f(/\p{blk=Latin_1_Sup}+/u, "いただきます");
t(/\p{blk=Hiragana}/u, "いただきます");
t(/\p{sc=Hiragana}/u, "\u{1b001}"); // This refers to the script "Hiragana".
f(/\p{blk=Hiragana}/u, "\u{1b001}"); // This refers to the block "Hiragana".
t(/\p{blk=Greek_And_Coptic}/u,
"ἄνδρα μοι ἔννεπε, μοῦσα, πολύτροπον, ὃς μάλα πολλὰ");
t(/\p{blk=Greek}/u, "μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος");
assertThrows("/\\p{In}/u");
assertThrows("/\\pI/u");
assertThrows("/\\p{I}/u");
assertThrows("/\\p{CJK}/u");
...@@ -4,16 +4,13 @@ ...@@ -4,16 +4,13 @@
// Flags: --harmony-regexp-property // Flags: --harmony-regexp-property
function t(re, s) { assertTrue(re.test(s)); } assertThrows("/\\p{Bidi_Class=L}+/u");
function f(re, s) { assertFalse(re.test(s)); } assertThrows("/\\p{bc=Left_To_Right}+/u");
assertThrows("/\\p{bc=AL}+/u");
assertThrows("/\\p{bc=Arabic_Letter}+/u");
t(/\p{Bidi_Class=L}+/u, "Is this the real life?"); assertThrows("/\\p{Line_Break=Glue}/u");
t(/\p{bc=Left_To_Right}+/u, "Is this just fantasy?"); assertThrows("/\\p{lb=AL}/u");
t(/\p{bc=AL}+/u, "السلام عليكم‎");
t(/\p{bc=Arabic_Letter}+/u, "متشرف بمعرفتك‎");
t(/\p{Line_Break=Glue}/u, "\u00A0");
t(/\p{lb=AL}/u, "~");
assertThrows("/\\p{Block=}/u"); assertThrows("/\\p{Block=}/u");
assertThrows("/\\p{=}/u"); assertThrows("/\\p{=}/u");
...@@ -22,7 +19,7 @@ assertThrows("/\\p{=Hiragana}/u"); ...@@ -22,7 +19,7 @@ assertThrows("/\\p{=Hiragana}/u");
assertThrows("/\\p{Block=CJK=}/u"); assertThrows("/\\p{Block=CJK=}/u");
assertThrows("/\\p{Age=V8_0}/u"); assertThrows("/\\p{Age=V8_0}/u");
assertThrows("/\\p{General_Category=Letter}/u"); assertDoesNotThrow("/\\p{General_Category=Letter}/u");
assertThrows("/\\p{gc=L}/u"); assertDoesNotThrow("/\\p{gc=L}/u");
assertThrows("/\\p{General_Category_Mask=Letter}/u"); assertThrows("/\\p{General_Category_Mask=Letter}/u");
assertThrows("/\\p{gcm=L}/u"); assertThrows("/\\p{gcm=L}/u");
...@@ -36,7 +36,7 @@ assertThrows("/\\p{BidiClass=LeftToRight}/u"); ...@@ -36,7 +36,7 @@ assertThrows("/\\p{BidiClass=LeftToRight}/u");
assertThrows("/\\p{BidiC=LeftToRight}/u"); assertThrows("/\\p{BidiC=LeftToRight}/u");
assertThrows("/\\p{bidi_c=Left_To_Right}/u"); assertThrows("/\\p{bidi_c=Left_To_Right}/u");
assertDoesNotThrow("/\\p{Block=CJK}/u"); assertThrows("/\\p{Block=CJK}/u");
assertThrows("/\\p{Block = CJK}/u"); assertThrows("/\\p{Block = CJK}/u");
assertThrows("/\\p{Block=cjk}/u"); assertThrows("/\\p{Block=cjk}/u");
assertThrows("/\\p{BLK=CJK}/u"); assertThrows("/\\p{BLK=CJK}/u");
...@@ -63,3 +63,7 @@ assertTrue(/\P{L}/u.test("\uA6EE")); ...@@ -63,3 +63,7 @@ assertTrue(/\P{L}/u.test("\uA6EE"));
assertTrue(/\p{Lowercase_Letter}/u.test("a")); assertTrue(/\p{Lowercase_Letter}/u.test("a"));
assertTrue(/\p{Math_Symbol}/u.test("+")); assertTrue(/\p{Math_Symbol}/u.test("+"));
assertTrue(/\p{gc=Ll}/u.test("a"));
assertTrue(/\p{General_Category=Math_Symbol}/u.test("+"));
assertTrue(/\p{General_Category=L}/u.test("X"));
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-regexp-property
assertThrows("/\p{Block=ASCII}+/u");
assertThrows("/\p{Block=ASCII}+/u");
assertThrows("/\p{Block=Basic_Latin}+/u");
assertThrows("/\p{Block=Basic_Latin}+/u");
assertThrows("/\p{blk=CJK}+/u");
assertThrows("/\p{blk=CJK_Unified_Ideographs}+/u");
assertThrows("/\p{blk=CJK}+/u");
assertThrows("/\p{blk=CJK_Unified_Ideographs}+/u");
assertThrows("/\p{Block=ASCII}+/u");
assertThrows("/\p{Block=ASCII}+/u");
assertThrows("/\p{Block=Basic_Latin}+/u");
assertThrows("/\p{Block=Basic_Latin}+/u");
assertThrows("/\p{NFKD_Quick_Check=Y}+/u");
assertThrows("/\p{NFKD_QC=Yes}+/u");
assertThrows("/\p{Numeric_Type=Decimal}+/u");
assertThrows("/\p{nt=De}+/u");
assertThrows("/\p{Bidi_Class=Arabic_Letter}+/u");
assertThrows("/\p{Bidi_Class=AN}+/u");
assertThrows("/\p{ccc=OV}+/u");
assertThrows("/\p{Sentence_Break=Format}+/u");
assertThrows("/\\p{In}/u");
assertThrows("/\\pI/u");
assertThrows("/\\p{I}/u");
assertThrows("/\\p{CJK}/u");
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment