Commit fbbb9cab authored by yangguo's avatar yangguo Committed by Commit bot

[regexp] correctly parse non-BMP unicode escapes in atoms.

R=rossberg@chromium.org

Review URL: https://codereview.chromium.org/1568623004

Cr-Commit-Position: refs/heads/master@{#33207}
parent 1ab97e90
......@@ -962,6 +962,7 @@ DEFINE_BOOL(regexp_possessive_quantifier, false,
DEFINE_BOOL(trace_regexp_bytecodes, false, "trace regexp bytecode execution")
DEFINE_BOOL(trace_regexp_assembler, false,
"trace regexp macro assembler calls.")
DEFINE_BOOL(trace_regexp_parser, false, "trace regexp parsing")
//
// Logging and profiling flags
......
......@@ -193,7 +193,8 @@ Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }
size_t Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, size_t length,
const byte* src, size_t* src_pos,
size_t src_length) {
static const unibrow::uchar kMaxUtf16Character = 0xffff;
static const unibrow::uchar kMaxUtf16Character =
unibrow::Utf16::kMaxNonSurrogateCharCode;
size_t i = 0;
// Because of the UTF-16 lead and trail surrogates, we stop filling the buffer
// one character early (in the normal case), because we need to have at least
......
......@@ -417,7 +417,12 @@ RegExpTree* RegExpParser::ParseDisjunction() {
Advance(2);
uc32 value;
if (ParseUnicodeEscape(&value)) {
builder->AddCharacter(value);
if (value > unibrow::Utf16::kMaxNonSurrogateCharCode) {
builder->AddCharacter(unibrow::Utf16::LeadSurrogate(value));
builder->AddCharacter(unibrow::Utf16::TrailSurrogate(value));
} else {
builder->AddCharacter(static_cast<uc16>(value));
}
} else if (!unicode_) {
builder->AddCharacter('u');
} else {
......@@ -986,6 +991,11 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
} else {
DCHECK(tree != NULL);
DCHECK(result->error.is_null());
if (FLAG_trace_regexp_parser) {
OFStream os(stdout);
tree->Print(os, zone);
os << "\n";
}
result->tree = tree;
int capture_count = parser.captures_started();
result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
......
......@@ -100,13 +100,14 @@ static bool CheckParse(const char* input) {
}
static void CheckParseEq(const char* input, const char* expected) {
static void CheckParseEq(const char* input, const char* expected,
bool unicode = false) {
v8::HandleScope scope(CcTest::isolate());
Zone zone;
FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
RegExpCompileData result;
CHECK(v8::internal::RegExpParser::ParseRegExp(
CcTest::i_isolate(), &zone, &reader, false, false, &result));
CcTest::i_isolate(), &zone, &reader, false, unicode, &result));
CHECK(result.tree != NULL);
CHECK(result.error.is_null());
std::ostringstream os;
......@@ -163,6 +164,7 @@ static MinMaxPair CheckMinMaxMatch(const char* input) {
void TestRegExpParser(bool lookbehind) {
FLAG_harmony_regexp_lookbehind = lookbehind;
FLAG_harmony_unicode_regexps = true;
CHECK_PARSE_ERROR("?");
......@@ -305,6 +307,12 @@ void TestRegExpParser(bool lookbehind) {
CheckParseEq("\\u003z", "'u003z'");
CheckParseEq("foo[z]*", "(: 'foo' (# 0 - g [z]))");
// Unicode regexps
CheckParseEq("\\u{12345}", "'\\ud808\\udf45'", true);
CheckParseEq("\\u{12345}\\u{23456}", "'\\ud808\\udf45\\ud84d\\udc56'", true);
CheckParseEq("\\u{12345}|\\u{23456}", "(| '\\ud808\\udf45' '\\ud84d\\udc56')",
true);
CHECK_SIMPLE("", false);
CHECK_SIMPLE("a", true);
CHECK_SIMPLE("a|b", false);
......
......@@ -210,3 +210,40 @@ function testRegexpHelper(r) {
helper(/foo/u);
helper(new RegExp("foo", "u"));
})();
// Non-BMP patterns.
// Single character atom.
assertTrue(new RegExp("\u{12345}", "u").test("\u{12345}"));
assertTrue(/\u{12345}/u.test("\u{12345}"));
assertTrue(new RegExp("\u{12345}", "u").test("\ud808\udf45"));
assertTrue(/\u{12345}/u.test("\ud808\udf45"));
assertFalse(new RegExp("\u{12345}", "u").test("\udf45"));
assertFalse(/\u{12345}/u.test("\udf45"));
// Multi-character atom.
assertTrue(new RegExp("\u{12345}\u{23456}", "u").test("a\u{12345}\u{23456}b"));
assertTrue(/\u{12345}\u{23456}/u.test("b\u{12345}\u{23456}c"));
assertFalse(new RegExp("\u{12345}\u{23456}", "u").test("a\udf45\u{23456}b"));
assertFalse(/\u{12345}\u{23456}/u.test("b\udf45\u{23456}c"));
// Disjunction.
assertTrue(new RegExp("\u{12345}(?:\u{23456})", "u").test(
"a\u{12345}\u{23456}b"));
assertTrue(/\u{12345}(?:\u{23456})/u.test("b\u{12345}\u{23456}c"));
assertFalse(new RegExp("\u{12345}(?:\u{23456})", "u").test(
"a\udf45\u{23456}b"));
assertFalse(/\u{12345}(?:\u{23456})/u.test("b\udf45\u{23456}c"));
// Alternative.
assertTrue(new RegExp("\u{12345}|\u{23456}", "u").test("a\u{12345}b"));
assertTrue(/\u{12345}|\u{23456}/u.test("b\u{23456}c"));
assertFalse(new RegExp("\u{12345}|\u{23456}", "u").test("a\udf45\ud84db"));
assertFalse(/\u{12345}|\u{23456}/u.test("b\udf45\ud808c"));
// Capture.
assertTrue(new RegExp("(\u{12345}|\u{23456}).\\1", "u").test(
"\u{12345}b\u{12345}"));
assertTrue(/(\u{12345}|\u{23456}).\1/u.test("\u{12345}b\u{12345}"));
assertFalse(new RegExp("(\u{12345}|\u{23456}).\\1", "u").test(
"\u{12345}b\u{23456}"));
assertFalse(/(\u{12345}|\u{23456}).\1/u.test("\u{12345}b\u{23456}"));
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment