Commit d17b4bfb authored by Yang Guo's avatar Yang Guo Committed by Commit Bot

Reland "[regexp] fix Latin1 ignore-case bug."

Bug: v8:6703
Change-Id: I225cd78bedf2c0c123aedd3deeb1cd6d442f7697
Reviewed-on: https://chromium-review.googlesource.com/901522Reviewed-by: 's avatarJakob Gruber <jgruber@chromium.org>
Commit-Queue: Yang Guo <yangguo@chromium.org>
Cr-Commit-Position: refs/heads/master@{#51114}
parent 8f96f66f
...@@ -2768,16 +2768,13 @@ RegExpNode* TextNode::FilterOneByte(int depth) { ...@@ -2768,16 +2768,13 @@ RegExpNode* TextNode::FilterOneByte(int depth) {
Vector<const uc16> quarks = elm.atom()->data(); Vector<const uc16> quarks = elm.atom()->data();
for (int j = 0; j < quarks.length(); j++) { for (int j = 0; j < quarks.length(); j++) {
uint16_t c = quarks[j]; uint16_t c = quarks[j];
if (c <= String::kMaxOneByteCharCode) continue; if (elm.atom()->ignore_case()) {
if (!IgnoreCase(elm.atom()->flags())) return set_replacement(nullptr); c = unibrow::Latin1::TryConvertToLatin1(c);
// Here, we need to check for characters whose upper and lower cases }
// are outside the Latin-1 range. if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); // Replace quark in case we converted to Latin-1.
// Character is outside Latin-1 completely uint16_t* writable_quarks = const_cast<uint16_t*>(quarks.start());
if (converted == 0) return set_replacement(nullptr); writable_quarks[j] = c;
// Convert quark to Latin-1 in place.
uint16_t* copy = const_cast<uint16_t*>(quarks.start());
copy[j] = converted;
} }
} else { } else {
DCHECK(elm.text_type() == TextElement::CHAR_CLASS); DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
...@@ -3209,10 +3206,17 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, ...@@ -3209,10 +3206,17 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
if (first_element_checked && i == 0 && j == 0) continue; if (first_element_checked && i == 0 && j == 0) continue;
if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
EmitCharacterFunction* emit_function = nullptr; EmitCharacterFunction* emit_function = nullptr;
uc16 quark = quarks[j];
if (elm.atom()->ignore_case()) {
// Everywhere else we assume that a non-Latin-1 character cannot match
// a Latin-1 character. Avoid the cases where this is assumption is
// invalid by using the Latin1 equivalent instead.
quark = unibrow::Latin1::TryConvertToLatin1(quark);
}
switch (pass) { switch (pass) {
case NON_LATIN1_MATCH: case NON_LATIN1_MATCH:
DCHECK(one_byte); DCHECK(one_byte);
if (quarks[j] > String::kMaxOneByteCharCode) { if (quark > String::kMaxOneByteCharCode) {
assembler->GoTo(backtrack); assembler->GoTo(backtrack);
return; return;
} }
...@@ -3232,8 +3236,8 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, ...@@ -3232,8 +3236,8 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
if (emit_function != nullptr) { if (emit_function != nullptr) {
bool bounds_check = *checked_up_to < cp_offset + j || read_backward(); bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
bool bound_checked = bool bound_checked =
emit_function(isolate, compiler, quarks[j], backtrack, emit_function(isolate, compiler, quark, backtrack, cp_offset + j,
cp_offset + j, bounds_check, preloaded); bounds_check, preloaded);
if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
} }
} }
......
...@@ -93,15 +93,11 @@ size_t Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data, ...@@ -93,15 +93,11 @@ size_t Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
class Latin1 { class Latin1 {
public: public:
static const unsigned kMaxChar = 0xff; static const unsigned kMaxChar = 0xff;
// Returns 0 if character does not convert to single latin-1 character // Convert the character to Latin-1 case equivalent if possible.
// or if the character doesn't not convert back to latin-1 via inverse static inline uint16_t TryConvertToLatin1(uint16_t);
// operation (upper to lower, etc).
static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
}; };
uint16_t Latin1::TryConvertToLatin1(uint16_t c) {
uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
DCHECK_GT(c, Latin1::kMaxChar);
switch (c) { switch (c) {
// This are equivalent characters in unicode. // This are equivalent characters in unicode.
case 0x39c: case 0x39c:
...@@ -112,7 +108,7 @@ uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) { ...@@ -112,7 +108,7 @@ uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
case 0x178: case 0x178:
return 0xff; return 0xff;
} }
return 0; return c;
} }
......
...@@ -1505,7 +1505,7 @@ static uint16_t ConvertLatin1(uint16_t c) { ...@@ -1505,7 +1505,7 @@ static uint16_t ConvertLatin1(uint16_t c) {
#ifndef V8_INTL_SUPPORT #ifndef V8_INTL_SUPPORT
static void CheckCanonicalEquivalence(uint16_t c, uint16_t test) { static void CheckCanonicalEquivalence(uint16_t c, uint16_t test) {
uint16_t expect = ConvertLatin1<unibrow::Ecma262UnCanonicalize, true>(c); uint16_t expect = ConvertLatin1<unibrow::Ecma262UnCanonicalize, true>(c);
if (expect > unibrow::Latin1::kMaxChar) expect = 0; if (expect > unibrow::Latin1::kMaxChar || expect == 0) expect = c;
CHECK_EQ(expect, test); CHECK_EQ(expect, test);
} }
...@@ -1514,7 +1514,7 @@ TEST(Latin1IgnoreCase) { ...@@ -1514,7 +1514,7 @@ TEST(Latin1IgnoreCase) {
for (uint16_t c = unibrow::Latin1::kMaxChar + 1; c != 0; c++) { for (uint16_t c = unibrow::Latin1::kMaxChar + 1; c != 0; c++) {
uint16_t lower = ConvertLatin1<unibrow::ToLowercase, false>(c); uint16_t lower = ConvertLatin1<unibrow::ToLowercase, false>(c);
uint16_t upper = ConvertLatin1<unibrow::ToUppercase, false>(c); uint16_t upper = ConvertLatin1<unibrow::ToUppercase, false>(c);
uint16_t test = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); uint16_t test = unibrow::Latin1::TryConvertToLatin1(c);
// Filter out all character whose upper is not their lower or vice versa. // Filter out all character whose upper is not their lower or vice versa.
if (lower == 0 && upper == 0) { if (lower == 0 && upper == 0) {
CheckCanonicalEquivalence(c, test); CheckCanonicalEquivalence(c, test);
......
// Copyright 2018 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
assertTrue(/(\u039C)/i.test("\xB5"));
assertTrue(/(\u039C)+/i.test("\xB5"));
assertTrue(/(\u039C)/ui.test("\xB5"));
assertTrue(/(\u039C)+/ui.test("\xB5"));
assertTrue(/(\u03BC)/i.test("\xB5"));
assertTrue(/(\u03BC)+/i.test("\xB5"));
assertTrue(/(\u03BC)/ui.test("\xB5"));
assertTrue(/(\u03BC)+/ui.test("\xB5"));
assertTrue(/(\u03BC)/i.test("\u039C"));
assertTrue(/(\u03BC)+/i.test("\u039C"));
assertTrue(/(\u03BC)/ui.test("\u039C"));
assertTrue(/(\u03BC)+/ui.test("\u039C"));
assertTrue(/(\u0178)/i.test("\xFF"));
assertTrue(/(\u0178)+/i.test("\xFF"));
assertTrue(/(\u0178)/ui.test("\xFF"));
assertTrue(/(\u0178)+/ui.test("\xFF"));
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment