Commit a8d59243 authored by yangguo@chromium.org's avatar yangguo@chromium.org

Cleanup latin-1 conversion check in regexp engine

R=yangguo@chromium.org
BUG=

Review URL: https://chromiumcodereview.appspot.com/11880045
Patch from Dan Carney <dcarney@google.com>.

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13400 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 451ed2f2
...@@ -2875,23 +2875,9 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) { ...@@ -2875,23 +2875,9 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
if (!ignore_case) return set_replacement(NULL); if (!ignore_case) return set_replacement(NULL);
// Here, we need to check for characters whose upper and lower cases // Here, we need to check for characters whose upper and lower cases
// are outside the Latin-1 range. // are outside the Latin-1 range.
// TODO(dcarney): Replace this code with a simple if (!unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(quarks[j])) {
// table lookup in unibrow::Latin-1. return set_replacement(NULL);
// TODO(dcarney): Test cases!.
unibrow::uchar result;
int chars;
chars = unibrow::ToLowercase::Convert(quarks[j], 0, &result, NULL);
if (chars > 1 ||
(chars == 1 && result <= String::kMaxOneByteCharCodeU)) {
continue;
}
chars = unibrow::ToUppercase::Convert(quarks[j], 0, &result, NULL);
if (chars > 1 ||
(chars == 1 && result <= String::kMaxOneByteCharCodeU)) {
continue;
} }
// This character is definitely not in the Latin-1 range.
return set_replacement(NULL);
#endif #endif
} }
} else { } else {
......
...@@ -79,6 +79,36 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n, ...@@ -79,6 +79,36 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
} }
bool Latin1::NonLatin1CanBeConvertedToLatin1(uint16_t c) {
ASSERT(c > Latin1::kMaxChar);
switch (c) {
case 0x130:
case 0x131:
case 0x149:
case 0x178:
case 0x17f:
case 0x1f0:
case 0x1e96:
case 0x1e97:
case 0x1e98:
case 0x1e99:
case 0x1e9a:
case 0x1e9e:
case 0x212a:
case 0x212b:
case 0xfb00:
case 0xfb01:
case 0xfb02:
case 0xfb03:
case 0xfb04:
case 0xfb05:
case 0xfb06:
return true;
}
return false;
}
unsigned Utf8::Encode(char* str, uchar c, int previous) { unsigned Utf8::Encode(char* str, uchar c, int previous) {
static const int kMask = ~(1 << 6); static const int kMask = ~(1 << 6);
if (c <= kMaxOneByteChar) { if (c <= kMaxOneByteChar) {
......
...@@ -140,6 +140,7 @@ class Latin1 { ...@@ -140,6 +140,7 @@ class Latin1 {
#else #else
static const unsigned kMaxChar = 0xff; static const unsigned kMaxChar = 0xff;
#endif #endif
static inline bool NonLatin1CanBeConvertedToLatin1(uint16_t);
}; };
class Utf8 { class Utf8 {
......
...@@ -1275,3 +1275,40 @@ TEST(IsAscii) { ...@@ -1275,3 +1275,40 @@ TEST(IsAscii) {
CHECK(String::IsAscii(static_cast<char*>(NULL), 0)); CHECK(String::IsAscii(static_cast<char*>(NULL), 0));
CHECK(String::IsOneByte(static_cast<uc16*>(NULL), 0)); CHECK(String::IsOneByte(static_cast<uc16*>(NULL), 0));
} }
static bool CanBeConvertedToLatin1(uint16_t c) {
CHECK(c > unibrow::Latin1::kMaxChar);
uint32_t result[4];
int chars;
chars = unibrow::ToLowercase::Convert(c, 0, result, NULL);
if (chars > 0) {
CHECK_LE(chars, static_cast<int>(sizeof(result)));
for (int i = 0; i < chars; i++) {
if (result[i] <= unibrow::Latin1::kMaxChar) {
return true;
}
}
}
chars = unibrow::ToUppercase::Convert(c, 0, result, NULL);
if (chars > 0) {
CHECK_LE(chars, static_cast<int>(sizeof(result)));
for (int i = 0; i < chars; i++) {
if (result[i] <= unibrow::Latin1::kMaxChar) {
return true;
}
}
}
return false;
}
TEST(Latin1) {
#ifndef ENABLE_LATIN_1
if (true) return;
#endif
for (uint16_t c = unibrow::Latin1::kMaxChar + 1; c != 0; c++) {
CHECK_EQ(CanBeConvertedToLatin1(c),
unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(c));
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment