Commit 0c822b21 authored by yangguo@chromium.org's avatar yangguo@chromium.org

Fix some latin-1 webkit units tests

R=yangguo@chromium.org
BUG=

Review URL: https://chromiumcodereview.appspot.com/11962035
Patch from Dan Carney <dcarney@google.com>.

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13455 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 7f331f62
......@@ -2855,6 +2855,29 @@ RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
}
// We need to check for the following characters: 0x39c 0x3bc 0x178.
static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
#ifdef ENABLE_LATIN_1
// TODO(dcarney): this could be a lot more efficient.
return range.Contains(0x39c) ||
range.Contains(0x3bc) || range.Contains(0x178);
#else
return false;
#endif
}
#ifdef ENABLE_LATIN_1
static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
for (int i = 0; i < ranges->length(); i++) {
// TODO(dcarney): this could be a lot more efficient.
if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
}
return false;
}
#endif
RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
......@@ -2871,21 +2894,21 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
return set_replacement(NULL);
}
#else
if (quarks[j] <= String::kMaxOneByteCharCode) continue;
uint16_t c = quarks[j];
if (c <= String::kMaxOneByteCharCode) continue;
if (!ignore_case) return set_replacement(NULL);
// Here, we need to check for characters whose upper and lower cases
// are outside the Latin-1 range.
if (!unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(quarks[j])) {
return set_replacement(NULL);
}
uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
// Character is outside Latin-1 completely
if (converted == 0) return set_replacement(NULL);
// Convert quark to Latin-1 in place.
uint16_t* copy = const_cast<uint16_t*>(quarks.start());
copy[j] = converted;
#endif
}
} else {
ASSERT(elm.type == TextElement::CHAR_CLASS);
#ifdef ENABLE_LATIN_1
// TODO(dcarney): Can this be improved?
if (ignore_case) continue;
#endif
RegExpCharacterClass* cc = elm.data.u_char_class;
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
if (!CharacterRange::IsCanonical(ranges)) {
......@@ -2897,11 +2920,19 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
if (range_count != 0 &&
ranges->at(0).from() == 0 &&
ranges->at(0).to() >= String::kMaxOneByteCharCode) {
#ifdef ENABLE_LATIN_1
// This will be handled in a later filter.
if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
#endif
return set_replacement(NULL);
}
} else {
if (range_count == 0 ||
ranges->at(0).from() > String::kMaxOneByteCharCode) {
#ifdef ENABLE_LATIN_1
// This will be handled in a later filter.
if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
#endif
return set_replacement(NULL);
}
}
......@@ -5354,7 +5385,7 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
Isolate* isolate = Isolate::Current();
uc16 bottom = from();
uc16 top = to();
if (is_ascii) {
if (is_ascii && !RangeContainsLatin1Equivalents(*this)) {
if (bottom > String::kMaxOneByteCharCode) return;
if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
}
......
......@@ -210,6 +210,26 @@ const byte NativeRegExpMacroAssembler::word_character_map[] = {
0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'h' - 'o'
0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'p' - 'w'
0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
// Latin-1 range
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
};
......
......@@ -244,10 +244,10 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
static const byte* StringCharacterPosition(String* subject, int start_index);
// Byte map of ASCII characters with a 0xff if the character is a word
// Byte map of one byte characters with a 0xff if the character is a word
// character (digit, letter or underscore) and 0x00 otherwise.
// Used by generated RegExp code.
static const byte word_character_map[128];
static const byte word_character_map[256];
static Address word_character_map_address() {
return const_cast<Address>(&word_character_map[0]);
......
......@@ -5051,8 +5051,8 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_StringToNumber) {
// Fast check for a junk value. A valid string may start from a
// whitespace, a sign ('+' or '-'), the decimal point, a decimal digit or
// the 'I' character ('Infinity'). All of that have codes not greater than
// '9' except 'I'.
if (data[start_pos] != 'I') {
// '9' except 'I' and &nbsp;.
if (data[start_pos] != 'I' && data[start_pos] != 0xa0) {
return isolate->heap()->nan_value();
}
} else if (len - start_pos < 10 && AreDigits(data, start_pos, len)) {
......
......@@ -79,33 +79,19 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
}
bool Latin1::NonLatin1CanBeConvertedToLatin1(uint16_t c) {
uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
ASSERT(c > Latin1::kMaxChar);
switch (c) {
case 0x130:
case 0x131:
case 0x149:
// This are equivalent characters in unicode.
case 0x39c:
case 0x3bc:
return 0xb5;
// This is an uppercase of a Latin-1 character
// outside of Latin-1.
case 0x178:
case 0x17f:
case 0x1f0:
case 0x1e96:
case 0x1e97:
case 0x1e98:
case 0x1e99:
case 0x1e9a:
case 0x1e9e:
case 0x212a:
case 0x212b:
case 0xfb00:
case 0xfb01:
case 0xfb02:
case 0xfb03:
case 0xfb04:
case 0xfb05:
case 0xfb06:
return true;
return 0xff;
}
return false;
return 0;
}
......
......@@ -140,7 +140,10 @@ class Latin1 {
#else
static const unsigned kMaxChar = 0xff;
#endif
static inline bool NonLatin1CanBeConvertedToLatin1(uint16_t);
// Returns 0 if character does not convert to single latin-1 character
// or if the character doesn't not convert back to latin-1 via inverse
// operation (upper to lower, etc).
static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
};
class Utf8 {
......
......@@ -1277,38 +1277,60 @@ TEST(IsAscii) {
}
static bool CanBeConvertedToLatin1(uint16_t c) {
CHECK(c > unibrow::Latin1::kMaxChar);
uint32_t result[4];
#ifdef ENABLE_LATIN_1
template<typename Op, bool return_first>
static uint16_t ConvertLatin1(uint16_t c) {
uint32_t result[Op::kMaxWidth];
int chars;
chars = unibrow::ToLowercase::Convert(c, 0, result, NULL);
if (chars > 0) {
CHECK_LE(chars, static_cast<int>(sizeof(result)));
for (int i = 0; i < chars; i++) {
if (result[i] <= unibrow::Latin1::kMaxChar) {
return true;
}
}
}
chars = unibrow::ToUppercase::Convert(c, 0, result, NULL);
if (chars > 0) {
CHECK_LE(chars, static_cast<int>(sizeof(result)));
for (int i = 0; i < chars; i++) {
if (result[i] <= unibrow::Latin1::kMaxChar) {
return true;
}
}
chars = Op::Convert(c, 0, result, NULL);
if (chars == 0) return 0;
CHECK_LE(chars, static_cast<int>(sizeof(result)));
if (!return_first && chars > 1) {
return 0;
}
return false;
return result[0];
}
TEST(Latin1) {
#ifndef ENABLE_LATIN_1
if (true) return;
#endif
for (uint16_t c = unibrow::Latin1::kMaxChar + 1; c != 0; c++) {
CHECK_EQ(CanBeConvertedToLatin1(c),
unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(c));
static void CheckCanonicalEquivalence(uint16_t c, uint16_t test) {
uint16_t expect = ConvertLatin1<unibrow::Ecma262UnCanonicalize, true>(c);
if (expect > unibrow::Latin1::kMaxChar) expect = 0;
CHECK_EQ(expect, test);
}
TEST(Latin1IgnoreCase) {
if (true) return;
using namespace unibrow;
for (uint16_t c = Latin1::kMaxChar + 1; c != 0; c++) {
uint16_t lower = ConvertLatin1<ToLowercase, false>(c);
uint16_t upper = ConvertLatin1<ToUppercase, false>(c);
uint16_t test = Latin1::ConvertNonLatin1ToLatin1(c);
// Filter out all character whose upper is not their lower or vice versa.
if (lower == 0 && upper == 0) {
CheckCanonicalEquivalence(c, test);
continue;
}
if (lower > Latin1::kMaxChar && upper > Latin1::kMaxChar) {
CheckCanonicalEquivalence(c, test);
continue;
}
if (lower == 0 && upper != 0) {
lower = ConvertLatin1<ToLowercase, false>(upper);
}
if (upper == 0 && lower != c) {
upper = ConvertLatin1<ToUppercase, false>(lower);
}
if (lower > Latin1::kMaxChar && upper > Latin1::kMaxChar) {
CheckCanonicalEquivalence(c, test);
continue;
}
if (upper != c && lower != c) {
CheckCanonicalEquivalence(c, test);
continue;
}
CHECK_EQ(Min(upper, lower), test);
}
}
#endif // ENABLE_LATIN_1
......@@ -57,3 +57,22 @@ for (var i = 0; i < 0xff; i++) {
// Should have hit the branch for the following char codes:
// [A-Z], [192-222] but not 215
assertEquals((90-65+1)+(222-192-1+1), total_lo);
// Latin-1 whitespace character
assertEquals( 1, +(String.fromCharCode(0xA0) + '1') );
// Latin-1 \W characters
assertEquals(["+\u00a3", "=="], "+\u00a3==".match(/\W\W/g));
// Latin-1 character that uppercases out of Latin-1.
assertTrue(/\u0178/i.test('\u00ff'));
// Unicode equivalence
assertTrue(/\u039c/i.test('\u00b5'));
assertTrue(/\u039c/i.test('\u03bc'));
assertTrue(/\u00b5/i.test('\u03bc'));
// Unicode equivalence ranges
assertTrue(/[\u039b-\u039d]/i.test('\u00b5'));
assertFalse(/[^\u039b-\u039d]/i.test('\u00b5'));
assertFalse(/[\u039b-\u039d]/.test('\u00b5'));
assertTrue(/[^\u039b-\u039d]/.test('\u00b5'));
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment