Fix some latin-1 webkit units tests

R=yangguo@chromium.org BUG= Review URL: https://chromiumcodereview.appspot.com/11962035 Patch from Dan Carney <dcarney@google.com>. git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13455 ce2b1a6d-e550-0410-aec6-3dcde31c8c00

Fix some latin-1 webkit units tests
R=yangguo@chromium.org BUG= Review URL: https://chromiumcodereview.appspot.com/11962035 Patch from Dan Carney <dcarney@google.com>. git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13455 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
0c822b21 · yangguo@chromium.org · 7f331f62 · 0c822b21 · 0c822b21 · 0c822b21
Commit 0c822b21 authored Jan 21, 2013 by yangguo@chromium.org
8 changed files
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@@ -2855,6 +2855,29 @@ RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
 }
+// We need to check for the following characters: 0x39c 0x3bc 0x178.
+static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
+#ifdef ENABLE_LATIN_1
+  // TODO(dcarney): this could be a lot more efficient.
+  return range.Contains(0x39c) ||
+      range.Contains(0x3bc) || range.Contains(0x178);
+#else
+  return false;
+#endif
+}
+#ifdef ENABLE_LATIN_1
+static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
+  for (int i = 0; i < ranges->length(); i++) {
+    // TODO(dcarney): this could be a lot more efficient.
+    if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
+  }
+  return false;
+}
+#endif
 RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
  if (info()->replacement_calculated) return replacement();
  if (depth < 0) return this;
@@ -2871,21 +2894,21 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
          return set_replacement(NULL);
        }
 #else
-        if (quarks[j] <= String::kMaxOneByteCharCode) continue;
+        uint16_t c = quarks[j];
+        if (c <= String::kMaxOneByteCharCode) continue;
        if (!ignore_case) return set_replacement(NULL);
        // Here, we need to check for characters whose upper and lower cases
        // are outside the Latin-1 range.
-        if (!unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(quarks[j])) {
+        uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
-          return set_replacement(NULL);
+        // Character is outside Latin-1 completely
-        }
+        if (converted == 0) return set_replacement(NULL);
+        // Convert quark to Latin-1 in place.
+        uint16_t* copy = const_cast<uint16_t*>(quarks.start());
+        copy[j] = converted;
 #endif
      }
    } else {
      ASSERT(elm.type == TextElement::CHAR_CLASS);
-#ifdef ENABLE_LATIN_1
-      // TODO(dcarney): Can this be improved?
-      if (ignore_case) continue;
-#endif
      RegExpCharacterClass* cc = elm.data.u_char_class;
      ZoneList<CharacterRange>* ranges = cc->ranges(zone());
      if (!CharacterRange::IsCanonical(ranges)) {
@@ -2897,11 +2920,19 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
        if (range_count != 0 &&
            ranges->at(0).from() == 0 &&
            ranges->at(0).to() >= String::kMaxOneByteCharCode) {
+#ifdef ENABLE_LATIN_1
+          // This will be handled in a later filter.
+          if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
+#endif
          return set_replacement(NULL);
        }
      } else {
        if (range_count == 0 ||
            ranges->at(0).from() > String::kMaxOneByteCharCode) {
+#ifdef ENABLE_LATIN_1
+          // This will be handled in a later filter.
+          if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
+#endif
          return set_replacement(NULL);
        }
      }
@@ -5354,7 +5385,7 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
  Isolate* isolate = Isolate::Current();
  uc16 bottom = from();
  uc16 top = to();
-  if (is_ascii) {
+  if (is_ascii && !RangeContainsLatin1Equivalents(*this)) {
    if (bottom > String::kMaxOneByteCharCode) return;
    if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
  }

--- a/src/regexp-macro-assembler.cc
+++ b/src/regexp-macro-assembler.cc
@@ -210,6 +210,26 @@ const byte NativeRegExpMacroAssembler::word_character_map[] = {
    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'h' - 'o'
    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'p' - 'w'
    0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
+    // Latin-1 range
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
 };

--- a/src/regexp-macro-assembler.h
+++ b/src/regexp-macro-assembler.h
@@ -244,10 +244,10 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
  static const byte* StringCharacterPosition(String* subject, int start_index);
-  // Byte map of ASCII characters with a 0xff if the character is a word
+  // Byte map of one byte characters with a 0xff if the character is a word
  // character (digit, letter or underscore) and 0x00 otherwise.
  // Used by generated RegExp code.
-  static const byte word_character_map[128];
+  static const byte word_character_map[256];
  static Address word_character_map_address() {
    return const_cast<Address>(&word_character_map[0]);

--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -5051,8 +5051,8 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_StringToNumber) {
      // Fast check for a junk value. A valid string may start from a
      // whitespace, a sign ('+' or '-'), the decimal point, a decimal digit or
      // the 'I' character ('Infinity'). All of that have codes not greater than
-      // '9' except 'I'.
+      // '9' except 'I' and &nbsp;.
-      if (data[start_pos] != 'I') {
+      if (data[start_pos] != 'I' && data[start_pos] != 0xa0) {
        return isolate->heap()->nan_value();
      }
    } else if (len - start_pos < 10 && AreDigits(data, start_pos, len)) {

--- a/src/unicode-inl.h
+++ b/src/unicode-inl.h
@@ -79,33 +79,19 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
 }
-bool Latin1::NonLatin1CanBeConvertedToLatin1(uint16_t c) {
+uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
  ASSERT(c > Latin1::kMaxChar);
  switch (c) {
-    case 0x130:
+    // This are equivalent characters in unicode.
-    case 0x131:
+    case 0x39c:
-    case 0x149:
+    case 0x3bc:
+      return 0xb5;
+    // This is an uppercase of a Latin-1 character
+    // outside of Latin-1.
    case 0x178:
-    case 0x17f:
+      return 0xff;
-    case 0x1f0:
-    case 0x1e96:
-    case 0x1e97:
-    case 0x1e98:
-    case 0x1e99:
-    case 0x1e9a:
-    case 0x1e9e:
-    case 0x212a:
-    case 0x212b:
-    case 0xfb00:
-    case 0xfb01:
-    case 0xfb02:
-    case 0xfb03:
-    case 0xfb04:
-    case 0xfb05:
-    case 0xfb06:
-      return true;
  }
-  return false;
+  return 0;
 }

--- a/src/unicode.h
+++ b/src/unicode.h
@@ -140,7 +140,10 @@ class Latin1 {
 #else
  static const unsigned kMaxChar = 0xff;
 #endif
-  static inline bool NonLatin1CanBeConvertedToLatin1(uint16_t);
+  // Returns 0 if character does not convert to single latin-1 character
+  // or if the character doesn't not convert back to latin-1 via inverse
+  // operation (upper to lower, etc).
+  static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
 };
 class Utf8 {

--- a/test/cctest/test-strings.cc
+++ b/test/cctest/test-strings.cc
@@ -1277,38 +1277,60 @@ TEST(IsAscii) {
 }
-static bool CanBeConvertedToLatin1(uint16_t c) {
-  CHECK(c > unibrow::Latin1::kMaxChar);
+#ifdef ENABLE_LATIN_1
-  uint32_t result[4];
+template<typename Op, bool return_first>
+static uint16_t ConvertLatin1(uint16_t c) {
+  uint32_t result[Op::kMaxWidth];
  int chars;
-  chars = unibrow::ToLowercase::Convert(c, 0, result, NULL);
+  chars = Op::Convert(c, 0, result, NULL);
-  if (chars > 0) {
+  if (chars == 0) return 0;
-    CHECK_LE(chars, static_cast<int>(sizeof(result)));
+  CHECK_LE(chars, static_cast<int>(sizeof(result)));
-    for (int i = 0; i < chars; i++) {
+  if (!return_first && chars > 1) {
-      if (result[i] <= unibrow::Latin1::kMaxChar) {
+    return 0;
-        return true;
-      }
-    }
-  }
-  chars = unibrow::ToUppercase::Convert(c, 0, result, NULL);
-  if (chars > 0) {
-    CHECK_LE(chars, static_cast<int>(sizeof(result)));
-    for (int i = 0; i < chars; i++) {
-      if (result[i] <= unibrow::Latin1::kMaxChar) {
-        return true;
-      }
-    }
  }
-  return false;
+  return result[0];
 }
-TEST(Latin1) {
+static void CheckCanonicalEquivalence(uint16_t c, uint16_t test) {
-#ifndef ENABLE_LATIN_1
+  uint16_t expect = ConvertLatin1<unibrow::Ecma262UnCanonicalize, true>(c);
-    if (true) return;
+  if (expect > unibrow::Latin1::kMaxChar) expect = 0;
-#endif
+  CHECK_EQ(expect, test);
-  for (uint16_t c = unibrow::Latin1::kMaxChar + 1; c != 0; c++) {
+}
-    CHECK_EQ(CanBeConvertedToLatin1(c),
-             unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(c));
+TEST(Latin1IgnoreCase) {
+  if (true) return;
+  using namespace unibrow;
+  for (uint16_t c = Latin1::kMaxChar + 1; c != 0; c++) {
+    uint16_t lower = ConvertLatin1<ToLowercase, false>(c);
+    uint16_t upper = ConvertLatin1<ToUppercase, false>(c);
+    uint16_t test = Latin1::ConvertNonLatin1ToLatin1(c);
+    // Filter out all character whose upper is not their lower or vice versa.
+    if (lower == 0 && upper == 0) {
+      CheckCanonicalEquivalence(c, test);
+      continue;
+    }
+    if (lower > Latin1::kMaxChar && upper > Latin1::kMaxChar) {
+      CheckCanonicalEquivalence(c, test);
+      continue;
+    }
+    if (lower == 0 && upper != 0) {
+      lower = ConvertLatin1<ToLowercase, false>(upper);
+    }
+    if (upper == 0 && lower != c) {
+      upper = ConvertLatin1<ToUppercase, false>(lower);
+    }
+    if (lower > Latin1::kMaxChar && upper > Latin1::kMaxChar) {
+      CheckCanonicalEquivalence(c, test);
+      continue;
+    }
+    if (upper != c && lower != c) {
+      CheckCanonicalEquivalence(c, test);
+      continue;
+    }
+    CHECK_EQ(Min(upper, lower), test);
  }
 }
+#endif  // ENABLE_LATIN_1
--- a/test/mjsunit/regress/regress-latin-1.js
+++ b/test/mjsunit/regress/regress-latin-1.js
@@ -57,3 +57,22 @@ for (var i = 0; i < 0xff; i++) {
 // Should have hit the branch for the following char codes:
 // [A-Z], [192-222] but not 215
 assertEquals((90-65+1)+(222-192-1+1), total_lo);
+// Latin-1 whitespace character
+assertEquals( 1, +(String.fromCharCode(0xA0) + '1') );
+// Latin-1 \W characters
+assertEquals(["+\u00a3", "=="], "+\u00a3==".match(/\W\W/g));
+// Latin-1 character that uppercases out of Latin-1.
+assertTrue(/\u0178/i.test('\u00ff'));
+// Unicode equivalence
+assertTrue(/\u039c/i.test('\u00b5'));
+assertTrue(/\u039c/i.test('\u03bc'));
+assertTrue(/\u00b5/i.test('\u03bc'));
+// Unicode equivalence ranges
+assertTrue(/[\u039b-\u039d]/i.test('\u00b5'));
+assertFalse(/[^\u039b-\u039d]/i.test('\u00b5'));
+assertFalse(/[\u039b-\u039d]/.test('\u00b5'));
+assertTrue(/[^\u039b-\u039d]/.test('\u00b5'));