Return kBadChar for longest subpart of incomplete utf-8 character.

This brings the two utf-8 decoders (bulk + incremental) in line. Technically, either behaviour was correct, since the utf-8 spec demands incomplete utf-8 be handled, but does not specify how. Unicode recommends that "the maximal subpart at that offset should be replaced by a single U+FFFD," and with this change we consistently do that. More details + spec references in the bug. BUG=chromium:662822 Review-Url: https://codereview.chromium.org/2493143003 Cr-Commit-Position: refs/heads/master@{#41025}

Return kBadChar for longest subpart of incomplete utf-8 character.
This brings the two utf-8 decoders (bulk + incremental) in line. Technically, either behaviour was correct, since the utf-8 spec demands incomplete utf-8 be handled, but does not specify how. Unicode recommends that "the maximal subpart at that offset should be replaced by a single U+FFFD," and with this change we consistently do that. More details + spec references in the bug. BUG=chromium:662822 Review-Url: https://codereview.chromium.org/2493143003 Cr-Commit-Position: refs/heads/master@{#41025}
fd40ebb1 · vogelheim · Commit bot · 0188c3fb · fd40ebb1 · fd40ebb1
Commit fd40ebb1 authored Nov 16, 2016 by vogelheim Committed by Commit bot Nov 16, 2016
Expand all Show whitespace changes
Inline Side-by-side

Showing with 270 additions and 300 deletions

unicode.cc src/unicode.cc +53 -72

test-parsing.cc test/cctest/test-parsing.cc +217 -228

No files found.
--- a/src/unicode.cc
+++ b/src/unicode.cc
@@ -228,80 +228,56 @@ static inline bool IsContinuationCharacter(byte chr) {
 // This method decodes an UTF-8 value according to RFC 3629.
 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
  size_t length = NonASCIISequenceLength(str[0]);
-  if (length == 0 || max_length < length) {
-    *cursor += 1;
+  // Check continuation characters.
-    return kBadChar;
+  size_t max_count = std::min(length, max_length);
-  }
+  size_t count = 1;
-  if (length == 2) {
+  while (count < max_count && IsContinuationCharacter(str[count])) {
-    if (!IsContinuationCharacter(str[1])) {
+    count++;
-      *cursor += 1;
-      return kBadChar;
-    }
-    *cursor += 2;
-    return ((str[0] << 6) + str[1]) - 0x00003080;
  }
+  // Check overly long sequences & other conditions. Use length as error
+  // indicator.
  if (length == 3) {
-    switch (str[0]) {
+    if (str[0] == 0xE0 && (str[1] < 0xA0 || str[1] > 0xBF)) {
-      case 0xE0:
+      // Overlong three-byte sequence?
-        // Overlong three-byte sequence.
+      length = 0;
-        if (str[1] < 0xA0 || str[1] > 0xBF) {
+    } else if (str[0] == 0xED && (str[1] < 0x80 || str[1] > 0x9F)) {
-          *cursor += 1;
+      // High and low surrogate halves?
-          return kBadChar;
+      length = 0;
-        }
-        break;
-      case 0xED:
-        // High and low surrogate halves.
-        if (str[1] < 0x80 || str[1] > 0x9F) {
-          *cursor += 1;
-          return kBadChar;
    }
-        break;
+  } else if (length == 4) {
-      default:
+    if (str[0] == 0xF0 && (str[1] < 0x90 || str[1] > 0xBF)) {
-        if (!IsContinuationCharacter(str[1])) {
-          *cursor += 1;
-          return kBadChar;
-        }
-    }
-    if (!IsContinuationCharacter(str[2])) {
-      *cursor += 1;
-      return kBadChar;
-    }
-    *cursor += 3;
-    return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
-  }
-  DCHECK(length == 4);
-  switch (str[0]) {
-    case 0xF0:
      // Overlong four-byte sequence.
-      if (str[1] < 0x90 || str[1] > 0xBF) {
+      length = 0;
-        *cursor += 1;
+    } else if (str[0] == 0xF4 && (str[1] < 0x80 || str[1] > 0x8F)) {
-        return kBadChar;
-      }
-      break;
-    case 0xF4:
      // Code points outside of the unicode range.
-      if (str[1] < 0x80 || str[1] > 0x8F) {
+      length = 0;
-        *cursor += 1;
-        return kBadChar;
-      }
-      break;
-    default:
-      if (!IsContinuationCharacter(str[1])) {
-        *cursor += 1;
-        return kBadChar;
    }
  }
-  if (!IsContinuationCharacter(str[2])) {
-    *cursor += 1;
+  if (count != length) {
-    return kBadChar;
+    // All invalid encodings should land here.
-  }
+    *cursor += count;
-  if (!IsContinuationCharacter(str[3])) {
-    *cursor += 1;
    return kBadChar;
  }
-  *cursor += 4;
+  // All errors have been handled, so we only have to assemble the result.
+  *cursor += length;
+  switch (length) {
+    case 1:
+      return str[0];
+    case 2:
+      return ((str[0] << 6) + str[1]) - 0x00003080;
+    case 3:
+      return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
+    case 4:
      return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
             0x03C82080;
+  }
+  UNREACHABLE();
+  return kBadChar;
 }
 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
@@ -323,9 +299,10 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
      // with one shift.
      uint8_t mask = 0x7f >> kind;
-      // Store the kind - 1 (i.e., remaining bytes) in the top byte, value
+      // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)
-      // in the bottom three.
+      // in 2nd nibble, and the value  in the bottom three. The 2nd nibble is
-      *buffer = (kind - 1) << 24 | (next & mask);
+      // intended as a counter about how many bytes are still needed.
+      *buffer = kind << 28 | (kind - 1) << 24 | (next & mask);
      return kIncomplete;
    } else {
      // No buffer, and not the start of a 1-byte char (handled at the
@@ -354,15 +331,19 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
    // We're inside of a character, as described by buffer.
    // How many bytes (excluding this one) do we still expect?
-    uint8_t count = (*buffer >> 24) - 1;
+    uint8_t bytes_expected = *buffer >> 28;
+    uint8_t bytes_left = (*buffer >> 24) & 0x0f;
+    bytes_left--;
    // Update the value.
    uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
-    if (count) {
+    if (bytes_left) {
-      *buffer = count << 24 | value;
+      *buffer = (bytes_expected << 28 | bytes_left << 24 | value);
      return kIncomplete;
    } else {
      *buffer = 0;
-      return value;
+      bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80) ||
+                                   (bytes_expected == 3 && value < 0x800);
+      return sequence_was_too_long ? kBadChar : value;
    }
  } else {
    // Within a character, but not a continuation character? Then the

--- a/test/cctest/test-parsing.cc
+++ b/test/cctest/test-parsing.cc