[wasm] Also kBadChar is a valid utf8 character

The validation of utf8 strings in WebAssembly modules used the character kBadChar = 0xFFFD to indicate a validation error. However, this character can appear in a valid utf8 string. This CL fixes this problem by duplicating some of the code in {Utf8::CalculateValue} and inlining it directly into Utf8::Validate. Note that Utf8::Validate is used only for WebAssembly. Tests for this change are in the WebAssembly spec tests, which I will update in a separate CL. R=vogelheim@chromium.org Change-Id: I8697b9299f3e98a8eafdf193bff8bdff90efd7dc Reviewed-on: https://chromium-review.googlesource.com/509534Reviewed-by: Daniel Vogelheim <vogelheim@chromium.org> Commit-Queue: Andreas Haas <ahaas@chromium.org> Cr-Commit-Position: refs/heads/master@{#45476}

[wasm] Also kBadChar is a valid utf8 character
The validation of utf8 strings in WebAssembly modules used the character kBadChar = 0xFFFD to indicate a validation error. However, this character can appear in a valid utf8 string. This CL fixes this problem by duplicating some of the code in {Utf8::CalculateValue} and inlining it directly into Utf8::Validate. Note that Utf8::Validate is used only for WebAssembly. Tests for this change are in the WebAssembly spec tests, which I will update in a separate CL. R=vogelheim@chromium.org Change-Id: I8697b9299f3e98a8eafdf193bff8bdff90efd7dc Reviewed-on: https://chromium-review.googlesource.com/509534Reviewed-by: Daniel Vogelheim <vogelheim@chromium.org> Commit-Queue: Andreas Haas <ahaas@chromium.org> Cr-Commit-Position: refs/heads/master@{#45476}
8e0daf78 · Andreas Haas · Commit Bot · a4eb80f7 · 8e0daf78 · 8e0daf78
Commit 8e0daf78 authored May 22, 2017 by Andreas Haas Committed by Commit Bot May 23, 2017
5 changed files
--- a/src/unicode.cc
+++ b/src/unicode.cc
@@ -360,17 +360,51 @@ uchar Utf8::ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer) {
  }
 }
-bool Utf8::Validate(const byte* bytes, size_t length) {
+bool Utf8::ValidateEncoding(const byte* bytes, size_t length) {
-  size_t cursor = 0;
+  const byte* cursor = bytes;
+  const byte* end = bytes + length;
-  // Performance optimization: Skip over single-byte values first.
+  while (cursor < end) {
-  while (cursor < length && bytes[cursor] <= kMaxOneByteChar) {
+    // Skip over single-byte values.
+    if (*cursor <= kMaxOneByteChar) {
      ++cursor;
+      continue;
    }
-  while (cursor < length) {
+    // Get the length the the character.
-    uchar c = ValueOf(bytes + cursor, length - cursor, &cursor);
+    size_t seq_length = NonASCIISequenceLength(*cursor);
-    if (!IsValidCharacter(c)) return false;
+    // For some invalid characters NonASCIISequenceLength returns 0.
+    if (seq_length == 0) return false;
+    const byte* char_end = cursor + seq_length;
+    // Return false if we do not have enough bytes for the character.
+    if (char_end > end) return false;
+    // Check if the bytes of the character are continuation bytes.
+    for (const byte* i = cursor + 1; i < char_end; ++i) {
+      if (!IsContinuationCharacter(*i)) return false;
+    }
+    // Check overly long sequences & other conditions.
+    if (seq_length == 3) {
+      if (cursor[0] == 0xE0 && (cursor[1] < 0xA0 || cursor[1] > 0xBF)) {
+        // Overlong three-byte sequence?
+        return false;
+      } else if (cursor[0] == 0xED && (cursor[1] < 0x80 || cursor[1] > 0x9F)) {
+        // High and low surrogate halves?
+        return false;
+      }
+    } else if (seq_length == 4) {
+      if (cursor[0] == 0xF0 && (cursor[1] < 0x90 || cursor[1] > 0xBF)) {
+        // Overlong four-byte sequence.
+        return false;
+      } else if (cursor[0] == 0xF4 && (cursor[1] < 0x80 || cursor[1] > 0x8F)) {
+        // Code points outside of the unicode range.
+        return false;
+      }
+    }
+    cursor = char_end;
  }
  return true;
 }

--- a/src/unicode.h
+++ b/src/unicode.h
@@ -166,7 +166,15 @@ class Utf8 {
  // Excludes non-characters from the set of valid code points.
  static inline bool IsValidCharacter(uchar c);
-  static bool Validate(const byte* str, size_t length);
+  // Validate if the input has a valid utf-8 encoding. Unlike JS source code
+  // this validation function will accept any unicode code point, including
+  // kBadChar and BOMs.
+  //
+  // This method checks for:
+  // - valid utf-8 endcoding (e.g. no over-long encodings),
+  // - absence of surrogates,
+  // - valid code point range.
+  static bool ValidateEncoding(const byte* str, size_t length);
 };
 struct Uppercase {

--- a/src/wasm/module-decoder.cc
+++ b/src/wasm/module-decoder.cc
@@ -100,7 +100,7 @@ uint32_t consume_string(Decoder& decoder, uint32_t* length, bool validate_utf8,
  if (*length > 0) {
    decoder.consume_bytes(*length, name);
    if (decoder.ok() && validate_utf8 &&
-        !unibrow::Utf8::Validate(string_start, *length)) {
+        !unibrow::Utf8::ValidateEncoding(string_start, *length)) {
      decoder.errorf(string_start, "%s: no valid UTF-8 string", name);
    }
  }
@@ -741,7 +741,7 @@ class ModuleDecoder : public Decoder {
          // or out-of-order indexes and non-UTF8 names. You can even assign
          // to the same function multiple times (last valid one wins).
          if (inner.ok() && function_index < module_->functions.size() &&
-              unibrow::Utf8::Validate(
+              unibrow::Utf8::ValidateEncoding(
                  inner.start() + inner.GetBufferRelativeOffset(name_offset),
                  name_length)) {
            module_->functions[function_index].name_offset = name_offset;

--- a/src/wasm/wasm-objects.cc
+++ b/src/wasm/wasm-objects.cc
@@ -1137,7 +1137,7 @@ MaybeHandle<String> WasmCompiledModule::ExtractUtf8StringFromModuleBytes(
  DCHECK_GE(module_bytes->length(), offset);
  DCHECK_GE(module_bytes->length() - offset, size);
  // UTF8 validation happens at decode time.
-  DCHECK(unibrow::Utf8::Validate(
+  DCHECK(unibrow::Utf8::ValidateEncoding(
      reinterpret_cast<const byte*>(module_bytes->GetCharsAddress() + offset),
      size));
  DCHECK_GE(kMaxInt, offset);

--- a/test/mjsunit/wasm/unicode-validation.js
+++ b/test/mjsunit/wasm/unicode-validation.js
@@ -118,5 +118,4 @@ checkAll(toByteArray("\xff"), true);
 checkAll(toByteArray("\xed\xa0\x8f"), true);        // surrogate code points
 checkAll(toByteArray("\xe0\x82\x80"), true);        // overlong sequence
 checkAll(toByteArray("\xf4\x90\x80\x80"), true);    // beyond limit: U+110000
-checkAll(toByteArray("\xef\xbf\xbe"), true);        // non-character; U+FFFE
 checkAll(toByteArray("with\x00null"), false);