Fix out-of-range access in unibrow::Utf8::CalculateValue.

This code should not access bytes out of the permitted range in order to check the range of a possible UTF-8 value. Instead, the length check should occur before such checks. BUG=chromium:667260, chromium:662822 Review-Url: https://codereview.chromium.org/2520053003 Cr-Commit-Position: refs/heads/master@{#41165}

Fix out-of-range access in unibrow::Utf8::CalculateValue.
This code should not access bytes out of the permitted range in order to check the range of a possible UTF-8 value. Instead, the length check should occur before such checks. BUG=chromium:667260, chromium:662822 Review-Url: https://codereview.chromium.org/2520053003 Cr-Commit-Position: refs/heads/master@{#41165}
9d524bd3 · jbroman · Commit bot · 8c4988f7 · 9d524bd3 · 9d524bd3
Commit 9d524bd3 authored Nov 22, 2016 by jbroman Committed by Commit bot Nov 22, 2016
5 changed files
--- a/src/unicode-decoder.h
+++ b/src/unicode-decoder.h
@@ -7,10 +7,11 @@
 #include <sys/types.h>
 #include "src/globals.h"
+#include "src/utils.h"
 namespace unibrow {
-class Utf8DecoderBase {
+class V8_EXPORT_PRIVATE Utf8DecoderBase {
 public:
  // Initialization done in subclass.
  inline Utf8DecoderBase();

--- a/src/unicode.cc
+++ b/src/unicode.cc
@@ -235,35 +235,31 @@ uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
  while (count < max_count && IsContinuationCharacter(str[count])) {
    count++;
  }
+  *cursor += count;
-  // Check overly long sequences & other conditions. Use length as error
+  // There must be enough continuation characters.
-  // indicator.
+  if (count != length) return kBadChar;
+  // Check overly long sequences & other conditions.
  if (length == 3) {
    if (str[0] == 0xE0 && (str[1] < 0xA0 || str[1] > 0xBF)) {
      // Overlong three-byte sequence?
-      length = 0;
+      return kBadChar;
    } else if (str[0] == 0xED && (str[1] < 0x80 || str[1] > 0x9F)) {
      // High and low surrogate halves?
-      length = 0;
+      return kBadChar;
    }
  } else if (length == 4) {
    if (str[0] == 0xF0 && (str[1] < 0x90 || str[1] > 0xBF)) {
      // Overlong four-byte sequence.
-      length = 0;
+      return kBadChar;
    } else if (str[0] == 0xF4 && (str[1] < 0x80 || str[1] > 0x8F)) {
      // Code points outside of the unicode range.
-      length = 0;
+      return kBadChar;
    }
  }
-  if (count != length) {
-    // All invalid encodings should land here.
-    *cursor += count;
-    return kBadChar;
-  }
  // All errors have been handled, so we only have to assemble the result.
-  *cursor += length;
  switch (length) {
    case 1:
      return str[0];

--- a/test/unittests/BUILD.gn
+++ b/test/unittests/BUILD.gn
@@ -120,6 +120,7 @@ v8_executable("unittests") {
    "source-position-table-unittest.cc",
    "test-utils.cc",
    "test-utils.h",
+    "unicode-unittest.cc",
    "value-serializer-unittest.cc",
    "wasm/asm-types-unittest.cc",
    "wasm/ast-decoder-unittest.cc",

--- a/test/unittests/unicode-unittest.cc
+++ b/test/unittests/unicode-unittest.cc
+// Copyright 2016 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include <memory>
+#include <string>
+#include "src/unicode-decoder.h"
+#include "testing/gtest/include/gtest/gtest.h"
+namespace v8 {
+namespace internal {
+namespace {
+using Utf8Decoder = unibrow::Utf8Decoder<512>;
+void Decode(Utf8Decoder* decoder, const std::string& str) {
+  // Put the string in its own buffer on the heap to make sure that
+  // AddressSanitizer's heap-buffer-overflow logic can see what's going on.
+  std::unique_ptr<char[]> buffer(new char[str.length()]);
+  memcpy(buffer.get(), str.data(), str.length());
+  decoder->Reset(buffer.get(), str.length());
+}
+}  // namespace
+TEST(UnicodeTest, ReadOffEndOfUtf8String) {
+  Utf8Decoder decoder;
+  // Not enough continuation bytes before string ends.
+  Decode(&decoder, "\xE0");
+  Decode(&decoder, "\xED");
+  Decode(&decoder, "\xF0");
+  Decode(&decoder, "\xF4");
+}
+}  // namespace internal
+}  // namespace v8
--- a/test/unittests/unittests.gyp
+++ b/test/unittests/unittests.gyp
@@ -118,6 +118,7 @@
      'source-position-table-unittest.cc',
      'test-utils.h',
      'test-utils.cc',
+      'unicode-unittest.cc',
      'value-serializer-unittest.cc',
      'zone/segmentpool-unittest.cc',
      'zone/zone-chunk-list-unittest.cc',