Commit 8e0daf78 authored by Andreas Haas's avatar Andreas Haas Committed by Commit Bot

[wasm] Also kBadChar is a valid utf8 character

The validation of utf8 strings in WebAssembly modules used the character
kBadChar = 0xFFFD to indicate a validation error. However, this
character can appear in a valid utf8 string. This CL fixes this problem
by duplicating some of the code in {Utf8::CalculateValue} and inlining
it directly into Utf8::Validate. Note that Utf8::Validate is used only
for WebAssembly.

Tests for this change are in the WebAssembly spec tests, which I will
update in a separate CL.

R=vogelheim@chromium.org

Change-Id: I8697b9299f3e98a8eafdf193bff8bdff90efd7dc
Reviewed-on: https://chromium-review.googlesource.com/509534Reviewed-by: 's avatarDaniel Vogelheim <vogelheim@chromium.org>
Commit-Queue: Andreas Haas <ahaas@chromium.org>
Cr-Commit-Position: refs/heads/master@{#45476}
parent a4eb80f7
......@@ -360,17 +360,51 @@ uchar Utf8::ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer) {
}
}
bool Utf8::Validate(const byte* bytes, size_t length) {
size_t cursor = 0;
bool Utf8::ValidateEncoding(const byte* bytes, size_t length) {
const byte* cursor = bytes;
const byte* end = bytes + length;
while (cursor < end) {
// Skip over single-byte values.
if (*cursor <= kMaxOneByteChar) {
++cursor;
continue;
}
// Performance optimization: Skip over single-byte values first.
while (cursor < length && bytes[cursor] <= kMaxOneByteChar) {
++cursor;
}
// Get the length the the character.
size_t seq_length = NonASCIISequenceLength(*cursor);
// For some invalid characters NonASCIISequenceLength returns 0.
if (seq_length == 0) return false;
const byte* char_end = cursor + seq_length;
while (cursor < length) {
uchar c = ValueOf(bytes + cursor, length - cursor, &cursor);
if (!IsValidCharacter(c)) return false;
// Return false if we do not have enough bytes for the character.
if (char_end > end) return false;
// Check if the bytes of the character are continuation bytes.
for (const byte* i = cursor + 1; i < char_end; ++i) {
if (!IsContinuationCharacter(*i)) return false;
}
// Check overly long sequences & other conditions.
if (seq_length == 3) {
if (cursor[0] == 0xE0 && (cursor[1] < 0xA0 || cursor[1] > 0xBF)) {
// Overlong three-byte sequence?
return false;
} else if (cursor[0] == 0xED && (cursor[1] < 0x80 || cursor[1] > 0x9F)) {
// High and low surrogate halves?
return false;
}
} else if (seq_length == 4) {
if (cursor[0] == 0xF0 && (cursor[1] < 0x90 || cursor[1] > 0xBF)) {
// Overlong four-byte sequence.
return false;
} else if (cursor[0] == 0xF4 && (cursor[1] < 0x80 || cursor[1] > 0x8F)) {
// Code points outside of the unicode range.
return false;
}
}
cursor = char_end;
}
return true;
}
......
......@@ -166,7 +166,15 @@ class Utf8 {
// Excludes non-characters from the set of valid code points.
static inline bool IsValidCharacter(uchar c);
static bool Validate(const byte* str, size_t length);
// Validate if the input has a valid utf-8 encoding. Unlike JS source code
// this validation function will accept any unicode code point, including
// kBadChar and BOMs.
//
// This method checks for:
// - valid utf-8 endcoding (e.g. no over-long encodings),
// - absence of surrogates,
// - valid code point range.
static bool ValidateEncoding(const byte* str, size_t length);
};
struct Uppercase {
......
......@@ -100,7 +100,7 @@ uint32_t consume_string(Decoder& decoder, uint32_t* length, bool validate_utf8,
if (*length > 0) {
decoder.consume_bytes(*length, name);
if (decoder.ok() && validate_utf8 &&
!unibrow::Utf8::Validate(string_start, *length)) {
!unibrow::Utf8::ValidateEncoding(string_start, *length)) {
decoder.errorf(string_start, "%s: no valid UTF-8 string", name);
}
}
......@@ -741,7 +741,7 @@ class ModuleDecoder : public Decoder {
// or out-of-order indexes and non-UTF8 names. You can even assign
// to the same function multiple times (last valid one wins).
if (inner.ok() && function_index < module_->functions.size() &&
unibrow::Utf8::Validate(
unibrow::Utf8::ValidateEncoding(
inner.start() + inner.GetBufferRelativeOffset(name_offset),
name_length)) {
module_->functions[function_index].name_offset = name_offset;
......
......@@ -1137,7 +1137,7 @@ MaybeHandle<String> WasmCompiledModule::ExtractUtf8StringFromModuleBytes(
DCHECK_GE(module_bytes->length(), offset);
DCHECK_GE(module_bytes->length() - offset, size);
// UTF8 validation happens at decode time.
DCHECK(unibrow::Utf8::Validate(
DCHECK(unibrow::Utf8::ValidateEncoding(
reinterpret_cast<const byte*>(module_bytes->GetCharsAddress() + offset),
size));
DCHECK_GE(kMaxInt, offset);
......
......@@ -118,5 +118,4 @@ checkAll(toByteArray("\xff"), true);
checkAll(toByteArray("\xed\xa0\x8f"), true); // surrogate code points
checkAll(toByteArray("\xe0\x82\x80"), true); // overlong sequence
checkAll(toByteArray("\xf4\x90\x80\x80"), true); // beyond limit: U+110000
checkAll(toByteArray("\xef\xbf\xbe"), true); // non-character; U+FFFE
checkAll(toByteArray("with\x00null"), false);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment