Commit 44932c16 authored by Andy Wingo's avatar Andy Wingo Committed by V8 LUCI CQ

[stringrefs] Parse the string literals section

Bug: v8:12868

Also adds wtf8.cc, wtf8.h to src/wasm, to implement WTF-8 validation and
possibly other utilities.  Also fixes a bug when parsing the string
literals section; I had misunderstood the way the unordered/ordered
sections mechanism worked.

Change-Id: I3c4205e0872379a69575f84ba33e0090a9d8d656
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3652789
Commit-Queue: Andy Wingo <wingo@igalia.com>
Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Cr-Commit-Position: refs/heads/main@{#80674}
parent 09b7a8b8
......@@ -2547,6 +2547,8 @@ filegroup(
"src/wasm/wasm-subtyping.h",
"src/wasm/wasm-tier.h",
"src/wasm/wasm-value.h",
"src/wasm/wtf8.cc",
"src/wasm/wtf8.h",
],
"//conditions:default": [],
}),
......
......@@ -3608,6 +3608,7 @@ v8_header_set("v8_internal_headers") {
"src/wasm/wasm-subtyping.h",
"src/wasm/wasm-tier.h",
"src/wasm/wasm-value.h",
"src/wasm/wtf8.h",
]
}
......@@ -4664,6 +4665,7 @@ v8_source_set("v8_base_without_compiler") {
"src/wasm/wasm-result.cc",
"src/wasm/wasm-serialization.cc",
"src/wasm/wasm-subtyping.cc",
"src/wasm/wtf8.cc",
]
}
......
......@@ -22,6 +22,7 @@
#include "src/wasm/wasm-engine.h"
#include "src/wasm/wasm-limits.h"
#include "src/wasm/wasm-opcodes-inl.h"
#include "src/wasm/wtf8.h"
namespace v8 {
namespace internal {
......@@ -116,9 +117,11 @@ bool validate_utf8(Decoder* decoder, WireBytesRef string) {
string.length());
}
enum class StringValidation { kNone, kUtf8, kWtf8 };
// Reads a length-prefixed string, checking that it is within bounds. Returns
// the offset of the string, and the length as an out parameter.
WireBytesRef consume_string(Decoder* decoder, bool validate_utf8,
WireBytesRef consume_string(Decoder* decoder, StringValidation validation,
const char* name) {
uint32_t length = decoder->consume_u32v("string length");
uint32_t offset = decoder->pc_offset();
......@@ -126,17 +129,33 @@ WireBytesRef consume_string(Decoder* decoder, bool validate_utf8,
// Consume bytes before validation to guarantee that the string is not oob.
if (length > 0) {
decoder->consume_bytes(length, name);
if (decoder->ok() && validate_utf8 &&
!unibrow::Utf8::ValidateEncoding(string_start, length)) {
decoder->errorf(string_start, "%s: no valid UTF-8 string", name);
if (decoder->ok()) {
switch (validation) {
case StringValidation::kNone:
break;
case StringValidation::kUtf8:
if (!unibrow::Utf8::ValidateEncoding(string_start, length)) {
decoder->errorf(string_start, "%s: no valid UTF-8 string", name);
}
break;
case StringValidation::kWtf8:
if (!Wtf8::ValidateEncoding(string_start, length)) {
decoder->errorf(string_start, "%s: no valid WTF-8 string", name);
}
break;
}
}
}
return {offset, decoder->failed() ? 0 : length};
}
WireBytesRef consume_utf8_string(Decoder* decoder, const char* name) {
return consume_string(decoder, StringValidation::kUtf8, name);
}
namespace {
SectionCode IdentifyUnknownSectionInternal(Decoder* decoder) {
WireBytesRef string = consume_string(decoder, true, "section name");
WireBytesRef string = consume_utf8_string(decoder, "section name");
if (decoder->failed()) {
return kUnknownSectionCode;
}
......@@ -422,19 +441,18 @@ class ModuleDecoderImpl : public Decoder {
break;
case kTagSectionCode: {
if (!CheckUnorderedSection(section_code)) return;
SectionCode next = enabled_features_.has_stringref()
? kStringRefSectionCode
: kGlobalSectionCode;
if (!CheckSectionOrder(section_code, kMemorySectionCode, next)) {
if (!CheckSectionOrder(section_code, kMemorySectionCode,
kGlobalSectionCode)) {
return;
}
break;
}
case kStringRefSectionCode: {
// TODO(12868): If there's a tag section, assert that we're after the
// tag section.
if (!CheckUnorderedSection(section_code)) return;
SectionCode prev =
enabled_features_.has_eh() ? kTagSectionCode : kMemorySectionCode;
if (!CheckSectionOrder(section_code, prev, kGlobalSectionCode)) {
if (!CheckSectionOrder(section_code, kMemorySectionCode,
kGlobalSectionCode)) {
return;
}
break;
......@@ -812,8 +830,8 @@ class ModuleDecoderImpl : public Decoder {
});
WasmImport* import = &module_->import_table.back();
const byte* pos = pc_;
import->module_name = consume_string(this, true, "module name");
import->field_name = consume_string(this, true, "field name");
import->module_name = consume_utf8_string(this, "module name");
import->field_name = consume_utf8_string(this, "field name");
import->kind =
static_cast<ImportExportKindCode>(consume_u8("import kind"));
switch (import->kind) {
......@@ -999,7 +1017,7 @@ class ModuleDecoderImpl : public Decoder {
});
WasmExport* exp = &module_->export_table.back();
exp->name = consume_string(this, true, "field name");
exp->name = consume_utf8_string(this, "field name");
const byte* pos = pc();
exp->kind = static_cast<ImportExportKindCode>(consume_u8("export kind"));
......@@ -1250,7 +1268,8 @@ class ModuleDecoderImpl : public Decoder {
// Decode module name, ignore the rest.
// Function and local names will be decoded when needed.
if (name_type == NameSectionKindCode::kModuleCode) {
WireBytesRef name = consume_string(&inner, false, "module name");
WireBytesRef name =
consume_string(&inner, StringValidation::kNone, "module name");
if (inner.ok() && validate_utf8(&inner, name)) {
module_->name = name;
}
......@@ -1265,7 +1284,7 @@ class ModuleDecoderImpl : public Decoder {
void DecodeSourceMappingURLSection() {
Decoder inner(start_, pc_, end_, buffer_offset_);
WireBytesRef url = wasm::consume_string(&inner, true, "module name");
WireBytesRef url = wasm::consume_utf8_string(&inner, "module name");
if (inner.ok() &&
module_->debug_symbols.type != WasmDebugSymbols::Type::SourceMap) {
module_->debug_symbols = {WasmDebugSymbols::Type::SourceMap, url};
......@@ -1277,7 +1296,7 @@ class ModuleDecoderImpl : public Decoder {
void DecodeExternalDebugInfoSection() {
Decoder inner(start_, pc_, end_, buffer_offset_);
WireBytesRef url =
wasm::consume_string(&inner, true, "external symbol file");
wasm::consume_utf8_string(&inner, "external symbol file");
// If there is an explicit source map, prefer it over DWARF info.
if (inner.ok() &&
module_->debug_symbols.type != WasmDebugSymbols::Type::SourceMap) {
......@@ -1475,7 +1494,23 @@ class ModuleDecoderImpl : public Decoder {
}
}
void DecodeStringRefSection() { UNIMPLEMENTED(); }
void DecodeStringRefSection() {
uint32_t deferred = consume_count("deferred string literal count",
kV8MaxWasmStringLiterals);
if (deferred) {
errorf(pc(), "Invalid deferred string literal count %u (expected 0)",
deferred);
}
uint32_t immediate = consume_count("string literal count",
kV8MaxWasmStringLiterals - deferred);
for (uint32_t i = 0; ok() && i < immediate; ++i) {
TRACE("DecodeStringLiteral[%d] module+%d\n", i,
static_cast<int>(pc_ - start_));
WireBytesRef pos =
wasm::consume_string(this, StringValidation::kWtf8, "string literal");
module_->stringref_literals.emplace_back(pos);
}
}
bool CheckMismatchedCounts() {
// The declared vs. defined function count is normally checked when
......@@ -2527,7 +2562,8 @@ void DecodeFunctionNames(const byte* module_start, const byte* module_end,
for (; decoder.ok() && functions_count > 0; --functions_count) {
uint32_t function_index = decoder.consume_u32v("function index");
WireBytesRef name = consume_string(&decoder, false, "function name");
WireBytesRef name =
consume_string(&decoder, StringValidation::kNone, "function name");
// Be lenient with errors in the name section: Ignore non-UTF8 names.
// You can even assign to the same function multiple times (last valid
......@@ -2561,7 +2597,8 @@ NameMap DecodeNameMap(base::Vector<const uint8_t> module_bytes,
uint32_t count = decoder.consume_u32v("names count");
for (uint32_t i = 0; i < count; i++) {
uint32_t index = decoder.consume_u32v("index");
WireBytesRef name = consume_string(&decoder, false, "name");
WireBytesRef name =
consume_string(&decoder, StringValidation::kNone, "name");
if (!decoder.ok()) break;
if (index > kMaxInt) continue;
if (!validate_utf8(&decoder, name)) continue;
......@@ -2598,7 +2635,8 @@ IndirectNameMap DecodeIndirectNameMap(base::Vector<const uint8_t> module_bytes,
uint32_t inner_count = decoder.consume_u32v("inner count");
for (uint32_t k = 0; k < inner_count; ++k) {
uint32_t inner_index = decoder.consume_u32v("inner index");
WireBytesRef name = consume_string(&decoder, false, "name");
WireBytesRef name =
consume_string(&decoder, StringValidation::kNone, "name");
if (!decoder.ok()) break;
if (inner_index > kMaxInt) continue;
// Ignore non-utf8 names.
......
......@@ -60,6 +60,9 @@ constexpr size_t kV8MaxWasmStructFields = 999;
constexpr uint32_t kV8MaxRttSubtypingDepth = 31;
constexpr size_t kV8MaxWasmArrayInitLength = 10000;
// Stringref proposal. This limit is not standardized yet.
constexpr size_t kV8MaxWasmStringLiterals = 1000000;
static_assert(kV8MaxWasmTableSize <= 4294967295, // 2^32 - 1
"v8 should not exceed WebAssembly's non-web embedding limits");
static_assert(kV8MaxWasmTableInitEntries <= kV8MaxWasmTableSize,
......
......@@ -196,8 +196,8 @@ struct WasmTag {
// Static representation of a wasm literal stringref.
struct WasmStringRefLiteral {
explicit WasmStringRefLiteral(uint32_t offset) : offset(offset) {}
uint32_t offset; // Offset into string literals table.
explicit WasmStringRefLiteral(const WireBytesRef& source) : source(source) {}
WireBytesRef source; // start offset in the module bytes.
};
// Static representation of a wasm data segment.
......
// Copyright 2022 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/wasm/wtf8.h"
#include "src/strings/unicode.h"
#include "src/third_party/utf8-decoder/generalized-utf8-decoder.h"
namespace v8 {
namespace internal {
namespace wasm {
bool Wtf8::ValidateEncoding(const byte* bytes, size_t length) {
auto state = GeneralizedUtf8DfaDecoder::kAccept;
uint32_t current = 0;
uint32_t previous = 0;
for (size_t i = 0; i < length; i++) {
GeneralizedUtf8DfaDecoder::Decode(bytes[i], &state, &current);
if (state == GeneralizedUtf8DfaDecoder::kReject) return false;
if (state == GeneralizedUtf8DfaDecoder::kAccept) {
if (unibrow::Utf16::IsTrailSurrogate(current) &&
unibrow::Utf16::IsLeadSurrogate(previous)) {
return false;
}
previous = current;
current = 0;
}
}
return state == GeneralizedUtf8DfaDecoder::kAccept;
}
} // namespace wasm
} // namespace internal
} // namespace v8
// Copyright 2022 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#if !V8_ENABLE_WEBASSEMBLY
#error This header should only be included if WebAssembly is enabled.
#endif // !V8_ENABLE_WEBASSEMBLY
#ifndef V8_WASM_WTF8_H_
#define V8_WASM_WTF8_H_
#include <cinttypes>
#include <cstdarg>
#include <memory>
#include "src/strings/unicode.h"
namespace v8 {
namespace internal {
namespace wasm {
using byte = unibrow::byte;
class Wtf8 {
public:
// Validate that the input has a valid WTF-8 encoding.
//
// This method checks for:
// - valid utf-8 endcoding (e.g. no over-long encodings),
// - absence of surrogate pairs,
// - valid code point range.
//
// In terms of the WTF-8 specification (https://simonsapin.github.io/wtf-8/),
// this function checks for a valid "generalized UTF-8" sequence, with the
// additional constraint that surrogate pairs are not allowed.
static bool ValidateEncoding(const byte* str, size_t length);
};
} // namespace wasm
} // namespace internal
} // namespace v8
#endif // V8_WASM_WTF8_H_
......@@ -21,8 +21,7 @@ function assertInvalid(fn, message) {
`WebAssembly.Module(): ${message}`);
}
// TODO(wingo): Enable when we start parsing string literal sections.
// assertValid(builder => builder.addLiteralStringRef("foo"));
assertValid(builder => builder.addLiteralStringRef("foo"));
for (let [name, code] of [['string', kWasmStringRef],
['stringview_wtf8', kWasmStringViewWtf8],
......@@ -86,12 +85,11 @@ let kSig_w_zi = makeSig([kWasmStringViewIter, kWasmI32],
kGCPrefix, kExprStringNewWtf16, 0
]);
// TODO(wingo): Enable when we start parting string literal sections.
// builder.addFunction("string.const", kSig_w_v)
// .addLiteralStringRef("foo")
// .addBody([
// kGCPrefix, kExprStringConst, 0
// ]);
builder.addLiteralStringRef("foo");
builder.addFunction("string.const", kSig_w_v)
.addBody([
kGCPrefix, kExprStringConst, 0
]);
builder.addFunction("string.measure_utf8", kSig_i_w)
.addBody([
......@@ -244,6 +242,16 @@ let kSig_w_zi = makeSig([kWasmStringViewIter, kWasmI32],
assertTrue(WebAssembly.validate(builder.toBuffer()));
})();
assertInvalid(
builder => {
builder.addFunction("string.const/bad-index", kSig_w_v)
.addBody([
kGCPrefix, kExprStringConst, 0
]);
},
"Compiling function #0:\"string.const/bad-index\" failed: " +
"Invalid string literal index: 0 @+26");
assertInvalid(
builder => {
builder.addFunction("string.new_wtf8/no-mem", kSig_w_ii)
......
......@@ -1945,6 +1945,7 @@ class WasmModuleBuilder {
if (wasm.stringrefs.length > 0) {
if (debug) print('emitting stringrefs @ ' + binary.length);
binary.emit_section(kStringRefSectionCode, section => {
section.emit_u32v(0);
section.emit_u32v(wasm.stringrefs.length);
for (let str of wasm.stringrefs) {
section.emit_string(str);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment