Commit 44932c16 authored by Andy Wingo's avatar Andy Wingo Committed by V8 LUCI CQ

[stringrefs] Parse the string literals section

Bug: v8:12868

Also adds wtf8.cc, wtf8.h to src/wasm, to implement WTF-8 validation and
possibly other utilities.  Also fixes a bug when parsing the string
literals section; I had misunderstood the way the unordered/ordered
sections mechanism worked.

Change-Id: I3c4205e0872379a69575f84ba33e0090a9d8d656
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3652789
Commit-Queue: Andy Wingo <wingo@igalia.com>
Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Cr-Commit-Position: refs/heads/main@{#80674}
parent 09b7a8b8
...@@ -2547,6 +2547,8 @@ filegroup( ...@@ -2547,6 +2547,8 @@ filegroup(
"src/wasm/wasm-subtyping.h", "src/wasm/wasm-subtyping.h",
"src/wasm/wasm-tier.h", "src/wasm/wasm-tier.h",
"src/wasm/wasm-value.h", "src/wasm/wasm-value.h",
"src/wasm/wtf8.cc",
"src/wasm/wtf8.h",
], ],
"//conditions:default": [], "//conditions:default": [],
}), }),
......
...@@ -3608,6 +3608,7 @@ v8_header_set("v8_internal_headers") { ...@@ -3608,6 +3608,7 @@ v8_header_set("v8_internal_headers") {
"src/wasm/wasm-subtyping.h", "src/wasm/wasm-subtyping.h",
"src/wasm/wasm-tier.h", "src/wasm/wasm-tier.h",
"src/wasm/wasm-value.h", "src/wasm/wasm-value.h",
"src/wasm/wtf8.h",
] ]
} }
...@@ -4664,6 +4665,7 @@ v8_source_set("v8_base_without_compiler") { ...@@ -4664,6 +4665,7 @@ v8_source_set("v8_base_without_compiler") {
"src/wasm/wasm-result.cc", "src/wasm/wasm-result.cc",
"src/wasm/wasm-serialization.cc", "src/wasm/wasm-serialization.cc",
"src/wasm/wasm-subtyping.cc", "src/wasm/wasm-subtyping.cc",
"src/wasm/wtf8.cc",
] ]
} }
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "src/wasm/wasm-engine.h" #include "src/wasm/wasm-engine.h"
#include "src/wasm/wasm-limits.h" #include "src/wasm/wasm-limits.h"
#include "src/wasm/wasm-opcodes-inl.h" #include "src/wasm/wasm-opcodes-inl.h"
#include "src/wasm/wtf8.h"
namespace v8 { namespace v8 {
namespace internal { namespace internal {
...@@ -116,9 +117,11 @@ bool validate_utf8(Decoder* decoder, WireBytesRef string) { ...@@ -116,9 +117,11 @@ bool validate_utf8(Decoder* decoder, WireBytesRef string) {
string.length()); string.length());
} }
enum class StringValidation { kNone, kUtf8, kWtf8 };
// Reads a length-prefixed string, checking that it is within bounds. Returns // Reads a length-prefixed string, checking that it is within bounds. Returns
// the offset of the string, and the length as an out parameter. // the offset of the string, and the length as an out parameter.
WireBytesRef consume_string(Decoder* decoder, bool validate_utf8, WireBytesRef consume_string(Decoder* decoder, StringValidation validation,
const char* name) { const char* name) {
uint32_t length = decoder->consume_u32v("string length"); uint32_t length = decoder->consume_u32v("string length");
uint32_t offset = decoder->pc_offset(); uint32_t offset = decoder->pc_offset();
...@@ -126,17 +129,33 @@ WireBytesRef consume_string(Decoder* decoder, bool validate_utf8, ...@@ -126,17 +129,33 @@ WireBytesRef consume_string(Decoder* decoder, bool validate_utf8,
// Consume bytes before validation to guarantee that the string is not oob. // Consume bytes before validation to guarantee that the string is not oob.
if (length > 0) { if (length > 0) {
decoder->consume_bytes(length, name); decoder->consume_bytes(length, name);
if (decoder->ok() && validate_utf8 && if (decoder->ok()) {
!unibrow::Utf8::ValidateEncoding(string_start, length)) { switch (validation) {
decoder->errorf(string_start, "%s: no valid UTF-8 string", name); case StringValidation::kNone:
break;
case StringValidation::kUtf8:
if (!unibrow::Utf8::ValidateEncoding(string_start, length)) {
decoder->errorf(string_start, "%s: no valid UTF-8 string", name);
}
break;
case StringValidation::kWtf8:
if (!Wtf8::ValidateEncoding(string_start, length)) {
decoder->errorf(string_start, "%s: no valid WTF-8 string", name);
}
break;
}
} }
} }
return {offset, decoder->failed() ? 0 : length}; return {offset, decoder->failed() ? 0 : length};
} }
WireBytesRef consume_utf8_string(Decoder* decoder, const char* name) {
return consume_string(decoder, StringValidation::kUtf8, name);
}
namespace { namespace {
SectionCode IdentifyUnknownSectionInternal(Decoder* decoder) { SectionCode IdentifyUnknownSectionInternal(Decoder* decoder) {
WireBytesRef string = consume_string(decoder, true, "section name"); WireBytesRef string = consume_utf8_string(decoder, "section name");
if (decoder->failed()) { if (decoder->failed()) {
return kUnknownSectionCode; return kUnknownSectionCode;
} }
...@@ -422,19 +441,18 @@ class ModuleDecoderImpl : public Decoder { ...@@ -422,19 +441,18 @@ class ModuleDecoderImpl : public Decoder {
break; break;
case kTagSectionCode: { case kTagSectionCode: {
if (!CheckUnorderedSection(section_code)) return; if (!CheckUnorderedSection(section_code)) return;
SectionCode next = enabled_features_.has_stringref() if (!CheckSectionOrder(section_code, kMemorySectionCode,
? kStringRefSectionCode kGlobalSectionCode)) {
: kGlobalSectionCode;
if (!CheckSectionOrder(section_code, kMemorySectionCode, next)) {
return; return;
} }
break; break;
} }
case kStringRefSectionCode: { case kStringRefSectionCode: {
// TODO(12868): If there's a tag section, assert that we're after the
// tag section.
if (!CheckUnorderedSection(section_code)) return; if (!CheckUnorderedSection(section_code)) return;
SectionCode prev = if (!CheckSectionOrder(section_code, kMemorySectionCode,
enabled_features_.has_eh() ? kTagSectionCode : kMemorySectionCode; kGlobalSectionCode)) {
if (!CheckSectionOrder(section_code, prev, kGlobalSectionCode)) {
return; return;
} }
break; break;
...@@ -812,8 +830,8 @@ class ModuleDecoderImpl : public Decoder { ...@@ -812,8 +830,8 @@ class ModuleDecoderImpl : public Decoder {
}); });
WasmImport* import = &module_->import_table.back(); WasmImport* import = &module_->import_table.back();
const byte* pos = pc_; const byte* pos = pc_;
import->module_name = consume_string(this, true, "module name"); import->module_name = consume_utf8_string(this, "module name");
import->field_name = consume_string(this, true, "field name"); import->field_name = consume_utf8_string(this, "field name");
import->kind = import->kind =
static_cast<ImportExportKindCode>(consume_u8("import kind")); static_cast<ImportExportKindCode>(consume_u8("import kind"));
switch (import->kind) { switch (import->kind) {
...@@ -999,7 +1017,7 @@ class ModuleDecoderImpl : public Decoder { ...@@ -999,7 +1017,7 @@ class ModuleDecoderImpl : public Decoder {
}); });
WasmExport* exp = &module_->export_table.back(); WasmExport* exp = &module_->export_table.back();
exp->name = consume_string(this, true, "field name"); exp->name = consume_utf8_string(this, "field name");
const byte* pos = pc(); const byte* pos = pc();
exp->kind = static_cast<ImportExportKindCode>(consume_u8("export kind")); exp->kind = static_cast<ImportExportKindCode>(consume_u8("export kind"));
...@@ -1250,7 +1268,8 @@ class ModuleDecoderImpl : public Decoder { ...@@ -1250,7 +1268,8 @@ class ModuleDecoderImpl : public Decoder {
// Decode module name, ignore the rest. // Decode module name, ignore the rest.
// Function and local names will be decoded when needed. // Function and local names will be decoded when needed.
if (name_type == NameSectionKindCode::kModuleCode) { if (name_type == NameSectionKindCode::kModuleCode) {
WireBytesRef name = consume_string(&inner, false, "module name"); WireBytesRef name =
consume_string(&inner, StringValidation::kNone, "module name");
if (inner.ok() && validate_utf8(&inner, name)) { if (inner.ok() && validate_utf8(&inner, name)) {
module_->name = name; module_->name = name;
} }
...@@ -1265,7 +1284,7 @@ class ModuleDecoderImpl : public Decoder { ...@@ -1265,7 +1284,7 @@ class ModuleDecoderImpl : public Decoder {
void DecodeSourceMappingURLSection() { void DecodeSourceMappingURLSection() {
Decoder inner(start_, pc_, end_, buffer_offset_); Decoder inner(start_, pc_, end_, buffer_offset_);
WireBytesRef url = wasm::consume_string(&inner, true, "module name"); WireBytesRef url = wasm::consume_utf8_string(&inner, "module name");
if (inner.ok() && if (inner.ok() &&
module_->debug_symbols.type != WasmDebugSymbols::Type::SourceMap) { module_->debug_symbols.type != WasmDebugSymbols::Type::SourceMap) {
module_->debug_symbols = {WasmDebugSymbols::Type::SourceMap, url}; module_->debug_symbols = {WasmDebugSymbols::Type::SourceMap, url};
...@@ -1277,7 +1296,7 @@ class ModuleDecoderImpl : public Decoder { ...@@ -1277,7 +1296,7 @@ class ModuleDecoderImpl : public Decoder {
void DecodeExternalDebugInfoSection() { void DecodeExternalDebugInfoSection() {
Decoder inner(start_, pc_, end_, buffer_offset_); Decoder inner(start_, pc_, end_, buffer_offset_);
WireBytesRef url = WireBytesRef url =
wasm::consume_string(&inner, true, "external symbol file"); wasm::consume_utf8_string(&inner, "external symbol file");
// If there is an explicit source map, prefer it over DWARF info. // If there is an explicit source map, prefer it over DWARF info.
if (inner.ok() && if (inner.ok() &&
module_->debug_symbols.type != WasmDebugSymbols::Type::SourceMap) { module_->debug_symbols.type != WasmDebugSymbols::Type::SourceMap) {
...@@ -1475,7 +1494,23 @@ class ModuleDecoderImpl : public Decoder { ...@@ -1475,7 +1494,23 @@ class ModuleDecoderImpl : public Decoder {
} }
} }
void DecodeStringRefSection() { UNIMPLEMENTED(); } void DecodeStringRefSection() {
uint32_t deferred = consume_count("deferred string literal count",
kV8MaxWasmStringLiterals);
if (deferred) {
errorf(pc(), "Invalid deferred string literal count %u (expected 0)",
deferred);
}
uint32_t immediate = consume_count("string literal count",
kV8MaxWasmStringLiterals - deferred);
for (uint32_t i = 0; ok() && i < immediate; ++i) {
TRACE("DecodeStringLiteral[%d] module+%d\n", i,
static_cast<int>(pc_ - start_));
WireBytesRef pos =
wasm::consume_string(this, StringValidation::kWtf8, "string literal");
module_->stringref_literals.emplace_back(pos);
}
}
bool CheckMismatchedCounts() { bool CheckMismatchedCounts() {
// The declared vs. defined function count is normally checked when // The declared vs. defined function count is normally checked when
...@@ -2527,7 +2562,8 @@ void DecodeFunctionNames(const byte* module_start, const byte* module_end, ...@@ -2527,7 +2562,8 @@ void DecodeFunctionNames(const byte* module_start, const byte* module_end,
for (; decoder.ok() && functions_count > 0; --functions_count) { for (; decoder.ok() && functions_count > 0; --functions_count) {
uint32_t function_index = decoder.consume_u32v("function index"); uint32_t function_index = decoder.consume_u32v("function index");
WireBytesRef name = consume_string(&decoder, false, "function name"); WireBytesRef name =
consume_string(&decoder, StringValidation::kNone, "function name");
// Be lenient with errors in the name section: Ignore non-UTF8 names. // Be lenient with errors in the name section: Ignore non-UTF8 names.
// You can even assign to the same function multiple times (last valid // You can even assign to the same function multiple times (last valid
...@@ -2561,7 +2597,8 @@ NameMap DecodeNameMap(base::Vector<const uint8_t> module_bytes, ...@@ -2561,7 +2597,8 @@ NameMap DecodeNameMap(base::Vector<const uint8_t> module_bytes,
uint32_t count = decoder.consume_u32v("names count"); uint32_t count = decoder.consume_u32v("names count");
for (uint32_t i = 0; i < count; i++) { for (uint32_t i = 0; i < count; i++) {
uint32_t index = decoder.consume_u32v("index"); uint32_t index = decoder.consume_u32v("index");
WireBytesRef name = consume_string(&decoder, false, "name"); WireBytesRef name =
consume_string(&decoder, StringValidation::kNone, "name");
if (!decoder.ok()) break; if (!decoder.ok()) break;
if (index > kMaxInt) continue; if (index > kMaxInt) continue;
if (!validate_utf8(&decoder, name)) continue; if (!validate_utf8(&decoder, name)) continue;
...@@ -2598,7 +2635,8 @@ IndirectNameMap DecodeIndirectNameMap(base::Vector<const uint8_t> module_bytes, ...@@ -2598,7 +2635,8 @@ IndirectNameMap DecodeIndirectNameMap(base::Vector<const uint8_t> module_bytes,
uint32_t inner_count = decoder.consume_u32v("inner count"); uint32_t inner_count = decoder.consume_u32v("inner count");
for (uint32_t k = 0; k < inner_count; ++k) { for (uint32_t k = 0; k < inner_count; ++k) {
uint32_t inner_index = decoder.consume_u32v("inner index"); uint32_t inner_index = decoder.consume_u32v("inner index");
WireBytesRef name = consume_string(&decoder, false, "name"); WireBytesRef name =
consume_string(&decoder, StringValidation::kNone, "name");
if (!decoder.ok()) break; if (!decoder.ok()) break;
if (inner_index > kMaxInt) continue; if (inner_index > kMaxInt) continue;
// Ignore non-utf8 names. // Ignore non-utf8 names.
......
...@@ -60,6 +60,9 @@ constexpr size_t kV8MaxWasmStructFields = 999; ...@@ -60,6 +60,9 @@ constexpr size_t kV8MaxWasmStructFields = 999;
constexpr uint32_t kV8MaxRttSubtypingDepth = 31; constexpr uint32_t kV8MaxRttSubtypingDepth = 31;
constexpr size_t kV8MaxWasmArrayInitLength = 10000; constexpr size_t kV8MaxWasmArrayInitLength = 10000;
// Stringref proposal. This limit is not standardized yet.
constexpr size_t kV8MaxWasmStringLiterals = 1000000;
static_assert(kV8MaxWasmTableSize <= 4294967295, // 2^32 - 1 static_assert(kV8MaxWasmTableSize <= 4294967295, // 2^32 - 1
"v8 should not exceed WebAssembly's non-web embedding limits"); "v8 should not exceed WebAssembly's non-web embedding limits");
static_assert(kV8MaxWasmTableInitEntries <= kV8MaxWasmTableSize, static_assert(kV8MaxWasmTableInitEntries <= kV8MaxWasmTableSize,
......
...@@ -196,8 +196,8 @@ struct WasmTag { ...@@ -196,8 +196,8 @@ struct WasmTag {
// Static representation of a wasm literal stringref. // Static representation of a wasm literal stringref.
struct WasmStringRefLiteral { struct WasmStringRefLiteral {
explicit WasmStringRefLiteral(uint32_t offset) : offset(offset) {} explicit WasmStringRefLiteral(const WireBytesRef& source) : source(source) {}
uint32_t offset; // Offset into string literals table. WireBytesRef source; // start offset in the module bytes.
}; };
// Static representation of a wasm data segment. // Static representation of a wasm data segment.
......
// Copyright 2022 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/wasm/wtf8.h"
#include "src/strings/unicode.h"
#include "src/third_party/utf8-decoder/generalized-utf8-decoder.h"
namespace v8 {
namespace internal {
namespace wasm {
bool Wtf8::ValidateEncoding(const byte* bytes, size_t length) {
auto state = GeneralizedUtf8DfaDecoder::kAccept;
uint32_t current = 0;
uint32_t previous = 0;
for (size_t i = 0; i < length; i++) {
GeneralizedUtf8DfaDecoder::Decode(bytes[i], &state, &current);
if (state == GeneralizedUtf8DfaDecoder::kReject) return false;
if (state == GeneralizedUtf8DfaDecoder::kAccept) {
if (unibrow::Utf16::IsTrailSurrogate(current) &&
unibrow::Utf16::IsLeadSurrogate(previous)) {
return false;
}
previous = current;
current = 0;
}
}
return state == GeneralizedUtf8DfaDecoder::kAccept;
}
} // namespace wasm
} // namespace internal
} // namespace v8
// Copyright 2022 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#if !V8_ENABLE_WEBASSEMBLY
#error This header should only be included if WebAssembly is enabled.
#endif // !V8_ENABLE_WEBASSEMBLY
#ifndef V8_WASM_WTF8_H_
#define V8_WASM_WTF8_H_
#include <cinttypes>
#include <cstdarg>
#include <memory>
#include "src/strings/unicode.h"
namespace v8 {
namespace internal {
namespace wasm {
using byte = unibrow::byte;
class Wtf8 {
public:
// Validate that the input has a valid WTF-8 encoding.
//
// This method checks for:
// - valid utf-8 endcoding (e.g. no over-long encodings),
// - absence of surrogate pairs,
// - valid code point range.
//
// In terms of the WTF-8 specification (https://simonsapin.github.io/wtf-8/),
// this function checks for a valid "generalized UTF-8" sequence, with the
// additional constraint that surrogate pairs are not allowed.
static bool ValidateEncoding(const byte* str, size_t length);
};
} // namespace wasm
} // namespace internal
} // namespace v8
#endif // V8_WASM_WTF8_H_
...@@ -21,8 +21,7 @@ function assertInvalid(fn, message) { ...@@ -21,8 +21,7 @@ function assertInvalid(fn, message) {
`WebAssembly.Module(): ${message}`); `WebAssembly.Module(): ${message}`);
} }
// TODO(wingo): Enable when we start parsing string literal sections. assertValid(builder => builder.addLiteralStringRef("foo"));
// assertValid(builder => builder.addLiteralStringRef("foo"));
for (let [name, code] of [['string', kWasmStringRef], for (let [name, code] of [['string', kWasmStringRef],
['stringview_wtf8', kWasmStringViewWtf8], ['stringview_wtf8', kWasmStringViewWtf8],
...@@ -86,12 +85,11 @@ let kSig_w_zi = makeSig([kWasmStringViewIter, kWasmI32], ...@@ -86,12 +85,11 @@ let kSig_w_zi = makeSig([kWasmStringViewIter, kWasmI32],
kGCPrefix, kExprStringNewWtf16, 0 kGCPrefix, kExprStringNewWtf16, 0
]); ]);
// TODO(wingo): Enable when we start parting string literal sections. builder.addLiteralStringRef("foo");
// builder.addFunction("string.const", kSig_w_v) builder.addFunction("string.const", kSig_w_v)
// .addLiteralStringRef("foo") .addBody([
// .addBody([ kGCPrefix, kExprStringConst, 0
// kGCPrefix, kExprStringConst, 0 ]);
// ]);
builder.addFunction("string.measure_utf8", kSig_i_w) builder.addFunction("string.measure_utf8", kSig_i_w)
.addBody([ .addBody([
...@@ -244,6 +242,16 @@ let kSig_w_zi = makeSig([kWasmStringViewIter, kWasmI32], ...@@ -244,6 +242,16 @@ let kSig_w_zi = makeSig([kWasmStringViewIter, kWasmI32],
assertTrue(WebAssembly.validate(builder.toBuffer())); assertTrue(WebAssembly.validate(builder.toBuffer()));
})(); })();
assertInvalid(
builder => {
builder.addFunction("string.const/bad-index", kSig_w_v)
.addBody([
kGCPrefix, kExprStringConst, 0
]);
},
"Compiling function #0:\"string.const/bad-index\" failed: " +
"Invalid string literal index: 0 @+26");
assertInvalid( assertInvalid(
builder => { builder => {
builder.addFunction("string.new_wtf8/no-mem", kSig_w_ii) builder.addFunction("string.new_wtf8/no-mem", kSig_w_ii)
......
...@@ -1945,6 +1945,7 @@ class WasmModuleBuilder { ...@@ -1945,6 +1945,7 @@ class WasmModuleBuilder {
if (wasm.stringrefs.length > 0) { if (wasm.stringrefs.length > 0) {
if (debug) print('emitting stringrefs @ ' + binary.length); if (debug) print('emitting stringrefs @ ' + binary.length);
binary.emit_section(kStringRefSectionCode, section => { binary.emit_section(kStringRefSectionCode, section => {
section.emit_u32v(0);
section.emit_u32v(wasm.stringrefs.length); section.emit_u32v(wasm.stringrefs.length);
for (let str of wasm.stringrefs) { for (let str of wasm.stringrefs) {
section.emit_string(str); section.emit_string(str);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment