Commit 995063d1 authored by Andy Wingo's avatar Andy Wingo Committed by V8 LUCI CQ

[stringrefs] Fix array-to-string instruction

To make an array from a string, we were passing an interior pointer to
the array's contents to NewStringFromUtf8 -- but the allocation could
cause that interior pointer to become invalid.

As part of the fix, also start to refactor to put the notion of UTF-8
variants closer to unibrow.  This will allow us to unify more
UTF-8/WTF-8/sloppy UTF-8 code paths.

Bug: v8:12868, v8:13028, v8:13036
Change-Id: I61f53a6882b5ebe1d0b38594412cb35ff0e28a94
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3742701
Commit-Queue: Andy Wingo <wingo@igalia.com>
Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Cr-Commit-Position: refs/heads/main@{#81536}
parent 9d69f6f7
......@@ -812,8 +812,11 @@ builtin WasmStringNewWtf8Array(
try {
if (array.length < end) goto OffsetOutOfRange;
if (end < start) goto OffsetOutOfRange;
tail runtime::WasmStringNewWtf8Array(
// TODO(13036): Make it a tail call, once stack maps are fixed, or
// make it a compile error to be a tail call.
const result = runtime::WasmStringNewWtf8Array(
context, policy, array, SmiFromUint32(start), SmiFromUint32(end));
return result;
} label OffsetOutOfRange deferred {
const error = MessageTemplate::kWasmTrapArrayOutOfBounds;
runtime::ThrowWasmError(context, SmiConstant(error));
......
......@@ -702,24 +702,40 @@ MaybeHandle<String> Factory::NewStringFromOneByte(
}
namespace {
void ThrowInvalidEncodedStringBytes(Isolate* isolate, MessageTemplate message) {
#if V8_ENABLE_WEBASSEMBLY
DCHECK(message == MessageTemplate::kWasmTrapStringInvalidWtf8 ||
message == MessageTemplate::kWasmTrapStringInvalidUtf8);
Handle<JSObject> error_obj = isolate->factory()->NewWasmRuntimeError(message);
JSObject::AddProperty(isolate, error_obj,
isolate->factory()->wasm_uncatchable_symbol(),
isolate->factory()->true_value(), NONE);
isolate->Throw(*error_obj);
#else
// The default in JS-land is to use Utf8Variant::kLossyUtf8, which never
// throws an error, so if there is no WebAssembly compiled in we'll never get
// here.
UNREACHABLE();
#endif // V8_ENABLE_WEBASSEMBLY
}
template <typename Decoder, typename Handler>
MaybeHandle<String> NewStringFromBytes(Isolate* isolate,
const base::Vector<const uint8_t>& data,
template <typename Decoder, typename PeekBytes>
MaybeHandle<String> NewStringFromBytes(Isolate* isolate, PeekBytes peek_bytes,
AllocationType allocation,
Handler throw_invalid) {
Decoder decoder(data);
MessageTemplate message) {
Decoder decoder(peek_bytes());
if (decoder.is_invalid()) {
throw_invalid();
DCHECK(isolate->has_pending_exception());
ThrowInvalidEncodedStringBytes(isolate, message);
return MaybeHandle<String>();
}
if (decoder.utf16_length() == 0) return isolate->factory()->empty_string();
if (decoder.is_one_byte()) {
if (data.size() == 1) {
return isolate->factory()->LookupSingleCharacterStringFromCode(data[0]);
if (decoder.utf16_length() == 1) {
uint8_t codepoint;
decoder.Decode(&codepoint, peek_bytes());
return isolate->factory()->LookupSingleCharacterStringFromCode(codepoint);
}
// Allocate string.
Handle<SeqOneByteString> result;
......@@ -729,7 +745,7 @@ MaybeHandle<String> NewStringFromBytes(Isolate* isolate,
String);
DisallowGarbageCollection no_gc;
decoder.Decode(result->GetChars(no_gc), data);
decoder.Decode(result->GetChars(no_gc), peek_bytes());
return result;
}
......@@ -741,46 +757,62 @@ MaybeHandle<String> NewStringFromBytes(Isolate* isolate,
String);
DisallowGarbageCollection no_gc;
decoder.Decode(result->GetChars(no_gc), data);
decoder.Decode(result->GetChars(no_gc), peek_bytes());
return result;
}
template <typename PeekBytes>
MaybeHandle<String> NewStringFromUtf8Variant(Isolate* isolate,
PeekBytes peek_bytes,
unibrow::Utf8Variant utf8_variant,
AllocationType allocation) {
switch (utf8_variant) {
case unibrow::Utf8Variant::kLossyUtf8:
return NewStringFromBytes<Utf8Decoder>(isolate, peek_bytes, allocation,
MessageTemplate::kNone);
#if V8_ENABLE_WEBASSEMBLY
case unibrow::Utf8Variant::kUtf8:
return NewStringFromBytes<StrictUtf8Decoder>(
isolate, peek_bytes, allocation,
MessageTemplate::kWasmTrapStringInvalidUtf8);
case unibrow::Utf8Variant::kWtf8:
return NewStringFromBytes<Wtf8Decoder>(
isolate, peek_bytes, allocation,
MessageTemplate::kWasmTrapStringInvalidWtf8);
#endif
}
}
} // namespace
MaybeHandle<String> Factory::NewStringFromUtf8(
const base::Vector<const char>& string, AllocationType allocation) {
auto handler = [&]() { UNREACHABLE(); };
return NewStringFromBytes<Utf8Decoder>(
isolate(), base::Vector<const uint8_t>::cast(string), allocation,
handler);
const base::Vector<const uint8_t>& string,
unibrow::Utf8Variant utf8_variant, AllocationType allocation) {
auto peek_bytes = [&]() -> base::Vector<const uint8_t> { return string; };
return NewStringFromUtf8Variant(isolate(), peek_bytes, utf8_variant,
allocation);
}
#if V8_ENABLE_WEBASSEMBLY
MaybeHandle<String> Factory::NewStringFromWtf8(
const base::Vector<const uint8_t>& string, AllocationType allocation) {
auto handler = [&]() {
Handle<JSObject> error_obj =
NewWasmRuntimeError(MessageTemplate::kWasmTrapStringInvalidWtf8);
JSObject::AddProperty(isolate(), error_obj, wasm_uncatchable_symbol(),
true_value(), NONE);
isolate()->Throw(*error_obj);
};
return NewStringFromBytes<Wtf8Decoder>(isolate(), string, allocation,
handler);
MaybeHandle<String> Factory::NewStringFromUtf8(
const base::Vector<const char>& string, AllocationType allocation) {
return NewStringFromUtf8(base::Vector<const uint8_t>::cast(string),
unibrow::Utf8Variant::kLossyUtf8, allocation);
}
MaybeHandle<String> Factory::NewStringFromStrictUtf8(
const base::Vector<const uint8_t>& string, AllocationType allocation) {
auto handler = [&]() {
Handle<JSObject> error_obj =
NewWasmRuntimeError(MessageTemplate::kWasmTrapStringInvalidUtf8);
JSObject::AddProperty(isolate(), error_obj, wasm_uncatchable_symbol(),
true_value(), NONE);
isolate()->Throw(*error_obj);
#if V8_ENABLE_WEBASSEMBLY
MaybeHandle<String> Factory::NewStringFromUtf8(
Handle<WasmArray> array, uint32_t start, uint32_t end,
unibrow::Utf8Variant utf8_variant, AllocationType allocation) {
DCHECK_EQ(sizeof(uint8_t), array->type()->element_type().value_kind_size());
DCHECK_LE(start, end);
DCHECK_LE(end, array->length());
auto peek_bytes = [&]() -> base::Vector<const uint8_t> {
const uint8_t* contents =
reinterpret_cast<const uint8_t*>(array->ElementAddress(0));
return {contents + start, end - start};
};
return NewStringFromBytes<StrictUtf8Decoder>(
isolate(), base::Vector<const uint8_t>::cast(string), allocation,
handler);
return NewStringFromUtf8Variant(isolate(), peek_bytes, utf8_variant,
allocation);
}
#endif // V8_ENABLE_WEBASSEMBLY
......
......@@ -24,6 +24,10 @@
#include "src/objects/shared-function-info.h"
#include "src/objects/string.h"
namespace unibrow {
enum class Utf8Variant : uint8_t;
}
namespace v8 {
namespace internal {
......@@ -257,18 +261,14 @@ class V8_EXPORT_PRIVATE Factory : public FactoryBase<Factory> {
V8_WARN_UNUSED_RESULT MaybeHandle<String> NewStringFromUtf8(
const base::Vector<const char>& str,
AllocationType allocation = AllocationType::kYoung);
V8_WARN_UNUSED_RESULT MaybeHandle<String> NewStringFromUtf8(
const base::Vector<const uint8_t>& str, unibrow::Utf8Variant utf8_variant,
AllocationType allocation = AllocationType::kYoung);
#if V8_ENABLE_WEBASSEMBLY
// The WTF-8 encoding is just like UTF-8 except that it can also represent
// isolated surrogate codepoints. It can represent all strings that
// JavaScript's strings can.
V8_WARN_UNUSED_RESULT MaybeHandle<String> NewStringFromWtf8(
const base::Vector<const uint8_t>& str,
AllocationType allocation = AllocationType::kYoung);
// The NewStringFromUtf8 function will replace any decoding error with U+FFFD
// (the replacement character). This function will trap instead.
V8_WARN_UNUSED_RESULT MaybeHandle<String> NewStringFromStrictUtf8(
const base::Vector<const uint8_t>& str,
V8_WARN_UNUSED_RESULT MaybeHandle<String> NewStringFromUtf8(
Handle<WasmArray> array, uint32_t begin, uint32_t end,
unibrow::Utf8Variant utf8_variant,
AllocationType allocation = AllocationType::kYoung);
#endif // V8_ENABLE_WEBASSEMBLY
......
......@@ -858,27 +858,16 @@ RUNTIME_FUNCTION(Runtime_WasmCreateResumePromise) {
}
namespace {
Object StringFromWtf8(Isolate* isolate, wasm::StringRefWtf8Policy policy,
const base::Vector<const uint8_t> bytes) {
// TODO(12868): Override any exception with an uncatchable-by-wasm trap.
Handle<String> result;
unibrow::Utf8Variant Utf8VariantFromWtf8Policy(
wasm::StringRefWtf8Policy policy) {
switch (policy) {
case wasm::kWtf8PolicyReject:
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, result, isolate->factory()->NewStringFromStrictUtf8(bytes));
break;
return unibrow::Utf8Variant::kUtf8;
case wasm::kWtf8PolicyAccept:
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, result, isolate->factory()->NewStringFromWtf8(bytes));
break;
case wasm::kWtf8PolicyReplace: {
auto string = base::Vector<const char>::cast(bytes);
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, result, isolate->factory()->NewStringFromUtf8(string));
break;
}
return unibrow::Utf8Variant::kWtf8;
case wasm::kWtf8PolicyReplace:
return unibrow::Utf8Variant::kLossyUtf8;
}
return *result;
}
} // namespace
......@@ -899,6 +888,7 @@ RUNTIME_FUNCTION(Runtime_WasmStringNewWtf8) {
DCHECK(policy_value <= wasm::kLastWtf8Policy);
auto policy = static_cast<wasm::StringRefWtf8Policy>(policy_value);
auto utf8_variant = Utf8VariantFromWtf8Policy(policy);
uint64_t mem_size = instance->memory_size();
if (!base::IsInBounds<uint64_t>(offset, size, mem_size)) {
......@@ -907,7 +897,8 @@ RUNTIME_FUNCTION(Runtime_WasmStringNewWtf8) {
const base::Vector<const uint8_t> bytes{instance->memory_start() + offset,
size};
return StringFromWtf8(isolate, policy, bytes);
RETURN_RESULT_OR_FAILURE(
isolate, isolate->factory()->NewStringFromUtf8(bytes, utf8_variant));
}
RUNTIME_FUNCTION(Runtime_WasmStringNewWtf8Array) {
......@@ -921,14 +912,10 @@ RUNTIME_FUNCTION(Runtime_WasmStringNewWtf8Array) {
DCHECK(policy_value <= wasm::kLastWtf8Policy);
auto policy = static_cast<wasm::StringRefWtf8Policy>(policy_value);
auto utf8_variant = Utf8VariantFromWtf8Policy(policy);
DCHECK_EQ(sizeof(uint8_t), array->type()->element_type().value_kind_size());
const void* src = ArrayElementAddress(array, start, sizeof(uint8_t));
DCHECK_LE(start, end);
DCHECK_LE(end, array->length());
const base::Vector<const uint8_t> bytes{static_cast<const uint8_t*>(src),
end - start};
return StringFromWtf8(isolate, policy, bytes);
RETURN_RESULT_OR_FAILURE(isolate, isolate->factory()->NewStringFromUtf8(
array, start, end, utf8_variant));
}
RUNTIME_FUNCTION(Runtime_WasmStringNewWtf16) {
......@@ -955,12 +942,9 @@ RUNTIME_FUNCTION(Runtime_WasmStringNewWtf16) {
const byte* bytes = instance->memory_start() + offset;
const base::uc16* codeunits = reinterpret_cast<const base::uc16*>(bytes);
// TODO(12868): Override any exception with an uncatchable-by-wasm trap.
Handle<String> result;
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, result,
isolate->factory()->NewStringFromTwoByteLittleEndian(
{codeunits, size_in_codeunits}));
return *result;
RETURN_RESULT_OR_FAILURE(isolate,
isolate->factory()->NewStringFromTwoByteLittleEndian(
{codeunits, size_in_codeunits}));
}
// Returns the new string if the operation succeeds. Otherwise traps.
......@@ -982,12 +966,10 @@ RUNTIME_FUNCTION(Runtime_WasmStringConst) {
const base::Vector<const uint8_t> string_bytes =
module_bytes.SubVector(literal.source.offset(),
literal.source.offset() + literal.source.length());
// TODO(12868): Override any exception with an uncatchable-by-wasm trap?
// TODO(12868): No need to re-validate WTF-8. Also, result should be cached.
Handle<String> result;
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, result, isolate->factory()->NewStringFromWtf8(string_bytes));
return *result;
return *isolate->factory()
->NewStringFromUtf8(string_bytes, unibrow::Utf8Variant::kWtf8)
.ToHandleChecked();
}
namespace {
......
......@@ -151,6 +151,19 @@ class Latin1 {
}
};
enum class Utf8Variant : uint8_t {
kLossyUtf8, // Lossy UTF-8: Any byte sequence can be decoded without
// error, replacing invalid UTF-8 with the replacement
// character (U+FFFD). Any sequence of codepoints can be
// encoded without error, replacing surrogates with U+FFFD.
#if V8_ENABLE_WEBASSEMBLY
kUtf8, // UTF-8. Decoding an invalid byte sequence or encoding a
// surrogate codepoint signals an error.
kWtf8, // WTF-8: like UTF-8, but allows isolated (but not paired)
// surrogate codepoints to be encoded and decoded.
#endif
};
class V8_EXPORT_PRIVATE Utf8 {
public:
using State = Utf8DfaDecoder::State;
......
......@@ -149,7 +149,9 @@ void ConstantExpressionInterface::StringConst(
module_bytes.SubVector(literal.source.offset(),
literal.source.offset() + literal.source.length());
Handle<String> string =
isolate_->factory()->NewStringFromWtf8(string_bytes).ToHandleChecked();
isolate_->factory()
->NewStringFromUtf8(string_bytes, unibrow::Utf8Variant::kWtf8)
.ToHandleChecked();
result->runtime_value = WasmValue(string, kWasmStringRef);
}
......
......@@ -106,12 +106,10 @@ inline bool validate_utf8(Decoder* decoder, WireBytesRef string) {
string.length());
}
enum class StringValidation { kNone, kUtf8, kWtf8 };
// Reads a length-prefixed string, checking that it is within bounds. Returns
// the offset of the string, and the length as an out parameter.
inline WireBytesRef consume_string(Decoder* decoder,
StringValidation validation,
unibrow::Utf8Variant grammar,
const char* name) {
uint32_t length = decoder->consume_u32v("string length");
uint32_t offset = decoder->pc_offset();
......@@ -120,15 +118,15 @@ inline WireBytesRef consume_string(Decoder* decoder,
if (length > 0) {
decoder->consume_bytes(length, name);
if (decoder->ok()) {
switch (validation) {
case StringValidation::kNone:
switch (grammar) {
case unibrow::Utf8Variant::kLossyUtf8:
break;
case StringValidation::kUtf8:
case unibrow::Utf8Variant::kUtf8:
if (!unibrow::Utf8::ValidateEncoding(string_start, length)) {
decoder->errorf(string_start, "%s: no valid UTF-8 string", name);
}
break;
case StringValidation::kWtf8:
case unibrow::Utf8Variant::kWtf8:
if (!unibrow::Wtf8::ValidateEncoding(string_start, length)) {
decoder->errorf(string_start, "%s: no valid WTF-8 string", name);
}
......@@ -140,7 +138,7 @@ inline WireBytesRef consume_string(Decoder* decoder,
}
inline WireBytesRef consume_utf8_string(Decoder* decoder, const char* name) {
return consume_string(decoder, StringValidation::kUtf8, name);
return consume_string(decoder, unibrow::Utf8Variant::kUtf8, name);
}
inline SectionCode IdentifyUnknownSectionInternal(Decoder* decoder) {
......@@ -1278,8 +1276,8 @@ class ModuleDecoderImpl : public Decoder {
// Decode module name, ignore the rest.
// Function and local names will be decoded when needed.
if (name_type == NameSectionKindCode::kModuleCode) {
WireBytesRef name =
consume_string(&inner, StringValidation::kNone, "module name");
WireBytesRef name = consume_string(
&inner, unibrow::Utf8Variant::kLossyUtf8, "module name");
if (inner.ok() && validate_utf8(&inner, name)) {
module_->name = name;
}
......@@ -1516,8 +1514,9 @@ class ModuleDecoderImpl : public Decoder {
for (uint32_t i = 0; ok() && i < immediate; ++i) {
TRACE("DecodeStringLiteral[%d] module+%d\n", i,
static_cast<int>(pc_ - start_));
WireBytesRef pos =
wasm::consume_string(this, StringValidation::kWtf8, "string literal");
// TODO(12868): Throw if the string's utf-16 length > String::kMaxLength.
WireBytesRef pos = wasm::consume_string(this, unibrow::Utf8Variant::kWtf8,
"string literal");
module_->stringref_literals.emplace_back(pos);
}
}
......
......@@ -298,7 +298,7 @@ void DecodeNameMap(NameMap& target, Decoder& decoder,
for (uint32_t i = 0; i < count; i++) {
uint32_t index = decoder.consume_u32v("index");
WireBytesRef name =
consume_string(&decoder, StringValidation::kNone, "name");
consume_string(&decoder, unibrow::Utf8Variant::kLossyUtf8, "name");
if (!decoder.ok()) break;
if (index > NameMap::kMaxKey) continue;
if (empty_names == kSkipEmptyNames && name.is_empty()) continue;
......@@ -318,7 +318,7 @@ void DecodeIndirectNameMap(IndirectNameMap& target, Decoder& decoder) {
for (uint32_t k = 0; k < inner_count; ++k) {
uint32_t inner_index = decoder.consume_u32v("inner index");
WireBytesRef name =
consume_string(&decoder, StringValidation::kNone, "name");
consume_string(&decoder, unibrow::Utf8Variant::kLossyUtf8, "name");
if (!decoder.ok()) break;
if (inner_index > NameMap::kMaxKey) continue;
if (name.is_empty()) continue; // Empty names are useless.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment