Commit e4941131 authored by Andy Wingo's avatar Andy Wingo Committed by V8 LUCI CQ

[stringrefs] Implement string.new_wtf8

Bug: v8:12868

Also adds the equivalent of Utf8Decoder, but for WTF-8.

Change-Id: I1548a44b0aea912cdd429eb85be4dfc606355cad
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3660257Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Commit-Queue: Andy Wingo <wingo@igalia.com>
Cr-Commit-Position: refs/heads/main@{#80750}
parent 2864a436
...@@ -38,6 +38,8 @@ extern runtime WasmArrayCopy( ...@@ -38,6 +38,8 @@ extern runtime WasmArrayCopy(
Context, WasmArray, Smi, WasmArray, Smi, Smi): JSAny; Context, WasmArray, Smi, WasmArray, Smi, Smi): JSAny;
extern runtime WasmArrayInitFromData( extern runtime WasmArrayInitFromData(
Context, WasmInstanceObject, Smi, Smi, Smi, Map): Object; Context, WasmInstanceObject, Smi, Smi, Smi, Map): Object;
extern runtime WasmStringNewWtf8(
Context, WasmInstanceObject, Smi, Number, Number): String;
} }
namespace unsafe { namespace unsafe {
...@@ -798,4 +800,12 @@ transitioning javascript builtin ExperimentalWasmConvertStringToArray( ...@@ -798,4 +800,12 @@ transitioning javascript builtin ExperimentalWasmConvertStringToArray(
ThrowTypeError(MessageTemplate::kInvalidArgument); ThrowTypeError(MessageTemplate::kInvalidArgument);
} }
} }
builtin WasmStringNewWtf8(
memory: uint32, offset: uint32, size: uint32): String {
const instance = LoadInstanceFromFrame();
tail runtime::WasmStringNewWtf8(
LoadContextFromInstance(instance), instance, SmiFromUint32(memory),
WasmUint32ToNumber(offset), WasmUint32ToNumber(size));
}
} }
...@@ -643,6 +643,7 @@ namespace internal { ...@@ -643,6 +643,7 @@ namespace internal {
T(WasmTrapIllegalCast, "illegal cast") \ T(WasmTrapIllegalCast, "illegal cast") \
T(WasmTrapArrayOutOfBounds, "array element access out of bounds") \ T(WasmTrapArrayOutOfBounds, "array element access out of bounds") \
T(WasmTrapArrayTooLarge, "requested new array is too large") \ T(WasmTrapArrayTooLarge, "requested new array is too large") \
T(WasmTrapStringInvalidWtf8, "invalid WTF-8 string") \
T(WasmExceptionError, "wasm exception") \ T(WasmExceptionError, "wasm exception") \
/* Asm.js validation related */ \ /* Asm.js validation related */ \
T(AsmJsInvalid, "Invalid asm.js: %") \ T(AsmJsInvalid, "Invalid asm.js: %") \
......
...@@ -5670,6 +5670,12 @@ void WasmGraphBuilder::ArrayCopy(Node* dst_array, Node* dst_index, ...@@ -5670,6 +5670,12 @@ void WasmGraphBuilder::ArrayCopy(Node* dst_array, Node* dst_index,
gasm_->Bind(&skip); gasm_->Bind(&skip);
} }
Node* WasmGraphBuilder::StringNewWtf8(uint32_t memory, Node* offset,
Node* size) {
return gasm_->CallBuiltin(Builtin::kWasmStringNewWtf8, Operator::kNoDeopt,
gasm_->Uint32Constant(memory), offset, size);
}
// 1 bit V8 Smi tag, 31 bits V8 Smi shift, 1 bit i31ref high-bit truncation. // 1 bit V8 Smi tag, 31 bits V8 Smi shift, 1 bit i31ref high-bit truncation.
constexpr int kI31To32BitSmiShift = 33; constexpr int kI31To32BitSmiShift = 33;
......
...@@ -536,6 +536,7 @@ class WasmGraphBuilder { ...@@ -536,6 +536,7 @@ class WasmGraphBuilder {
void BrOnI31(Node* object, Node* rtt, WasmTypeCheckConfig config, void BrOnI31(Node* object, Node* rtt, WasmTypeCheckConfig config,
Node** match_control, Node** match_effect, Node** match_control, Node** match_effect,
Node** no_match_control, Node** no_match_effect); Node** no_match_control, Node** no_match_effect);
Node* StringNewWtf8(uint32_t memory, Node* offset, Node* size);
bool has_simd() const { return has_simd_; } bool has_simd() const { return has_simd_; }
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include "src/wasm/wasm-objects.h" #include "src/wasm/wasm-objects.h"
#include "src/wasm/wasm-subtyping.h" #include "src/wasm/wasm-subtyping.h"
#include "src/wasm/wasm-value.h" #include "src/wasm/wasm-value.h"
#include "src/wasm/wtf8.h"
namespace v8 { namespace v8 {
namespace internal { namespace internal {
...@@ -825,5 +826,55 @@ RUNTIME_FUNCTION(Runtime_WasmCreateResumePromise) { ...@@ -825,5 +826,55 @@ RUNTIME_FUNCTION(Runtime_WasmCreateResumePromise) {
return *result; return *result;
} }
// Returns the new string if the operation succeeds. Otherwise throws an
// exception and returns an empty result.
RUNTIME_FUNCTION(Runtime_WasmStringNewWtf8) {
ClearThreadInWasmScope flag_scope(isolate);
DCHECK_EQ(4, args.length());
HandleScope scope(isolate);
Handle<WasmInstanceObject> instance = args.at<WasmInstanceObject>(0);
uint32_t memory = args.positive_smi_value_at(1);
uint32_t offset = NumberToUint32(args[2]);
uint32_t size = NumberToUint32(args[3]);
DCHECK_EQ(memory, 0);
USE(memory);
uint64_t mem_size = instance->memory_size();
if (!base::IsInBounds<uint64_t>(offset, size, mem_size)) {
return ThrowWasmError(isolate, MessageTemplate::kWasmTrapMemOutOfBounds);
}
const base::Vector<const uint8_t> bytes{instance->memory_start() + offset,
size};
wasm::Wtf8Decoder decoder(bytes);
if (!decoder.is_valid()) {
return ThrowWasmError(isolate, MessageTemplate::kWasmTrapStringInvalidWtf8);
}
if (decoder.utf16_length() == 0) return *isolate->factory()->empty_string();
if (decoder.is_one_byte()) {
if (size == 1) {
return *isolate->factory()->LookupSingleCharacterStringFromCode(bytes[0]);
}
Handle<SeqOneByteString> result;
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, result,
isolate->factory()->NewRawOneByteString(decoder.utf16_length()));
DisallowGarbageCollection no_gc;
decoder.Decode(result->GetChars(no_gc), bytes);
return *result;
}
Handle<SeqTwoByteString> result;
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, result,
isolate->factory()->NewRawTwoByteString(decoder.utf16_length()));
DisallowGarbageCollection no_gc;
decoder.Decode(result->GetChars(no_gc), bytes);
return *result;
}
} // namespace internal } // namespace internal
} // namespace v8 } // namespace v8
...@@ -611,7 +611,8 @@ namespace internal { ...@@ -611,7 +611,8 @@ namespace internal {
F(WasmArrayInitFromData, 5, 1) \ F(WasmArrayInitFromData, 5, 1) \
F(WasmAllocateContinuation, 1, 1) \ F(WasmAllocateContinuation, 1, 1) \
F(WasmSyncStackLimit, 0, 1) \ F(WasmSyncStackLimit, 0, 1) \
F(WasmCreateResumePromise, 2, 1) F(WasmCreateResumePromise, 2, 1) \
F(WasmStringNewWtf8, 4, 1)
#define FOR_EACH_INTRINSIC_WASM_TEST(F, I) \ #define FOR_EACH_INTRINSIC_WASM_TEST(F, I) \
F(DeserializeWasmModule, 2, 1) \ F(DeserializeWasmModule, 2, 1) \
......
...@@ -5983,8 +5983,26 @@ class LiftoffCompiler { ...@@ -5983,8 +5983,26 @@ class LiftoffCompiler {
void StringNewWtf8(FullDecoder* decoder, void StringNewWtf8(FullDecoder* decoder,
const MemoryIndexImmediate<validate>& imm, const MemoryIndexImmediate<validate>& imm,
const Value& index, const Value& bytes, Value* result) { const Value& offset, const Value& size, Value* result) {
UNIMPLEMENTED(); LiftoffRegList pinned;
LiftoffRegister memory_reg =
pinned.set(__ GetUnusedRegister(kGpReg, pinned));
__ LoadConstant(memory_reg, WasmValue(static_cast<int32_t>(imm.index)));
LiftoffAssembler::VarState memory_var(kI32, memory_reg, 0);
CallRuntimeStub(WasmCode::kWasmStringNewWtf8,
MakeSig::Returns(kRef).Params(kI32, kI32, kI32),
{
memory_var,
__ cache_state()->stack_state.end()[-2], // offset
__ cache_state()->stack_state.end()[-1] // size
},
decoder->position());
__ cache_state()->stack_state.pop_back(2);
RegisterDebugSideTableEntry(decoder, DebugSideTableBuilder::kDidSpill);
LiftoffRegister result_reg(kReturnRegister0);
__ PushRegister(kRef, result_reg);
} }
void StringNewWtf16(FullDecoder* decoder, void StringNewWtf16(FullDecoder* decoder,
......
...@@ -1132,7 +1132,7 @@ struct ControlBase : public PcForErrors<validate> { ...@@ -1132,7 +1132,7 @@ struct ControlBase : public PcForErrors<validate> {
F(BrOnNonArray, const Value& object, Value* value_on_fallthrough, \ F(BrOnNonArray, const Value& object, Value* value_on_fallthrough, \
uint32_t br_depth) \ uint32_t br_depth) \
F(StringNewWtf8, const MemoryIndexImmediate<validate>& imm, \ F(StringNewWtf8, const MemoryIndexImmediate<validate>& imm, \
const Value& index, const Value& bytes, Value* result) \ const Value& offset, const Value& size, Value* result) \
F(StringNewWtf16, const MemoryIndexImmediate<validate>& imm, \ F(StringNewWtf16, const MemoryIndexImmediate<validate>& imm, \
const Value& index, const Value& codeunits, Value* result) \ const Value& index, const Value& codeunits, Value* result) \
F(StringConst, const StringConstImmediate<validate>& imm, Value* result) \ F(StringConst, const StringConstImmediate<validate>& imm, Value* result) \
...@@ -5148,10 +5148,10 @@ class WasmFullDecoder : public WasmDecoder<validate, decoding_mode> { ...@@ -5148,10 +5148,10 @@ class WasmFullDecoder : public WasmDecoder<validate, decoding_mode> {
MemoryIndexImmediate<validate> imm(this, this->pc_ + opcode_length); MemoryIndexImmediate<validate> imm(this, this->pc_ + opcode_length);
if (!this->Validate(this->pc_ + opcode_length, imm)) return 0; if (!this->Validate(this->pc_ + opcode_length, imm)) return 0;
ValueType addr_type = this->module_->is_memory64 ? kWasmI64 : kWasmI32; ValueType addr_type = this->module_->is_memory64 ? kWasmI64 : kWasmI32;
Value addr = Peek(1, 0, addr_type); Value offset = Peek(1, 0, addr_type);
Value bytes = Peek(0, 1, kWasmI32); Value size = Peek(0, 1, kWasmI32);
Value result = CreateValue(kWasmStringRef); Value result = CreateValue(kWasmStringRef);
CALL_INTERFACE_IF_OK_AND_REACHABLE(StringNewWtf8, imm, addr, bytes, CALL_INTERFACE_IF_OK_AND_REACHABLE(StringNewWtf8, imm, offset, size,
&result); &result);
Drop(2); Drop(2);
Push(result); Push(result);
......
...@@ -1343,8 +1343,8 @@ class WasmGraphBuildingInterface { ...@@ -1343,8 +1343,8 @@ class WasmGraphBuildingInterface {
void StringNewWtf8(FullDecoder* decoder, void StringNewWtf8(FullDecoder* decoder,
const MemoryIndexImmediate<validate>& imm, const MemoryIndexImmediate<validate>& imm,
const Value& index, const Value& bytes, Value* result) { const Value& offset, const Value& size, Value* result) {
UNIMPLEMENTED(); result->node = builder_->StringNewWtf8(imm.index, offset.node, size.node);
} }
void StringNewWtf16(FullDecoder* decoder, void StringNewWtf16(FullDecoder* decoder,
......
...@@ -126,7 +126,8 @@ struct WasmModule; ...@@ -126,7 +126,8 @@ struct WasmModule;
V(WasmAllocateStructWithRtt) \ V(WasmAllocateStructWithRtt) \
V(WasmSubtypeCheck) \ V(WasmSubtypeCheck) \
V(WasmOnStackReplace) \ V(WasmOnStackReplace) \
V(WasmSuspend) V(WasmSuspend) \
V(WasmStringNewWtf8)
// Sorted, disjoint and non-overlapping memory regions. A region is of the // Sorted, disjoint and non-overlapping memory regions. A region is of the
// form [start, end). So there's no [start, end), [end, other_end), // form [start, end). So there's no [start, end), [end, other_end),
......
...@@ -4,8 +4,10 @@ ...@@ -4,8 +4,10 @@
#include "src/wasm/wtf8.h" #include "src/wasm/wtf8.h"
#include "src/strings/unicode-decoder.h"
#include "src/strings/unicode.h" #include "src/strings/unicode.h"
#include "src/third_party/utf8-decoder/generalized-utf8-decoder.h" #include "src/third_party/utf8-decoder/generalized-utf8-decoder.h"
#include "src/utils/memcopy.h"
namespace v8 { namespace v8 {
namespace internal { namespace internal {
...@@ -30,6 +32,75 @@ bool Wtf8::ValidateEncoding(const byte* bytes, size_t length) { ...@@ -30,6 +32,75 @@ bool Wtf8::ValidateEncoding(const byte* bytes, size_t length) {
return state == GeneralizedUtf8DfaDecoder::kAccept; return state == GeneralizedUtf8DfaDecoder::kAccept;
} }
Wtf8Decoder::Wtf8Decoder(const base::Vector<const uint8_t>& data)
: encoding_(Encoding::kAscii),
non_ascii_start_(NonAsciiStart(data.begin(), data.length())),
utf16_length_(non_ascii_start_) {
if (non_ascii_start_ == data.length()) return;
bool is_one_byte = true;
auto state = GeneralizedUtf8DfaDecoder::kAccept;
uint32_t current = 0;
uint32_t previous = 0;
for (size_t i = non_ascii_start_; i < data.size(); i++) {
GeneralizedUtf8DfaDecoder::Decode(data[i], &state, &current);
if (state < GeneralizedUtf8DfaDecoder::kAccept) {
DCHECK_EQ(state, GeneralizedUtf8DfaDecoder::kReject);
encoding_ = Encoding::kInvalid;
return;
}
if (state == GeneralizedUtf8DfaDecoder::kAccept) {
if (unibrow::Utf16::IsTrailSurrogate(current) &&
unibrow::Utf16::IsLeadSurrogate(previous)) {
encoding_ = Encoding::kInvalid;
return;
}
is_one_byte = is_one_byte && current <= unibrow::Latin1::kMaxChar;
utf16_length_++;
if (current > unibrow::Utf16::kMaxNonSurrogateCharCode) utf16_length_++;
previous = current;
current = 0;
}
}
if (state == GeneralizedUtf8DfaDecoder::kAccept) {
encoding_ = is_one_byte ? Encoding::kLatin1 : Encoding::kUtf16;
} else {
encoding_ = Encoding::kInvalid;
}
}
template <typename Char>
void Wtf8Decoder::Decode(Char* out, const base::Vector<const uint8_t>& data) {
DCHECK(is_valid());
CopyChars(out, data.begin(), non_ascii_start_);
out += non_ascii_start_;
auto state = GeneralizedUtf8DfaDecoder::kAccept;
uint32_t t = 0;
for (size_t i = non_ascii_start_; i < data.size(); i++) {
GeneralizedUtf8DfaDecoder::Decode(data[i], &state, &t);
if (state == GeneralizedUtf8DfaDecoder::kAccept) {
if (sizeof(Char) == 1 || t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
*(out++) = static_cast<Char>(t);
} else {
*(out++) = unibrow::Utf16::LeadSurrogate(t);
*(out++) = unibrow::Utf16::TrailSurrogate(t);
}
t = 0;
}
}
DCHECK_EQ(state, GeneralizedUtf8DfaDecoder::kAccept);
}
template void Wtf8Decoder::Decode(uint8_t* out,
const base::Vector<const uint8_t>& data);
template void Wtf8Decoder::Decode(uint16_t* out,
const base::Vector<const uint8_t>& data);
} // namespace wasm } // namespace wasm
} // namespace internal } // namespace internal
} // namespace v8 } // namespace v8
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <cstdarg> #include <cstdarg>
#include <memory> #include <memory>
#include "src/base/vector.h"
#include "src/strings/unicode.h" #include "src/strings/unicode.h"
namespace v8 { namespace v8 {
...@@ -36,6 +37,36 @@ class Wtf8 { ...@@ -36,6 +37,36 @@ class Wtf8 {
static bool ValidateEncoding(const byte* str, size_t length); static bool ValidateEncoding(const byte* str, size_t length);
}; };
// Like Utf8Decoder, except that instead of replacing invalid sequences with
// U+FFFD, we have a separate Encoding::kInvalid state.
class Wtf8Decoder {
public:
enum class Encoding : uint8_t { kAscii, kLatin1, kUtf16, kInvalid };
explicit Wtf8Decoder(const base::Vector<const uint8_t>& data);
bool is_valid() const { return encoding_ != Encoding::kInvalid; }
bool is_ascii() const { return encoding_ == Encoding::kAscii; }
bool is_one_byte() const { return encoding_ <= Encoding::kLatin1; }
int utf16_length() const {
DCHECK(is_valid());
return utf16_length_;
}
int non_ascii_start() const {
DCHECK(is_valid());
return non_ascii_start_;
}
template <typename Char>
V8_EXPORT_PRIVATE void Decode(Char* out,
const base::Vector<const uint8_t>& data);
private:
Encoding encoding_;
int non_ascii_start_;
int utf16_length_;
};
} // namespace wasm } // namespace wasm
} // namespace internal } // namespace internal
} // namespace v8 } // namespace v8
......
// Copyright 2022 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --experimental-wasm-stringref
d8.file.execute("test/mjsunit/wasm/wasm-module-builder.js");
let kSig_w_ii = makeSig([kWasmI32, kWasmI32], [kWasmStringRef]);
function encodeWtf8(str) {
// String iterator coalesces surrogate pairs.
let out = [];
for (let codepoint of str) {
codepoint = codepoint.codePointAt(0);
if (codepoint <= 0x7f) {
out.push(codepoint);
} else if (codepoint <= 0x7ff) {
out.push(0xc0 | (codepoint >> 6));
out.push(0x80 | (codepoint & 0x3f));
} else if (codepoint <= 0xffff) {
out.push(0xe0 | (codepoint >> 12));
out.push(0x80 | ((codepoint >> 6) & 0x3f));
out.push(0x80 | (codepoint & 0x3f));
} else if (codepoint <= 0x10ffff) {
out.push(0xf0 | (codepoint >> 18));
out.push(0x80 | ((codepoint >> 12) & 0x3f));
out.push(0x80 | ((codepoint >> 6) & 0x3f));
out.push(0x80 | (codepoint & 0x3f));
} else {
throw new Error("bad codepoint " + codepoint);
}
}
return out;
}
function makeWtf8TestDataSegment() {
let data = []
let valid = {};
let invalid = {};
for (let str of ['',
'ascii',
'latin \xa9 1',
'two \ucccc byte',
'surrogate \ud800\udc000 pair',
'isolated \ud800 leading',
'isolated \udc00 trailing']) {
let bytes = encodeWtf8(str);
valid[str] = { offset: data.length, length: bytes.length };
for (let byte of bytes) {
data.push(byte);
}
}
for (let bytes of ['trailing high byte \xa9',
'interstitial high \xa9 byte',
'invalid \xc0 byte',
'surrogate \xed\xa0\x80\xed\xd0\x80 pair']) {
invalid[bytes] = { offset: data.length, length: bytes.length };
for (let i = 0; i < bytes.length; i++) {
data.push(bytes.charCodeAt(i));
}
}
return { valid, invalid, data: Uint8Array.from(data) };
};
(function TestStringNewWtf8() {
let builder = new WasmModuleBuilder();
builder.addMemory(1, undefined, false, false);
let data = makeWtf8TestDataSegment();
builder.addDataSegment(0, data.data);
builder.addFunction("string_new_wtf8", kSig_w_ii)
.exportAs("string_new_wtf8")
.addBody([
kExprLocalGet, 0, kExprLocalGet, 1,
kGCPrefix, kExprStringNewWtf8, 0
]);
let instance = builder.instantiate();
for (let [str, {offset, length}] of Object.entries(data.valid)) {
assertEquals(str, instance.exports.string_new_wtf8(offset, length));
}
for (let [str, {offset, length}] of Object.entries(data.invalid)) {
assertThrows(() => instance.exports.string_new_wtf8(offset, length),
WebAssembly.RuntimeError, "invalid WTF-8 string");
}
})();
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment