Commit e8610ab8 authored by Andy Wingo's avatar Andy Wingo Committed by V8 LUCI CQ

[stringrefs] Implement stringview_wtf8.encode

Bug: v8:12868
Change-Id: I714fffec248114a7ff61479f122a7df538e8e8d5
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3755140
Commit-Queue: Andy Wingo <wingo@igalia.com>
Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Cr-Commit-Position: refs/heads/main@{#81666}
parent 902759b8
......@@ -55,6 +55,8 @@ extern runtime WasmStringEncodeWtf8Array(
extern runtime WasmStringEncodeWtf16(
Context, WasmInstanceObject, Smi, String, Number, Smi, Smi): JSAny;
extern runtime WasmStringAsWtf8(Context, String): ByteArray;
extern runtime WasmStringViewWtf8Encode(
Context, WasmInstanceObject, Smi, ByteArray, Number, Number, Number): JSAny;
}
namespace unsafe {
......@@ -941,7 +943,7 @@ macro IsWtf8CodepointStart(view: ByteArray, pos: uint32): bool {
// doesn't start with 0b10xxxxxx.
return (view.bytes[Convert<uintptr>(pos)] & 0xc0) != 0x80;
}
builtin WasmWtf8PositionTreatment(view: ByteArray, pos: uint32): uint32 {
macro AlignWtf8PositionForward(view: ByteArray, pos: uint32): uint32 {
const length = Unsigned(SmiToInt32(view.length));
if (pos >= length) return length;
......@@ -958,20 +960,50 @@ builtin WasmWtf8PositionTreatment(view: ByteArray, pos: uint32): uint32 {
return pos + 3;
}
macro AlignWtf8PositionBackward(view: ByteArray, pos: uint32): uint32 {
// Return the highest offset that starts a codepoint which is not
// greater than pos. Preconditions: pos in [0, view.length), view
// contains well-formed WTF-8.
if (IsWtf8CodepointStart(view, pos)) return pos;
if (IsWtf8CodepointStart(view, pos - 1)) return pos - 1;
if (IsWtf8CodepointStart(view, pos - 2)) return pos - 2;
return pos - 3;
}
builtin WasmStringViewWtf8Advance(
view: ByteArray, pos: uint32, bytes: uint32): uint32 {
const clampedPos = AlignWtf8PositionForward(view, pos);
if (bytes == 0) return clampedPos;
const length = Unsigned(SmiToInt32(view.length));
const clampedPos = WasmWtf8PositionTreatment(view, pos);
if (bytes >= length - clampedPos) return length;
const newPos = clampedPos + bytes;
// Return the highest offset that starts a codepoint which is not greater
// than newPos.
if (IsWtf8CodepointStart(view, newPos)) return newPos;
if (IsWtf8CodepointStart(view, newPos - 1)) return newPos - 1;
if (IsWtf8CodepointStart(view, newPos - 2)) return newPos - 2;
return newPos - 3;
return AlignWtf8PositionBackward(view, clampedPos + bytes);
}
struct NewPositionAndBytesWritten {
newPosition: uintptr;
bytesWritten: uintptr;
}
builtin WasmStringViewWtf8Encode(
addr: uint32, pos: uint32, bytes: uint32, view: ByteArray, memory: Smi,
policy: Smi): NewPositionAndBytesWritten {
const start = WasmStringViewWtf8Advance(view, pos, 0);
const end = WasmStringViewWtf8Advance(view, start, bytes);
const instance = LoadInstanceFromFrame();
const context = LoadContextFromInstance(instance);
// kMaxArgs in code-assembler.cc:CallRunTimeImpl is currently limited
// to 6 arguments when calling a runtime function. Throw away the
// memory argument for now; when we need multi-memory we can bump
// kMaxArgs.
dcheck(memory == SmiFromInt32(0));
// Always call out to run-time, to catch invalid addr.
runtime::WasmStringViewWtf8Encode(
context, instance, policy, view, WasmUint32ToNumber(addr),
WasmUint32ToNumber(start), WasmUint32ToNumber(end));
return NewPositionAndBytesWritten{
newPosition: Convert<uintptr>(end),
bytesWritten: Convert<uintptr>(end - start)
};
}
transitioning builtin WasmStringViewWtf16GetCodeUnit(
string: String, offset: uint32): uint32 {
......
......@@ -5913,6 +5913,20 @@ Node* WasmGraphBuilder::StringViewWtf8Advance(Node* view,
Operator::kNoDeopt, view, pos, bytes);
}
void WasmGraphBuilder::StringViewWtf8Encode(
uint32_t memory, wasm::StringRefWtf8Policy policy, Node* view,
CheckForNull null_check, Node* addr, Node* pos, Node* bytes,
Node** next_pos, Node** bytes_written, wasm::WasmCodePosition position) {
if (null_check == kWithNullCheck) {
view = AssertNotNull(view, position);
}
Node* pair = gasm_->CallBuiltin(
Builtin::kWasmStringViewWtf8Encode, Operator::kNoDeopt, addr, pos, bytes,
view, gasm_->SmiConstant(memory), gasm_->SmiConstant(policy));
*next_pos = gasm_->Projection(0, pair);
*bytes_written = gasm_->Projection(1, pair);
}
Node* WasmGraphBuilder::StringViewWtf16GetCodeUnit(
Node* string, CheckForNull null_check, Node* offset,
wasm::WasmCodePosition position) {
......
......@@ -571,6 +571,11 @@ class WasmGraphBuilder {
wasm::WasmCodePosition position);
Node* StringViewWtf8Advance(Node* view, CheckForNull null_check, Node* pos,
Node* bytes, wasm::WasmCodePosition position);
void StringViewWtf8Encode(uint32_t memory, wasm::StringRefWtf8Policy policy,
Node* view, CheckForNull null_check, Node* addr,
Node* pos, Node* bytes, Node** next_pos,
Node** bytes_written,
wasm::WasmCodePosition position);
Node* StringViewWtf16GetCodeUnit(Node* string, CheckForNull null_check,
Node* offset,
wasm::WasmCodePosition position);
......
......@@ -1235,5 +1235,54 @@ RUNTIME_FUNCTION(Runtime_WasmStringAsWtf8) {
return *array;
}
RUNTIME_FUNCTION(Runtime_WasmStringViewWtf8Encode) {
ClearThreadInWasmScope flag_scope(isolate);
DCHECK_EQ(6, args.length());
HandleScope scope(isolate);
WasmInstanceObject instance = WasmInstanceObject::cast(args[0]);
uint32_t policy_value = args.positive_smi_value_at(1);
Handle<ByteArray> array(ByteArray::cast(args[2]), isolate);
uint32_t addr = NumberToUint32(args[3]);
uint32_t start = NumberToUint32(args[4]);
uint32_t end = NumberToUint32(args[5]);
DCHECK(policy_value <= wasm::kLastWtf8Policy);
DCHECK_LE(start, end);
DCHECK(base::IsInBounds<size_t>(start, end - start, array->length()));
auto policy = static_cast<wasm::StringRefWtf8Policy>(policy_value);
size_t length = end - start;
if (!base::IsInBounds<size_t>(addr, length, instance.memory_size())) {
return ThrowWasmError(isolate, MessageTemplate::kWasmTrapMemOutOfBounds);
}
byte* memory_start = reinterpret_cast<byte*>(instance.memory_start());
const byte* src =
reinterpret_cast<const byte*>(array->GetDataStartAddress() + start);
byte* dst = memory_start + addr;
std::vector<size_t> surrogates;
if (policy != wasm::kWtf8PolicyAccept) {
unibrow::Wtf8::ScanForSurrogates({src, length}, &surrogates);
if (policy == wasm::kWtf8PolicyReject && !surrogates.empty()) {
return ThrowWasmError(isolate,
MessageTemplate::kWasmTrapStringIsolatedSurrogate);
}
}
MemCopy(dst, src, length);
for (size_t surrogate : surrogates) {
DCHECK_LT(surrogate, length);
DCHECK_EQ(policy, wasm::kWtf8PolicyReplace);
unibrow::Utf8::Encode(reinterpret_cast<char*>(dst + surrogate),
unibrow::Utf8::kBadChar, 0, false);
}
// Unused.
return Smi(0);
}
} // namespace internal
} // namespace v8
......@@ -619,7 +619,8 @@ namespace internal {
F(WasmStringEncodeWtf8, 5, 1) \
F(WasmStringEncodeWtf16, 6, 1) \
F(WasmStringEncodeWtf8Array, 4, 1) \
F(WasmStringAsWtf8, 1, 1)
F(WasmStringAsWtf8, 1, 1) \
F(WasmStringViewWtf8Encode, 6, 1)
#define FOR_EACH_INTRINSIC_WASM_TEST(F, I) \
F(DeserializeWasmModule, 2, 1) \
......
......@@ -5,8 +5,12 @@
// This file was generated at 2014-10-08 15:25:47.940335
#include "src/strings/unicode.h"
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include "src/strings/unicode-inl.h"
#if V8_ENABLE_WEBASSEMBLY
......@@ -255,6 +259,30 @@ bool Wtf8::ValidateEncoding(const byte* bytes, size_t length) {
}
return state == State::kAccept;
}
// Precondition: valid WTF-8.
void Wtf8::ScanForSurrogates(const v8::base::Vector<const byte>& wtf8,
std::vector<size_t>* surrogate_offsets) {
// A surrogate codepoint is encoded in a three-byte sequence:
//
// 0xED [0xA0,0xBF] [0x80,0xBF]
//
// If the first byte is 0xED, you already have a 50% chance of the value being
// a surrogate; you just have to check the second byte. (There are
// three-byte non-surrogates starting with 0xED whose second byte is in
// [0x80,0x9F].) Could speed this up with SWAR; most likely case is that no
// byte in the array is 0xED.
const byte kWtf8SurrogateFirstByte = 0xED;
const byte kWtf8SurrogateSecondByteHighBit = 0x20;
for (size_t i = 0; i < wtf8.size(); i++) {
if (wtf8[i] == kWtf8SurrogateFirstByte &&
(wtf8[i + 1] & kWtf8SurrogateSecondByteHighBit)) {
// Record the byte offset of the encoded surrogate.
surrogate_offsets->push_back(i);
}
}
}
#endif // V8_ENABLE_WEBASSEMBLY
// Uppercase: point.category == 'Lu'
......
......@@ -6,7 +6,9 @@
#define V8_STRINGS_UNICODE_H_
#include <sys/types.h>
#include "src/base/bit-field.h"
#include "src/base/vector.h"
#include "src/common/globals.h"
#include "src/third_party/utf8-decoder/utf8-decoder.h"
/**
......@@ -230,6 +232,9 @@ class V8_EXPORT_PRIVATE Wtf8 {
// this function checks for a valid "generalized UTF-8" sequence, with the
// additional constraint that surrogate pairs are not allowed.
static bool ValidateEncoding(const byte* str, size_t length);
static void ScanForSurrogates(const v8::base::Vector<const byte>& wtf8,
std::vector<size_t>* surrogate_offsets);
};
#endif // V8_ENABLE_WEBASSEMBLY
......
......@@ -6662,7 +6662,49 @@ class LiftoffCompiler {
const Value& view, const Value& addr,
const Value& pos, const Value& bytes,
Value* next_pos, Value* bytes_written) {
UNIMPLEMENTED();
LiftoffRegList pinned;
LiftoffAssembler::VarState& bytes_var =
__ cache_state()->stack_state.end()[-1];
LiftoffAssembler::VarState& pos_var =
__ cache_state()->stack_state.end()[-2];
LiftoffAssembler::VarState& addr_var =
__ cache_state()->stack_state.end()[-3];
LiftoffRegister view_reg = pinned.set(
__ LoadToRegister(__ cache_state()->stack_state.end()[-4], pinned));
MaybeEmitNullCheck(decoder, view_reg.gp(), pinned, view.type);
LiftoffAssembler::VarState view_var(kRef, view_reg, 0);
LiftoffRegister memory_reg =
pinned.set(__ GetUnusedRegister(kGpReg, pinned));
LoadSmi(memory_reg, imm.memory.index);
LiftoffAssembler::VarState memory_var(kSmiKind, memory_reg, 0);
LiftoffRegister policy_reg =
pinned.set(__ GetUnusedRegister(kGpReg, pinned));
LoadSmi(policy_reg, static_cast<int32_t>(imm.policy.value));
LiftoffAssembler::VarState policy_var(kSmiKind, policy_reg, 0);
CallRuntimeStub(WasmCode::kWasmStringViewWtf8Encode,
MakeSig::Returns(kI32, kI32)
.Params(kI32, kI32, kI32, kRef, kSmiKind, kSmiKind),
{
addr_var,
pos_var,
bytes_var,
view_var,
memory_var,
policy_var,
},
decoder->position());
__ DropValues(4);
RegisterDebugSideTableEntry(decoder, DebugSideTableBuilder::kDidSpill);
LiftoffRegister next_pos_reg(kReturnRegister0);
__ PushRegister(kI32, next_pos_reg);
LiftoffRegister bytes_written_reg(kReturnRegister1);
__ PushRegister(kI32, bytes_written_reg);
}
void StringViewWtf8Slice(FullDecoder* decoder, const Value& view,
......
......@@ -1509,7 +1509,10 @@ class WasmGraphBuildingInterface {
const Value& view, const Value& addr,
const Value& pos, const Value& bytes,
Value* next_pos, Value* bytes_written) {
UNIMPLEMENTED();
builder_->StringViewWtf8Encode(
imm.memory.index, imm.policy.value, view.node, NullCheckFor(view.type),
addr.node, pos.node, bytes.node, &next_pos->node, &bytes_written->node,
decoder->position());
}
void StringViewWtf8Slice(FullDecoder* decoder, const Value& view,
......
......@@ -141,7 +141,8 @@ struct WasmModule;
V(WasmStringEncodeWtf8Array) \
V(WasmStringEncodeWtf16Array) \
V(WasmStringAsWtf8) \
V(WasmStringViewWtf8Advance)
V(WasmStringViewWtf8Advance) \
V(WasmStringViewWtf8Encode)
// Sorted, disjoint and non-overlapping memory regions. A region is of the
// form [start, end). So there's no [start, end), [end, other_end),
......
......@@ -14,6 +14,8 @@ let kSig_i_wii = makeSig([kWasmStringRef, kWasmI32, kWasmI32], [kWasmI32]);
let kSig_i_ww = makeSig([kWasmStringRef, kWasmStringRef], [kWasmI32]);
let kSig_i_wiii = makeSig([kWasmStringRef, kWasmI32, kWasmI32, kWasmI32],
[kWasmI32]);
let kSig_ii_wiii = makeSig([kWasmStringRef, kWasmI32, kWasmI32, kWasmI32],
[kWasmI32, kWasmI32]);
let kSig_w_wii = makeSig([kWasmStringRef, kWasmI32, kWasmI32],
[kWasmStringRef]);
let kSig_w_ww = makeSig([kWasmStringRef, kWasmStringRef], [kWasmStringRef]);
......@@ -45,11 +47,50 @@ function encodeWtf8(str) {
return out;
}
// Compute the string that corresponds to the valid WTF-8 bytes from
// start (inclusive) to end (exclusive).
function decodeWtf8(wtf8, start, end) {
let result = ''
while (start < end) {
let cp;
let b0 = wtf8[start];
if ((b0 & 0xC0) == 0x80) {
// The precondition is that we have valid WTF-8 bytes and that
// start and end are codepoint boundaries. Here we make a weak
// assertion about that invariant, that we don't start decoding
// with a continuation byte.
throw new Error('invalid wtf8');
}
if (b0 <= 0x7F) {
cp = b0;
start += 1;
} else if (b0 <= 0xDF) {
cp = (b0 & 0x1f) << 6;
cp |= (wtf8[start + 1] & 0x3f);
start += 2;
} else if (b0 <= 0xEF) {
cp = (b0 & 0x0f) << 12;
cp |= (wtf8[start + 1] & 0x3f) << 6;
cp |= (wtf8[start + 2] & 0x3f);
start += 3;
} else {
cp = (b0 & 0x07) << 18;
cp |= (wtf8[start + 1] & 0x3f) << 12;
cp |= (wtf8[start + 2] & 0x3f) << 6;
cp |= (wtf8[start + 3] & 0x3f);
start += 4;
}
result += String.fromCodePoint(cp);
}
assertEquals(start, end);
return result;
}
let interestingStrings = ['',
'ascii',
'latin \xa9 1',
'two \ucccc byte',
'surrogate \ud800\udc000 pair',
'surrogate \ud800\udc00 pair',
'isolated \ud800 leading',
'isolated \udc00 trailing',
'\ud800 isolated leading at beginning',
......@@ -783,6 +824,8 @@ function makeWtf16TestDataSegment() {
(function TestStringViewWtf8() {
let builder = new WasmModuleBuilder();
builder.addMemory(1, undefined, true /* exported */, false);
builder.addFunction("advance", kSig_i_wii)
.exportFunc()
.addBody([
......@@ -802,6 +845,32 @@ function makeWtf16TestDataSegment() {
kGCPrefix, kExprStringViewWtf8Advance
]);
for (let [name, policy] of Object.entries({utf8: kWtf8PolicyReject,
wtf8: kWtf8PolicyAccept,
replace: kWtf8PolicyReplace})) {
builder.addFunction(`encode_${name}`, kSig_ii_wiii)
.exportFunc()
.addBody([
kExprLocalGet, 0,
kGCPrefix, kExprStringAsWtf8,
kExprLocalGet, 1,
kExprLocalGet, 2,
kExprLocalGet, 3,
kGCPrefix, kExprStringViewWtf8Encode, 0, policy
]);
}
builder.addFunction("encode_null", kSig_v_v)
.exportFunc()
.addBody([
kExprRefNull, kStringViewWtf8Code,
kExprI32Const, 0,
kExprI32Const, 0,
kExprI32Const, 0,
kGCPrefix, kExprStringViewWtf8Encode, 0, kWtf8PolicyAccept,
kExprDrop,
kExprDrop
]);
function Wtf8StartsCodepoint(wtf8, offset) {
return (wtf8[offset] & 0xc0) != 0x80;
}
......@@ -821,6 +890,7 @@ function makeWtf16TestDataSegment() {
}
let instance = builder.instantiate();
let memory = new Uint8Array(instance.exports.memory.buffer);
for (let pos = 0; pos < "ascii".length; pos++) {
assertEquals(pos + 1, instance.exports.advance("ascii", pos, 1));
......@@ -836,8 +906,8 @@ function makeWtf16TestDataSegment() {
instance.exports.advance(str, 0, wtf8.length + 1));
assertEquals(wtf8.length,
instance.exports.advance(str, wtf8.length + 1, 0));
for (let pos = 0; pos < wtf8.length; pos++) {
for (let bytes = 0; bytes < wtf8.length - pos; bytes++) {
for (let pos = 0; pos <= wtf8.length; pos++) {
for (let bytes = 0; bytes <= wtf8.length - pos; bytes++) {
assertEquals(
CodepointStart(wtf8, Wtf8PositionTreatment(wtf8, pos) + bytes),
instance.exports.advance(str, pos, bytes));
......@@ -845,6 +915,87 @@ function makeWtf16TestDataSegment() {
}
}
function checkEncoding(variant, str, slice, start, length) {
let all_bytes = encodeWtf8(str);
let bytes = encodeWtf8(slice);
function clearMemory(low, high) {
for (let i = low; i < high; i++) {
memory[i] = 0;
}
}
function assertMemoryBytesZero(low, high) {
for (let i = low; i < high; i++) {
assertEquals(0, memory[i]);
}
}
function checkMemory(offset, bytes) {
let slop = 64;
assertMemoryBytesZero(Math.max(0, offset - slop), offset);
for (let i = 0; i < bytes.length; i++) {
assertEquals(bytes[i], memory[offset + i]);
}
assertMemoryBytesZero(offset + bytes.length,
Math.min(memory.length,
offset + bytes.length + slop));
}
let encode = instance.exports[`encode_${variant}`];
let expected_start = Wtf8PositionTreatment(all_bytes, start);
let expected_end = CodepointStart(all_bytes, expected_start + bytes.length);
for (let offset of [0, 42, memory.length - bytes.length]) {
assertArrayEquals([expected_end, expected_end - expected_start],
encode(str, offset, start, length));
checkMemory(offset, bytes);
clearMemory(offset, offset + bytes.length);
}
assertThrows(() => encode(str, memory.length - bytes.length + 2,
start, length),
WebAssembly.RuntimeError, "memory access out of bounds");
checkMemory(memory.length - bytes.length - 2, []);
}
checkEncoding('utf8', "fox", "f", 0, 1);
checkEncoding('utf8', "fox", "fo", 0, 2);
checkEncoding('utf8', "fox", "fox", 0, 3);
checkEncoding('utf8', "fox", "fox", 0, 300);
checkEncoding('utf8', "fox", "", 1, 0);
checkEncoding('utf8', "fox", "o", 1, 1);
checkEncoding('utf8', "fox", "ox", 1, 2);
checkEncoding('utf8', "fox", "ox", 1, 200);
checkEncoding('utf8', "fox", "", 2, 0);
checkEncoding('utf8', "fox", "x", 2, 1);
checkEncoding('utf8', "fox", "x", 2, 2);
checkEncoding('utf8', "fox", "", 3, 0);
checkEncoding('utf8', "fox", "", 3, 1_000_000_000);
checkEncoding('utf8', "fox", "", 1_000_000_000, 1_000_000_000);
checkEncoding('utf8', "fox", "", 100, 100);
for (let str of interestingStrings) {
let wtf8 = encodeWtf8(str);
for (let pos = 0; pos <= wtf8.length; pos++) {
for (let bytes = 0; bytes <= wtf8.length - pos; bytes++) {
let start = Wtf8PositionTreatment(wtf8, pos);
let end = CodepointStart(wtf8, start + bytes);
let expected = decodeWtf8(wtf8, start, end);
checkEncoding('wtf8', str, expected, pos, bytes);
if (HasIsolatedSurrogate(expected)) {
assertThrows(() => instance.exports.encode_utf8(str, 0, pos, bytes),
WebAssembly.RuntimeError,
"Failed to encode string as UTF-8: " +
"contains unpaired surrogate");
checkEncoding('replace', str,
ReplaceIsolatedSurrogates(expected), pos, bytes);
} else {
checkEncoding('utf8', str, expected, pos, bytes);
checkEncoding('replace', str, expected, pos, bytes);
}
}
}
}
assertThrows(() => instance.exports.advance_null(),
WebAssembly.RuntimeError, "dereferencing a null pointer");
assertThrows(() => instance.exports.encode_null(),
WebAssembly.RuntimeError, "dereferencing a null pointer");
})();
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment