Commit 06594a8d authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

Reland "[wasm-simd][liftoff] Add S128 case for stack movements"

This relands commit 5cfe053e.

Original change's description:
> [wasm-simd][liftoff] Add S128 case for stack movements
>
> The two cases we are fixing here are Construct and
> LoadCallerFrameSlot, which are closely related.
>
> Construct is called during PrepareCall, where we build up
> LiftoffStackSlots when we need to move an arg from caller's stack frame
> into callee's stack frame. LoadCallerFrameSlot is the parallel to
> this, called in ProcessParameter during decoding of the callee's
> function body.
>
> In most cases, Construct needs a new case to handle kWasmS128, and calls
> the relevant assembler to push a s128 onto the stack.
>
> ARM64 requires 16-byte alignment of sp, so we need to Claim the right
> number of kXRegSize slots first, which requires
> us traversing the list of slots to figure out how many s128 values there
> are. This is a straightforward way to fix this, if efficiency is a
> problem, we can change LiftOffStackSlots::Add to sum up the slot sizes.
>
> On IA32, pushing s128 values will require 4 calls to push. Instead, we
> use a sub and two movdqu, which will generate less code in most cases.
>
> On x64, there is no 128-bit push, so we call push twice.
>
> Bug: v8:9909
> Change-Id: I3af35b8462ea9c3b9b2d90800c37d11b5e95be59
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2015945
> Commit-Queue: Zhi An Ng <zhin@chromium.org>
> Reviewed-by: Clemens Backes <clemensb@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#65956}

Bug: v8:9909
Change-Id: Icdaead289abe13faf75bb9e049929f7fd7c59a08
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2036760
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66119}
parent 868e4e19
......@@ -601,6 +601,13 @@ void LiftoffAssembler::LoadCallerFrameSlot(LiftoffRegister dst,
case kWasmF64:
vldr(dst.fp(), src);
break;
case kWasmS128: {
UseScratchRegisterScope temps(this);
Register addr = liftoff::CalculateActualAddress(this, &temps, src.rn(),
no_reg, src.offset());
vld1(Neon8, NeonListOperand(dst.low_fp(), 2), NeonMemOperand(addr));
break;
}
default:
UNREACHABLE();
}
......@@ -1736,6 +1743,16 @@ void LiftoffStackSlots::Construct() {
asm_->vldr(scratch, liftoff::GetStackSlot(slot.src_offset_));
asm_->vpush(scratch);
} break;
case kWasmS128: {
MemOperand mem_op = liftoff::GetStackSlot(slot.src_offset_);
UseScratchRegisterScope temps(asm_);
Register addr = liftoff::CalculateActualAddress(
asm_, &temps, mem_op.rn(), no_reg, mem_op.offset());
QwNeonRegister scratch = temps.AcquireQ();
asm_->vld1(Neon8, NeonListOperand(scratch), NeonMemOperand(addr));
asm_->vpush(scratch);
break;
}
default:
UNREACHABLE();
}
......@@ -1757,6 +1774,9 @@ void LiftoffStackSlots::Construct() {
case kWasmF64:
asm_->vpush(src.reg().fp());
break;
case kWasmS128:
asm_->vpush(liftoff::GetSimd128Register(src.reg().low_fp()));
break;
default:
UNREACHABLE();
}
......
......@@ -1186,12 +1186,15 @@ void LiftoffAssembler::DeallocateStackSlot(uint32_t size) {
void LiftoffAssembler::DebugBreak() { debug("DebugBreak", 0, BREAK); }
void LiftoffStackSlots::Construct() {
size_t slot_count = slots_.size();
size_t num_slots = 0;
for (auto& slot : slots_) {
num_slots += slot.src_.type() == kWasmS128 ? 2 : 1;
}
// The stack pointer is required to be quadword aligned.
asm_->Claim(RoundUp(slot_count, 2));
size_t slot_index = 0;
asm_->Claim(RoundUp(num_slots, 2));
size_t poke_offset = num_slots * kXRegSize;
for (auto& slot : slots_) {
size_t poke_offset = (slot_count - slot_index - 1) * kXRegSize;
poke_offset -= slot.src_.type() == kWasmS128 ? kXRegSize * 2 : kXRegSize;
switch (slot.src_.loc()) {
case LiftoffAssembler::VarState::kStack: {
UseScratchRegisterScope temps(asm_);
......@@ -1218,7 +1221,6 @@ void LiftoffStackSlots::Construct() {
}
break;
}
slot_index++;
}
}
......
......@@ -54,6 +54,9 @@ inline void Load(LiftoffAssembler* assm, LiftoffRegister dst, Register base,
case kWasmF64:
assm->movsd(dst.fp(), src);
break;
case kWasmS128:
assm->movdqu(dst.fp(), src);
break;
default:
UNREACHABLE();
}
......@@ -98,6 +101,10 @@ inline void push(LiftoffAssembler* assm, LiftoffRegister reg, ValueType type) {
assm->AllocateStackSpace(sizeof(double));
assm->movsd(Operand(esp, 0), reg.fp());
break;
case kWasmS128:
assm->AllocateStackSpace(sizeof(double) * 2);
assm->movdqu(Operand(esp, 0), reg.fp());
break;
default:
UNREACHABLE();
}
......@@ -2067,6 +2074,15 @@ void LiftoffStackSlots::Construct() {
const LiftoffAssembler::VarState& src = slot.src_;
switch (src.loc()) {
case LiftoffAssembler::VarState::kStack:
// The combination of AllocateStackSpace and 2 movdqu is usually smaller
// in code size than doing 4 pushes.
if (src.type() == kWasmS128) {
asm_->AllocateStackSpace(sizeof(double) * 2);
asm_->movdqu(liftoff::kScratchDoubleReg,
liftoff::GetStackSlot(slot.src_offset_));
asm_->movdqu(Operand(esp, 0), liftoff::kScratchDoubleReg);
break;
}
if (src.type() == kWasmF64) {
DCHECK_EQ(kLowWord, slot.half_);
asm_->push(liftoff::GetHalfStackSlot(slot.src_offset_, kHighWord));
......
......@@ -71,6 +71,9 @@ inline void Load(LiftoffAssembler* assm, LiftoffRegister dst, Operand src,
case kWasmF64:
assm->Movsd(dst.fp(), src);
break;
case kWasmS128:
assm->Movdqu(dst.fp(), src);
break;
default:
UNREACHABLE();
}
......@@ -110,6 +113,10 @@ inline void push(LiftoffAssembler* assm, LiftoffRegister reg, ValueType type) {
assm->AllocateStackSpace(kSystemPointerSize);
assm->Movsd(Operand(rsp, 0), reg.fp());
break;
case kWasmS128:
assm->AllocateStackSpace(kSystemPointerSize * 2);
assm->Movdqu(Operand(rsp, 0), reg.fp());
break;
default:
UNREACHABLE();
}
......@@ -1926,6 +1933,11 @@ void LiftoffStackSlots::Construct() {
// extended.
asm_->movl(kScratchRegister, liftoff::GetStackSlot(slot.src_offset_));
asm_->pushq(kScratchRegister);
} else if (src.type() == kWasmS128) {
// Since offsets are subtracted from sp, we need a smaller offset to
// push the top of a s128 value.
asm_->pushq(liftoff::GetStackSlot(slot.src_offset_ - 8));
asm_->pushq(liftoff::GetStackSlot(slot.src_offset_));
} else {
// For all other types, just push the whole (8-byte) stack slot.
// This is also ok for f32 values (even though we copy 4 uninitialized
......
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --experimental-wasm-simd
load('test/mjsunit/wasm/wasm-module-builder.js');
// This test case tries to exercise SIMD stack to stack movements by creating
// a function that has many parameters.
(function() {
const builder = new WasmModuleBuilder();
// At this point we have limited support for SIMD operations, but we can load
// and store s128 values. So this memory will be used for reading and writing
// s128 values and we will assert expected values from JS.
builder.addImportedMemory('m', 'imported_mem', 1, 2);
builder.addType(makeSig(new Array(18).fill(kWasmS128), []));
builder.addFunction(undefined, makeSig([], []))
.addLocals({s128_count: 9})
.addBodyWithEnd([
// These will all be args to the callee.
// Load first arg from memory, this was written with values from JS.
kExprI32Const, 0, // i32.const
kSimdPrefix, kExprS128LoadMem, 0, 0, // s128.load
kExprLocalGet, 0, // local.get
kExprLocalGet, 1, // local.get
kExprLocalGet, 2, // local.get
kExprLocalGet, 3, // local.get
kExprLocalGet, 4, // local.get
kExprLocalGet, 5, // local.get
kExprLocalGet, 6, // local.get
kExprLocalGet, 7, // local.get
kExprLocalGet, 8, // local.get
kExprLocalGet, 0, // local.get
kExprLocalGet, 1, // local.get
kExprLocalGet, 2, // local.get
kExprLocalGet, 3, // local.get
kExprLocalGet, 4, // local.get
kExprLocalGet, 5, // local.get
kExprLocalGet, 6, // local.get
// Load last s128 from memory, this was written with values from JS.
kExprI32Const, 16, // i32.const
kSimdPrefix, kExprS128LoadMem, 0, 0, // s128.load
kExprCallFunction, 0x01, // call
kExprEnd, // end
]);
builder.addFunction(undefined, 0 /* sig */).addBodyWithEnd([
kExprI32Const, 32, // i32.const
kExprLocalGet, 0, // local.get
kSimdPrefix, kExprS128StoreMem, 0, 0, // s128.store
kExprI32Const, 48, // i32.const
kExprLocalGet, 17, // local.get
kSimdPrefix, kExprS128StoreMem, 0, 0, // s128.store
kExprEnd, // end
]);
builder.addExport('main', 0);
var memory = new WebAssembly.Memory({initial: 1, maximum: 2});
const instance = builder.instantiate({m: {imported_mem: memory}});
// We write sentinel values to two s128 values at the start of the memory.
// Function 1 will read these values from memory, and pass them as the first
// and last arg to function 2. Function 2 then write these values to memory
// after these two s128 values.
const arr = new Uint32Array(memory.buffer);
for (let i = 0; i < 8; i++) {
arr[0] = i * 2;
}
instance.exports.main();
for (let i = 0; i < 8; i++) {
assertEquals(arr[i], arr[i + 8]);
}
})();
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment