Commit 5cfe053e authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][liftoff] Add S128 case for stack movements

The two cases we are fixing here are Construct and
LoadCallerFrameSlot, which are closely related.

Construct is called during PrepareCall, where we build up
LiftoffStackSlots when we need to move an arg from caller's stack frame
into callee's stack frame. LoadCallerFrameSlot is the parallel to
this, called in ProcessParameter during decoding of the callee's
function body.

In most cases, Construct needs a new case to handle kWasmS128, and calls
the relevant assembler to push a s128 onto the stack.

ARM64 requires 16-byte alignment of sp, so we need to Claim the right
number of kXRegSize slots first, which requires
us traversing the list of slots to figure out how many s128 values there
are. This is a straightforward way to fix this, if efficiency is a
problem, we can change LiftOffStackSlots::Add to sum up the slot sizes.

On IA32, pushing s128 values will require 4 calls to push. Instead, we
use a sub and two movdqu, which will generate less code in most cases.

On x64, there is no 128-bit push, so we call push twice.

Bug: v8:9909
Change-Id: I3af35b8462ea9c3b9b2d90800c37d11b5e95be59
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2015945
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#65956}
parent a8d059d9
......@@ -577,6 +577,13 @@ void LiftoffAssembler::LoadCallerFrameSlot(LiftoffRegister dst,
case kWasmF64:
vldr(dst.fp(), src);
break;
case kWasmS128: {
UseScratchRegisterScope temps(this);
Register addr = liftoff::CalculateActualAddress(this, &temps, src.rn(),
no_reg, src.offset());
vld1(Neon8, NeonListOperand(dst.low_fp(), 2), NeonMemOperand(addr));
break;
}
default:
UNREACHABLE();
}
......@@ -1707,6 +1714,16 @@ void LiftoffStackSlots::Construct() {
asm_->vldr(scratch, liftoff::GetStackSlot(slot.src_offset_));
asm_->vpush(scratch);
} break;
case kWasmS128: {
MemOperand mem_op = liftoff::GetStackSlot(slot.src_offset_);
UseScratchRegisterScope temps(asm_);
Register addr = liftoff::CalculateActualAddress(
asm_, &temps, mem_op.rn(), no_reg, mem_op.offset());
QwNeonRegister scratch = temps.AcquireQ();
asm_->vld1(Neon8, NeonListOperand(scratch), NeonMemOperand(addr));
asm_->vpush(scratch);
break;
}
default:
UNREACHABLE();
}
......@@ -1728,6 +1745,9 @@ void LiftoffStackSlots::Construct() {
case kWasmF64:
asm_->vpush(src.reg().fp());
break;
case kWasmS128:
asm_->vpush(liftoff::GetSimd128Register(src.reg().low_fp()));
break;
default:
UNREACHABLE();
}
......
......@@ -1157,12 +1157,15 @@ void LiftoffAssembler::DeallocateStackSlot(uint32_t size) {
void LiftoffAssembler::DebugBreak() { debug("DebugBreak", 0, BREAK); }
void LiftoffStackSlots::Construct() {
size_t slot_count = slots_.size();
size_t num_slots = 0;
for (auto& slot : slots_) {
num_slots += slot.src_.type() == kWasmS128 ? 2 : 1;
}
// The stack pointer is required to be quadword aligned.
asm_->Claim(RoundUp(slot_count, 2));
size_t slot_index = 0;
asm_->Claim(RoundUp(num_slots, 2));
size_t poke_offset = num_slots * kXRegSize;
for (auto& slot : slots_) {
size_t poke_offset = (slot_count - slot_index - 1) * kXRegSize;
poke_offset -= slot.src_.type() == kWasmS128 ? kXRegSize * 2 : kXRegSize;
switch (slot.src_.loc()) {
case LiftoffAssembler::VarState::kStack: {
UseScratchRegisterScope temps(asm_);
......@@ -1189,7 +1192,6 @@ void LiftoffStackSlots::Construct() {
}
break;
}
slot_index++;
}
}
......
......@@ -54,6 +54,9 @@ inline void Load(LiftoffAssembler* assm, LiftoffRegister dst, Register base,
case kWasmF64:
assm->movsd(dst.fp(), src);
break;
case kWasmS128:
assm->movdqu(dst.fp(), src);
break;
default:
UNREACHABLE();
}
......@@ -98,6 +101,10 @@ inline void push(LiftoffAssembler* assm, LiftoffRegister reg, ValueType type) {
assm->AllocateStackSpace(sizeof(double));
assm->movsd(Operand(esp, 0), reg.fp());
break;
case kWasmS128:
assm->AllocateStackSpace(sizeof(double) * 2);
assm->movdqu(Operand(esp, 0), reg.fp());
break;
default:
UNREACHABLE();
}
......@@ -2043,6 +2050,15 @@ void LiftoffStackSlots::Construct() {
const LiftoffAssembler::VarState& src = slot.src_;
switch (src.loc()) {
case LiftoffAssembler::VarState::kStack:
// The combination of AllocateStackSpace and 2 movdqu is usually smaller
// in code size than doing 4 pushes.
if (src.type() == kWasmS128) {
asm_->AllocateStackSpace(sizeof(double) * 2);
asm_->movdqu(liftoff::kScratchDoubleReg,
liftoff::GetStackSlot(slot.src_offset_));
asm_->movdqu(Operand(esp, 0), liftoff::kScratchDoubleReg);
break;
}
if (src.type() == kWasmF64) {
DCHECK_EQ(kLowWord, slot.half_);
asm_->push(liftoff::GetHalfStackSlot(slot.src_offset_, kHighWord));
......
......@@ -71,6 +71,9 @@ inline void Load(LiftoffAssembler* assm, LiftoffRegister dst, Operand src,
case kWasmF64:
assm->Movsd(dst.fp(), src);
break;
case kWasmS128:
assm->Movdqu(dst.fp(), src);
break;
default:
UNREACHABLE();
}
......@@ -110,6 +113,10 @@ inline void push(LiftoffAssembler* assm, LiftoffRegister reg, ValueType type) {
assm->AllocateStackSpace(kSystemPointerSize);
assm->Movsd(Operand(rsp, 0), reg.fp());
break;
case kWasmS128:
assm->AllocateStackSpace(kSystemPointerSize * 2);
assm->Movdqu(Operand(rsp, 0), reg.fp());
break;
default:
UNREACHABLE();
}
......@@ -1764,6 +1771,11 @@ void LiftoffStackSlots::Construct() {
// extended.
asm_->movl(kScratchRegister, liftoff::GetStackSlot(slot.src_offset_));
asm_->pushq(kScratchRegister);
} else if (src.type() == kWasmS128) {
// Since offsets are subtracted from sp, we need a smaller offset to
// push the top of a s128 value.
asm_->pushq(liftoff::GetStackSlot(slot.src_offset_ - 8));
asm_->pushq(liftoff::GetStackSlot(slot.src_offset_));
} else {
// For all other types, just push the whole (8-byte) stack slot.
// This is also ok for f32 values (even though we copy 4 uninitialized
......
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --experimental-wasm-simd
load('test/mjsunit/wasm/wasm-module-builder.js');
// This test case tries to exercise SIMD stack to stack movements by creating
// a function that has many parameters.
(function() {
const builder = new WasmModuleBuilder();
// At this point we have limited support for SIMD operations, but we can load
// and store s128 values. So this memory will be used for reading and writing
// s128 values and we will assert expected values from JS.
builder.addImportedMemory('m', 'imported_mem', 1, 2);
builder.addType(makeSig(new Array(18).fill(kWasmS128), []));
builder.addFunction(undefined, makeSig([], []))
.addLocals({s128_count: 9})
.addBodyWithEnd([
// These will all be args to the callee.
// Load first arg from memory, this was written with values from JS.
kExprI32Const, 0,
kSimdPrefix, kExprS128LoadMem, 0, 0,
kExprLocalGet, 0,
kExprLocalGet, 1,
kExprLocalGet, 2,
kExprLocalGet, 3,
kExprLocalGet, 4,
kExprLocalGet, 5,
kExprLocalGet, 6,
kExprLocalGet, 7,
kExprLocalGet, 8,
kExprLocalGet, 0,
kExprLocalGet, 1,
kExprLocalGet, 2,
kExprLocalGet, 3,
kExprLocalGet, 4,
kExprLocalGet, 5,
kExprLocalGet, 6,
// Load last s128 from memory, this was written with values from JS.
kExprI32Const, 16,
kSimdPrefix, kExprS128LoadMem, 0, 0,
kExprCallFunction, 0x01,
kExprEnd,
]);
builder.addFunction(undefined, 0 /* sig */)
.addBodyWithEnd([
kExprI32Const, 32,
kExprLocalGet, 0,
kSimdPrefix, kExprS128StoreMem, 0, 0,
kExprI32Const, 48,
kExprLocalGet, 17,
kSimdPrefix, kExprS128StoreMem, 0, 0,
kExprEnd,
]);
builder.addExport('main', 0);
var memory = new WebAssembly.Memory({initial:1, maximum:2});
const instance = builder.instantiate({m: {imported_mem: memory}});
// We write sentinel values to two s128 values at the start of the memory.
// Function 1 will read these values from memory, and pass them as the first
// and last arg to function 2. Function 2 then write these values to memory
// after these two s128 values.
const arr = new Uint32Array(memory.buffer);
for (let i = 0; i < 8; i++) {
arr[0] = i * 2;
}
instance.exports.main();
for (let i = 0; i < 8; i++) {
assertEquals(arr[i], arr[i + 8]);
}
})();
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment