Commit 10d4418f authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm-simd] Optimize i64x2.shr_s

Use logical shifts to emulate arithmetic shift, by first adding a bias
to make all signed values unsigned, then subtracting the shifted bias.
Details are in code comments for SharedTurboAssembler::I64x2ShrS.

Also refactor ia32 (which was already using this algorithm) to use the
shared macro-assembler function. And convert Liftoff's implementation as
well.

Bug: v8:12058
Change-Id: Ia1fd5fe5a9a0b7a7f31c426d4112256c8bf7021b
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3083291
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#76209}
parent 67a565c0
......@@ -18,6 +18,30 @@
namespace v8 {
namespace internal {
void SharedTurboAssembler::Move(Register dst, Register src) {
// Helper to paper over the different assembler function names.
if (dst != src) {
#if V8_TARGET_ARCH_IA32
mov(dst, src);
#elif V8_TARGET_ARCH_X64
movq(dst, src);
#else
#error Unsupported target architecture.
#endif
}
}
void SharedTurboAssembler::And(Register dst, Immediate src) {
// Helper to paper over the different assembler function names.
#if V8_TARGET_ARCH_IA32
and_(dst, src);
#elif V8_TARGET_ARCH_X64
andq(dst, src);
#else
#error Unsupported target architecture.
#endif
}
void SharedTurboAssembler::Movapd(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
......@@ -496,6 +520,67 @@ void SharedTurboAssembler::I64x2GeS(XMMRegister dst, XMMRegister src0,
}
}
void SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src,
uint8_t shift, XMMRegister xmm_tmp) {
DCHECK_GT(64, shift);
DCHECK_NE(xmm_tmp, dst);
DCHECK_NE(xmm_tmp, src);
// Use logical right shift to emulate arithmetic right shifts:
// Given:
// signed >> c
// == (signed + 2^63 - 2^63) >> c
// == ((signed + 2^63) >> c) - (2^63 >> c)
// ^^^^^^^^^
// xmm_tmp
// signed + 2^63 is an unsigned number, so we can use logical right shifts.
// xmm_tmp = wasm_i64x2_const(0x80000000'00000000).
Pcmpeqd(xmm_tmp, xmm_tmp);
Psllq(xmm_tmp, byte{63});
if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
Movapd(dst, src);
src = dst;
}
// Add a bias of 2^63 to convert signed to unsigned.
// Since only highest bit changes, use pxor instead of paddq.
Pxor(dst, src, xmm_tmp);
// Logically shift both value and bias.
Psrlq(dst, shift);
Psrlq(xmm_tmp, shift);
// Subtract shifted bias to convert back to signed value.
Psubq(dst, xmm_tmp);
}
void SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src,
Register shift, XMMRegister xmm_tmp,
XMMRegister xmm_shift,
Register tmp_shift) {
DCHECK_NE(xmm_tmp, dst);
DCHECK_NE(xmm_tmp, src);
DCHECK_NE(xmm_shift, dst);
DCHECK_NE(xmm_shift, src);
// tmp_shift can alias shift since we don't use shift after masking it.
// See I64x2ShrS with constant shift for explanation of this algorithm.
Pcmpeqd(xmm_tmp, xmm_tmp);
Psllq(xmm_tmp, byte{63});
// Shift modulo 64.
Move(tmp_shift, shift);
And(shift, Immediate(0x3F));
Movd(xmm_shift, shift);
if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
Movapd(dst, src);
src = dst;
}
Pxor(dst, src, xmm_tmp);
Psrlq(dst, xmm_shift);
Psrlq(xmm_tmp, xmm_shift);
Psubq(dst, xmm_tmp);
}
// 1. Unpack src0, src1 into even-number elements of scratch.
// 2. Unpack src1, src0 into even-number elements of dst.
// 3. Multiply 1. with 2.
......
......@@ -33,6 +33,10 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
public:
using TurboAssemblerBase::TurboAssemblerBase;
// Move if registers are not identical.
void Move(Register dst, Register src);
void And(Register dst, Immediate src);
void Movapd(XMMRegister dst, XMMRegister src);
template <typename Dst, typename Src>
......@@ -315,6 +319,11 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
XMMRegister scratch);
void I64x2GeS(XMMRegister dst, XMMRegister src0, XMMRegister src1,
XMMRegister scratch);
void I64x2ShrS(XMMRegister dst, XMMRegister src, uint8_t shift,
XMMRegister xmm_tmp);
void I64x2ShrS(XMMRegister dst, XMMRegister src, Register shift,
XMMRegister xmm_tmp, XMMRegister xmm_shift,
Register tmp_shift);
void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch, bool low, bool is_signed);
void I64x2SConvertI32x4High(XMMRegister dst, XMMRegister src);
......
......@@ -2083,22 +2083,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kIA32I64x2ShrS: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
XMMRegister tmp = i.TempSimd128Register(0);
XMMRegister tmp2 = i.TempSimd128Register(1);
Operand shift = i.InputOperand(1);
// Take shift value modulo 64.
__ and_(shift, Immediate(63));
__ Movd(tmp, shift);
// Set up a mask [0x80000000,0,0x80000000,0].
__ Pcmpeqb(tmp2, tmp2);
__ Psllq(tmp2, tmp2, byte{63});
__ Psrlq(tmp2, tmp2, tmp);
__ Psrlq(dst, src, tmp);
__ Pxor(dst, tmp2);
__ Psubq(dst, tmp2);
if (HasImmediateInput(instr, 1)) {
__ I64x2ShrS(dst, src, i.InputInt6(1), kScratchDoubleReg);
} else {
__ I64x2ShrS(dst, src, i.InputRegister(1), kScratchDoubleReg,
i.TempSimd128Register(0), i.TempRegister(1));
}
break;
}
case kIA32I64x2Add: {
......
......@@ -2417,16 +2417,16 @@ void InstructionSelector::VisitI64x2Neg(Node* node) {
void InstructionSelector::VisitI64x2ShrS(Node* node) {
IA32OperandGenerator g(this);
InstructionOperand temps[] = {g.TempSimd128Register(),
g.TempSimd128Register()};
if (IsSupported(AVX)) {
Emit(kIA32I64x2ShrS, g.DefineAsRegister(node),
g.UseUniqueRegister(node->InputAt(0)), g.Use(node->InputAt(1)),
arraysize(temps), temps);
InstructionOperand dst =
IsSupported(AVX) ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node);
if (g.CanBeImmediate(node->InputAt(1))) {
Emit(kIA32I64x2ShrS, dst, g.UseRegister(node->InputAt(0)),
g.UseImmediate(node->InputAt(1)));
} else {
Emit(kIA32I64x2ShrS, g.DefineSameAsFirst(node),
g.UseUniqueRegister(node->InputAt(0)), g.Use(node->InputAt(1)),
arraysize(temps), temps);
InstructionOperand temps[] = {g.TempSimd128Register(), g.TempRegister()};
Emit(kIA32I64x2ShrS, dst, g.UseUniqueRegister(node->InputAt(0)),
g.UseRegister(node->InputAt(1)), arraysize(temps), temps);
}
}
......
......@@ -2954,21 +2954,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I64x2ShrS: {
// TODO(zhin): there is vpsraq but requires AVX512
// ShrS on each quadword one at a time
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
Register tmp = i.ToRegister(instr->TempAt(0));
// Modulo 64 not required as sarq_cl will mask cl to 6 bits.
// lower quadword
__ Pextrq(tmp, src, int8_t{0x0});
__ sarq_cl(tmp);
__ Pinsrq(dst, tmp, uint8_t{0x0});
// upper quadword
__ Pextrq(tmp, src, int8_t{0x1});
__ sarq_cl(tmp);
__ Pinsrq(dst, tmp, uint8_t{0x1});
if (HasImmediateInput(instr, 1)) {
__ I64x2ShrS(dst, src, i.InputInt6(1), kScratchDoubleReg);
} else {
__ I64x2ShrS(dst, src, i.InputRegister(1), kScratchDoubleReg,
i.TempSimd128Register(0), kScratchRegister);
}
break;
}
case kX64I64x2Add: {
......
......@@ -3298,11 +3298,17 @@ void InstructionSelector::VisitI64x2Neg(Node* node) {
void InstructionSelector::VisitI64x2ShrS(Node* node) {
X64OperandGenerator g(this);
InstructionOperand temps[] = {g.TempRegister()};
// Use fixed to rcx, to use sarq_cl in codegen.
Emit(kX64I64x2ShrS, g.DefineSameAsFirst(node),
g.UseUniqueRegister(node->InputAt(0)), g.UseFixed(node->InputAt(1), rcx),
arraysize(temps), temps);
InstructionOperand dst =
IsSupported(AVX) ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node);
if (g.CanBeImmediate(node->InputAt(1))) {
Emit(kX64I64x2ShrS, dst, g.UseRegister(node->InputAt(0)),
g.UseImmediate(node->InputAt(1)));
} else {
InstructionOperand temps[] = {g.TempSimd128Register()};
Emit(kX64I64x2ShrS, dst, g.UseUniqueRegister(node->InputAt(0)),
g.UseRegister(node->InputAt(1)), arraysize(temps), temps);
}
}
void InstructionSelector::VisitI64x2Mul(Node* node) {
......
......@@ -9,6 +9,7 @@
#include "src/codegen/assembler.h"
#include "src/heap/memory-chunk.h"
#include "src/wasm/baseline/liftoff-assembler.h"
#include "src/wasm/baseline/liftoff-register.h"
#include "src/wasm/simd-shuffle.h"
#include "src/wasm/value-type.h"
#include "src/wasm/wasm-objects.h"
......@@ -3903,47 +3904,19 @@ void LiftoffAssembler::emit_i64x2_shli(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i64x2_shr_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
XMMRegister shift = liftoff::kScratchDoubleReg;
XMMRegister tmp =
GetUnusedRegister(RegClass::kFpReg, LiftoffRegList::ForRegs(dst, lhs))
.fp();
Register scratch =
GetUnusedRegister(RegClass::kGpReg, LiftoffRegList::ForRegs(rhs)).gp();
// Take shift value modulo 64.
and_(rhs.gp(), Immediate(63));
Movd(shift, rhs.gp());
// Set up a mask [0x80000000,0,0x80000000,0].
Pcmpeqb(tmp, tmp);
Psllq(tmp, tmp, byte{63});
Psrlq(tmp, tmp, shift);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpsrlq(dst.fp(), lhs.fp(), shift);
} else {
if (dst != lhs) {
movaps(dst.fp(), lhs.fp());
}
psrlq(dst.fp(), shift);
}
Pxor(dst.fp(), tmp);
Psubq(dst.fp(), tmp);
I64x2ShrS(dst.fp(), lhs.fp(), rhs.gp(), liftoff::kScratchDoubleReg, tmp,
scratch);
}
void LiftoffAssembler::emit_i64x2_shri_s(LiftoffRegister dst,
LiftoffRegister lhs, int32_t rhs) {
XMMRegister tmp = liftoff::kScratchDoubleReg;
byte shift = rhs & 63;
// Set up a mask [0x80000000,0,0x80000000,0].
Pcmpeqb(tmp, tmp);
Psllq(tmp, tmp, byte{63});
Psrlq(tmp, tmp, shift);
liftoff::EmitSimdShiftOpImm<&Assembler::vpsrlq, &Assembler::psrlq, 6>(
this, dst, lhs, rhs);
Pxor(dst.fp(), tmp);
Psubq(dst.fp(), tmp);
I64x2ShrS(dst.fp(), lhs.fp(), rhs & 0x3F, liftoff::kScratchDoubleReg);
}
void LiftoffAssembler::emit_i64x2_shr_u(LiftoffRegister dst,
......
......@@ -2358,38 +2358,6 @@ void EmitI8x16Shr(LiftoffAssembler* assm, LiftoffRegister dst,
}
}
// Can be used by both the immediate and register version of the shifts. psraq
// is only available in AVX512, so we can't use it yet.
template <typename ShiftOperand>
void EmitI64x2ShrS(LiftoffAssembler* assm, LiftoffRegister dst,
LiftoffRegister lhs, ShiftOperand rhs,
bool shift_is_rcx = false) {
bool restore_rcx = false;
Register backup = kScratchRegister2;
if (!shift_is_rcx) {
if (assm->cache_state()->is_used(LiftoffRegister(rcx))) {
restore_rcx = true;
assm->movq(backup, rcx);
}
assm->movl(rcx, rhs);
}
Register tmp = kScratchRegister;
assm->Pextrq(tmp, lhs.fp(), int8_t{0x0});
assm->sarq_cl(tmp);
assm->Pinsrq(dst.fp(), tmp, uint8_t{0x0});
assm->Pextrq(tmp, lhs.fp(), int8_t{0x1});
assm->sarq_cl(tmp);
assm->Pinsrq(dst.fp(), tmp, uint8_t{0x1});
// restore rcx.
if (restore_rcx) {
assm->movq(rcx, backup);
}
}
inline void EmitAnyTrue(LiftoffAssembler* assm, LiftoffRegister dst,
LiftoffRegister src) {
assm->xorq(dst.gp(), dst.gp());
......@@ -3495,13 +3463,13 @@ void LiftoffAssembler::emit_i64x2_shli(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i64x2_shr_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitI64x2ShrS(this, dst, lhs, rhs.gp(),
/*shift_is_rcx=*/rhs.gp() == rcx);
I64x2ShrS(dst.fp(), lhs.fp(), rhs.gp(), kScratchDoubleReg,
liftoff::kScratchDoubleReg2, kScratchRegister);
}
void LiftoffAssembler::emit_i64x2_shri_s(LiftoffRegister dst,
LiftoffRegister lhs, int32_t rhs) {
liftoff::EmitI64x2ShrS(this, dst, lhs, Immediate(rhs));
I64x2ShrS(dst.fp(), lhs.fp(), rhs & 0x3F, kScratchDoubleReg);
}
void LiftoffAssembler::emit_i64x2_shr_u(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment