Commit 39fb4e14 authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm-simd] Share and optimize i16x8.splat

Change i16x8.splat to use Punpcklqdq instead of Pshufd as the final step
to move low 32 bits to all lanes.

Move this implementation to shared-macro-assembler and use it
everywhere.

Bug: v8:11589,v8:12090
Change-Id: I968b1dca5a262e4e67875caea18c5c09828cb33a
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3092558
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76353}
parent bb12c48a
......@@ -16,6 +16,14 @@
#error Unsupported target architecture.
#endif
// Operand on IA32 can be a wrapper for a single register, in which case they
// should call I8x16Splat |src| being Register.
#if V8_TARGET_ARCH_IA32
#define DCHECK_OPERAND_IS_NOT_REG(op) DCHECK(!op.is_reg_only());
#else
#define DCHECK_OPERAND_IS_NOT_REG(op)
#endif
namespace v8 {
namespace internal {
......@@ -277,11 +285,7 @@ void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Register src,
void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Operand src,
XMMRegister scratch) {
#if V8_TARGET_ARCH_IA32
// Operand on IA32 can be a wrapper for a single register, in which case they
// should call I8x16Splat |src| being Register.
DCHECK(!src.is_reg_only());
#endif
DCHECK_OPERAND_IS_NOT_REG(src);
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(this, AVX2);
vpbroadcastb(dst, src);
......@@ -411,6 +415,34 @@ void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
Packuswb(dst, tmp2);
}
template <typename Op>
void SharedTurboAssembler::I16x8SplatPreAvx2(XMMRegister dst, Op src) {
DCHECK(!CpuFeatures::IsSupported(AVX2));
Movd(dst, src);
Pshuflw(dst, dst, uint8_t{0x0});
Punpcklqdq(dst, dst);
}
void SharedTurboAssembler::I16x8Splat(XMMRegister dst, Register src) {
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(this, AVX2);
Movd(dst, src);
vpbroadcastw(dst, dst);
} else {
I16x8SplatPreAvx2(dst, src);
}
}
void SharedTurboAssembler::I16x8Splat(XMMRegister dst, Operand src) {
DCHECK_OPERAND_IS_NOT_REG(src);
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(this, AVX2);
vpbroadcastw(dst, src);
} else {
I16x8SplatPreAvx2(dst, src);
}
}
void SharedTurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister scratch,
bool is_signed) {
......@@ -886,3 +918,5 @@ void SharedTurboAssembler::S128Select(XMMRegister dst, XMMRegister mask,
} // namespace internal
} // namespace v8
#undef DCHECK_OPERAND_IS_NOT_REG
......@@ -315,6 +315,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
XMMRegister tmp2);
void I8x16ShrU(XMMRegister dst, XMMRegister src1, Register src2,
Register tmp1, XMMRegister tmp2, XMMRegister tmp3);
void I16x8Splat(XMMRegister dst, Register src);
void I16x8Splat(XMMRegister dst, Operand src);
void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scrat, bool is_signed);
void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
......@@ -357,6 +359,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
private:
template <typename Op>
void I8x16SplatPreAvx2(XMMRegister dst, Op src, XMMRegister scratch);
template <typename Op>
void I16x8SplatPreAvx2(XMMRegister dst, Op src);
};
} // namespace internal
} // namespace v8
......
......@@ -2697,10 +2697,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kIA32I16x8Splat: {
XMMRegister dst = i.OutputSimd128Register();
__ Movd(dst, i.InputOperand(0));
__ Pshuflw(dst, dst, uint8_t{0x0});
__ Pshufd(dst, dst, uint8_t{0x0});
if (instr->InputAt(0)->IsRegister()) {
__ I16x8Splat(i.OutputSimd128Register(), i.InputRegister(0));
} else {
__ I16x8Splat(i.OutputSimd128Register(), i.InputOperand(0));
}
break;
}
case kIA32I16x8ExtractLaneS: {
......
......@@ -3199,12 +3199,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kX64I16x8Splat: {
XMMRegister dst = i.OutputSimd128Register();
if (HasRegisterInput(instr, 0)) {
__ Movd(dst, i.InputRegister(0));
__ I16x8Splat(dst, i.InputRegister(0));
} else {
__ Movd(dst, i.InputOperand(0));
__ I16x8Splat(dst, i.InputOperand(0));
}
__ Pshuflw(dst, dst, uint8_t{0x0});
__ Pshufd(dst, dst, uint8_t{0x0});
break;
}
case kX64I16x8ExtractLaneS: {
......
......@@ -2922,9 +2922,7 @@ void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
Pshuflw(dst.fp(), dst.fp(), uint8_t{0});
Pshufd(dst.fp(), dst.fp(), uint8_t{0});
I16x8Splat(dst.fp(), src.gp());
}
void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst,
......
......@@ -2507,9 +2507,7 @@ void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
Pshuflw(dst.fp(), dst.fp(), static_cast<uint8_t>(0));
Pshufd(dst.fp(), dst.fp(), static_cast<uint8_t>(0));
I16x8Splat(dst.fp(), src.gp());
}
void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment