Commit 39fb4e14 authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm-simd] Share and optimize i16x8.splat

Change i16x8.splat to use Punpcklqdq instead of Pshufd as the final step
to move low 32 bits to all lanes.

Move this implementation to shared-macro-assembler and use it
everywhere.

Bug: v8:11589,v8:12090
Change-Id: I968b1dca5a262e4e67875caea18c5c09828cb33a
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3092558
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76353}
parent bb12c48a
...@@ -16,6 +16,14 @@ ...@@ -16,6 +16,14 @@
#error Unsupported target architecture. #error Unsupported target architecture.
#endif #endif
// Operand on IA32 can be a wrapper for a single register, in which case they
// should call I8x16Splat |src| being Register.
#if V8_TARGET_ARCH_IA32
#define DCHECK_OPERAND_IS_NOT_REG(op) DCHECK(!op.is_reg_only());
#else
#define DCHECK_OPERAND_IS_NOT_REG(op)
#endif
namespace v8 { namespace v8 {
namespace internal { namespace internal {
...@@ -277,11 +285,7 @@ void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Register src, ...@@ -277,11 +285,7 @@ void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Register src,
void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Operand src, void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Operand src,
XMMRegister scratch) { XMMRegister scratch) {
#if V8_TARGET_ARCH_IA32 DCHECK_OPERAND_IS_NOT_REG(src);
// Operand on IA32 can be a wrapper for a single register, in which case they
// should call I8x16Splat |src| being Register.
DCHECK(!src.is_reg_only());
#endif
if (CpuFeatures::IsSupported(AVX2)) { if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(this, AVX2); CpuFeatureScope avx2_scope(this, AVX2);
vpbroadcastb(dst, src); vpbroadcastb(dst, src);
...@@ -411,6 +415,34 @@ void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1, ...@@ -411,6 +415,34 @@ void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
Packuswb(dst, tmp2); Packuswb(dst, tmp2);
} }
template <typename Op>
void SharedTurboAssembler::I16x8SplatPreAvx2(XMMRegister dst, Op src) {
DCHECK(!CpuFeatures::IsSupported(AVX2));
Movd(dst, src);
Pshuflw(dst, dst, uint8_t{0x0});
Punpcklqdq(dst, dst);
}
void SharedTurboAssembler::I16x8Splat(XMMRegister dst, Register src) {
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(this, AVX2);
Movd(dst, src);
vpbroadcastw(dst, dst);
} else {
I16x8SplatPreAvx2(dst, src);
}
}
void SharedTurboAssembler::I16x8Splat(XMMRegister dst, Operand src) {
DCHECK_OPERAND_IS_NOT_REG(src);
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(this, AVX2);
vpbroadcastw(dst, src);
} else {
I16x8SplatPreAvx2(dst, src);
}
}
void SharedTurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, void SharedTurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister scratch, XMMRegister src2, XMMRegister scratch,
bool is_signed) { bool is_signed) {
...@@ -886,3 +918,5 @@ void SharedTurboAssembler::S128Select(XMMRegister dst, XMMRegister mask, ...@@ -886,3 +918,5 @@ void SharedTurboAssembler::S128Select(XMMRegister dst, XMMRegister mask,
} // namespace internal } // namespace internal
} // namespace v8 } // namespace v8
#undef DCHECK_OPERAND_IS_NOT_REG
...@@ -315,6 +315,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -315,6 +315,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
XMMRegister tmp2); XMMRegister tmp2);
void I8x16ShrU(XMMRegister dst, XMMRegister src1, Register src2, void I8x16ShrU(XMMRegister dst, XMMRegister src1, Register src2,
Register tmp1, XMMRegister tmp2, XMMRegister tmp3); Register tmp1, XMMRegister tmp2, XMMRegister tmp3);
void I16x8Splat(XMMRegister dst, Register src);
void I16x8Splat(XMMRegister dst, Operand src);
void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2, void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scrat, bool is_signed); XMMRegister scrat, bool is_signed);
void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2, void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
...@@ -357,6 +359,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -357,6 +359,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
private: private:
template <typename Op> template <typename Op>
void I8x16SplatPreAvx2(XMMRegister dst, Op src, XMMRegister scratch); void I8x16SplatPreAvx2(XMMRegister dst, Op src, XMMRegister scratch);
template <typename Op>
void I16x8SplatPreAvx2(XMMRegister dst, Op src);
}; };
} // namespace internal } // namespace internal
} // namespace v8 } // namespace v8
......
...@@ -2697,10 +2697,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2697,10 +2697,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kIA32I16x8Splat: { case kIA32I16x8Splat: {
XMMRegister dst = i.OutputSimd128Register(); if (instr->InputAt(0)->IsRegister()) {
__ Movd(dst, i.InputOperand(0)); __ I16x8Splat(i.OutputSimd128Register(), i.InputRegister(0));
__ Pshuflw(dst, dst, uint8_t{0x0}); } else {
__ Pshufd(dst, dst, uint8_t{0x0}); __ I16x8Splat(i.OutputSimd128Register(), i.InputOperand(0));
}
break; break;
} }
case kIA32I16x8ExtractLaneS: { case kIA32I16x8ExtractLaneS: {
......
...@@ -3199,12 +3199,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3199,12 +3199,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kX64I16x8Splat: { case kX64I16x8Splat: {
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
if (HasRegisterInput(instr, 0)) { if (HasRegisterInput(instr, 0)) {
__ Movd(dst, i.InputRegister(0)); __ I16x8Splat(dst, i.InputRegister(0));
} else { } else {
__ Movd(dst, i.InputOperand(0)); __ I16x8Splat(dst, i.InputOperand(0));
} }
__ Pshuflw(dst, dst, uint8_t{0x0});
__ Pshufd(dst, dst, uint8_t{0x0});
break; break;
} }
case kX64I16x8ExtractLaneS: { case kX64I16x8ExtractLaneS: {
......
...@@ -2922,9 +2922,7 @@ void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst, ...@@ -2922,9 +2922,7 @@ void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst, void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
LiftoffRegister src) { LiftoffRegister src) {
Movd(dst.fp(), src.gp()); I16x8Splat(dst.fp(), src.gp());
Pshuflw(dst.fp(), dst.fp(), uint8_t{0});
Pshufd(dst.fp(), dst.fp(), uint8_t{0});
} }
void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst, void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst,
......
...@@ -2507,9 +2507,7 @@ void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst, ...@@ -2507,9 +2507,7 @@ void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst, void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
LiftoffRegister src) { LiftoffRegister src) {
Movd(dst.fp(), src.gp()); I16x8Splat(dst.fp(), src.gp());
Pshuflw(dst.fp(), dst.fp(), static_cast<uint8_t>(0));
Pshufd(dst.fp(), dst.fp(), static_cast<uint8_t>(0));
} }
void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst, void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment