Commit bb12c48a authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm-simd] Share i8x16.splat implementation

The optimal implementation is in TurboFan x64 codegen, move it into
shared-macro-assembler, and have TurboFan ia32 and Liftoff use it. The
optimal implementation accounts for AVX2 support.

We add a couple of AVX2 instruction to ia32 in sse-instr.h, not all of
them are used, but follow-up patches will use them, so we add support
(including diassembly and test) in this change.

Drive-by clean up to test-disasm-x64.cc to merge 2 AVX2 test sections.

Bug: v8:11589
Change-Id: I1c8d7deb0f8bb70b29e7a680e5dbcfb09ca5505b
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3092555Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76352}
parent 9b772187
...@@ -1790,6 +1790,19 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1790,6 +1790,19 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
SSE4_RM_INSTRUCTION_LIST(DECLARE_SSE4_AVX_RM_INSTRUCTION) SSE4_RM_INSTRUCTION_LIST(DECLARE_SSE4_AVX_RM_INSTRUCTION)
#undef DECLARE_SSE4_AVX_RM_INSTRUCTION #undef DECLARE_SSE4_AVX_RM_INSTRUCTION
// AVX2 instructions
#define AVX2_INSTRUCTION(instr, prefix, escape1, escape2, opcode) \
void instr(XMMRegister dst, XMMRegister src) { \
vinstr(0x##opcode, dst, xmm0, src, k##prefix, k##escape1##escape2, kW0, \
AVX2); \
} \
void instr(XMMRegister dst, Operand src) { \
vinstr(0x##opcode, dst, xmm0, src, k##prefix, k##escape1##escape2, kW0, \
AVX2); \
}
AVX2_BROADCAST_LIST(AVX2_INSTRUCTION)
#undef AVX2_INSTRUCTION
// Prefetch src position into cache level. // Prefetch src position into cache level.
// Level 1, 2 or 3 specifies CPU cache level. Level 0 specifies a // Level 1, 2 or 3 specifies CPU cache level. Level 0 specifies a
// non-temporal // non-temporal
......
...@@ -102,4 +102,10 @@ ...@@ -102,4 +102,10 @@
V(pmovzxdq, 66, 0F, 38, 35) \ V(pmovzxdq, 66, 0F, 38, 35) \
V(ptest, 66, 0F, 38, 17) V(ptest, 66, 0F, 38, 17)
// These require AVX2, and we only define the VEX-128 versions.
#define AVX2_BROADCAST_LIST(V) \
V(vpbroadcastd, 66, 0F, 38, 58) \
V(vpbroadcastb, 66, 0F, 38, 78) \
V(vpbroadcastw, 66, 0F, 38, 79)
#endif // V8_CODEGEN_IA32_SSE_INSTR_H_ #endif // V8_CODEGEN_IA32_SSE_INSTR_H_
...@@ -254,6 +254,42 @@ void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src, ...@@ -254,6 +254,42 @@ void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
} }
} }
template <typename Op>
void SharedTurboAssembler::I8x16SplatPreAvx2(XMMRegister dst, Op src,
XMMRegister scratch) {
DCHECK(!CpuFeatures::IsSupported(AVX2));
CpuFeatureScope ssse3_scope(this, SSSE3);
Movd(dst, src);
Xorps(scratch, scratch);
Pshufb(dst, scratch);
}
void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Register src,
XMMRegister scratch) {
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(this, AVX2);
Movd(scratch, src);
vpbroadcastb(dst, scratch);
} else {
I8x16SplatPreAvx2(dst, src, scratch);
}
}
void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Operand src,
XMMRegister scratch) {
#if V8_TARGET_ARCH_IA32
// Operand on IA32 can be a wrapper for a single register, in which case they
// should call I8x16Splat |src| being Register.
DCHECK(!src.is_reg_only());
#endif
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(this, AVX2);
vpbroadcastb(dst, src);
} else {
I8x16SplatPreAvx2(dst, src, scratch);
}
}
void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1, void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1,
uint8_t src2, Register tmp1, uint8_t src2, Register tmp1,
XMMRegister tmp2) { XMMRegister tmp2) {
......
...@@ -223,6 +223,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -223,6 +223,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Pmullw, pmullw) AVX_OP(Pmullw, pmullw)
AVX_OP(Pmuludq, pmuludq) AVX_OP(Pmuludq, pmuludq)
AVX_OP(Por, por) AVX_OP(Por, por)
AVX_OP(Pshufb, pshufb)
AVX_OP(Pshufd, pshufd) AVX_OP(Pshufd, pshufd)
AVX_OP(Pshufhw, pshufhw) AVX_OP(Pshufhw, pshufhw)
AVX_OP(Pshuflw, pshuflw) AVX_OP(Pshuflw, pshuflw)
...@@ -300,6 +301,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -300,6 +301,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
void F32x4Splat(XMMRegister dst, DoubleRegister src); void F32x4Splat(XMMRegister dst, DoubleRegister src);
void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane); void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane);
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx); void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
void I8x16Splat(XMMRegister dst, Register src, XMMRegister scratch);
void I8x16Splat(XMMRegister dst, Operand src, XMMRegister scratch);
void I8x16Shl(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1, void I8x16Shl(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1,
XMMRegister tmp2); XMMRegister tmp2);
void I8x16Shl(XMMRegister dst, XMMRegister src1, Register src2, Register tmp1, void I8x16Shl(XMMRegister dst, XMMRegister src1, Register src2, Register tmp1,
...@@ -350,6 +353,10 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -350,6 +353,10 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
// Requires dst == mask when AVX is not supported. // Requires dst == mask when AVX is not supported.
void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1, void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1,
XMMRegister src2, XMMRegister scratch); XMMRegister src2, XMMRegister scratch);
private:
template <typename Op>
void I8x16SplatPreAvx2(XMMRegister dst, Op src, XMMRegister scratch);
}; };
} // namespace internal } // namespace internal
} // namespace v8 } // namespace v8
......
...@@ -3009,10 +3009,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3009,10 +3009,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kIA32I8x16Splat: { case kIA32I8x16Splat: {
XMMRegister dst = i.OutputSimd128Register(); if (instr->InputAt(0)->IsRegister()) {
__ Movd(dst, i.InputOperand(0)); __ I8x16Splat(i.OutputSimd128Register(), i.InputRegister(0),
__ Pxor(kScratchDoubleReg, kScratchDoubleReg); kScratchDoubleReg);
__ Pshufb(dst, kScratchDoubleReg); } else {
__ I8x16Splat(i.OutputSimd128Register(), i.InputOperand(0),
kScratchDoubleReg);
}
break; break;
} }
case kIA32I8x16ExtractLaneS: { case kIA32I8x16ExtractLaneS: {
......
...@@ -3405,25 +3405,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3405,25 +3405,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
case kX64I8x16Splat: { case kX64I8x16Splat: {
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx_scope(tasm(), AVX);
CpuFeatureScope avx2_scope(tasm(), AVX2);
if (HasRegisterInput(instr, 0)) {
__ vmovd(kScratchDoubleReg, i.InputRegister(0));
__ vpbroadcastb(dst, kScratchDoubleReg);
} else {
__ vpbroadcastb(dst, i.InputOperand(0));
}
} else {
if (HasRegisterInput(instr, 0)) { if (HasRegisterInput(instr, 0)) {
__ Movd(dst, i.InputRegister(0)); __ I8x16Splat(dst, i.InputRegister(0), kScratchDoubleReg);
} else { } else {
__ Movd(dst, i.InputOperand(0)); __ I8x16Splat(dst, i.InputOperand(0), kScratchDoubleReg);
} }
__ Xorps(kScratchDoubleReg, kScratchDoubleReg);
__ Pshufb(dst, kScratchDoubleReg);
}
break; break;
} }
case kX64Pextrb: { case kX64Pextrb: {
......
...@@ -786,6 +786,15 @@ int DisassemblerIA32::AVXInstruction(byte* data) { ...@@ -786,6 +786,15 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
SSSE3_UNOP_INSTRUCTION_LIST(DECLARE_SSE_AVX_RM_DIS_CASE) SSSE3_UNOP_INSTRUCTION_LIST(DECLARE_SSE_AVX_RM_DIS_CASE)
SSE4_RM_INSTRUCTION_LIST(DECLARE_SSE_AVX_RM_DIS_CASE) SSE4_RM_INSTRUCTION_LIST(DECLARE_SSE_AVX_RM_DIS_CASE)
#undef DECLARE_SSE_AVX_RM_DIS_CASE #undef DECLARE_SSE_AVX_RM_DIS_CASE
#define DISASSEMBLE_AVX2_BROADCAST(instruction, _1, _2, _3, code) \
case 0x##code: \
AppendToBuffer("" #instruction " %s,", NameOfXMMRegister(regop)); \
current += PrintRightXMMOperand(current); \
break;
AVX2_BROADCAST_LIST(DISASSEMBLE_AVX2_BROADCAST)
#undef DISASSEMBLE_AVX2_BROADCAST
default: default:
UnimplementedInstruction(); UnimplementedInstruction();
} }
......
...@@ -2917,9 +2917,7 @@ void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst, ...@@ -2917,9 +2917,7 @@ void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst, void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) { LiftoffRegister src) {
Movd(dst.fp(), src.gp()); I8x16Splat(dst.fp(), src.gp(), liftoff::kScratchDoubleReg);
Pxor(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
Pshufb(dst.fp(), liftoff::kScratchDoubleReg);
} }
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst, void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
......
...@@ -2502,9 +2502,7 @@ void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst, ...@@ -2502,9 +2502,7 @@ void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst, void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) { LiftoffRegister src) {
Movd(dst.fp(), src.gp()); I8x16Splat(dst.fp(), src.gp(), kScratchDoubleReg);
Pxor(kScratchDoubleReg, kScratchDoubleReg);
Pshufb(dst.fp(), kScratchDoubleReg);
} }
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst, void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
......
...@@ -865,6 +865,18 @@ TEST(DisasmIa320) { ...@@ -865,6 +865,18 @@ TEST(DisasmIa320) {
} }
} }
// AVX2 instructions.
{
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope scope(&assm, AVX2);
#define EMIT_AVX2_BROADCAST(instruction, notUsed1, notUsed2, notUsed3, \
notUsed4) \
__ instruction(xmm0, xmm1); \
__ instruction(xmm0, Operand(ebx, ecx, times_4, 10000));
AVX2_BROADCAST_LIST(EMIT_AVX2_BROADCAST)
}
}
// FMA3 instruction // FMA3 instruction
{ {
if (CpuFeatures::IsSupported(FMA3)) { if (CpuFeatures::IsSupported(FMA3)) {
......
...@@ -869,13 +869,6 @@ TEST(DisasmX64) { ...@@ -869,13 +869,6 @@ TEST(DisasmX64) {
if (CpuFeatures::IsSupported(AVX2)) { if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope scope(&assm, AVX2); CpuFeatureScope scope(&assm, AVX2);
__ vbroadcastss(xmm1, xmm2); __ vbroadcastss(xmm1, xmm2);
}
}
// AVX2 instructions.
{
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope scope(&assm, AVX2);
#define EMIT_AVX2_BROADCAST(instruction, notUsed1, notUsed2, notUsed3, \ #define EMIT_AVX2_BROADCAST(instruction, notUsed1, notUsed2, notUsed3, \
notUsed4) \ notUsed4) \
__ instruction(xmm0, xmm1); \ __ instruction(xmm0, xmm1); \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment