Commit acf0f469 authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm-simd] Share and optimize load 8, 16, 32 splat

Move optimized implementation (accounts for AVX2) into
shared-macro-assembler, and use it everywhere.

Drive-by fix in liftoff-assembler-ia32.h to use Movss and Movsd
macro-assembler functions to that they emit AVX when supported.

Bug: v8:11589
Change-Id: Ibc4f2709d323d5b835bcac175a32b422d47d3355
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3095008
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76372}
parent 268a1608
...@@ -916,6 +916,63 @@ void SharedTurboAssembler::S128Select(XMMRegister dst, XMMRegister mask, ...@@ -916,6 +916,63 @@ void SharedTurboAssembler::S128Select(XMMRegister dst, XMMRegister mask,
} }
} }
void SharedTurboAssembler::S128Load8Splat(XMMRegister dst, Operand src,
XMMRegister scratch) {
// The trap handler uses the current pc to creating a landing, so that it can
// determine if a trap occured in Wasm code due to a OOB load. Make sure the
// first instruction in each case below is the one that loads.
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(this, AVX2);
vpbroadcastb(dst, src);
} else if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// Avoid dependency on previous value of dst.
vpinsrb(dst, scratch, src, uint8_t{0});
vpxor(scratch, scratch, scratch);
vpshufb(dst, dst, scratch);
} else {
CpuFeatureScope ssse4_scope(this, SSE4_1);
CpuFeatureScope ssse3_scope(this, SSSE3);
pinsrb(dst, src, uint8_t{0});
xorps(scratch, scratch);
pshufb(dst, scratch);
}
}
void SharedTurboAssembler::S128Load16Splat(XMMRegister dst, Operand src,
XMMRegister scratch) {
// The trap handler uses the current pc to creating a landing, so that it can
// determine if a trap occured in Wasm code due to a OOB load. Make sure the
// first instruction in each case below is the one that loads.
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(this, AVX2);
vpbroadcastw(dst, src);
} else if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// Avoid dependency on previous value of dst.
vpinsrw(dst, scratch, src, uint8_t{0});
vpshuflw(dst, dst, uint8_t{0});
vpunpcklqdq(dst, dst, dst);
} else {
pinsrw(dst, src, uint8_t{0});
pshuflw(dst, dst, uint8_t{0});
movlhps(dst, dst);
}
}
void SharedTurboAssembler::S128Load32Splat(XMMRegister dst, Operand src) {
// The trap handler uses the current pc to creating a landing, so that it can
// determine if a trap occured in Wasm code due to a OOB load. Make sure the
// first instruction in each case below is the one that loads.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vbroadcastss(dst, src);
} else {
movss(dst, src);
shufps(dst, dst, byte{0});
}
}
} // namespace internal } // namespace internal
} // namespace v8 } // namespace v8
......
...@@ -217,6 +217,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -217,6 +217,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Pavgw, pavgw) AVX_OP(Pavgw, pavgw)
AVX_OP(Pcmpgtb, pcmpgtb) AVX_OP(Pcmpgtb, pcmpgtb)
AVX_OP(Pcmpeqd, pcmpeqd) AVX_OP(Pcmpeqd, pcmpeqd)
AVX_OP(Pinsrw, pinsrw)
AVX_OP(Pmaxub, pmaxub) AVX_OP(Pmaxub, pmaxub)
AVX_OP(Pminub, pminub) AVX_OP(Pminub, pminub)
AVX_OP(Pmovmskb, pmovmskb) AVX_OP(Pmovmskb, pmovmskb)
...@@ -278,6 +279,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -278,6 +279,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP_SSE4_1(Pblendw, pblendw) AVX_OP_SSE4_1(Pblendw, pblendw)
AVX_OP_SSE4_1(Pextrb, pextrb) AVX_OP_SSE4_1(Pextrb, pextrb)
AVX_OP_SSE4_1(Pextrw, pextrw) AVX_OP_SSE4_1(Pextrw, pextrw)
AVX_OP_SSE4_1(Pinsrb, pinsrb)
AVX_OP_SSE4_1(Pmaxsb, pmaxsb) AVX_OP_SSE4_1(Pmaxsb, pmaxsb)
AVX_OP_SSE4_1(Pmaxsd, pmaxsd) AVX_OP_SSE4_1(Pmaxsd, pmaxsd)
AVX_OP_SSE4_1(Pminsb, pminsb) AVX_OP_SSE4_1(Pminsb, pminsb)
...@@ -355,6 +357,9 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -355,6 +357,9 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
// Requires dst == mask when AVX is not supported. // Requires dst == mask when AVX is not supported.
void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1, void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1,
XMMRegister src2, XMMRegister scratch); XMMRegister src2, XMMRegister scratch);
void S128Load8Splat(XMMRegister dst, Operand src, XMMRegister scratch);
void S128Load16Splat(XMMRegister dst, Operand src, XMMRegister scratch);
void S128Load32Splat(XMMRegister dst, Operand src);
private: private:
template <typename Op> template <typename Op>
......
...@@ -1256,14 +1256,15 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1256,14 +1256,15 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void pmovmskb(Register dst, XMMRegister src); void pmovmskb(Register dst, XMMRegister src);
void pinsrw(XMMRegister dst, Register src, uint8_t imm8);
void pinsrw(XMMRegister dst, Operand src, uint8_t imm8);
// SSE 4.1 instruction // SSE 4.1 instruction
void insertps(XMMRegister dst, XMMRegister src, byte imm8); void insertps(XMMRegister dst, XMMRegister src, byte imm8);
void insertps(XMMRegister dst, Operand src, byte imm8); void insertps(XMMRegister dst, Operand src, byte imm8);
void pextrq(Register dst, XMMRegister src, int8_t imm8); void pextrq(Register dst, XMMRegister src, int8_t imm8);
void pinsrb(XMMRegister dst, Register src, uint8_t imm8); void pinsrb(XMMRegister dst, Register src, uint8_t imm8);
void pinsrb(XMMRegister dst, Operand src, uint8_t imm8); void pinsrb(XMMRegister dst, Operand src, uint8_t imm8);
void pinsrw(XMMRegister dst, Register src, uint8_t imm8);
void pinsrw(XMMRegister dst, Operand src, uint8_t imm8);
void pinsrd(XMMRegister dst, Register src, uint8_t imm8); void pinsrd(XMMRegister dst, Register src, uint8_t imm8);
void pinsrd(XMMRegister dst, Operand src, uint8_t imm8); void pinsrd(XMMRegister dst, Operand src, uint8_t imm8);
void pinsrq(XMMRegister dst, Register src, uint8_t imm8); void pinsrq(XMMRegister dst, Register src, uint8_t imm8);
......
...@@ -3442,20 +3442,17 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3442,20 +3442,17 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kIA32S128Load8Splat: { case kIA32S128Load8Splat: {
__ Pinsrb(i.OutputSimd128Register(), i.MemoryOperand(), 0); __ S128Load8Splat(i.OutputSimd128Register(), i.MemoryOperand(),
__ Pxor(kScratchDoubleReg, kScratchDoubleReg); kScratchDoubleReg);
__ Pshufb(i.OutputSimd128Register(), kScratchDoubleReg);
break; break;
} }
case kIA32S128Load16Splat: { case kIA32S128Load16Splat: {
__ Pinsrw(i.OutputSimd128Register(), i.MemoryOperand(), 0); __ S128Load16Splat(i.OutputSimd128Register(), i.MemoryOperand(),
__ Pshuflw(i.OutputSimd128Register(), i.OutputSimd128Register(), kScratchDoubleReg);
uint8_t{0});
__ Punpcklqdq(i.OutputSimd128Register(), i.OutputSimd128Register());
break; break;
} }
case kIA32S128Load32Splat: { case kIA32S128Load32Splat: {
__ Vbroadcastss(i.OutputSimd128Register(), i.MemoryOperand()); __ S128Load32Splat(i.OutputSimd128Register(), i.MemoryOperand());
break; break;
} }
case kIA32S128Load64Splat: { case kIA32S128Load64Splat: {
......
...@@ -3723,40 +3723,19 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3723,40 +3723,19 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
case kX64S128Load8Splat: { case kX64S128Load8Splat: {
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset()); EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
XMMRegister dst = i.OutputSimd128Register(); __ S128Load8Splat(i.OutputSimd128Register(), i.MemoryOperand(),
if (CpuFeatures::IsSupported(AVX2)) { kScratchDoubleReg);
CpuFeatureScope avx2_scope(tasm(), AVX2);
__ vpbroadcastb(dst, i.MemoryOperand());
} else {
__ Pinsrb(dst, dst, i.MemoryOperand(), 0);
__ Pxor(kScratchDoubleReg, kScratchDoubleReg);
__ Pshufb(dst, kScratchDoubleReg);
}
break; break;
} }
case kX64S128Load16Splat: { case kX64S128Load16Splat: {
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset()); EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
XMMRegister dst = i.OutputSimd128Register(); __ S128Load16Splat(i.OutputSimd128Register(), i.MemoryOperand(),
if (CpuFeatures::IsSupported(AVX2)) { kScratchDoubleReg);
CpuFeatureScope avx2_scope(tasm(), AVX2);
__ vpbroadcastw(dst, i.MemoryOperand());
} else {
__ Pinsrw(dst, dst, i.MemoryOperand(), 0);
__ Pshuflw(dst, dst, uint8_t{0});
__ Punpcklqdq(dst, dst);
}
break; break;
} }
case kX64S128Load32Splat: { case kX64S128Load32Splat: {
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset()); EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
if (CpuFeatures::IsSupported(AVX)) { __ S128Load32Splat(i.OutputSimd128Register(), i.MemoryOperand());
CpuFeatureScope avx_scope(tasm(), AVX);
__ vbroadcastss(i.OutputSimd128Register(), i.MemoryOperand());
} else {
__ movss(i.OutputSimd128Register(), i.MemoryOperand());
__ shufps(i.OutputSimd128Register(), i.OutputSimd128Register(),
byte{0});
}
break; break;
} }
case kX64S128Load64Splat: { case kX64S128Load64Splat: {
......
...@@ -2775,23 +2775,19 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr, ...@@ -2775,23 +2775,19 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
} }
} else if (transform == LoadTransformationKind::kZeroExtend) { } else if (transform == LoadTransformationKind::kZeroExtend) {
if (memtype == MachineType::Int32()) { if (memtype == MachineType::Int32()) {
movss(dst.fp(), src_op); Movss(dst.fp(), src_op);
} else { } else {
DCHECK_EQ(MachineType::Int64(), memtype); DCHECK_EQ(MachineType::Int64(), memtype);
movsd(dst.fp(), src_op); Movsd(dst.fp(), src_op);
} }
} else { } else {
DCHECK_EQ(LoadTransformationKind::kSplat, transform); DCHECK_EQ(LoadTransformationKind::kSplat, transform);
if (memtype == MachineType::Int8()) { if (memtype == MachineType::Int8()) {
Pinsrb(dst.fp(), src_op, 0); S128Load8Splat(dst.fp(), src_op, liftoff::kScratchDoubleReg);
Pxor(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
Pshufb(dst.fp(), liftoff::kScratchDoubleReg);
} else if (memtype == MachineType::Int16()) { } else if (memtype == MachineType::Int16()) {
Pinsrw(dst.fp(), src_op, 0); S128Load16Splat(dst.fp(), src_op, liftoff::kScratchDoubleReg);
Pshuflw(dst.fp(), dst.fp(), uint8_t{0});
Punpcklqdq(dst.fp(), dst.fp());
} else if (memtype == MachineType::Int32()) { } else if (memtype == MachineType::Int32()) {
Vbroadcastss(dst.fp(), src_op); S128Load32Splat(dst.fp(), src_op);
} else if (memtype == MachineType::Int64()) { } else if (memtype == MachineType::Int64()) {
Movddup(dst.fp(), src_op); Movddup(dst.fp(), src_op);
} }
......
...@@ -2391,21 +2391,11 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr, ...@@ -2391,21 +2391,11 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
} else { } else {
DCHECK_EQ(LoadTransformationKind::kSplat, transform); DCHECK_EQ(LoadTransformationKind::kSplat, transform);
if (memtype == MachineType::Int8()) { if (memtype == MachineType::Int8()) {
Pinsrb(dst.fp(), dst.fp(), src_op, 0); S128Load8Splat(dst.fp(), src_op, kScratchDoubleReg);
Pxor(kScratchDoubleReg, kScratchDoubleReg);
Pshufb(dst.fp(), kScratchDoubleReg);
} else if (memtype == MachineType::Int16()) { } else if (memtype == MachineType::Int16()) {
Pinsrw(dst.fp(), dst.fp(), src_op, 0); S128Load16Splat(dst.fp(), src_op, kScratchDoubleReg);
Pshuflw(dst.fp(), dst.fp(), uint8_t{0});
Punpcklqdq(dst.fp(), dst.fp());
} else if (memtype == MachineType::Int32()) { } else if (memtype == MachineType::Int32()) {
if (CpuFeatures::IsSupported(AVX)) { S128Load32Splat(dst.fp(), src_op);
CpuFeatureScope avx_scope(this, AVX);
vbroadcastss(dst.fp(), src_op);
} else {
movss(dst.fp(), src_op);
shufps(dst.fp(), dst.fp(), byte{0});
}
} else if (memtype == MachineType::Int64()) { } else if (memtype == MachineType::Int64()) {
Movddup(dst.fp(), src_op); Movddup(dst.fp(), src_op);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment