Commit 64758c63 authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm-relaxed-simd] Optimize Qfma/Qfms

Do not require that dst == src1, this leaves more flexibility for the
operands. We check in the macro-assembler if dst alias any of the input
operands, then use vfma231/vfma132/vfma213 appropriately.

Bug: v8:11659
Change-Id: I3644f5e0e75bd047d4e5f5b52d4234e54d329d15
Fixed: v8:11659
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3131370
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76732}
parent 06751aba
......@@ -30,9 +30,17 @@
V(vfnmsub132ss, LIG, 66, 0F, 38, W0, 9f) \
V(vfnmsub213ss, LIG, 66, 0F, 38, W0, af) \
V(vfnmsub231ss, LIG, 66, 0F, 38, W0, bf) \
V(vfmadd132ps, L128, 66, 0F, 38, W0, 98) \
V(vfmadd213ps, L128, 66, 0F, 38, W0, a8) \
V(vfmadd231ps, L128, 66, 0F, 38, W0, b8) \
V(vfnmadd132ps, L128, 66, 0F, 38, W0, 9c) \
V(vfnmadd213ps, L128, 66, 0F, 38, W0, ac) \
V(vfnmadd231ps, L128, 66, 0F, 38, W0, bc) \
V(vfmadd132pd, L128, 66, 0F, 38, W1, 98) \
V(vfmadd213pd, L128, 66, 0F, 38, W1, a8) \
V(vfmadd231pd, L128, 66, 0F, 38, W1, b8) \
V(vfnmadd132pd, L128, 66, 0F, 38, W1, 9c) \
V(vfnmadd213pd, L128, 66, 0F, 38, W1, ac) \
V(vfnmadd231pd, L128, 66, 0F, 38, W1, bc)
#endif // V8_CODEGEN_X64_FMA_INSTR_H_
......@@ -859,6 +859,99 @@ void TurboAssembler::Movq(Register dst, XMMRegister src) {
}
}
// Helper macro to define qfma macro-assembler. This takes care of every
// possible case of register aliasing to minimize the number of instructions.
#define QFMA(ps_or_pd) \
if (CpuFeatures::IsSupported(FMA3)) { \
CpuFeatureScope fma3_scope(this, FMA3); \
if (dst == src1) { \
vfmadd231##ps_or_pd(dst, src2, src3); \
} else if (dst == src2) { \
vfmadd132##ps_or_pd(dst, src1, src3); \
} else if (dst == src3) { \
vfmadd213##ps_or_pd(dst, src2, src1); \
} else { \
vmovups(dst, src1); \
vfmadd231##ps_or_pd(dst, src2, src3); \
} \
} else if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope avx_scope(this, AVX); \
vmul##ps_or_pd(tmp, src2, src3); \
vadd##ps_or_pd(dst, src1, tmp); \
} else { \
if (dst == src1) { \
movaps(tmp, src2); \
mul##ps_or_pd(tmp, src3); \
add##ps_or_pd(dst, tmp); \
} else if (dst == src2) { \
DCHECK_NE(src2, src1); \
mul##ps_or_pd(src2, src3); \
add##ps_or_pd(src2, src1); \
} else if (dst == src3) { \
DCHECK_NE(src3, src1); \
mul##ps_or_pd(src3, src2); \
add##ps_or_pd(src3, src1); \
} else { \
movaps(dst, src2); \
mul##ps_or_pd(dst, src3); \
add##ps_or_pd(dst, src1); \
} \
}
// Helper macro to define qfms macro-assembler. This takes care of every
// possible case of register aliasing to minimize the number of instructions.
#define QFMS(ps_or_pd) \
if (CpuFeatures::IsSupported(FMA3)) { \
CpuFeatureScope fma3_scope(this, FMA3); \
if (dst == src1) { \
vfnmadd231##ps_or_pd(dst, src2, src3); \
} else if (dst == src2) { \
vfnmadd132##ps_or_pd(dst, src1, src3); \
} else if (dst == src3) { \
vfnmadd213##ps_or_pd(dst, src2, src1); \
} else { \
vmovups(dst, src1); \
vfnmadd231##ps_or_pd(dst, src2, src3); \
} \
} else if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope avx_scope(this, AVX); \
vmul##ps_or_pd(tmp, src2, src3); \
vsub##ps_or_pd(dst, src1, tmp); \
} else { \
movaps(tmp, src2); \
mul##ps_or_pd(tmp, src3); \
if (dst != src1) { \
movaps(dst, src1); \
} \
sub##ps_or_pd(dst, tmp); \
}
void TurboAssembler::F32x4Qfma(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister src3,
XMMRegister tmp) {
QFMA(ps)
}
void TurboAssembler::F32x4Qfms(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister src3,
XMMRegister tmp) {
QFMS(ps)
}
void TurboAssembler::F64x2Qfma(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister src3,
XMMRegister tmp) {
QFMA(pd);
}
void TurboAssembler::F64x2Qfms(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister src3,
XMMRegister tmp) {
QFMS(pd);
}
#undef QFMOP
void TurboAssembler::Movdqa(XMMRegister dst, Operand src) {
// See comments in Movdqa(XMMRegister, XMMRegister).
if (CpuFeatures::IsSupported(AVX)) {
......
......@@ -83,6 +83,15 @@ class V8_EXPORT_PRIVATE TurboAssembler
void Movq(XMMRegister dst, Register src);
void Movq(Register dst, XMMRegister src);
void F64x2Qfma(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister src3, XMMRegister tmp);
void F64x2Qfms(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister src3, XMMRegister tmp);
void F32x4Qfma(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister src3, XMMRegister tmp);
void F32x4Qfms(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister src3, XMMRegister tmp);
void PushReturnAddressFrom(Register src) { pushq(src); }
void PopReturnAddressTo(Register dst) { popq(dst); }
......
......@@ -2681,27 +2681,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64F64x2Qfma: {
if (CpuFeatures::IsSupported(FMA3)) {
CpuFeatureScope fma3_scope(tasm(), FMA3);
__ vfmadd231pd(i.OutputSimd128Register(), i.InputSimd128Register(1),
i.InputSimd128Register(2));
} else {
__ Movapd(kScratchDoubleReg, i.InputSimd128Register(2));
__ Mulpd(kScratchDoubleReg, i.InputSimd128Register(1));
__ Addpd(i.OutputSimd128Register(), kScratchDoubleReg);
}
__ F64x2Qfma(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), i.InputSimd128Register(2),
kScratchDoubleReg);
break;
}
case kX64F64x2Qfms: {
if (CpuFeatures::IsSupported(FMA3)) {
CpuFeatureScope fma3_scope(tasm(), FMA3);
__ vfnmadd231pd(i.OutputSimd128Register(), i.InputSimd128Register(1),
i.InputSimd128Register(2));
} else {
__ Movapd(kScratchDoubleReg, i.InputSimd128Register(2));
__ Mulpd(kScratchDoubleReg, i.InputSimd128Register(1));
__ Subpd(i.OutputSimd128Register(), kScratchDoubleReg);
}
__ F64x2Qfms(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), i.InputSimd128Register(2),
kScratchDoubleReg);
break;
}
case kX64F64x2ConvertLowI32x4S: {
......@@ -2884,27 +2872,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64F32x4Qfma: {
if (CpuFeatures::IsSupported(FMA3)) {
CpuFeatureScope fma3_scope(tasm(), FMA3);
__ vfmadd231ps(i.OutputSimd128Register(), i.InputSimd128Register(1),
i.InputSimd128Register(2));
} else {
__ Movaps(kScratchDoubleReg, i.InputSimd128Register(2));
__ Mulps(kScratchDoubleReg, i.InputSimd128Register(1));
__ Addps(i.OutputSimd128Register(), kScratchDoubleReg);
}
__ F32x4Qfma(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), i.InputSimd128Register(2),
kScratchDoubleReg);
break;
}
case kX64F32x4Qfms: {
if (CpuFeatures::IsSupported(FMA3)) {
CpuFeatureScope fma3_scope(tasm(), FMA3);
__ vfnmadd231ps(i.OutputSimd128Register(), i.InputSimd128Register(1),
i.InputSimd128Register(2));
} else {
__ Movaps(kScratchDoubleReg, i.InputSimd128Register(2));
__ Mulps(kScratchDoubleReg, i.InputSimd128Register(1));
__ Subps(i.OutputSimd128Register(), kScratchDoubleReg);
}
__ F32x4Qfms(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), i.InputSimd128Register(2),
kScratchDoubleReg);
break;
}
case kX64F32x4Pmin: {
......
......@@ -3306,12 +3306,11 @@ void InstructionSelector::VisitF32x4UConvertI32x4(Node* node) {
g.UseRegister(node->InputAt(0)));
}
#define VISIT_SIMD_QFMOP(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
X64OperandGenerator g(this); \
Emit(kX64##Opcode, g.DefineSameAsFirst(node), \
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)), \
g.UseRegister(node->InputAt(2))); \
#define VISIT_SIMD_QFMOP(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
X64OperandGenerator g(this); \
Emit(kX64##Opcode, g.UseRegister(node), g.UseRegister(node->InputAt(0)), \
g.UseRegister(node->InputAt(1)), g.UseRegister(node->InputAt(2))); \
}
VISIT_SIMD_QFMOP(F64x2Qfma)
VISIT_SIMD_QFMOP(F64x2Qfms)
......
......@@ -893,11 +893,21 @@ int DisassemblerX64::AVXInstruction(byte* data) {
AppendToBuffer("vbroadcastss %s,", NameOfAVXRegister(regop));
current += PrintRightAVXOperand(current);
break;
case 0x98:
AppendToBuffer("vfmadd132p%c %s,%s,", float_size_code(),
NameOfXMMRegister(regop), NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
break;
case 0x99:
AppendToBuffer("vfmadd132s%c %s,%s,", float_size_code(),
NameOfAVXRegister(regop), NameOfAVXRegister(vvvv));
current += PrintRightAVXOperand(current);
break;
case 0xA8:
AppendToBuffer("vfmadd213p%c %s,%s,", float_size_code(),
NameOfXMMRegister(regop), NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
break;
case 0xA9:
AppendToBuffer("vfmadd213s%c %s,%s,", float_size_code(),
NameOfAVXRegister(regop), NameOfAVXRegister(vvvv));
......@@ -918,11 +928,21 @@ int DisassemblerX64::AVXInstruction(byte* data) {
NameOfAVXRegister(regop), NameOfAVXRegister(vvvv));
current += PrintRightAVXOperand(current);
break;
case 0x9C:
AppendToBuffer("vfnmadd132p%c %s,%s,", float_size_code(),
NameOfXMMRegister(regop), NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
break;
case 0xAB:
AppendToBuffer("vfmsub213s%c %s,%s,", float_size_code(),
NameOfAVXRegister(regop), NameOfAVXRegister(vvvv));
current += PrintRightAVXOperand(current);
break;
case 0xAC:
AppendToBuffer("vfnmadd213p%c %s,%s,", float_size_code(),
NameOfXMMRegister(regop), NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
break;
case 0xBB:
AppendToBuffer("vfmsub231s%c %s,%s,", float_size_code(),
NameOfAVXRegister(regop), NameOfAVXRegister(vvvv));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment