Commit 64758c63 authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm-relaxed-simd] Optimize Qfma/Qfms

Do not require that dst == src1, this leaves more flexibility for the
operands. We check in the macro-assembler if dst alias any of the input
operands, then use vfma231/vfma132/vfma213 appropriately.

Bug: v8:11659
Change-Id: I3644f5e0e75bd047d4e5f5b52d4234e54d329d15
Fixed: v8:11659
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3131370
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76732}
parent 06751aba
...@@ -30,9 +30,17 @@ ...@@ -30,9 +30,17 @@
V(vfnmsub132ss, LIG, 66, 0F, 38, W0, 9f) \ V(vfnmsub132ss, LIG, 66, 0F, 38, W0, 9f) \
V(vfnmsub213ss, LIG, 66, 0F, 38, W0, af) \ V(vfnmsub213ss, LIG, 66, 0F, 38, W0, af) \
V(vfnmsub231ss, LIG, 66, 0F, 38, W0, bf) \ V(vfnmsub231ss, LIG, 66, 0F, 38, W0, bf) \
V(vfmadd132ps, L128, 66, 0F, 38, W0, 98) \
V(vfmadd213ps, L128, 66, 0F, 38, W0, a8) \
V(vfmadd231ps, L128, 66, 0F, 38, W0, b8) \ V(vfmadd231ps, L128, 66, 0F, 38, W0, b8) \
V(vfnmadd132ps, L128, 66, 0F, 38, W0, 9c) \
V(vfnmadd213ps, L128, 66, 0F, 38, W0, ac) \
V(vfnmadd231ps, L128, 66, 0F, 38, W0, bc) \ V(vfnmadd231ps, L128, 66, 0F, 38, W0, bc) \
V(vfmadd132pd, L128, 66, 0F, 38, W1, 98) \
V(vfmadd213pd, L128, 66, 0F, 38, W1, a8) \
V(vfmadd231pd, L128, 66, 0F, 38, W1, b8) \ V(vfmadd231pd, L128, 66, 0F, 38, W1, b8) \
V(vfnmadd132pd, L128, 66, 0F, 38, W1, 9c) \
V(vfnmadd213pd, L128, 66, 0F, 38, W1, ac) \
V(vfnmadd231pd, L128, 66, 0F, 38, W1, bc) V(vfnmadd231pd, L128, 66, 0F, 38, W1, bc)
#endif // V8_CODEGEN_X64_FMA_INSTR_H_ #endif // V8_CODEGEN_X64_FMA_INSTR_H_
...@@ -859,6 +859,99 @@ void TurboAssembler::Movq(Register dst, XMMRegister src) { ...@@ -859,6 +859,99 @@ void TurboAssembler::Movq(Register dst, XMMRegister src) {
} }
} }
// Helper macro to define qfma macro-assembler. This takes care of every
// possible case of register aliasing to minimize the number of instructions.
#define QFMA(ps_or_pd) \
if (CpuFeatures::IsSupported(FMA3)) { \
CpuFeatureScope fma3_scope(this, FMA3); \
if (dst == src1) { \
vfmadd231##ps_or_pd(dst, src2, src3); \
} else if (dst == src2) { \
vfmadd132##ps_or_pd(dst, src1, src3); \
} else if (dst == src3) { \
vfmadd213##ps_or_pd(dst, src2, src1); \
} else { \
vmovups(dst, src1); \
vfmadd231##ps_or_pd(dst, src2, src3); \
} \
} else if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope avx_scope(this, AVX); \
vmul##ps_or_pd(tmp, src2, src3); \
vadd##ps_or_pd(dst, src1, tmp); \
} else { \
if (dst == src1) { \
movaps(tmp, src2); \
mul##ps_or_pd(tmp, src3); \
add##ps_or_pd(dst, tmp); \
} else if (dst == src2) { \
DCHECK_NE(src2, src1); \
mul##ps_or_pd(src2, src3); \
add##ps_or_pd(src2, src1); \
} else if (dst == src3) { \
DCHECK_NE(src3, src1); \
mul##ps_or_pd(src3, src2); \
add##ps_or_pd(src3, src1); \
} else { \
movaps(dst, src2); \
mul##ps_or_pd(dst, src3); \
add##ps_or_pd(dst, src1); \
} \
}
// Helper macro to define qfms macro-assembler. This takes care of every
// possible case of register aliasing to minimize the number of instructions.
#define QFMS(ps_or_pd) \
if (CpuFeatures::IsSupported(FMA3)) { \
CpuFeatureScope fma3_scope(this, FMA3); \
if (dst == src1) { \
vfnmadd231##ps_or_pd(dst, src2, src3); \
} else if (dst == src2) { \
vfnmadd132##ps_or_pd(dst, src1, src3); \
} else if (dst == src3) { \
vfnmadd213##ps_or_pd(dst, src2, src1); \
} else { \
vmovups(dst, src1); \
vfnmadd231##ps_or_pd(dst, src2, src3); \
} \
} else if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope avx_scope(this, AVX); \
vmul##ps_or_pd(tmp, src2, src3); \
vsub##ps_or_pd(dst, src1, tmp); \
} else { \
movaps(tmp, src2); \
mul##ps_or_pd(tmp, src3); \
if (dst != src1) { \
movaps(dst, src1); \
} \
sub##ps_or_pd(dst, tmp); \
}
void TurboAssembler::F32x4Qfma(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister src3,
XMMRegister tmp) {
QFMA(ps)
}
void TurboAssembler::F32x4Qfms(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister src3,
XMMRegister tmp) {
QFMS(ps)
}
void TurboAssembler::F64x2Qfma(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister src3,
XMMRegister tmp) {
QFMA(pd);
}
void TurboAssembler::F64x2Qfms(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister src3,
XMMRegister tmp) {
QFMS(pd);
}
#undef QFMOP
void TurboAssembler::Movdqa(XMMRegister dst, Operand src) { void TurboAssembler::Movdqa(XMMRegister dst, Operand src) {
// See comments in Movdqa(XMMRegister, XMMRegister). // See comments in Movdqa(XMMRegister, XMMRegister).
if (CpuFeatures::IsSupported(AVX)) { if (CpuFeatures::IsSupported(AVX)) {
......
...@@ -83,6 +83,15 @@ class V8_EXPORT_PRIVATE TurboAssembler ...@@ -83,6 +83,15 @@ class V8_EXPORT_PRIVATE TurboAssembler
void Movq(XMMRegister dst, Register src); void Movq(XMMRegister dst, Register src);
void Movq(Register dst, XMMRegister src); void Movq(Register dst, XMMRegister src);
void F64x2Qfma(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister src3, XMMRegister tmp);
void F64x2Qfms(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister src3, XMMRegister tmp);
void F32x4Qfma(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister src3, XMMRegister tmp);
void F32x4Qfms(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister src3, XMMRegister tmp);
void PushReturnAddressFrom(Register src) { pushq(src); } void PushReturnAddressFrom(Register src) { pushq(src); }
void PopReturnAddressTo(Register dst) { popq(dst); } void PopReturnAddressTo(Register dst) { popq(dst); }
......
...@@ -2681,27 +2681,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2681,27 +2681,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64F64x2Qfma: { case kX64F64x2Qfma: {
if (CpuFeatures::IsSupported(FMA3)) { __ F64x2Qfma(i.OutputSimd128Register(), i.InputSimd128Register(0),
CpuFeatureScope fma3_scope(tasm(), FMA3); i.InputSimd128Register(1), i.InputSimd128Register(2),
__ vfmadd231pd(i.OutputSimd128Register(), i.InputSimd128Register(1), kScratchDoubleReg);
i.InputSimd128Register(2));
} else {
__ Movapd(kScratchDoubleReg, i.InputSimd128Register(2));
__ Mulpd(kScratchDoubleReg, i.InputSimd128Register(1));
__ Addpd(i.OutputSimd128Register(), kScratchDoubleReg);
}
break; break;
} }
case kX64F64x2Qfms: { case kX64F64x2Qfms: {
if (CpuFeatures::IsSupported(FMA3)) { __ F64x2Qfms(i.OutputSimd128Register(), i.InputSimd128Register(0),
CpuFeatureScope fma3_scope(tasm(), FMA3); i.InputSimd128Register(1), i.InputSimd128Register(2),
__ vfnmadd231pd(i.OutputSimd128Register(), i.InputSimd128Register(1), kScratchDoubleReg);
i.InputSimd128Register(2));
} else {
__ Movapd(kScratchDoubleReg, i.InputSimd128Register(2));
__ Mulpd(kScratchDoubleReg, i.InputSimd128Register(1));
__ Subpd(i.OutputSimd128Register(), kScratchDoubleReg);
}
break; break;
} }
case kX64F64x2ConvertLowI32x4S: { case kX64F64x2ConvertLowI32x4S: {
...@@ -2884,27 +2872,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2884,27 +2872,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64F32x4Qfma: { case kX64F32x4Qfma: {
if (CpuFeatures::IsSupported(FMA3)) { __ F32x4Qfma(i.OutputSimd128Register(), i.InputSimd128Register(0),
CpuFeatureScope fma3_scope(tasm(), FMA3); i.InputSimd128Register(1), i.InputSimd128Register(2),
__ vfmadd231ps(i.OutputSimd128Register(), i.InputSimd128Register(1), kScratchDoubleReg);
i.InputSimd128Register(2));
} else {
__ Movaps(kScratchDoubleReg, i.InputSimd128Register(2));
__ Mulps(kScratchDoubleReg, i.InputSimd128Register(1));
__ Addps(i.OutputSimd128Register(), kScratchDoubleReg);
}
break; break;
} }
case kX64F32x4Qfms: { case kX64F32x4Qfms: {
if (CpuFeatures::IsSupported(FMA3)) { __ F32x4Qfms(i.OutputSimd128Register(), i.InputSimd128Register(0),
CpuFeatureScope fma3_scope(tasm(), FMA3); i.InputSimd128Register(1), i.InputSimd128Register(2),
__ vfnmadd231ps(i.OutputSimd128Register(), i.InputSimd128Register(1), kScratchDoubleReg);
i.InputSimd128Register(2));
} else {
__ Movaps(kScratchDoubleReg, i.InputSimd128Register(2));
__ Mulps(kScratchDoubleReg, i.InputSimd128Register(1));
__ Subps(i.OutputSimd128Register(), kScratchDoubleReg);
}
break; break;
} }
case kX64F32x4Pmin: { case kX64F32x4Pmin: {
......
...@@ -3306,12 +3306,11 @@ void InstructionSelector::VisitF32x4UConvertI32x4(Node* node) { ...@@ -3306,12 +3306,11 @@ void InstructionSelector::VisitF32x4UConvertI32x4(Node* node) {
g.UseRegister(node->InputAt(0))); g.UseRegister(node->InputAt(0)));
} }
#define VISIT_SIMD_QFMOP(Opcode) \ #define VISIT_SIMD_QFMOP(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \ void InstructionSelector::Visit##Opcode(Node* node) { \
X64OperandGenerator g(this); \ X64OperandGenerator g(this); \
Emit(kX64##Opcode, g.DefineSameAsFirst(node), \ Emit(kX64##Opcode, g.UseRegister(node), g.UseRegister(node->InputAt(0)), \
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)), \ g.UseRegister(node->InputAt(1)), g.UseRegister(node->InputAt(2))); \
g.UseRegister(node->InputAt(2))); \
} }
VISIT_SIMD_QFMOP(F64x2Qfma) VISIT_SIMD_QFMOP(F64x2Qfma)
VISIT_SIMD_QFMOP(F64x2Qfms) VISIT_SIMD_QFMOP(F64x2Qfms)
......
...@@ -893,11 +893,21 @@ int DisassemblerX64::AVXInstruction(byte* data) { ...@@ -893,11 +893,21 @@ int DisassemblerX64::AVXInstruction(byte* data) {
AppendToBuffer("vbroadcastss %s,", NameOfAVXRegister(regop)); AppendToBuffer("vbroadcastss %s,", NameOfAVXRegister(regop));
current += PrintRightAVXOperand(current); current += PrintRightAVXOperand(current);
break; break;
case 0x98:
AppendToBuffer("vfmadd132p%c %s,%s,", float_size_code(),
NameOfXMMRegister(regop), NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
break;
case 0x99: case 0x99:
AppendToBuffer("vfmadd132s%c %s,%s,", float_size_code(), AppendToBuffer("vfmadd132s%c %s,%s,", float_size_code(),
NameOfAVXRegister(regop), NameOfAVXRegister(vvvv)); NameOfAVXRegister(regop), NameOfAVXRegister(vvvv));
current += PrintRightAVXOperand(current); current += PrintRightAVXOperand(current);
break; break;
case 0xA8:
AppendToBuffer("vfmadd213p%c %s,%s,", float_size_code(),
NameOfXMMRegister(regop), NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
break;
case 0xA9: case 0xA9:
AppendToBuffer("vfmadd213s%c %s,%s,", float_size_code(), AppendToBuffer("vfmadd213s%c %s,%s,", float_size_code(),
NameOfAVXRegister(regop), NameOfAVXRegister(vvvv)); NameOfAVXRegister(regop), NameOfAVXRegister(vvvv));
...@@ -918,11 +928,21 @@ int DisassemblerX64::AVXInstruction(byte* data) { ...@@ -918,11 +928,21 @@ int DisassemblerX64::AVXInstruction(byte* data) {
NameOfAVXRegister(regop), NameOfAVXRegister(vvvv)); NameOfAVXRegister(regop), NameOfAVXRegister(vvvv));
current += PrintRightAVXOperand(current); current += PrintRightAVXOperand(current);
break; break;
case 0x9C:
AppendToBuffer("vfnmadd132p%c %s,%s,", float_size_code(),
NameOfXMMRegister(regop), NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
break;
case 0xAB: case 0xAB:
AppendToBuffer("vfmsub213s%c %s,%s,", float_size_code(), AppendToBuffer("vfmsub213s%c %s,%s,", float_size_code(),
NameOfAVXRegister(regop), NameOfAVXRegister(vvvv)); NameOfAVXRegister(regop), NameOfAVXRegister(vvvv));
current += PrintRightAVXOperand(current); current += PrintRightAVXOperand(current);
break; break;
case 0xAC:
AppendToBuffer("vfnmadd213p%c %s,%s,", float_size_code(),
NameOfXMMRegister(regop), NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
break;
case 0xBB: case 0xBB:
AppendToBuffer("vfmsub231s%c %s,%s,", float_size_code(), AppendToBuffer("vfmsub231s%c %s,%s,", float_size_code(),
NameOfAVXRegister(regop), NameOfAVXRegister(vvvv)); NameOfAVXRegister(regop), NameOfAVXRegister(vvvv));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment