Commit 20945ef7 authored by Ambroise Vincent's avatar Ambroise Vincent Committed by Commit Bot

[turbofan] Add SIMD multiply-add/sub on arm64

Fold distinct MUL and ADD (or SUB) instructions into a single MLA (or
MLS) instruction, mirroring what is being done for general purpose
registers.

SIMD wasm only uses the vectorized ADD and MUL instructions on quad
vectors (NEON Q), so only those cases are handled.

SIMD wasm only uses MUL by vectors, not by elements so there is no need
to check for an addition and shift reduction.

Change-Id: If07191dde9fb1dc37a5de27187800c15cc4325ea
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2184239Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Reviewed-by: 's avatarZhi An Ng <zhin@chromium.org>
Commit-Queue: Martyn Capewell <martyn.capewell@arm.com>
Cr-Commit-Position: refs/heads/master@{#67770}
parent 69b46896
......@@ -1845,6 +1845,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(0).V##FORMAT(), \
i.InputSimd128Register(1).V##FORMAT()); \
break;
#define SIMD_DESTRUCTIVE_BINOP_CASE(Op, Instr, FORMAT) \
case Op: { \
VRegister dst = i.OutputSimd128Register().V##FORMAT(); \
DCHECK_EQ(dst, i.InputSimd128Register(0).V##FORMAT()); \
__ Instr(dst, i.InputSimd128Register(1).V##FORMAT(), \
i.InputSimd128Register(2).V##FORMAT()); \
break; \
}
case kArm64F64x2Splat: {
__ Dup(i.OutputSimd128Register().V2D(), i.InputSimd128Register(0).D(), 0);
......@@ -1891,18 +1899,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(0).V2D());
break;
}
case kArm64F64x2Qfma: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ Fmla(i.OutputSimd128Register().V2D(), i.InputSimd128Register(1).V2D(),
i.InputSimd128Register(2).V2D());
break;
}
case kArm64F64x2Qfms: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ Fmls(i.OutputSimd128Register().V2D(), i.InputSimd128Register(1).V2D(),
i.InputSimd128Register(2).V2D());
break;
}
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F64x2Qfma, Fmla, 2D);
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F64x2Qfms, Fmls, 2D);
case kArm64F32x4Splat: {
__ Dup(i.OutputSimd128Register().V4S(), i.InputSimd128Register(0).S(), 0);
break;
......@@ -1953,18 +1951,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(0).V4S());
break;
}
case kArm64F32x4Qfma: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ Fmla(i.OutputSimd128Register().V4S(), i.InputSimd128Register(1).V4S(),
i.InputSimd128Register(2).V4S());
break;
}
case kArm64F32x4Qfms: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ Fmls(i.OutputSimd128Register().V4S(), i.InputSimd128Register(1).V4S(),
i.InputSimd128Register(2).V4S());
break;
}
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F32x4Qfma, Fmla, 4S);
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64F32x4Qfms, Fmls, 4S);
case kArm64I64x2Splat: {
__ Dup(i.OutputSimd128Register().V2D(), i.InputRegister64(0));
break;
......@@ -2103,6 +2091,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
SIMD_BINOP_CASE(kArm64I32x4AddHoriz, Addp, 4S);
SIMD_BINOP_CASE(kArm64I32x4Sub, Sub, 4S);
SIMD_BINOP_CASE(kArm64I32x4Mul, Mul, 4S);
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64I32x4Mla, Mla, 4S);
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64I32x4Mls, Mls, 4S);
SIMD_BINOP_CASE(kArm64I32x4MinS, Smin, 4S);
SIMD_BINOP_CASE(kArm64I32x4MaxS, Smax, 4S);
SIMD_BINOP_CASE(kArm64I32x4Eq, Cmeq, 4S);
......@@ -2196,6 +2186,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
SIMD_BINOP_CASE(kArm64I16x8Sub, Sub, 8H);
SIMD_BINOP_CASE(kArm64I16x8SubSaturateS, Sqsub, 8H);
SIMD_BINOP_CASE(kArm64I16x8Mul, Mul, 8H);
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64I16x8Mla, Mla, 8H);
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64I16x8Mls, Mls, 8H);
SIMD_BINOP_CASE(kArm64I16x8MinS, Smin, 8H);
SIMD_BINOP_CASE(kArm64I16x8MaxS, Smax, 8H);
SIMD_BINOP_CASE(kArm64I16x8Eq, Cmeq, 8H);
......@@ -2309,6 +2301,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
SIMD_BINOP_CASE(kArm64I8x16Sub, Sub, 16B);
SIMD_BINOP_CASE(kArm64I8x16SubSaturateS, Sqsub, 16B);
SIMD_BINOP_CASE(kArm64I8x16Mul, Mul, 16B);
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64I8x16Mla, Mla, 16B);
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64I8x16Mls, Mls, 16B);
SIMD_BINOP_CASE(kArm64I8x16MinS, Smin, 16B);
SIMD_BINOP_CASE(kArm64I8x16MaxS, Smax, 16B);
SIMD_BINOP_CASE(kArm64I8x16Eq, Cmeq, 16B);
......@@ -2393,13 +2387,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
break;
}
case kArm64S128Select: {
VRegister dst = i.OutputSimd128Register().V16B();
DCHECK_EQ(dst, i.InputSimd128Register(0).V16B());
__ Bsl(dst, i.InputSimd128Register(1).V16B(),
i.InputSimd128Register(2).V16B());
break;
}
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64S128Select, Bsl, 16B);
SIMD_BINOP_CASE(kArm64S128AndNot, Bic, 16B);
case kArm64S32x4Shuffle: {
Simd128Register dst = i.OutputSimd128Register().V4S(),
......@@ -2574,6 +2562,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
#undef SIMD_UNOP_CASE
#undef SIMD_WIDENING_UNOP_CASE
#undef SIMD_BINOP_CASE
#undef SIMD_DESTRUCTIVE_BINOP_CASE
#undef SIMD_REDUCE_OP_CASE
#undef ASSEMBLE_SIMD_SHIFT_LEFT
#undef ASSEMBLE_SIMD_SHIFT_RIGHT
......
......@@ -238,6 +238,8 @@ namespace compiler {
V(Arm64I32x4AddHoriz) \
V(Arm64I32x4Sub) \
V(Arm64I32x4Mul) \
V(Arm64I32x4Mla) \
V(Arm64I32x4Mls) \
V(Arm64I32x4MinS) \
V(Arm64I32x4MaxS) \
V(Arm64I32x4Eq) \
......@@ -270,6 +272,8 @@ namespace compiler {
V(Arm64I16x8Sub) \
V(Arm64I16x8SubSaturateS) \
V(Arm64I16x8Mul) \
V(Arm64I16x8Mla) \
V(Arm64I16x8Mls) \
V(Arm64I16x8MinS) \
V(Arm64I16x8MaxS) \
V(Arm64I16x8Eq) \
......@@ -302,6 +306,8 @@ namespace compiler {
V(Arm64I8x16Sub) \
V(Arm64I8x16SubSaturateS) \
V(Arm64I8x16Mul) \
V(Arm64I8x16Mla) \
V(Arm64I8x16Mls) \
V(Arm64I8x16MinS) \
V(Arm64I8x16MaxS) \
V(Arm64I8x16Eq) \
......
......@@ -208,6 +208,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64I32x4AddHoriz:
case kArm64I32x4Sub:
case kArm64I32x4Mul:
case kArm64I32x4Mla:
case kArm64I32x4Mls:
case kArm64I32x4MinS:
case kArm64I32x4MaxS:
case kArm64I32x4Eq:
......@@ -240,6 +242,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64I16x8Sub:
case kArm64I16x8SubSaturateS:
case kArm64I16x8Mul:
case kArm64I16x8Mla:
case kArm64I16x8Mls:
case kArm64I16x8MinS:
case kArm64I16x8MaxS:
case kArm64I16x8Eq:
......@@ -272,6 +276,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64I8x16Sub:
case kArm64I8x16SubSaturateS:
case kArm64I8x16Mul:
case kArm64I8x16Mla:
case kArm64I8x16Mls:
case kArm64I8x16MinS:
case kArm64I8x16MaxS:
case kArm64I8x16Eq:
......
......@@ -3237,9 +3237,7 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
V(I64x2GeS, kArm64I64x2GeS) \
V(I64x2GtU, kArm64I64x2GtU) \
V(I64x2GeU, kArm64I64x2GeU) \
V(I32x4Add, kArm64I32x4Add) \
V(I32x4AddHoriz, kArm64I32x4AddHoriz) \
V(I32x4Sub, kArm64I32x4Sub) \
V(I32x4Mul, kArm64I32x4Mul) \
V(I32x4MinS, kArm64I32x4MinS) \
V(I32x4MaxS, kArm64I32x4MaxS) \
......@@ -3252,10 +3250,8 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
V(I32x4GtU, kArm64I32x4GtU) \
V(I32x4GeU, kArm64I32x4GeU) \
V(I16x8SConvertI32x4, kArm64I16x8SConvertI32x4) \
V(I16x8Add, kArm64I16x8Add) \
V(I16x8AddSaturateS, kArm64I16x8AddSaturateS) \
V(I16x8AddHoriz, kArm64I16x8AddHoriz) \
V(I16x8Sub, kArm64I16x8Sub) \
V(I16x8SubSaturateS, kArm64I16x8SubSaturateS) \
V(I16x8Mul, kArm64I16x8Mul) \
V(I16x8MinS, kArm64I16x8MinS) \
......@@ -3273,9 +3269,7 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
V(I16x8GeU, kArm64I16x8GeU) \
V(I16x8RoundingAverageU, kArm64I16x8RoundingAverageU) \
V(I8x16SConvertI16x8, kArm64I8x16SConvertI16x8) \
V(I8x16Add, kArm64I8x16Add) \
V(I8x16AddSaturateS, kArm64I8x16AddSaturateS) \
V(I8x16Sub, kArm64I8x16Sub) \
V(I8x16SubSaturateS, kArm64I8x16SubSaturateS) \
V(I8x16Mul, kArm64I8x16Mul) \
V(I8x16MinS, kArm64I8x16MinS) \
......@@ -3363,6 +3357,52 @@ void InstructionSelector::VisitI64x2Mul(Node* node) {
arraysize(temps), temps);
}
#define VISIT_SIMD_ADD(Type) \
void InstructionSelector::Visit##Type##Add(Node* node) { \
Arm64OperandGenerator g(this); \
Node* left = node->InputAt(0); \
Node* right = node->InputAt(1); \
/* Select Mla(z, x, y) for Add(Mul(x, y), z). */ \
if (left->opcode() == IrOpcode::k##Type##Mul && CanCover(node, left)) { \
Emit(kArm64##Type##Mla, g.DefineSameAsFirst(node), g.UseRegister(right), \
g.UseRegister(left->InputAt(0)), g.UseRegister(left->InputAt(1))); \
return; \
} \
/* Select Mla(z, x, y) for Add(z, Mul(x, y)). */ \
if (right->opcode() == IrOpcode::k##Type##Mul && CanCover(node, right)) { \
Emit(kArm64##Type##Mla, g.DefineSameAsFirst(node), g.UseRegister(left), \
g.UseRegister(right->InputAt(0)), \
g.UseRegister(right->InputAt(1))); \
return; \
} \
VisitRRR(this, kArm64##Type##Add, node); \
}
VISIT_SIMD_ADD(I32x4)
VISIT_SIMD_ADD(I16x8)
VISIT_SIMD_ADD(I8x16)
#undef VISIT_SIMD_ADD
#define VISIT_SIMD_SUB(Type) \
void InstructionSelector::Visit##Type##Sub(Node* node) { \
Arm64OperandGenerator g(this); \
Node* left = node->InputAt(0); \
Node* right = node->InputAt(1); \
/* Select Mls(z, x, y) for Sub(z, Mul(x, y)). */ \
if (right->opcode() == IrOpcode::k##Type##Mul && CanCover(node, right)) { \
Emit(kArm64##Type##Mls, g.DefineSameAsFirst(node), g.UseRegister(left), \
g.UseRegister(right->InputAt(0)), \
g.UseRegister(right->InputAt(1))); \
return; \
} \
VisitRRR(this, kArm64##Type##Sub, node); \
}
VISIT_SIMD_SUB(I32x4)
VISIT_SIMD_SUB(I16x8)
VISIT_SIMD_SUB(I8x16)
#undef VISIT_SIMD_SUB
void InstructionSelector::VisitS128Select(Node* node) {
Arm64OperandGenerator g(this);
Emit(kArm64S128Select, g.DefineSameAsFirst(node),
......
......@@ -2043,9 +2043,9 @@ struct MulDPInst {
Node* (RawMachineAssembler::*mul_constructor)(Node*, Node*);
Node* (RawMachineAssembler::*add_constructor)(Node*, Node*);
Node* (RawMachineAssembler::*sub_constructor)(Node*, Node*);
ArchOpcode add_arch_opcode;
ArchOpcode sub_arch_opcode;
ArchOpcode neg_arch_opcode;
ArchOpcode multiply_add_arch_opcode;
ArchOpcode multiply_sub_arch_opcode;
ArchOpcode multiply_neg_arch_opcode;
MachineType machine_type;
};
......@@ -2077,7 +2077,7 @@ TEST_P(InstructionSelectorIntDPWithIntMulTest, AddWithMul) {
m.Return((m.*mdpi.add_constructor)(m.Parameter(0), n));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(mdpi.add_arch_opcode, s[0]->arch_opcode());
EXPECT_EQ(mdpi.multiply_add_arch_opcode, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
......@@ -2087,7 +2087,7 @@ TEST_P(InstructionSelectorIntDPWithIntMulTest, AddWithMul) {
m.Return((m.*mdpi.add_constructor)(n, m.Parameter(2)));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(mdpi.add_arch_opcode, s[0]->arch_opcode());
EXPECT_EQ(mdpi.multiply_add_arch_opcode, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
......@@ -2103,7 +2103,7 @@ TEST_P(InstructionSelectorIntDPWithIntMulTest, SubWithMul) {
m.Return((m.*mdpi.sub_constructor)(m.Parameter(0), n));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(mdpi.sub_arch_opcode, s[0]->arch_opcode());
EXPECT_EQ(mdpi.multiply_sub_arch_opcode, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
......@@ -2120,7 +2120,7 @@ TEST_P(InstructionSelectorIntDPWithIntMulTest, NegativeMul) {
m.Return((m.*mdpi.mul_constructor)(n, m.Parameter(1)));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(mdpi.neg_arch_opcode, s[0]->arch_opcode());
EXPECT_EQ(mdpi.multiply_neg_arch_opcode, s[0]->arch_opcode());
EXPECT_EQ(2U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
......@@ -2131,7 +2131,7 @@ TEST_P(InstructionSelectorIntDPWithIntMulTest, NegativeMul) {
m.Return((m.*mdpi.mul_constructor)(m.Parameter(0), n));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(mdpi.neg_arch_opcode, s[0]->arch_opcode());
EXPECT_EQ(mdpi.multiply_neg_arch_opcode, s[0]->arch_opcode());
EXPECT_EQ(2U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
......@@ -2141,6 +2141,85 @@ INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorIntDPWithIntMulTest,
::testing::ValuesIn(kMulDPInstructions));
namespace {
struct SIMDMulDPInst {
const char* mul_constructor_name;
const Operator* (MachineOperatorBuilder::*mul_operator)(void);
const Operator* (MachineOperatorBuilder::*add_operator)(void);
const Operator* (MachineOperatorBuilder::*sub_operator)(void);
ArchOpcode multiply_add_arch_opcode;
ArchOpcode multiply_sub_arch_opcode;
MachineType machine_type;
};
std::ostream& operator<<(std::ostream& os, const SIMDMulDPInst& inst) {
return os << inst.mul_constructor_name;
}
} // namespace
static const SIMDMulDPInst kSIMDMulDPInstructions[] = {
{"I32x4Mul", &MachineOperatorBuilder::I32x4Mul,
&MachineOperatorBuilder::I32x4Add, &MachineOperatorBuilder::I32x4Sub,
kArm64I32x4Mla, kArm64I32x4Mls, MachineType::Simd128()},
{"I16x8Mul", &MachineOperatorBuilder::I16x8Mul,
&MachineOperatorBuilder::I16x8Add, &MachineOperatorBuilder::I16x8Sub,
kArm64I16x8Mla, kArm64I16x8Mls, MachineType::Simd128()},
{"I8x16Mul", &MachineOperatorBuilder::I8x16Mul,
&MachineOperatorBuilder::I8x16Add, &MachineOperatorBuilder::I8x16Sub,
kArm64I8x16Mla, kArm64I8x16Mls, MachineType::Simd128()}};
using InstructionSelectorSIMDDPWithSIMDMulTest =
InstructionSelectorTestWithParam<SIMDMulDPInst>;
TEST_P(InstructionSelectorSIMDDPWithSIMDMulTest, AddWithMul) {
const SIMDMulDPInst mdpi = GetParam();
const MachineType type = mdpi.machine_type;
{
StreamBuilder m(this, type, type, type, type);
Node* n = m.AddNode((m.machine()->*mdpi.mul_operator)(), m.Parameter(1),
m.Parameter(2));
m.Return(m.AddNode((m.machine()->*mdpi.add_operator)(), m.Parameter(0), n));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(mdpi.multiply_add_arch_opcode, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
{
StreamBuilder m(this, type, type, type, type);
Node* n = m.AddNode((m.machine()->*mdpi.mul_operator)(), m.Parameter(0),
m.Parameter(1));
m.Return(m.AddNode((m.machine()->*mdpi.add_operator)(), n, m.Parameter(2)));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(mdpi.multiply_add_arch_opcode, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
}
TEST_P(InstructionSelectorSIMDDPWithSIMDMulTest, SubWithMul) {
const SIMDMulDPInst mdpi = GetParam();
const MachineType type = mdpi.machine_type;
{
StreamBuilder m(this, type, type, type, type);
Node* n = m.AddNode((m.machine()->*mdpi.mul_operator)(), m.Parameter(1),
m.Parameter(2));
m.Return(m.AddNode((m.machine()->*mdpi.sub_operator)(), m.Parameter(0), n));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(mdpi.multiply_sub_arch_opcode, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
}
INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorSIMDDPWithSIMDMulTest,
::testing::ValuesIn(kSIMDMulDPInstructions));
TEST_F(InstructionSelectorTest, Int32MulWithImmediate) {
// x * (2^k + 1) -> x + (x << k)
TRACED_FORRANGE(int32_t, k, 1, 30) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment