Commit 5667bfe5 authored by Daan de Graaf's avatar Daan de Graaf Committed by V8 LUCI CQ

[wasm-simd][arm] Fuse extadd_pairwise and add SIMD on arm.

Bug: v8:11546
Change-Id: I40bc4b5e3c813edba4a194b086b63e19d1231e29
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2956729
Commit-Queue: Daan de Graaf <daagra@google.com>
Reviewed-by: 's avatarZhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarMythri Alle <mythria@chromium.org>
Cr-Commit-Position: refs/heads/master@{#75416}
parent 31391ab8
......@@ -4043,6 +4043,8 @@ enum UnaryOp {
VTRN,
VRECPE,
VRSQRTE,
VPADAL_S,
VPADAL_U,
VPADDL_S,
VPADDL_U,
VCEQ0,
......@@ -4116,6 +4118,12 @@ static Instr EncodeNeonUnaryOp(UnaryOp op, NeonRegType reg_type, NeonSize size,
// Only support floating point.
op_encoding = 0x3 * B16 | 0xB * B7;
break;
case VPADAL_S:
op_encoding = 0xC * B7;
break;
case VPADAL_U:
op_encoding = 0xD * B7;
break;
case VPADDL_S:
op_encoding = 0x4 * B7;
break;
......@@ -5013,6 +5021,14 @@ void Assembler::vtrn(NeonSize size, QwNeonRegister src1, QwNeonRegister src2) {
emit(EncodeNeonUnaryOp(VTRN, NEON_Q, size, src1.code(), src2.code()));
}
void Assembler::vpadal(NeonDataType dt, QwNeonRegister dst,
QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
// vpadal.<dt>(Qd, Qm) SIMD Vector Pairwise Add and Accumulate Long
emit(EncodeNeonUnaryOp(NeonU(dt) ? VPADAL_U : VPADAL_S, NEON_Q,
NeonDataTypeToSize(dt), dst.code(), src.code()));
}
void Assembler::vpaddl(NeonDataType dt, QwNeonRegister dst,
QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
......
......@@ -920,6 +920,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vpmax(NeonDataType dt, DwVfpRegister dst, DwVfpRegister src1,
DwVfpRegister src2);
void vpadal(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src);
void vpaddl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src);
void vqrdmulh(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2);
......
......@@ -1848,6 +1848,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1).high());
break;
}
case kArmVpadal: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
auto dt = static_cast<NeonDataType>(MiscField::decode(instr->opcode()));
__ vpadal(dt, i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kArmVpaddl: {
auto dt = static_cast<NeonDataType>(MiscField::decode(instr->opcode()));
__ vpaddl(dt, i.OutputSimd128Register(), i.InputSimd128Register(0));
......
......@@ -113,6 +113,7 @@ namespace compiler {
V(ArmVld1S128) \
V(ArmVst1S128) \
V(ArmVcnt) \
V(ArmVpadal) \
V(ArmVpaddl) \
V(ArmFloat32Max) \
V(ArmFloat64Max) \
......
......@@ -106,6 +106,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmVmovF64U32U32:
case kArmVmovU32U32F64:
case kArmVcnt:
case kArmVpadal:
case kArmVpaddl:
case kArmFloat32Max:
case kArmFloat64Max:
......
......@@ -2632,7 +2632,6 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
V(F32x4Le, kArmF32x4Le) \
V(I64x2Add, kArmI64x2Add) \
V(I64x2Sub, kArmI64x2Sub) \
V(I32x4Add, kArmI32x4Add) \
V(I32x4Sub, kArmI32x4Sub) \
V(I32x4Mul, kArmI32x4Mul) \
V(I32x4MinS, kArmI32x4MinS) \
......@@ -2650,7 +2649,6 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
V(I32x4GtU, kArmI32x4GtU) \
V(I32x4GeU, kArmI32x4GeU) \
V(I16x8SConvertI32x4, kArmI16x8SConvertI32x4) \
V(I16x8Add, kArmI16x8Add) \
V(I16x8AddSatS, kArmI16x8AddSatS) \
V(I16x8Sub, kArmI16x8Sub) \
V(I16x8SubSatS, kArmI16x8SubSatS) \
......@@ -2780,6 +2778,50 @@ SIMD_BINOP_LIST(SIMD_VISIT_BINOP)
#undef SIMD_VISIT_BINOP
#undef SIMD_BINOP_LIST
#define VISIT_SIMD_ADD(Type, PairwiseType, NeonWidth) \
void InstructionSelector::Visit##Type##Add(Node* node) { \
ArmOperandGenerator g(this); \
Node* left = node->InputAt(0); \
Node* right = node->InputAt(1); \
if (left->opcode() == \
IrOpcode::k##Type##ExtAddPairwise##PairwiseType##S && \
CanCover(node, left)) { \
Emit(kArmVpadal | MiscField::encode(NeonS##NeonWidth), \
g.DefineSameAsFirst(node), g.UseRegister(right), \
g.UseRegister(left->InputAt(0))); \
return; \
} \
if (left->opcode() == \
IrOpcode::k##Type##ExtAddPairwise##PairwiseType##U && \
CanCover(node, left)) { \
Emit(kArmVpadal | MiscField::encode(NeonU##NeonWidth), \
g.DefineSameAsFirst(node), g.UseRegister(right), \
g.UseRegister(left->InputAt(0))); \
return; \
} \
if (right->opcode() == \
IrOpcode::k##Type##ExtAddPairwise##PairwiseType##S && \
CanCover(node, right)) { \
Emit(kArmVpadal | MiscField::encode(NeonS##NeonWidth), \
g.DefineSameAsFirst(node), g.UseRegister(left), \
g.UseRegister(right->InputAt(0))); \
return; \
} \
if (right->opcode() == \
IrOpcode::k##Type##ExtAddPairwise##PairwiseType##U && \
CanCover(node, right)) { \
Emit(kArmVpadal | MiscField::encode(NeonU##NeonWidth), \
g.DefineSameAsFirst(node), g.UseRegister(left), \
g.UseRegister(right->InputAt(0))); \
return; \
} \
VisitRRR(this, kArm##Type##Add, node); \
}
VISIT_SIMD_ADD(I16x8, I8x16, 8)
VISIT_SIMD_ADD(I32x4, I16x8, 16)
#undef VISIT_SIMD_ADD
void InstructionSelector::VisitI64x2SplatI32Pair(Node* node) {
ArmOperandGenerator g(this);
InstructionOperand operand0 = g.UseRegister(node->InputAt(0));
......
......@@ -2276,6 +2276,10 @@ void Decoder::DecodeAdvancedSIMDTwoOrThreeRegisters(Instruction* instr) {
out_buffer_pos_ +=
base::SNPrintF(out_buffer_ + out_buffer_pos_, "vrev%d.%d q%d, q%d",
op, esize, Vd, Vm);
} else if (opc1 == 0 && opc2 == 0b1100) {
Format(instr, q ? "vpadal.s'size2 'Qd, 'Qm" : "vpadal.s'size2 'Dd, 'Dm");
} else if (opc1 == 0 && opc2 == 0b1101) {
Format(instr, q ? "vpadal.u'size2 'Qd, 'Qm" : "vpadal.u'size2 'Dd, 'Dm");
} else if (opc1 == 0 && opc2 == 0b0100) {
Format(instr, q ? "vpaddl.s'size2 'Qd, 'Qm" : "vpaddl.s'size2 'Dd, 'Dm");
} else if (opc1 == 0 && opc2 == 0b0101) {
......
......@@ -4303,6 +4303,21 @@ void PairwiseAddLong(Simulator* simulator, int Vd, int Vm) {
simulator->set_neon_register<WideType, SIZE>(Vd, dst);
}
template <typename NarrowType, typename WideType, int SIZE = kSimd128Size>
void PairwiseAddAccumulateLong(Simulator* simulator, int Vd, int Vm) {
DCHECK_EQ(sizeof(WideType), 2 * sizeof(NarrowType));
static constexpr int kSElems = SIZE / sizeof(NarrowType);
static constexpr int kTElems = SIZE / sizeof(WideType);
NarrowType src[kSElems];
WideType dst[kTElems];
simulator->get_neon_register<NarrowType, SIZE>(Vm, src);
simulator->get_neon_register<WideType, SIZE>(Vd, dst);
for (int i = 0; i < kTElems; i++) {
dst[i] += WideType{src[i * 2]} + WideType{src[i * 2 + 1]};
}
simulator->set_neon_register<WideType, SIZE>(Vd, dst);
}
template <typename NarrowType, typename WideType>
void MultiplyLong(Simulator* simulator, int Vd, int Vn, int Vm) {
DCHECK_EQ(sizeof(WideType), 2 * sizeof(NarrowType));
......@@ -4484,6 +4499,31 @@ void Simulator::DecodeAdvancedSIMDTwoOrThreeRegisters(Instruction* instr) {
case Neon64:
UNREACHABLE();
}
} else if (opc1 == 0 && (opc2 == 0b1100 || opc2 == 0b1101)) {
DCHECK_EQ(1, instr->Bit(6)); // Only support Q regs.
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int is_signed = instr->Bit(7) == 0;
// vpadal Qd, Qm
switch (size) {
case Neon8:
is_signed
? PairwiseAddAccumulateLong<int8_t, int16_t>(this, Vd, Vm)
: PairwiseAddAccumulateLong<uint8_t, uint16_t>(this, Vd, Vm);
break;
case Neon16:
is_signed
? PairwiseAddAccumulateLong<int16_t, int32_t>(this, Vd, Vm)
: PairwiseAddAccumulateLong<uint16_t, uint32_t>(this, Vd, Vm);
break;
case Neon32:
is_signed
? PairwiseAddAccumulateLong<int32_t, int64_t>(this, Vd, Vm)
: PairwiseAddAccumulateLong<uint32_t, uint64_t>(this, Vd, Vm);
break;
case Neon64:
UNREACHABLE();
}
} else if (size == 0 && opc1 == 0b10 && opc2 == 0) {
if (instr->Bit(6) == 0) {
// vswp Dd, Dm.
......
......@@ -1283,6 +1283,8 @@ TEST(15) {
uint32_t vmin_s8[4], vmin_u16[4], vmin_s32[4];
uint32_t vmax_s8[4], vmax_u16[4], vmax_s32[4];
uint32_t vpadd_i8[2], vpadd_i16[2], vpadd_i32[2];
uint32_t vpadal_s8[4], vpadal_s16[4], vpadal_s32[4];
uint32_t vpadal_u8[4], vpadal_u16[4], vpadal_u32[4];
uint32_t vpaddl_s8[4], vpaddl_s16[4], vpaddl_s32[4];
uint32_t vpaddl_u8[4], vpaddl_u16[4], vpaddl_u32[4];
uint32_t vpmin_s8[2], vpmin_u16[2], vpmin_s32[2];
......@@ -1648,6 +1650,47 @@ TEST(15) {
__ vpadd(Neon32, d0, d0, d2);
__ vstr(d0, r0, offsetof(T, vpadd_i32));
// vpadal signed.
__ mov(r4, Operand(0x81));
__ vdup(Neon8, q0, r4);
__ mov(r4, Operand(0x01));
__ vdup(Neon8, q2, r4);
__ vpadal(NeonS8, q2, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vpadal_s8))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ mov(r4, Operand(0x01));
__ vdup(Neon8, q2, r4);
__ vpadal(NeonS16, q2, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vpadal_s16))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ mov(r4, Operand(0x01));
__ vdup(Neon8, q2, r4);
__ vpadal(NeonS32, q2, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vpadal_s32))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
// vpadal unsigned.
__ mov(r4, Operand(0x01));
__ vdup(Neon8, q2, r4);
__ vpadal(NeonU8, q2, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vpadal_u8))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ mov(r4, Operand(0x01));
__ vdup(Neon8, q2, r4);
__ vpadal(NeonU16, q2, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vpadal_u16))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ mov(r4, Operand(0x01));
__ vdup(Neon8, q2, r4);
__ vpadal(NeonU32, q2, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vpadal_u32))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
// vpaddl signed.
__ mov(r4, Operand(0x81));
__ vdup(Neon8, q0, r4);
......@@ -2255,6 +2298,14 @@ TEST(15) {
CHECK_EQ_32X2(vpadd_i16, 0x0C0C0606u, 0x06060606u);
CHECK_EQ_32X2(vpadd_i32, 0x12120C0Cu, 0x06060606u);
CHECK_EQ_32X4(vpadal_s8, 0x30003, 0x30003, 0x30003, 0x30003);
CHECK_EQ_32X4(vpadal_s16, 0x1000403, 0x1000403, 0x1000403, 0x1000403);
CHECK_EQ_32X4(vpadal_s32, 0x4040403, 0x1010100, 0x4040403, 0x1010100);
CHECK_EQ_32X4(vpadal_u8, 0x2030203, 0x2030203, 0x2030203, 0x2030203);
CHECK_EQ_32X4(vpadal_u16, 0x1020403, 0x1020403, 0x1020403, 0x1020403);
CHECK_EQ_32X4(vpadal_u32, 0x4040403, 0x1010102, 0x4040403, 0x1010102);
CHECK_EQ_32X4(vpaddl_s8, 0xFF02FF02, 0xFF02FF02, 0xFF02FF02, 0xFF02FF02);
CHECK_EQ_32X4(vpaddl_s16, 0xFFFF0302, 0xFFFF0302, 0xFFFF0302, 0xFFFF0302);
CHECK_EQ_32X4(vpaddl_s32, 0x03030302, 0xFFFFFFFF, 0x03030302, 0xFFFFFFFF);
......
......@@ -1171,6 +1171,18 @@ TEST(Neon) {
"f2110b12 vpadd.i16 d0, d1, d2");
COMPARE(vpadd(Neon32, d0, d1, d2),
"f2210b12 vpadd.i32 d0, d1, d2");
COMPARE(vpadal(NeonS8, q0, q1),
"f3b00642 vpadal.s8 q0, q1");
COMPARE(vpadal(NeonS16, q0, q1),
"f3b40642 vpadal.s16 q0, q1");
COMPARE(vpadal(NeonS32, q0, q1),
"f3b80642 vpadal.s32 q0, q1");
COMPARE(vpadal(NeonU8, q14, q15),
"f3f0c6ee vpadal.u8 q14, q15");
COMPARE(vpadal(NeonU16, q14, q15),
"f3f4c6ee vpadal.u16 q14, q15");
COMPARE(vpadal(NeonU32, q14, q15),
"f3f8c6ee vpadal.u32 q14, q15");
COMPARE(vpaddl(NeonS8, q0, q1),
"f3b00242 vpaddl.s8 q0, q1");
COMPARE(vpaddl(NeonS16, q0, q1),
......
......@@ -3224,6 +3224,72 @@ TEST_F(InstructionSelectorTest, Float64Neg) {
ASSERT_EQ(1U, s[0]->OutputCount());
EXPECT_EQ(s.ToVreg(n), s.ToVreg(s[0]->Output()));
}
enum PairwiseAddSide { LEFT, RIGHT };
std::ostream& operator<<(std::ostream& os, const PairwiseAddSide& side) {
switch (side) {
case LEFT:
return os << "LEFT";
case RIGHT:
return os << "RIGHT";
}
}
struct AddWithPairwiseAddSideAndWidth {
PairwiseAddSide side;
int32_t width;
bool isSigned;
};
std::ostream& operator<<(std::ostream& os,
const AddWithPairwiseAddSideAndWidth& sw) {
return os << "{ side: " << sw.side << ", width: " << sw.width
<< ", isSigned: " << sw.isSigned << " }";
}
using InstructionSelectorAddWithPairwiseAddTest =
InstructionSelectorTestWithParam<AddWithPairwiseAddSideAndWidth>;
TEST_P(InstructionSelectorAddWithPairwiseAddTest, AddWithPairwiseAdd) {
AddWithPairwiseAddSideAndWidth params = GetParam();
const MachineType type = MachineType::Simd128();
StreamBuilder m(this, type, type, type, type);
Node* x = m.Parameter(0);
Node* y = m.Parameter(1);
const Operator* pairwiseAddOp;
if (params.width == 32 && params.isSigned) {
pairwiseAddOp = m.machine()->I32x4ExtAddPairwiseI16x8S();
} else if (params.width == 16 && params.isSigned) {
pairwiseAddOp = m.machine()->I16x8ExtAddPairwiseI8x16S();
} else if (params.width == 32 && !params.isSigned) {
pairwiseAddOp = m.machine()->I32x4ExtAddPairwiseI16x8U();
} else {
pairwiseAddOp = m.machine()->I16x8ExtAddPairwiseI8x16U();
}
Node* pairwiseAdd = m.AddNode(pairwiseAddOp, x);
const Operator* addOp =
params.width == 32 ? m.machine()->I32x4Add() : m.machine()->I16x8Add();
Node* add = params.side == LEFT ? m.AddNode(addOp, pairwiseAdd, y)
: m.AddNode(addOp, y, pairwiseAdd);
m.Return(add);
Stream s = m.Build();
// Should be fused to Vpadal
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kArmVpadal, s[0]->arch_opcode());
EXPECT_EQ(2U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
const AddWithPairwiseAddSideAndWidth kAddWithPairAddTestCases[] = {
{LEFT, 16, true}, {RIGHT, 16, true}, {LEFT, 32, true},
{RIGHT, 32, true}, {LEFT, 16, false}, {RIGHT, 16, false},
{LEFT, 32, false}, {RIGHT, 32, false}};
INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorAddWithPairwiseAddTest,
::testing::ValuesIn(kAddWithPairAddTestCases));
} // namespace compiler
} // namespace internal
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment