Commit cc682a66 authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[wasm-simd][arm] Prototype extended pairwise addition

Add new instruction vpaddl for signed and unsigned integers, adding
assembler, disassembler, simulator support, and also tests.

The signed and unsigned opcodes for vpaddl are separate enums, because
the helper EncodeNeonUnaryOp only takes a NeonSize (not NeonDataType). I
considered changing the signature, but none of the other instructions
use a NeonDataType, so it seems unnecessary.

Bug: v8:11086
Change-Id: I5e6694ae407779c1fd3604c5a40ca0a1b6ce061b
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2578233Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71671}
parent be7d3ebc
......@@ -3991,7 +3991,9 @@ enum UnaryOp {
VREV64,
VTRN,
VRECPE,
VRSQRTE
VRSQRTE,
VPADDL_S,
VPADDL_U
};
// Encoding helper for "Advanced SIMD two registers misc" decode group. See ARM
......@@ -4060,6 +4062,12 @@ static Instr EncodeNeonUnaryOp(UnaryOp op, NeonRegType reg_type, NeonSize size,
// Only support floating point.
op_encoding = 0x3 * B16 | 0xB * B7;
break;
case VPADDL_S:
op_encoding = 0x4 * B7;
break;
case VPADDL_U:
op_encoding = 0x5 * B7;
break;
default:
UNREACHABLE();
}
......@@ -4911,6 +4919,14 @@ void Assembler::vtrn(NeonSize size, QwNeonRegister src1, QwNeonRegister src2) {
emit(EncodeNeonUnaryOp(VTRN, NEON_Q, size, src1.code(), src2.code()));
}
void Assembler::vpaddl(NeonDataType dt, QwNeonRegister dst,
QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
// vpaddl.<dt>(Qd, Qm) SIMD Vector Pairwise Add Long.
emit(EncodeNeonUnaryOp(NeonU(dt) ? VPADDL_U : VPADDL_S, NEON_Q,
NeonDataTypeToSize(dt), dst.code(), src.code()));
}
// Encode NEON vtbl / vtbx instruction.
static Instr EncodeNeonVTB(DwVfpRegister dst, const NeonListOperand& list,
DwVfpRegister index, bool vtbx) {
......
......@@ -916,6 +916,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vpmax(NeonDataType dt, DwVfpRegister dst, DwVfpRegister src1,
DwVfpRegister src2);
void vpaddl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src);
// ARMv8 rounding instructions (NEON).
void vrintm(NeonDataType dt, const QwNeonRegister dst,
const QwNeonRegister src);
......
......@@ -1874,6 +1874,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1).high());
break;
}
case kArmVpaddl: {
auto dt = static_cast<NeonDataType>(MiscField::decode(instr->opcode()));
__ vpaddl(dt, i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
}
case kArmF64x2Splat: {
Simd128Register dst = i.OutputSimd128Register();
DoubleRegister src = i.InputDoubleRegister(0);
......
......@@ -112,6 +112,7 @@ namespace compiler {
V(ArmVst1F64) \
V(ArmVld1S128) \
V(ArmVst1S128) \
V(ArmVpaddl) \
V(ArmFloat32Max) \
V(ArmFloat64Max) \
V(ArmFloat32Min) \
......
......@@ -105,6 +105,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmVmovHighF64U32:
case kArmVmovF64U32U32:
case kArmVmovU32U32F64:
case kArmVpaddl:
case kArmFloat32Max:
case kArmFloat64Max:
case kArmFloat32Min:
......
......@@ -80,7 +80,8 @@ class ArmOperandGenerator : public OperandGenerator {
namespace {
void VisitRR(InstructionSelector* selector, ArchOpcode opcode, Node* node) {
void VisitRR(InstructionSelector* selector, InstructionCode opcode,
Node* node) {
ArmOperandGenerator g(selector);
selector->Emit(opcode, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)));
......@@ -3121,6 +3122,16 @@ EXT_MUL_LIST(VISIT_EXT_MUL)
#undef VISIT_EXT_MUL
#undef EXT_MUL_LIST
#define VISIT_EXTADD_PAIRWISE(OPCODE, NEONSIZE) \
void InstructionSelector::Visit##OPCODE(Node* node) { \
VisitRR(this, kArmVpaddl | MiscField::encode(NEONSIZE), node); \
}
VISIT_EXTADD_PAIRWISE(I16x8ExtAddPairwiseI8x16S, NeonS8)
VISIT_EXTADD_PAIRWISE(I16x8ExtAddPairwiseI8x16U, NeonU8)
VISIT_EXTADD_PAIRWISE(I32x4ExtAddPairwiseI16x8S, NeonS16)
VISIT_EXTADD_PAIRWISE(I32x4ExtAddPairwiseI16x8U, NeonU16)
#undef VISIT_EXTADD_PAIRWISE
void InstructionSelector::VisitTruncateFloat32ToInt32(Node* node) {
ArmOperandGenerator g(this);
......
......@@ -2801,6 +2801,14 @@ void InstructionSelector::VisitI64x2UConvertI32x4High(Node* node) {
// TODO(v8:11002) Prototype i8x16.popcnt.
void InstructionSelector::VisitI8x16Popcnt(Node* node) { UNIMPLEMENTED(); }
// TODO(v8:11168): Prototyping prefetch.
void InstructionSelector::VisitPrefetchTemporal(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitPrefetchNonTemporal(Node* node) {
UNIMPLEMENTED();
}
#endif // !V8_TARGET_ARCH_ARM64
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM
// TODO(v8:11086) Prototype extended pairwise add.
void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8S(Node* node) {
UNIMPLEMENTED();
......@@ -2814,13 +2822,7 @@ void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16S(Node* node) {
void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16U(Node* node) {
UNIMPLEMENTED();
}
// TODO(v8:11168): Prototyping prefetch.
void InstructionSelector::VisitPrefetchTemporal(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitPrefetchNonTemporal(Node* node) {
UNIMPLEMENTED();
}
#endif // !V8_TARGET_ARCH_ARM64
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM64 && \
!V8_TARGET_ARCH_ARM
......
......@@ -2262,6 +2262,10 @@ void Decoder::DecodeAdvancedSIMDTwoOrThreeRegisters(Instruction* instr) {
// vrev<op>.<esize> Qd, Qm.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vrev%d.%d q%d, q%d", op, esize, Vd, Vm);
} else if (opc1 == 0 && opc2 == 0b0100) {
Format(instr, q ? "vpaddl.s'size2 'Qd, 'Qm" : "vpaddl.s'size2 'Dd, 'Dm");
} else if (opc1 == 0 && opc2 == 0b0101) {
Format(instr, q ? "vpaddl.u'size2 'Qd, 'Qm" : "vpaddl.u'size2 'Dd, 'Dm");
} else if (size == 0 && opc1 == 0b10 && opc2 == 0) {
Format(instr, q ? "vswp 'Qd, 'Qm" : "vswp 'Dd, 'Dm");
} else if (opc1 == 0 && opc2 == 0b1011) {
......
......@@ -4288,6 +4288,20 @@ void PairwiseAdd(Simulator* simulator, int Vd, int Vm, int Vn) {
simulator->set_neon_register<T, kDoubleSize>(Vd, dst);
}
template <typename NarrowType, typename WideType, int SIZE = kSimd128Size>
void PairwiseAddLong(Simulator* simulator, int Vd, int Vm) {
DCHECK_EQ(sizeof(WideType), 2 * sizeof(NarrowType));
static constexpr int kSElems = SIZE / sizeof(NarrowType);
static constexpr int kTElems = SIZE / sizeof(WideType);
NarrowType src[kSElems];
WideType dst[kTElems];
simulator->get_neon_register<NarrowType, SIZE>(Vm, src);
for (int i = 0; i < kTElems; i++) {
dst[i] = WideType{src[i * 2]} + WideType{src[i * 2 + 1]};
}
simulator->set_neon_register<WideType, SIZE>(Vd, dst);
}
template <typename T, int SIZE = kSimd128Size>
void RoundingAverageUnsigned(Simulator* simulator, int Vd, int Vm, int Vn) {
static_assert(std::is_unsigned<T>::value,
......@@ -4454,6 +4468,28 @@ void Simulator::DecodeAdvancedSIMDTwoOrThreeRegisters(Instruction* instr) {
UNREACHABLE();
break;
}
} else if (opc1 == 0 && (opc2 == 0b0100 || opc2 == 0b0101)) {
DCHECK_EQ(1, instr->Bit(6)); // Only support Q regs.
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int is_signed = instr->Bit(7) == 0;
// vpaddl Qd, Qm.
switch (size) {
case Neon8:
is_signed ? PairwiseAddLong<int8_t, int16_t>(this, Vd, Vm)
: PairwiseAddLong<uint8_t, uint16_t>(this, Vd, Vm);
break;
case Neon16:
is_signed ? PairwiseAddLong<int16_t, int32_t>(this, Vd, Vm)
: PairwiseAddLong<uint16_t, uint32_t>(this, Vd, Vm);
break;
case Neon32:
is_signed ? PairwiseAddLong<int32_t, int64_t>(this, Vd, Vm)
: PairwiseAddLong<uint32_t, uint64_t>(this, Vd, Vm);
break;
case Neon64:
UNREACHABLE();
}
} else if (size == 0 && opc1 == 0b10 && opc2 == 0) {
if (instr->Bit(6) == 0) {
// vswp Dd, Dm.
......
......@@ -1282,6 +1282,8 @@ TEST(15) {
uint32_t vmin_s8[4], vmin_u16[4], vmin_s32[4];
uint32_t vmax_s8[4], vmax_u16[4], vmax_s32[4];
uint32_t vpadd_i8[2], vpadd_i16[2], vpadd_i32[2];
uint32_t vpaddl_s8[4], vpaddl_s16[4], vpaddl_s32[4];
uint32_t vpaddl_u8[4], vpaddl_u16[4], vpaddl_u32[4];
uint32_t vpmin_s8[2], vpmin_u16[2], vpmin_s32[2];
uint32_t vpmax_s8[2], vpmax_u16[2], vpmax_s32[2];
uint32_t vadd8[4], vadd16[4], vadd32[4];
......@@ -1645,6 +1647,35 @@ TEST(15) {
__ vpadd(Neon32, d0, d0, d2);
__ vstr(d0, r0, offsetof(T, vpadd_i32));
// vpaddl signed.
__ mov(r4, Operand(0x81));
__ vdup(Neon8, q0, r4);
__ vpaddl(NeonS8, q2, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vpaddl_s8))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ vpaddl(NeonS16, q2, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vpaddl_s16))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ vpaddl(NeonS32, q2, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vpaddl_s32))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
// vpaddl unsigned.
__ vpaddl(NeonU8, q2, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vpaddl_u8))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ vpaddl(NeonU16, q2, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vpaddl_u16))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ vpaddl(NeonU32, q2, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vpaddl_u32))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
// vpmin/vpmax integer.
__ mov(r4, Operand(0x03));
__ vdup(Neon16, q0, r4);
......@@ -2197,6 +2228,15 @@ TEST(15) {
CHECK_EQ_32X2(vpadd_i8, 0x03030303u, 0x06060606u);
CHECK_EQ_32X2(vpadd_i16, 0x0C0C0606u, 0x06060606u);
CHECK_EQ_32X2(vpadd_i32, 0x12120C0Cu, 0x06060606u);
CHECK_EQ_32X4(vpaddl_s8, 0xFF02FF02, 0xFF02FF02, 0xFF02FF02, 0xFF02FF02);
CHECK_EQ_32X4(vpaddl_s16, 0xFFFF0302, 0xFFFF0302, 0xFFFF0302, 0xFFFF0302);
CHECK_EQ_32X4(vpaddl_s32, 0x03030302, 0xFFFFFFFF, 0x03030302, 0xFFFFFFFF);
CHECK_EQ_32X4(vpaddl_u8, 0x01020102, 0x01020102, 0x01020102, 0x01020102);
CHECK_EQ_32X4(vpaddl_u16, 0x00010302, 0x00010302, 0x00010302, 0x00010302);
CHECK_EQ_32X4(vpaddl_u32, 0x03030302, 0x00000001, 0x03030302, 0x00000001);
CHECK_EQ_32X2(vpmin_s8, 0x00000000u, 0x03030303u);
CHECK_EQ_32X2(vpmax_s8, 0x03030303u, 0x03030303u);
// [0, ffff, 0, ffff] and [ffff, ffff]
......
......@@ -1165,6 +1165,18 @@ TEST(Neon) {
"f2110b12 vpadd.i16 d0, d1, d2");
COMPARE(vpadd(Neon32, d0, d1, d2),
"f2210b12 vpadd.i32 d0, d1, d2");
COMPARE(vpaddl(NeonS8, q0, q1),
"f3b00242 vpaddl.s8 q0, q1");
COMPARE(vpaddl(NeonS16, q0, q1),
"f3b40242 vpaddl.s16 q0, q1");
COMPARE(vpaddl(NeonS32, q0, q1),
"f3b80242 vpaddl.s32 q0, q1");
COMPARE(vpaddl(NeonU8, q14, q15),
"f3f0c2ee vpaddl.u8 q14, q15");
COMPARE(vpaddl(NeonU16, q14, q15),
"f3f4c2ee vpaddl.u16 q14, q15");
COMPARE(vpaddl(NeonU32, q14, q15),
"f3f8c2ee vpaddl.u32 q14, q15");
COMPARE(vpmax(NeonS8, d0, d1, d2),
"f2010a02 vpmax.s8 d0, d1, d2");
COMPARE(vpmin(NeonU16, d1, d2, d8),
......
......@@ -1880,7 +1880,7 @@ WASM_SIMD_TEST(S128Not) {
[](int32_t x) { return ~x; });
}
#if V8_TARGET_ARCH_ARM64
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
// TODO(v8:11086) Prototype i32x4.extadd_pairwise_i16x8_{s,u}
template <typename Narrow, typename Wide>
void RunExtAddPairwiseTest(TestExecutionTier execution_tier,
......@@ -1929,7 +1929,7 @@ WASM_SIMD_TEST_NO_LOWERING(I16x8ExtAddPairwiseI8x16U) {
kExprI16x8ExtAddPairwiseI8x16U,
kExprI8x16Splat);
}
#endif // V8_TARGET_ARCH_ARM64
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
void RunI32x4BinOpTest(TestExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode opcode, Int32BinOp expected_op) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment