Commit 231bc86c authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[wasm-simd][arm] Prototype i8x16.popcnt

This is the same as the original implementation in https://crrev.com/c/2567534
which was speculatively reverted due to flaky tests. Since then, there have
been some changes to fix those tests, so trying to get this in again.

Bug: v8:11002
Change-Id: I5bd0f63d3aec4cf6db403b35737f8b695b0f4e37
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2589063Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71746}
parent fd45d38a
......@@ -3995,6 +3995,7 @@ enum UnaryOp {
VPADDL_S,
VPADDL_U,
VCLT0,
VCNT
};
// Encoding helper for "Advanced SIMD two registers misc" decode group. See ARM
......@@ -4073,6 +4074,9 @@ static Instr EncodeNeonUnaryOp(UnaryOp op, NeonRegType reg_type, NeonSize size,
// Only support signed integers.
op_encoding = 0x1 * B16 | 0x4 * B7;
break;
case VCNT:
op_encoding = 0xA * B7;
break;
}
int vd, d;
NeonSplitCode(reg_type, dst_code, &vd, &d, &op_encoding);
......@@ -4939,6 +4943,13 @@ void Assembler::vpaddl(NeonDataType dt, QwNeonRegister dst,
NeonDataTypeToSize(dt), dst.code(), src.code()));
}
void Assembler::vcnt(QwNeonRegister dst, QwNeonRegister src) {
// Qd = vcnt(Qm) SIMD Vector Count Set Bits.
// Instruction details available at ARM DDI 0487F.b, F6-5094.
DCHECK(IsEnabled(NEON));
emit(EncodeNeonUnaryOp(VCNT, NEON_Q, Neon8, dst.code(), src.code()));
}
// Encode NEON vtbl / vtbx instruction.
static Instr EncodeNeonVTB(DwVfpRegister dst, const NeonListOperand& list,
DwVfpRegister index, bool vtbx) {
......
......@@ -972,6 +972,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vtbx(DwVfpRegister dst, const NeonListOperand& list,
DwVfpRegister index);
void vcnt(QwNeonRegister dst, QwNeonRegister src);
// Pseudo instructions
// Different nop operations are used by the code generator to detect certain
......
......@@ -1639,6 +1639,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputDoubleRegister(0));
DCHECK_EQ(LeaveCC, i.OutputSBit());
break;
case kArmVcnt: {
__ vcnt(i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
}
case kArmLdrb:
__ ldrb(i.OutputRegister(), i.InputOffset());
DCHECK_EQ(LeaveCC, i.OutputSBit());
......
......@@ -112,6 +112,7 @@ namespace compiler {
V(ArmVst1F64) \
V(ArmVld1S128) \
V(ArmVst1S128) \
V(ArmVcnt) \
V(ArmVpaddl) \
V(ArmFloat32Max) \
V(ArmFloat64Max) \
......
......@@ -105,6 +105,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmVmovHighF64U32:
case kArmVmovF64U32U32:
case kArmVmovU32U32F64:
case kArmVcnt:
case kArmVpaddl:
case kArmFloat32Max:
case kArmFloat64Max:
......
......@@ -2621,6 +2621,7 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
V(I16x8Abs, kArmI16x8Abs) \
V(I8x16Neg, kArmI8x16Neg) \
V(I8x16Abs, kArmI8x16Abs) \
V(I8x16Popcnt, kArmVcnt) \
V(S128Not, kArmS128Not) \
V(V32x4AnyTrue, kArmV32x4AnyTrue) \
V(V32x4AllTrue, kArmV32x4AllTrue) \
......
......@@ -2802,9 +2802,6 @@ void InstructionSelector::VisitI64x2UConvertI32x4High(Node* node) {
UNIMPLEMENTED();
}
// TODO(v8:11002) Prototype i8x16.popcnt.
void InstructionSelector::VisitI8x16Popcnt(Node* node) { UNIMPLEMENTED(); }
// TODO(v8:11168): Prototyping prefetch.
void InstructionSelector::VisitPrefetchTemporal(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitPrefetchNonTemporal(Node* node) {
......@@ -2812,6 +2809,11 @@ void InstructionSelector::VisitPrefetchNonTemporal(Node* node) {
}
#endif // !V8_TARGET_ARCH_ARM64
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM
// TODO(v8:11002) Prototype i8x16.popcnt.
void InstructionSelector::VisitI8x16Popcnt(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64
// TODO(v8:11086) Prototype extended pairwise add.
void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8S(Node* node) {
......
......@@ -2268,6 +2268,9 @@ void Decoder::DecodeAdvancedSIMDTwoOrThreeRegisters(Instruction* instr) {
Format(instr, q ? "vpaddl.u'size2 'Qd, 'Qm" : "vpaddl.u'size2 'Dd, 'Dm");
} else if (size == 0 && opc1 == 0b10 && opc2 == 0) {
Format(instr, q ? "vswp 'Qd, 'Qm" : "vswp 'Dd, 'Dm");
} else if (opc1 == 0 && opc2 == 0b1010) {
DCHECK_EQ(0, size);
Format(instr, q ? "vcnt.8 'Qd, 'Qm" : "vcnt.8 'Dd, 'Dm");
} else if (opc1 == 0 && opc2 == 0b1011) {
Format(instr, "vmvn 'Qd, 'Qm");
} else if (opc1 == 0b01 && opc2 == 0b0100) {
......
......@@ -4510,6 +4510,17 @@ void Simulator::DecodeAdvancedSIMDTwoOrThreeRegisters(Instruction* instr) {
set_neon_register(vm, dval);
set_neon_register(vd, mval);
}
} else if (opc1 == 0 && opc2 == 0b1010) {
// vcnt Qd, Qm.
DCHECK_EQ(0, size);
int vd = instr->VFPDRegValue(q ? kSimd128Precision : kDoublePrecision);
int vm = instr->VFPMRegValue(q ? kSimd128Precision : kDoublePrecision);
uint8_t q_data[16];
get_neon_register(vm, q_data);
for (int i = 0; i < 16; i++) {
q_data[i] = base::bits::CountPopulation(q_data[i]);
}
set_neon_register(vd, q_data);
} else if (opc1 == 0 && opc2 == 0b1011) {
// vmvn Qd, Qm.
int vd = instr->VFPDRegValue(kSimd128Precision);
......
......@@ -1351,6 +1351,8 @@ TEST(Neon) {
"f3b1fa45 vtbx.8 d15, {d1, d2, d3}, d5");
COMPARE(vtbx(d15, NeonListOperand(d1, 4), d5),
"f3b1fb45 vtbx.8 d15, {d1, d2, d3, d4}, d5");
COMPARE(vcnt(q1, q2),
"f3b02544 vcnt.8 q1, q2");
}
VERIFY_RUN();
......
......@@ -2553,7 +2553,7 @@ WASM_SIMD_TEST(I8x16Abs) {
RunI8x16UnOpTest(execution_tier, lower_simd, kExprI8x16Abs, Abs);
}
#if V8_TARGET_ARCH_ARM64
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
// TODO(v8:11002) Prototype i8x16.popcnt.
WASM_SIMD_TEST_NO_LOWERING(I8x16Popcnt) {
FLAG_SCOPE(wasm_simd_post_mvp);
......@@ -2576,7 +2576,7 @@ WASM_SIMD_TEST_NO_LOWERING(I8x16Popcnt) {
}
}
}
#endif // V8_TARGET_ARCH_ARM64
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
// Tests both signed and unsigned conversion from I16x8 (packing).
WASM_SIMD_TEST(I8x16ConvertI16x8) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment