Commit 34916c4a authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm][arm64] Optimize i32.popcnt and i64.popcnt

TurboFan currently calls into runtime for these two instructions, but
there is a better 4-instruction lowering that Liftoff already uses. Move
this into macro-assembler so we can share this across both compilers. We
name this PopcntHelper because there isn't a Cnt on ARM64 that works on
Word32/Word64.

Bug: v8:12071
Change-Id: I182bf466b76cbad985d8c5b8ddae0f4352f71cd2
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3087812Reviewed-by: 's avatarAndreas Haas <ahaas@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#76236}
parent 3fd8025d
...@@ -3575,6 +3575,16 @@ void TurboAssembler::StoreReturnAddressInWasmExitFrame(Label* return_location) { ...@@ -3575,6 +3575,16 @@ void TurboAssembler::StoreReturnAddressInWasmExitFrame(Label* return_location) {
} }
#endif // V8_ENABLE_WEBASSEMBLY #endif // V8_ENABLE_WEBASSEMBLY
void TurboAssembler::PopcntHelper(Register dst, Register src) {
UseScratchRegisterScope temps(this);
VRegister scratch = temps.AcquireV(kFormat8B);
VRegister tmp = src.Is32Bits() ? scratch.S() : scratch.D();
Fmov(tmp, src);
Cnt(scratch, scratch);
Addv(scratch.B(), scratch);
Fmov(dst, tmp);
}
void TurboAssembler::I64x2BitMask(Register dst, VRegister src) { void TurboAssembler::I64x2BitMask(Register dst, VRegister src) {
ASM_CODE_COMMENT(this); ASM_CODE_COMMENT(this);
UseScratchRegisterScope scope(this); UseScratchRegisterScope scope(this);
......
...@@ -1390,9 +1390,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -1390,9 +1390,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void StoreReturnAddressInWasmExitFrame(Label* return_location); void StoreReturnAddressInWasmExitFrame(Label* return_location);
#endif // V8_ENABLE_WEBASSEMBLY #endif // V8_ENABLE_WEBASSEMBLY
// Wasm SIMD helpers. These instructions don't have direct lowering to native // Wasm helpers. These instructions don't have direct lowering
// instructions. These helpers allow us to define the optimal code sequence, // to native instructions. These helpers allow us to define the optimal code
// and be used in both TurboFan and Liftoff. // sequence, and be used in both TurboFan and Liftoff.
void PopcntHelper(Register dst, Register src);
void I64x2BitMask(Register dst, VRegister src); void I64x2BitMask(Register dst, VRegister src);
void I64x2AllTrue(Register dst, VRegister src); void I64x2AllTrue(Register dst, VRegister src);
......
...@@ -1551,6 +1551,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -1551,6 +1551,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kArm64Cmn32: case kArm64Cmn32:
__ Cmn(i.InputOrZeroRegister32(0), i.InputOperand2_32(1)); __ Cmn(i.InputOrZeroRegister32(0), i.InputOperand2_32(1));
break; break;
case kArm64Cnt32: {
__ PopcntHelper(i.OutputRegister32(), i.InputRegister32(0));
break;
}
case kArm64Cnt64: {
__ PopcntHelper(i.OutputRegister64(), i.InputRegister64(0));
break;
}
case kArm64Cnt: { case kArm64Cnt: {
VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode)); VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode));
__ Cnt(i.OutputSimd128Register().Format(f), __ Cnt(i.OutputSimd128Register().Format(f),
......
...@@ -25,6 +25,8 @@ namespace compiler { ...@@ -25,6 +25,8 @@ namespace compiler {
V(Arm64Cmn) \ V(Arm64Cmn) \
V(Arm64Cmn32) \ V(Arm64Cmn32) \
V(Arm64Cnt) \ V(Arm64Cnt) \
V(Arm64Cnt32) \
V(Arm64Cnt64) \
V(Arm64Tst) \ V(Arm64Tst) \
V(Arm64Tst32) \ V(Arm64Tst32) \
V(Arm64Or) \ V(Arm64Or) \
......
...@@ -26,6 +26,8 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -26,6 +26,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64Cmn: case kArm64Cmn:
case kArm64Cmn32: case kArm64Cmn32:
case kArm64Cnt: case kArm64Cnt:
case kArm64Cnt32:
case kArm64Cnt64:
case kArm64Tst: case kArm64Tst:
case kArm64Tst32: case kArm64Tst32:
case kArm64Or: case kArm64Or:
......
...@@ -1433,6 +1433,8 @@ void InstructionSelector::VisitWord64Ror(Node* node) { ...@@ -1433,6 +1433,8 @@ void InstructionSelector::VisitWord64Ror(Node* node) {
#define RR_OP_LIST(V) \ #define RR_OP_LIST(V) \
V(Word64Clz, kArm64Clz) \ V(Word64Clz, kArm64Clz) \
V(Word32Clz, kArm64Clz32) \ V(Word32Clz, kArm64Clz32) \
V(Word32Popcnt, kArm64Cnt32) \
V(Word64Popcnt, kArm64Cnt64) \
V(Word32ReverseBits, kArm64Rbit32) \ V(Word32ReverseBits, kArm64Rbit32) \
V(Word64ReverseBits, kArm64Rbit) \ V(Word64ReverseBits, kArm64Rbit) \
V(Word32ReverseBytes, kArm64Rev32) \ V(Word32ReverseBytes, kArm64Rev32) \
...@@ -1523,10 +1525,6 @@ void InstructionSelector::VisitWord32Ctz(Node* node) { UNREACHABLE(); } ...@@ -1523,10 +1525,6 @@ void InstructionSelector::VisitWord32Ctz(Node* node) { UNREACHABLE(); }
void InstructionSelector::VisitWord64Ctz(Node* node) { UNREACHABLE(); } void InstructionSelector::VisitWord64Ctz(Node* node) { UNREACHABLE(); }
void InstructionSelector::VisitWord32Popcnt(Node* node) { UNREACHABLE(); }
void InstructionSelector::VisitWord64Popcnt(Node* node) { UNREACHABLE(); }
void InstructionSelector::VisitInt32Add(Node* node) { void InstructionSelector::VisitInt32Add(Node* node) {
Arm64OperandGenerator g(this); Arm64OperandGenerator g(this);
Int32BinopMatcher m(node); Int32BinopMatcher m(node);
...@@ -4102,6 +4100,8 @@ InstructionSelector::SupportedMachineOperatorFlags() { ...@@ -4102,6 +4100,8 @@ InstructionSelector::SupportedMachineOperatorFlags() {
MachineOperatorBuilder::kFloat64RoundTiesAway | MachineOperatorBuilder::kFloat64RoundTiesAway |
MachineOperatorBuilder::kFloat32RoundTiesEven | MachineOperatorBuilder::kFloat32RoundTiesEven |
MachineOperatorBuilder::kFloat64RoundTiesEven | MachineOperatorBuilder::kFloat64RoundTiesEven |
MachineOperatorBuilder::kWord32Popcnt |
MachineOperatorBuilder::kWord64Popcnt |
MachineOperatorBuilder::kWord32ShiftIsSafe | MachineOperatorBuilder::kWord32ShiftIsSafe |
MachineOperatorBuilder::kInt32DivIsSafe | MachineOperatorBuilder::kInt32DivIsSafe |
MachineOperatorBuilder::kUint32DivIsSafe | MachineOperatorBuilder::kUint32DivIsSafe |
......
...@@ -1173,12 +1173,7 @@ void LiftoffAssembler::emit_i32_ctz(Register dst, Register src) { ...@@ -1173,12 +1173,7 @@ void LiftoffAssembler::emit_i32_ctz(Register dst, Register src) {
} }
bool LiftoffAssembler::emit_i32_popcnt(Register dst, Register src) { bool LiftoffAssembler::emit_i32_popcnt(Register dst, Register src) {
UseScratchRegisterScope temps(this); PopcntHelper(dst.W(), src.W());
VRegister scratch = temps.AcquireV(kFormat8B);
Fmov(scratch.S(), src.W());
Cnt(scratch, scratch);
Addv(scratch.B(), scratch);
Fmov(dst.W(), scratch.S());
return true; return true;
} }
...@@ -1193,12 +1188,7 @@ void LiftoffAssembler::emit_i64_ctz(LiftoffRegister dst, LiftoffRegister src) { ...@@ -1193,12 +1188,7 @@ void LiftoffAssembler::emit_i64_ctz(LiftoffRegister dst, LiftoffRegister src) {
bool LiftoffAssembler::emit_i64_popcnt(LiftoffRegister dst, bool LiftoffAssembler::emit_i64_popcnt(LiftoffRegister dst,
LiftoffRegister src) { LiftoffRegister src) {
UseScratchRegisterScope temps(this); PopcntHelper(dst.gp().X(), src.gp().X());
VRegister scratch = temps.AcquireV(kFormat8B);
Fmov(scratch.D(), src.gp().X());
Cnt(scratch, scratch);
Addv(scratch.B(), scratch);
Fmov(dst.gp().X(), scratch.D());
return true; return true;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment