Commit d710756a authored by Clemens Backes's avatar Clemens Backes Committed by Commit Bot

[Liftoff] Implement i64 popcnt

This is the last remaining missing instruction from the MVP. This CL
adds support for ia32, x64, arm, and arm64.
For CPUs which do not support the POPCNT instruction, there exists a
fallback implementation in C.

R=jkummerow@chromium.org

Bug: v8:9919
Change-Id: Ie7a79a46e91726e15379b9a21b59775bbf5de556
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1895569
Commit-Queue: Clemens Backes <clemensb@chromium.org>
Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Cr-Commit-Position: refs/heads/master@{#64764}
parent 6c0825aa
......@@ -762,30 +762,36 @@ void LiftoffAssembler::emit_i32_ctz(Register dst, Register src) {
clz(dst, dst);
}
bool LiftoffAssembler::emit_i32_popcnt(Register dst, Register src) {
{
UseScratchRegisterScope temps(this);
LiftoffRegList pinned = LiftoffRegList::ForRegs(dst);
Register scratch = pinned.set(GetUnusedRegister(kGpReg, pinned)).gp();
Register scratch_2 = GetUnusedRegister(kGpReg, pinned).gp();
// x = x - ((x & (0x55555555 << 1)) >> 1)
and_(scratch, src, Operand(0xaaaaaaaa));
sub(dst, src, Operand(scratch, LSR, 1));
// x = (x & 0x33333333) + ((x & (0x33333333 << 2)) >> 2)
mov(scratch, Operand(0x33333333));
and_(scratch_2, dst, Operand(scratch, LSL, 2));
and_(scratch, dst, scratch);
add(dst, scratch, Operand(scratch_2, LSR, 2));
}
namespace liftoff {
inline void GeneratePopCnt(Assembler* assm, Register dst, Register src,
Register scratch1, Register scratch2) {
DCHECK(!AreAliased(dst, scratch1, scratch2));
if (src == scratch1) std::swap(scratch1, scratch2);
// x = x - ((x & (0x55555555 << 1)) >> 1)
assm->and_(scratch1, src, Operand(0xaaaaaaaa));
assm->sub(dst, src, Operand(scratch1, LSR, 1));
// x = (x & 0x33333333) + ((x & (0x33333333 << 2)) >> 2)
assm->mov(scratch1, Operand(0x33333333));
assm->and_(scratch2, dst, Operand(scratch1, LSL, 2));
assm->and_(scratch1, dst, scratch1);
assm->add(dst, scratch1, Operand(scratch2, LSR, 2));
// x = (x + (x >> 4)) & 0x0F0F0F0F
add(dst, dst, Operand(dst, LSR, 4));
and_(dst, dst, Operand(0x0f0f0f0f));
assm->add(dst, dst, Operand(dst, LSR, 4));
assm->and_(dst, dst, Operand(0x0f0f0f0f));
// x = x + (x >> 8)
add(dst, dst, Operand(dst, LSR, 8));
assm->add(dst, dst, Operand(dst, LSR, 8));
// x = x + (x >> 16)
add(dst, dst, Operand(dst, LSR, 16));
assm->add(dst, dst, Operand(dst, LSR, 16));
// x = x & 0x3F
and_(dst, dst, Operand(0x3f));
assm->and_(dst, dst, Operand(0x3f));
}
} // namespace liftoff
bool LiftoffAssembler::emit_i32_popcnt(Register dst, Register src) {
LiftoffRegList pinned = LiftoffRegList::ForRegs(dst);
Register scratch1 = pinned.set(GetUnusedRegister(kGpReg, pinned)).gp();
Register scratch2 = GetUnusedRegister(kGpReg, pinned).gp();
liftoff::GeneratePopCnt(this, dst, src, scratch1, scratch2);
return true;
}
......@@ -1001,6 +1007,23 @@ void LiftoffAssembler::emit_i64_ctz(LiftoffRegister dst, LiftoffRegister src) {
mov(dst.high_gp(), Operand(0)); // High word of result is always 0.
}
bool LiftoffAssembler::emit_i64_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
// Produce partial popcnts in the two dst registers, making sure not to
// overwrite the second src register before using it.
Register src1 = src.high_gp() == dst.low_gp() ? src.high_gp() : src.low_gp();
Register src2 = src.high_gp() == dst.low_gp() ? src.low_gp() : src.high_gp();
LiftoffRegList pinned = LiftoffRegList::ForRegs(dst, src2);
Register scratch1 = pinned.set(GetUnusedRegister(kGpReg, pinned)).gp();
Register scratch2 = GetUnusedRegister(kGpReg, pinned).gp();
liftoff::GeneratePopCnt(this, dst.low_gp(), src1, scratch1, scratch2);
liftoff::GeneratePopCnt(this, dst.high_gp(), src2, scratch1, scratch2);
// Now add the two into the lower dst reg and clear the higher dst reg.
add(dst.low_gp(), dst.low_gp(), dst.high_gp());
mov(dst.high_gp(), Operand(0));
return true;
}
bool LiftoffAssembler::emit_f32_ceil(DoubleRegister dst, DoubleRegister src) {
if (CpuFeatures::IsSupported(ARMv8)) {
CpuFeatureScope scope(this, ARMv8);
......
......@@ -588,6 +588,17 @@ void LiftoffAssembler::emit_i64_ctz(LiftoffRegister dst, LiftoffRegister src) {
Clz(dst.gp().X(), dst.gp().X());
}
bool LiftoffAssembler::emit_i64_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
UseScratchRegisterScope temps(this);
VRegister scratch = temps.AcquireV(kFormat8B);
Fmov(scratch.D(), src.gp().X());
Cnt(scratch, scratch);
Addv(scratch.B(), scratch);
Fmov(dst.gp().X(), scratch.D());
return true;
}
void LiftoffAssembler::emit_i32_divs(Register dst, Register lhs, Register rhs,
Label* trap_div_by_zero,
Label* trap_div_unrepresentable) {
......
......@@ -1083,6 +1083,21 @@ void LiftoffAssembler::emit_i64_ctz(LiftoffRegister dst, LiftoffRegister src) {
xor_(dst.high_gp(), dst.high_gp()); // High word of result is always 0.
}
bool LiftoffAssembler::emit_i64_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
if (!CpuFeatures::IsSupported(POPCNT)) return false;
CpuFeatureScope scope(this, POPCNT);
// Produce partial popcnts in the two dst registers.
Register src1 = src.high_gp() == dst.low_gp() ? src.high_gp() : src.low_gp();
Register src2 = src.high_gp() == dst.low_gp() ? src.low_gp() : src.high_gp();
popcnt(dst.low_gp(), src1);
popcnt(dst.high_gp(), src2);
// Add the two into the lower dst reg, clear the higher dst reg.
add(dst.low_gp(), dst.high_gp());
xor_(dst.high_gp(), dst.high_gp());
return true;
}
void LiftoffAssembler::emit_i32_to_intptr(Register dst, Register src) {
// This is a nop on ia32.
}
......
......@@ -462,6 +462,7 @@ class LiftoffAssembler : public TurboAssembler {
// i64 unops.
inline void emit_i64_clz(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i64_ctz(LiftoffRegister dst, LiftoffRegister src);
inline bool emit_i64_popcnt(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i32_to_intptr(Register dst, Register src);
......
......@@ -839,8 +839,20 @@ class LiftoffCompiler {
});
break;
case kExprI64Popcnt:
return unsupported(decoder, kComplexOperation,
WasmOpcodes::OpcodeName(opcode));
EmitUnOp<kWasmI64, kWasmI64>(
[=](LiftoffRegister dst, LiftoffRegister src) {
if (__ emit_i64_popcnt(dst, src)) return;
// The c function returns i32. We will zero-extend later.
ValueType sig_i_l_reps[] = {kWasmI32, kWasmI64};
FunctionSig sig_i_l(1, 1, sig_i_l_reps);
LiftoffRegister c_call_dst = kNeedI64RegPair ? dst.low() : dst;
GenerateCCall(&c_call_dst, &sig_i_l, kWasmStmt, &src,
ExternalReference::wasm_word64_popcnt());
// Now zero-extend the result to i64.
__ emit_type_conversion(kExprI64UConvertI32, dst, c_call_dst,
nullptr);
});
break;
case kExprI32SConvertSatF32:
case kExprI32UConvertSatF32:
case kExprI32SConvertSatF64:
......
......@@ -903,6 +903,14 @@ void LiftoffAssembler::emit_i64_ctz(LiftoffRegister dst, LiftoffRegister src) {
Tzcntq(dst.gp(), src.gp());
}
bool LiftoffAssembler::emit_i64_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
if (!CpuFeatures::IsSupported(POPCNT)) return false;
CpuFeatureScope scope(this, POPCNT);
popcntq(dst.gp(), src.gp());
return true;
}
void LiftoffAssembler::emit_i32_to_intptr(Register dst, Register src) {
movsxlq(dst, src);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment