Commit 9f37d0cd authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

Reland "[wasm-simd][x64][liftoff] Implement i8x16.popcnt"

This is a reland of 00babf07

No changes from original patch, previous CQ failures were likely
a Mac infra issue https://chromium-review.googlesource.com/c/2682521.

Original change's description:
> [wasm-simd][x64][liftoff] Implement i8x16.popcnt
>
> Extract i8x16.popcnt implementation into a macro-assembler function, and
> reuse it in Liftoff.
>
> Bug: v8:11002
> Change-Id: I86b2f5322c799d44f584cac28c70e0e393bf114f
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2676280
> Reviewed-by: Clemens Backes <clemensb@chromium.org>
> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
> Commit-Queue: Zhi An Ng <zhin@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#72565}

Bug: v8:11002
Change-Id: Ic8bcbdb3444865805d8d2af3669ccb4a05c4426d
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2682507Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#72614}
parent e8ad04c8
......@@ -2299,6 +2299,64 @@ void TurboAssembler::S128Store64Lane(Operand dst, XMMRegister src,
}
}
void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
XMMRegister tmp) {
DCHECK_NE(dst, tmp);
DCHECK_NE(src, tmp);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovdqa(tmp, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
vpandn(kScratchDoubleReg, tmp, src);
vpand(dst, tmp, src);
vmovdqa(tmp, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask()));
vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4);
vpshufb(dst, tmp, dst);
vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg);
vpaddb(dst, dst, kScratchDoubleReg);
} else if (CpuFeatures::IsSupported(ATOM)) {
// Pre-Goldmont low-power Intel microarchitectures have very slow
// PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
// algorithm on these processors. ATOM CPU feature captures exactly
// the right set of processors.
xorps(tmp, tmp);
pavgb(tmp, src);
if (dst != src) {
movaps(dst, src);
}
andps(tmp, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x55()));
psubb(dst, tmp);
Operand splat_0x33 = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x33());
movaps(tmp, dst);
andps(dst, splat_0x33);
psrlw(tmp, 2);
andps(tmp, splat_0x33);
paddb(dst, tmp);
movaps(tmp, dst);
psrlw(dst, 4);
paddb(dst, tmp);
andps(dst, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
} else {
movaps(tmp, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
Operand mask = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask());
Move(kScratchDoubleReg, tmp);
andps(tmp, src);
andnps(kScratchDoubleReg, src);
psrlw(kScratchDoubleReg, 4);
movaps(dst, mask);
pshufb(dst, tmp);
movaps(tmp, mask);
pshufb(tmp, kScratchDoubleReg);
paddb(dst, tmp);
}
}
void TurboAssembler::Abspd(XMMRegister dst) {
Andps(dst, ExternalReferenceAsOperand(
ExternalReference::address_of_double_abs_constant()));
......
......@@ -608,6 +608,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
void S128Store64Lane(Operand dst, XMMRegister src, uint8_t laneidx);
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp);
void Abspd(XMMRegister dst);
void Negpd(XMMRegister dst);
......
......@@ -3995,67 +3995,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I8x16Popcnt: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
XMMRegister tmp = i.TempSimd128Register(0);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vmovdqa(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
__ vpandn(kScratchDoubleReg, tmp, src);
__ vpand(dst, tmp, src);
__ vmovdqa(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask()));
__ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4);
__ vpshufb(dst, tmp, dst);
__ vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg);
__ vpaddb(dst, dst, kScratchDoubleReg);
} else if (CpuFeatures::IsSupported(ATOM)) {
// Pre-Goldmont low-power Intel microarchitectures have very slow
// PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
// algorithm on these processors. ATOM CPU feature captures exactly
// the right set of processors.
__ xorps(tmp, tmp);
__ pavgb(tmp, src);
if (dst != src) {
__ movaps(dst, src);
}
__ andps(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x55()));
__ psubb(dst, tmp);
Operand splat_0x33 = __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x33());
__ movaps(tmp, dst);
__ andps(dst, splat_0x33);
__ psrlw(tmp, 2);
__ andps(tmp, splat_0x33);
__ paddb(dst, tmp);
__ movaps(tmp, dst);
__ psrlw(dst, 4);
__ paddb(dst, tmp);
__ andps(dst,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
} else {
__ movaps(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
Operand mask = __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask());
__ Move(kScratchDoubleReg, tmp);
__ andps(tmp, src);
__ andnps(kScratchDoubleReg, src);
__ psrlw(kScratchDoubleReg, 4);
__ movaps(dst, mask);
__ pshufb(dst, tmp);
__ movaps(tmp, mask);
__ pshufb(tmp, kScratchDoubleReg);
__ paddb(dst, tmp);
}
__ I8x16Popcnt(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.TempSimd128Register(0));
break;
}
case kX64S128Load8Splat: {
......
......@@ -3402,6 +3402,11 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
}
}
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i8x16.popcnt");
}
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) {
vdup(Neon8, liftoff::GetSimd128Register(dst), src.gp());
......
......@@ -2452,6 +2452,11 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
}
}
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i8x16.popcnt");
}
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) {
Dup(dst.fp().V16B(), src.gp().W());
......
......@@ -2874,6 +2874,11 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
Pshufb(dst.fp(), lhs.fp(), mask);
}
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i8x16.popcnt");
}
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
......
......@@ -887,6 +887,7 @@ class LiftoffAssembler : public TurboAssembler {
bool is_swizzle);
inline void emit_i8x16_swizzle(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i8x16_popcnt(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i8x16_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i16x8_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i32x4_splat(LiftoffRegister dst, LiftoffRegister src);
......
......@@ -2946,6 +2946,8 @@ class LiftoffCompiler {
switch (opcode) {
case wasm::kExprI8x16Swizzle:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i8x16_swizzle);
case wasm::kExprI8x16Popcnt:
return EmitUnOp<kS128, kS128>(&LiftoffAssembler::emit_i8x16_popcnt);
case wasm::kExprI8x16Splat:
return EmitUnOp<kI32, kS128>(&LiftoffAssembler::emit_i8x16_splat);
case wasm::kExprI16x8Splat:
......
......@@ -2472,6 +2472,11 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
Pshufb(dst.fp(), lhs.fp(), mask);
}
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
I8x16Popcnt(dst.fp(), src.fp(), liftoff::kScratchDoubleReg2);
}
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment