Commit c9e36e78 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][ia32][liftoff] Implement i8x16.popcnt

Extract i8x16.popcnt implementation into a macro-assembler function, and
reuse it in Liftoff.

We need an additional temporary XMMRegister whose lifetimes overlap with
dst and src, so make sure to pin those 2 when getting an unused
XMMRegister.

Bug: v8:11002
Change-Id: I13400b139add6f12316b3f398a796e6bf5a1ea7f
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2676921
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/master@{#72615}
parent 9f37d0cd
......@@ -798,6 +798,75 @@ void TurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
}
}
void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
XMMRegister tmp1, XMMRegister tmp2,
Register scratch) {
DCHECK_NE(dst, tmp1);
DCHECK_NE(src, tmp1);
DCHECK_NE(dst, tmp2);
DCHECK_NE(src, tmp2);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovdqa(tmp1, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
scratch));
vpandn(tmp2, tmp1, src);
vpand(dst, tmp1, src);
vmovdqa(tmp1, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask(),
scratch));
vpsrlw(tmp2, tmp2, 4);
vpshufb(dst, tmp1, dst);
vpshufb(tmp2, tmp1, tmp2);
vpaddb(dst, dst, tmp2);
} else if (CpuFeatures::IsSupported(ATOM)) {
// Pre-Goldmont low-power Intel microarchitectures have very slow
// PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
// algorithm on these processors. ATOM CPU feature captures exactly
// the right set of processors.
xorps(tmp1, tmp1);
pavgb(tmp1, src);
if (dst != src) {
movaps(dst, src);
}
andps(tmp1,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x55(), scratch));
psubb(dst, tmp1);
Operand splat_0x33 = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x33(), scratch);
movaps(tmp1, dst);
andps(dst, splat_0x33);
psrlw(tmp1, 2);
andps(tmp1, splat_0x33);
paddb(dst, tmp1);
movaps(tmp1, dst);
psrlw(dst, 4);
paddb(dst, tmp1);
andps(dst,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f(), scratch));
} else {
CpuFeatureScope sse_scope(this, SSSE3);
movaps(tmp1,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f(), scratch));
Operand mask = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask(), scratch);
if (tmp2 != tmp1) {
movaps(tmp2, tmp1);
}
andps(tmp1, src);
andnps(tmp2, src);
psrlw(tmp2, 4);
movaps(dst, mask);
pshufb(dst, tmp1);
movaps(tmp1, mask);
pshufb(tmp1, tmp2);
paddb(dst, tmp1);
}
}
void TurboAssembler::ShlPair(Register high, Register low, uint8_t shift) {
DCHECK_GE(63, shift);
if (shift >= 32) {
......
......@@ -662,6 +662,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch);
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
XMMRegister tmp2, Register scratch);
void Push(Register src) { push(src); }
void Push(Operand src) { push(src); }
......
......@@ -3771,74 +3771,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kIA32I8x16Popcnt: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
XMMRegister tmp = i.TempSimd128Register(0);
Register scratch = i.TempRegister(1);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vmovdqa(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
scratch));
__ vpandn(kScratchDoubleReg, tmp, src);
__ vpand(dst, tmp, src);
__ vmovdqa(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask(),
scratch));
__ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4);
__ vpshufb(dst, tmp, dst);
__ vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg);
__ vpaddb(dst, dst, kScratchDoubleReg);
} else if (CpuFeatures::IsSupported(ATOM)) {
// Pre-Goldmont low-power Intel microarchitectures have very slow
// PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
// algorithm on these processors. ATOM CPU feature captures exactly
// the right set of processors.
__ xorps(tmp, tmp);
__ pavgb(tmp, src);
if (dst != src) {
__ movaps(dst, src);
}
__ andps(tmp, __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x55(),
scratch));
__ psubb(dst, tmp);
Operand splat_0x33 = __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x33(), scratch);
__ movaps(tmp, dst);
__ andps(dst, splat_0x33);
__ psrlw(tmp, 2);
__ andps(tmp, splat_0x33);
__ paddb(dst, tmp);
__ movaps(tmp, dst);
__ psrlw(dst, 4);
__ paddb(dst, tmp);
__ andps(dst, __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
scratch));
} else {
CpuFeatureScope sse_scope(tasm(), SSSE3);
__ movaps(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
scratch));
Operand mask = __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask(), scratch);
if (kScratchDoubleReg != tmp) {
__ movaps(kScratchDoubleReg, tmp);
}
__ andps(tmp, src);
__ andnps(kScratchDoubleReg, src);
__ psrlw(kScratchDoubleReg, 4);
__ movaps(dst, mask);
__ pshufb(dst, tmp);
__ movaps(tmp, mask);
__ pshufb(tmp, kScratchDoubleReg);
__ paddb(dst, tmp);
}
__ I8x16Popcnt(i.OutputSimd128Register(), i.InputSimd128Register(0),
kScratchDoubleReg, i.TempSimd128Register(0),
i.TempRegister(1));
break;
}
case kIA32S128Const: {
......
......@@ -2876,7 +2876,11 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i8x16.popcnt");
Register scratch = GetUnusedRegister(RegClass::kGpReg, {}).gp();
XMMRegister tmp =
GetUnusedRegister(RegClass::kFpReg, LiftoffRegList::ForRegs(dst, src))
.fp();
I8x16Popcnt(dst.fp(), src.fp(), liftoff::kScratchDoubleReg, tmp, scratch);
}
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment