Commit dd90d107 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][ia32] Implement i8x16.popcnt

Code sequence from https://github.com/WebAssembly/simd/pull/379, and
exactly the same as x64, with minor tweaks for
ExternalReferenceAsOperand.

Bug: v8:11002
Change-Id: Icbfdac62b21c2734ad4886b3d48f34e29f7a8222
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2664860
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/master@{#72495}
parent 809b6481
......@@ -1492,6 +1492,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vbroadcastss(XMMRegister dst, Operand src) {
vinstr(0x18, dst, xmm0, src, k66, k0F38, kW0);
}
void vmovdqa(XMMRegister dst, Operand src) {
vinstr(0x6F, dst, xmm0, src, k66, k0F, kWIG);
}
void vmovdqu(XMMRegister dst, Operand src) {
vinstr(0x6F, dst, xmm0, src, kF3, k0F, kWIG);
}
......
......@@ -3677,6 +3677,77 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pmovmskb(i.OutputRegister(), i.InputSimd128Register(0));
break;
}
case kIA32I8x16Popcnt: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
XMMRegister tmp = i.TempSimd128Register(0);
Register scratch = i.TempRegister(1);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vmovdqa(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
scratch));
__ vpandn(kScratchDoubleReg, tmp, src);
__ vpand(dst, tmp, src);
__ vmovdqa(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask(),
scratch));
__ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4);
__ vpshufb(dst, tmp, dst);
__ vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg);
__ vpaddb(dst, dst, kScratchDoubleReg);
} else if (CpuFeatures::IsSupported(ATOM)) {
// Pre-Goldmont low-power Intel microarchitectures have very slow
// PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
// algorithm on these processors. ATOM CPU feature captures exactly
// the right set of processors.
__ xorps(tmp, tmp);
__ pavgb(tmp, src);
if (dst != src) {
__ movaps(dst, src);
}
__ andps(tmp, __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x55(),
scratch));
__ psubb(dst, tmp);
Operand splat_0x33 = __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x33(), scratch);
__ movaps(tmp, dst);
__ andps(dst, splat_0x33);
__ psrlw(tmp, 2);
__ andps(tmp, splat_0x33);
__ paddb(dst, tmp);
__ movaps(tmp, dst);
__ psrlw(dst, 4);
__ paddb(dst, tmp);
__ andps(dst, __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
scratch));
} else {
CpuFeatureScope sse_scope(tasm(), SSSE3);
__ movaps(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f(),
scratch));
Operand mask = __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask(), scratch);
if (kScratchDoubleReg != tmp) {
__ movaps(kScratchDoubleReg, tmp);
}
__ andps(tmp, src);
__ andnps(kScratchDoubleReg, src);
__ psrlw(kScratchDoubleReg, 4);
__ movaps(dst, mask);
__ pshufb(dst, tmp);
__ movaps(tmp, mask);
__ pshufb(tmp, kScratchDoubleReg);
__ paddb(dst, tmp);
}
break;
}
case kIA32S128Const: {
XMMRegister dst = i.OutputSimd128Register();
Register tmp = i.TempRegister(0);
......
......@@ -342,6 +342,7 @@ namespace compiler {
V(IA32I8x16Abs) \
V(IA32I8x16BitMask) \
V(IA32I8x16SignSelect) \
V(IA32I8x16Popcnt) \
V(IA32S128Const) \
V(IA32S128Zero) \
V(IA32S128AllOnes) \
......
......@@ -324,6 +324,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I8x16Abs:
case kIA32I8x16BitMask:
case kIA32I8x16SignSelect:
case kIA32I8x16Popcnt:
case kIA32S128Const:
case kIA32S128Zero:
case kIA32S128AllOnes:
......
......@@ -3131,6 +3131,16 @@ void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16U(Node* node) {
VisitRRSimd(this, node, kIA32I16x8ExtAddPairwiseI8x16U);
}
void InstructionSelector::VisitI8x16Popcnt(Node* node) {
IA32OperandGenerator g(this);
InstructionOperand dst = CpuFeatures::IsSupported(AVX)
? g.DefineAsRegister(node)
: g.DefineAsRegister(node);
InstructionOperand temps[] = {g.TempSimd128Register(), g.TempRegister()};
Emit(kIA32I8x16Popcnt, dst, g.UseUniqueRegister(node->InputAt(0)),
arraysize(temps), temps);
}
// static
MachineOperatorBuilder::Flags
InstructionSelector::SupportedMachineOperatorFlags() {
......
......@@ -2759,11 +2759,6 @@ void InstructionSelector::VisitPrefetchNonTemporal(Node* node) {
}
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_X64 || !V8_TARGET_ARCH_IA32
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64
// TODO(v8:11002) Prototype i8x16.popcnt.
void InstructionSelector::VisitI8x16Popcnt(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64 && \
!V8_TARGET_ARCH_IA32
// TODO(v8:11086) Prototype extended pairwise add.
......
......@@ -1315,6 +1315,10 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
AppendToBuffer("vmovd %s,", NameOfXMMRegister(regop));
current += PrintRightOperand(current);
break;
case 0x6f:
AppendToBuffer("vmovdqa %s,", NameOfXMMRegister(regop));
current += PrintRightOperand(current);
break;
case 0x70:
AppendToBuffer("vpshufd %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
......
......@@ -807,6 +807,7 @@ TEST(DisasmIa320) {
__ vmovddup(xmm1, Operand(ebx, ecx, times_4, 10000));
__ vmovshdup(xmm1, xmm2);
__ vbroadcastss(xmm1, Operand(ebx, ecx, times_4, 10000));
__ vmovdqa(xmm0, Operand(ebx, ecx, times_4, 10000));
__ vmovdqu(xmm0, Operand(ebx, ecx, times_4, 10000));
__ vmovdqu(Operand(ebx, ecx, times_4, 10000), xmm0);
__ vmovd(xmm0, edi);
......
......@@ -2728,8 +2728,6 @@ WASM_SIMD_TEST(I8x16Abs) {
RunI8x16UnOpTest(execution_tier, lower_simd, kExprI8x16Abs, Abs);
}
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64
// TODO(v8:11002) Prototype i8x16.popcnt.
WASM_SIMD_TEST_NO_LOWERING(I8x16Popcnt) {
FLAG_SCOPE(wasm_simd_post_mvp);
WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
......@@ -2751,7 +2749,6 @@ WASM_SIMD_TEST_NO_LOWERING(I8x16Popcnt) {
}
}
}
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64
// Tests both signed and unsigned conversion from I16x8 (packing).
WASM_SIMD_TEST(I8x16ConvertI16x8) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment