Commit 71fc222f authored by Marat Dukhan's avatar Marat Dukhan Committed by Commit Bot

[wasm-simd][x64] Specialize i8x16.popcnt for Atom with slow PSHUFB

i8x16.popcnt uses PSHUFB instruction, which is slow on the old Atom
processors. Add an extra i8x16.popcnt implementation for those using
HACKMEM-inspired divide-and-conquer algorithm.

R=zhin@chromium.org, gdeepti@chromium.org

Change-Id: I4e130428fea8c3cf3be1bd6da7308fc752b2132a
Bug: v8:11002
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2656858Reviewed-by: 's avatarZhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Marat Dukhan <maratek@google.com>
Cr-Commit-Position: refs/heads/master@{#72421}
parent 8e7347dc
......@@ -93,6 +93,18 @@ constexpr struct alignas(16) {
} wasm_i8x16_splat_0x0f = {uint64_t{0x0F0F0F0F'0F0F0F0F},
uint64_t{0x0F0F0F0F'0F0F0F0F}};
constexpr struct alignas(16) {
uint64_t a;
uint64_t b;
} wasm_i8x16_splat_0x33 = {uint64_t{0x33333333'33333333},
uint64_t{0x33333333'33333333}};
constexpr struct alignas(16) {
uint64_t a;
uint64_t b;
} wasm_i8x16_splat_0x55 = {uint64_t{0x55555555'55555555},
uint64_t{0x55555555'55555555}};
constexpr struct alignas(16) {
uint64_t a;
uint64_t b;
......@@ -582,6 +594,14 @@ ExternalReference ExternalReference::address_of_wasm_i8x16_splat_0x0f() {
return ExternalReference(reinterpret_cast<Address>(&wasm_i8x16_splat_0x0f));
}
ExternalReference ExternalReference::address_of_wasm_i8x16_splat_0x33() {
return ExternalReference(reinterpret_cast<Address>(&wasm_i8x16_splat_0x33));
}
ExternalReference ExternalReference::address_of_wasm_i8x16_splat_0x55() {
return ExternalReference(reinterpret_cast<Address>(&wasm_i8x16_splat_0x55));
}
ExternalReference ExternalReference::address_of_wasm_i16x8_splat_0x0001() {
return ExternalReference(reinterpret_cast<Address>(&wasm_i16x8_splat_0x0001));
}
......
......@@ -118,6 +118,8 @@ class StatsCounter;
V(address_of_wasm_i8x16_popcnt_mask, "wasm_i8x16_popcnt_mask") \
V(address_of_wasm_i8x16_splat_0x01, "wasm_i8x16_splat_0x01") \
V(address_of_wasm_i8x16_splat_0x0f, "wasm_i8x16_splat_0x0f") \
V(address_of_wasm_i8x16_splat_0x33, "wasm_i8x16_splat_0x33") \
V(address_of_wasm_i8x16_splat_0x55, "wasm_i8x16_splat_0x55") \
V(address_of_wasm_i16x8_splat_0x0001, "wasm_16x8_splat_0x0001") \
V(bytecode_size_table_address, "Bytecodes::bytecode_size_table_address") \
V(check_object_type, "check_object_type") \
......
......@@ -3941,6 +3941,33 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vpshufb(dst, tmp, dst);
__ vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg);
__ vpaddb(dst, dst, kScratchDoubleReg);
} else if (CpuFeatures::IsSupported(ATOM)) {
// Pre-Goldmont low-power Intel microarchitectures have very slow
// PSHUFB instruction, thus use PSHUFB-free divide-and-conquer
// algorithm on these processors. ATOM CPU feature captures exactly
// the right set of processors.
__ xorps(tmp, tmp);
__ pavgb(tmp, src);
if (dst != src) {
__ movaps(dst, src);
}
__ andps(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x55()));
__ psubb(dst, tmp);
Operand splat_0x33 = __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x33());
__ movaps(tmp, dst);
__ andps(dst, splat_0x33);
__ psrlw(tmp, 2);
__ andps(tmp, splat_0x33);
__ paddb(dst, tmp);
__ movaps(tmp, dst);
__ psrlw(dst, 4);
__ paddb(dst, tmp);
__ andps(dst,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
} else {
__ movaps(tmp,
__ ExternalReferenceAsOperand(
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment