Commit 173d6608 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][x64] Optimize i8x16.popcnt with aligned moves

movups is slower on older hardware (core2) than movaps, even if the
operand is aligned. (Not an issue on modern hardware).

Also move i8x16.splat(0x0F) to an external reference so we can load the
mask directly.

Bug: v8:11002
Change-Id: I0b01c27a142024d50b9faaa9e7bd6a1fe169e141
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2643242Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#72336}
parent a6eefe14
......@@ -81,6 +81,12 @@ constexpr struct alignas(16) {
} wasm_i8x16_popcnt_mask = {uint64_t{0x03020201'02010100},
uint64_t{0x04030302'03020201}};
constexpr struct alignas(16) {
uint64_t a;
uint64_t b;
} wasm_i8x16_splat_0x0f = {uint64_t{0x0F0F0F0F'0F0F0F0F},
uint64_t{0x0F0F0F0F'0F0F0F0F}};
constexpr struct alignas(16) {
uint64_t a;
uint64_t b;
......@@ -556,6 +562,10 @@ ExternalReference ExternalReference::address_of_wasm_i8x16_popcnt_mask() {
return ExternalReference(reinterpret_cast<Address>(&wasm_i8x16_popcnt_mask));
}
ExternalReference ExternalReference::address_of_wasm_i8x16_splat_0x0f() {
return ExternalReference(reinterpret_cast<Address>(&wasm_i8x16_splat_0x0f));
}
ExternalReference
ExternalReference::address_of_wasm_f64x2_convert_low_i32x4_u_int_mask() {
return ExternalReference(
......
......@@ -116,6 +116,7 @@ class StatsCounter;
V(address_of_the_hole_nan, "the_hole_nan") \
V(address_of_uint32_bias, "uint32_bias") \
V(address_of_wasm_i8x16_popcnt_mask, "wasm_i8x16_popcnt_mask") \
V(address_of_wasm_i8x16_splat_0x0f, "wasm_i8x16_splat_0x0f") \
V(bytecode_size_table_address, "Bytecodes::bytecode_size_table_address") \
V(check_object_type, "check_object_type") \
V(compute_integer_hash, "ComputeSeededHash") \
......
......@@ -2929,6 +2929,15 @@ void Assembler::movaps(XMMRegister dst, XMMRegister src) {
}
}
void Assembler::movaps(XMMRegister dst, Operand src) {
DCHECK(!IsEnabled(AVX));
EnsureSpace ensure_space(this);
emit_optional_rex_32(dst, src);
emit(0x0F);
emit(0x28);
emit_sse_operand(dst, src);
}
void Assembler::shufps(XMMRegister dst, XMMRegister src, byte imm8) {
DCHECK(is_uint8(imm8));
EnsureSpace ensure_space(this);
......@@ -3513,6 +3522,14 @@ void Assembler::vmovq(Register dst, XMMRegister src) {
emit_sse_operand(src, dst);
}
void Assembler::vmovdqa(XMMRegister dst, Operand src) {
DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this);
emit_vex_prefix(dst, xmm0, src, kL128, k66, k0F, kWIG);
emit(0x6F);
emit_sse_operand(dst, src);
}
void Assembler::vmovdqa(XMMRegister dst, XMMRegister src) {
DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this);
......
......@@ -921,6 +921,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void ucomiss(XMMRegister dst, XMMRegister src);
void ucomiss(XMMRegister dst, Operand src);
void movaps(XMMRegister dst, XMMRegister src);
void movaps(XMMRegister dst, Operand src);
// Don't use this unless it's important to keep the
// top half of the destination register unchanged.
......@@ -1331,6 +1332,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
}
void vmovsd(XMMRegister dst, Operand src) { vsd(0x10, dst, xmm0, src); }
void vmovsd(Operand dst, XMMRegister src) { vsd(0x11, src, xmm0, dst); }
void vmovdqa(XMMRegister dst, Operand src);
void vmovdqa(XMMRegister dst, XMMRegister src);
void vmovdqu(XMMRegister dst, Operand src);
void vmovdqu(Operand dst, XMMRegister src);
......@@ -1514,6 +1516,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
}
void vmovaps(XMMRegister dst, XMMRegister src) { vps(0x28, dst, xmm0, src); }
void vmovaps(XMMRegister dst, Operand src) { vps(0x28, dst, xmm0, src); }
void vmovups(XMMRegister dst, XMMRegister src) { vps(0x10, dst, xmm0, src); }
void vmovups(XMMRegister dst, Operand src) { vps(0x10, dst, xmm0, src); }
void vmovups(Operand dst, XMMRegister src) { vps(0x11, src, xmm0, dst); }
......
......@@ -3928,35 +3928,33 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister src = i.InputSimd128Register(0);
XMMRegister tmp = i.TempSimd128Register(0);
// tmp = wasm_i8x16_splat(0x0F)
__ Move(tmp, uint32_t{0x0F0F0F0F});
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(tasm(), AVX2);
__ vpbroadcastd(tmp, tmp);
} else {
__ vpshufd(tmp, tmp, 0);
}
__ vmovdqa(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
__ vpandn(kScratchDoubleReg, tmp, src);
__ vpand(dst, tmp, src);
__ Move(tmp, 0x04030302'03020201, 0x03020201'02010100);
__ vmovdqa(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask()));
__ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 4);
__ vpshufb(dst, tmp, dst);
__ vpshufb(kScratchDoubleReg, tmp, kScratchDoubleReg);
__ vpaddb(dst, dst, kScratchDoubleReg);
} else {
__ movaps(tmp,
__ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x0f()));
Operand mask = __ ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_popcnt_mask());
__ shufps(tmp, tmp, 0);
__ Move(kScratchDoubleReg, tmp);
__ andps(tmp, src);
__ andnps(kScratchDoubleReg, src);
__ psrlw(kScratchDoubleReg, 4);
__ movups(dst, mask);
__ movaps(dst, mask);
__ pshufb(dst, tmp);
__ movups(tmp, mask);
__ movaps(tmp, mask);
__ pshufb(tmp, kScratchDoubleReg);
__ paddb(dst, tmp);
}
......
......@@ -396,6 +396,7 @@ TEST(DisasmX64) {
__ cvttps2dq(xmm0, xmm1);
__ cvttps2dq(xmm0, Operand(rbx, rcx, times_4, 10000));
__ movaps(xmm0, xmm1);
__ movaps(xmm0, Operand(rbx, rcx, times_4, 10000));
__ movdqa(xmm0, Operand(rsp, 12));
__ movdqa(Operand(rsp, 12), xmm0);
__ movdqu(xmm0, Operand(rsp, 12));
......@@ -660,6 +661,7 @@ TEST(DisasmX64) {
__ vmovsd(Operand(rbx, rcx, times_4, 10000), xmm0);
__ vmovdqa(xmm4, xmm5);
__ vmovdqa(xmm4, Operand(rbx, rcx, times_4, 10000));
__ vmovdqu(xmm9, Operand(rbx, rcx, times_4, 10000));
__ vmovdqu(Operand(rbx, rcx, times_4, 10000), xmm0);
......@@ -692,6 +694,7 @@ TEST(DisasmX64) {
__ vcvtsd2si(rdi, xmm9);
__ vmovaps(xmm10, xmm11);
__ vmovaps(xmm0, Operand(rbx, rcx, times_4, 10000));
__ vmovapd(xmm7, xmm0);
__ vmovupd(xmm0, Operand(rbx, rcx, times_4, 10000));
__ vmovupd(Operand(rbx, rcx, times_4, 10000), xmm0);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment