Commit f4f7f618 authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[wasm-simd][ia32][x64] Optimize Pshufb calls for AVX

Have a new macro-assembler function Pshufb that does not require dst and
src to be equal, since the vpshufb can have 3 operands. On SEE, if dst
and src are not equal, emit a movapd(dst, src). This saves a move on
AVX.

Small cleanup to use kScratchDoubleReg2 instead of requesting
for a tmp (x64).

Bug: v8:9561
Change-Id: I131ad0456b272da857350762582cac1fb240ae40
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2513868
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#70917}
parent 9a49b229
......@@ -1545,15 +1545,20 @@ void TurboAssembler::Psignd(XMMRegister dst, Operand src) {
FATAL("no AVX or SSE3 support");
}
void TurboAssembler::Pshufb(XMMRegister dst, Operand src) {
void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src, Operand mask) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpshufb(dst, dst, src);
vpshufb(dst, src, mask);
return;
}
if (CpuFeatures::IsSupported(SSSE3)) {
// Make sure these are different so that we won't overwrite mask.
DCHECK(!mask.is_reg(dst));
CpuFeatureScope sse_scope(this, SSSE3);
pshufb(dst, src);
if (dst != src) {
movapd(dst, src);
}
pshufb(dst, mask);
return;
}
FATAL("no AVX or SSE3 support");
......
......@@ -480,8 +480,13 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
#undef AVX_OP3_XO_SSE4
#undef AVX_OP3_WITH_TYPE_SCOPE
void Pshufb(XMMRegister dst, XMMRegister src) { Pshufb(dst, Operand(src)); }
void Pshufb(XMMRegister dst, Operand src);
void Pshufb(XMMRegister dst, XMMRegister src) { Pshufb(dst, dst, src); }
void Pshufb(XMMRegister dst, Operand src) { Pshufb(dst, dst, src); }
// Handles SSE and AVX. On SSE, moves src to dst if they are not equal.
void Pshufb(XMMRegister dst, XMMRegister src, XMMRegister mask) {
Pshufb(dst, src, Operand(mask));
}
void Pshufb(XMMRegister dst, XMMRegister src, Operand mask);
void Pblendw(XMMRegister dst, XMMRegister src, uint8_t imm8) {
Pblendw(dst, Operand(src), imm8);
}
......
......@@ -1936,6 +1936,22 @@ void TurboAssembler::Blendvpd(XMMRegister dst, XMMRegister src1,
}
}
void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src,
XMMRegister mask) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpshufb(dst, src, mask);
} else {
// Make sure these are different so that we won't overwrite mask.
DCHECK_NE(dst, mask);
if (dst != src) {
movapd(dst, src);
}
CpuFeatureScope sse_scope(this, SSSE3);
pshufb(dst, mask);
}
}
void TurboAssembler::Psrld(XMMRegister dst, byte imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
......
......@@ -549,6 +549,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void Blendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister mask);
// Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE.
void Pshufb(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void CompareRoot(Register with, RootIndex index);
void CompareRoot(Operand with, RootIndex index);
......
......@@ -2705,16 +2705,7 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
for (int i = 3; i >= 0; i--) {
push_imm32(imms[i]);
}
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpshufb(dst.fp(), lhs.fp(), Operand(esp, 0));
} else {
if (dst != lhs) {
movups(dst.fp(), lhs.fp());
}
CpuFeatureScope sse_scope(this, SSSE3);
pshufb(dst.fp(), Operand(esp, 0));
}
Pshufb(dst.fp(), lhs.fp(), Operand(esp, 0));
mov(esp, tmp.gp());
return;
}
......@@ -2729,7 +2720,7 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
}
push(Immediate(mask));
}
Pshufb(liftoff::kScratchDoubleReg, Operand(esp, 0));
Pshufb(liftoff::kScratchDoubleReg, lhs.fp(), Operand(esp, 0));
for (int i = 3; i >= 0; i--) {
uint32_t mask = 0;
......@@ -2740,10 +2731,7 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
}
push(Immediate(mask));
}
if (dst.fp() != rhs.fp()) {
movups(dst.fp(), rhs.fp());
}
Pshufb(dst.fp(), Operand(esp, 0));
Pshufb(dst.fp(), rhs.fp(), Operand(esp, 0));
Por(dst.fp(), liftoff::kScratchDoubleReg);
mov(esp, tmp.gp());
}
......@@ -2757,10 +2745,7 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
TurboAssembler::Move(mask, uint32_t{0x70707070});
Pshufd(mask, mask, uint8_t{0x0});
Paddusb(mask, rhs.fp());
if (lhs != dst) {
Movaps(dst.fp(), lhs.fp());
}
Pshufb(dst.fp(), mask);
Pshufb(dst.fp(), lhs.fp(), mask);
}
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
......
......@@ -2329,23 +2329,10 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
wasm::SimdShuffle::Pack16Lanes(imms, shuffle);
TurboAssembler::Move(kScratchDoubleReg, make_uint64(imms[3], imms[2]),
make_uint64(imms[1], imms[0]));
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpshufb(dst.fp(), lhs.fp(), kScratchDoubleReg);
} else {
if (dst != lhs) {
movups(dst.fp(), lhs.fp());
}
CpuFeatureScope sse_scope(this, SSSE3);
pshufb(dst.fp(), kScratchDoubleReg);
}
Pshufb(dst.fp(), lhs.fp(), kScratchDoubleReg);
return;
}
LiftoffRegister tmp_simd =
GetUnusedRegister(kFpReg, LiftoffRegList::ForRegs(dst, lhs, rhs));
Movups(kScratchDoubleReg, lhs.fp());
uint64_t mask1[2] = {};
for (int i = 15; i >= 0; i--) {
uint8_t lane = shuffle[i];
......@@ -2353,10 +2340,8 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
mask1[j] <<= 8;
mask1[j] |= lane < kSimd128Size ? lane : 0x80;
}
TurboAssembler::Move(tmp_simd.fp(), mask1[0]);
movq(kScratchRegister, mask1[1]);
Pinsrq(tmp_simd.fp(), kScratchRegister, uint8_t{1});
Pshufb(kScratchDoubleReg, tmp_simd.fp());
TurboAssembler::Move(liftoff::kScratchDoubleReg2, mask1[1], mask1[0]);
Pshufb(kScratchDoubleReg, lhs.fp(), liftoff::kScratchDoubleReg2);
uint64_t mask2[2] = {};
for (int i = 15; i >= 0; i--) {
......@@ -2365,14 +2350,9 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
mask2[j] <<= 8;
mask2[j] |= lane >= kSimd128Size ? (lane & 0x0F) : 0x80;
}
TurboAssembler::Move(tmp_simd.fp(), mask2[0]);
movq(kScratchRegister, mask2[1]);
Pinsrq(tmp_simd.fp(), kScratchRegister, uint8_t{1});
TurboAssembler::Move(liftoff::kScratchDoubleReg2, mask2[1], mask2[0]);
if (dst.fp() != rhs.fp()) {
Movups(dst.fp(), rhs.fp());
}
Pshufb(dst.fp(), tmp_simd.fp());
Pshufb(dst.fp(), rhs.fp(), liftoff::kScratchDoubleReg2);
Por(dst.fp(), kScratchDoubleReg);
}
......@@ -2385,10 +2365,7 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
TurboAssembler::Move(mask, uint32_t{0x70707070});
Pshufd(mask, mask, uint8_t{0x0});
Paddusb(mask, rhs.fp());
if (lhs != dst) {
Movaps(dst.fp(), lhs.fp());
}
Pshufb(dst.fp(), mask);
Pshufb(dst.fp(), lhs.fp(), mask);
}
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment