Commit f4f7f618 authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[wasm-simd][ia32][x64] Optimize Pshufb calls for AVX

Have a new macro-assembler function Pshufb that does not require dst and
src to be equal, since the vpshufb can have 3 operands. On SEE, if dst
and src are not equal, emit a movapd(dst, src). This saves a move on
AVX.

Small cleanup to use kScratchDoubleReg2 instead of requesting
for a tmp (x64).

Bug: v8:9561
Change-Id: I131ad0456b272da857350762582cac1fb240ae40
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2513868
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#70917}
parent 9a49b229
...@@ -1545,15 +1545,20 @@ void TurboAssembler::Psignd(XMMRegister dst, Operand src) { ...@@ -1545,15 +1545,20 @@ void TurboAssembler::Psignd(XMMRegister dst, Operand src) {
FATAL("no AVX or SSE3 support"); FATAL("no AVX or SSE3 support");
} }
void TurboAssembler::Pshufb(XMMRegister dst, Operand src) { void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src, Operand mask) {
if (CpuFeatures::IsSupported(AVX)) { if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX); CpuFeatureScope scope(this, AVX);
vpshufb(dst, dst, src); vpshufb(dst, src, mask);
return; return;
} }
if (CpuFeatures::IsSupported(SSSE3)) { if (CpuFeatures::IsSupported(SSSE3)) {
// Make sure these are different so that we won't overwrite mask.
DCHECK(!mask.is_reg(dst));
CpuFeatureScope sse_scope(this, SSSE3); CpuFeatureScope sse_scope(this, SSSE3);
pshufb(dst, src); if (dst != src) {
movapd(dst, src);
}
pshufb(dst, mask);
return; return;
} }
FATAL("no AVX or SSE3 support"); FATAL("no AVX or SSE3 support");
......
...@@ -480,8 +480,13 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -480,8 +480,13 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
#undef AVX_OP3_XO_SSE4 #undef AVX_OP3_XO_SSE4
#undef AVX_OP3_WITH_TYPE_SCOPE #undef AVX_OP3_WITH_TYPE_SCOPE
void Pshufb(XMMRegister dst, XMMRegister src) { Pshufb(dst, Operand(src)); } void Pshufb(XMMRegister dst, XMMRegister src) { Pshufb(dst, dst, src); }
void Pshufb(XMMRegister dst, Operand src); void Pshufb(XMMRegister dst, Operand src) { Pshufb(dst, dst, src); }
// Handles SSE and AVX. On SSE, moves src to dst if they are not equal.
void Pshufb(XMMRegister dst, XMMRegister src, XMMRegister mask) {
Pshufb(dst, src, Operand(mask));
}
void Pshufb(XMMRegister dst, XMMRegister src, Operand mask);
void Pblendw(XMMRegister dst, XMMRegister src, uint8_t imm8) { void Pblendw(XMMRegister dst, XMMRegister src, uint8_t imm8) {
Pblendw(dst, Operand(src), imm8); Pblendw(dst, Operand(src), imm8);
} }
......
...@@ -1936,6 +1936,22 @@ void TurboAssembler::Blendvpd(XMMRegister dst, XMMRegister src1, ...@@ -1936,6 +1936,22 @@ void TurboAssembler::Blendvpd(XMMRegister dst, XMMRegister src1,
} }
} }
void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src,
XMMRegister mask) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpshufb(dst, src, mask);
} else {
// Make sure these are different so that we won't overwrite mask.
DCHECK_NE(dst, mask);
if (dst != src) {
movapd(dst, src);
}
CpuFeatureScope sse_scope(this, SSSE3);
pshufb(dst, mask);
}
}
void TurboAssembler::Psrld(XMMRegister dst, byte imm8) { void TurboAssembler::Psrld(XMMRegister dst, byte imm8) {
if (CpuFeatures::IsSupported(AVX)) { if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX); CpuFeatureScope scope(this, AVX);
......
...@@ -549,6 +549,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -549,6 +549,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void Blendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, void Blendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister mask); XMMRegister mask);
// Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE.
void Pshufb(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void CompareRoot(Register with, RootIndex index); void CompareRoot(Register with, RootIndex index);
void CompareRoot(Operand with, RootIndex index); void CompareRoot(Operand with, RootIndex index);
......
...@@ -2705,16 +2705,7 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst, ...@@ -2705,16 +2705,7 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
for (int i = 3; i >= 0; i--) { for (int i = 3; i >= 0; i--) {
push_imm32(imms[i]); push_imm32(imms[i]);
} }
if (CpuFeatures::IsSupported(AVX)) { Pshufb(dst.fp(), lhs.fp(), Operand(esp, 0));
CpuFeatureScope scope(this, AVX);
vpshufb(dst.fp(), lhs.fp(), Operand(esp, 0));
} else {
if (dst != lhs) {
movups(dst.fp(), lhs.fp());
}
CpuFeatureScope sse_scope(this, SSSE3);
pshufb(dst.fp(), Operand(esp, 0));
}
mov(esp, tmp.gp()); mov(esp, tmp.gp());
return; return;
} }
...@@ -2729,7 +2720,7 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst, ...@@ -2729,7 +2720,7 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
} }
push(Immediate(mask)); push(Immediate(mask));
} }
Pshufb(liftoff::kScratchDoubleReg, Operand(esp, 0)); Pshufb(liftoff::kScratchDoubleReg, lhs.fp(), Operand(esp, 0));
for (int i = 3; i >= 0; i--) { for (int i = 3; i >= 0; i--) {
uint32_t mask = 0; uint32_t mask = 0;
...@@ -2740,10 +2731,7 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst, ...@@ -2740,10 +2731,7 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
} }
push(Immediate(mask)); push(Immediate(mask));
} }
if (dst.fp() != rhs.fp()) { Pshufb(dst.fp(), rhs.fp(), Operand(esp, 0));
movups(dst.fp(), rhs.fp());
}
Pshufb(dst.fp(), Operand(esp, 0));
Por(dst.fp(), liftoff::kScratchDoubleReg); Por(dst.fp(), liftoff::kScratchDoubleReg);
mov(esp, tmp.gp()); mov(esp, tmp.gp());
} }
...@@ -2757,10 +2745,7 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst, ...@@ -2757,10 +2745,7 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
TurboAssembler::Move(mask, uint32_t{0x70707070}); TurboAssembler::Move(mask, uint32_t{0x70707070});
Pshufd(mask, mask, uint8_t{0x0}); Pshufd(mask, mask, uint8_t{0x0});
Paddusb(mask, rhs.fp()); Paddusb(mask, rhs.fp());
if (lhs != dst) { Pshufb(dst.fp(), lhs.fp(), mask);
Movaps(dst.fp(), lhs.fp());
}
Pshufb(dst.fp(), mask);
} }
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst, void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
......
...@@ -2329,23 +2329,10 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst, ...@@ -2329,23 +2329,10 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
wasm::SimdShuffle::Pack16Lanes(imms, shuffle); wasm::SimdShuffle::Pack16Lanes(imms, shuffle);
TurboAssembler::Move(kScratchDoubleReg, make_uint64(imms[3], imms[2]), TurboAssembler::Move(kScratchDoubleReg, make_uint64(imms[3], imms[2]),
make_uint64(imms[1], imms[0])); make_uint64(imms[1], imms[0]));
if (CpuFeatures::IsSupported(AVX)) { Pshufb(dst.fp(), lhs.fp(), kScratchDoubleReg);
CpuFeatureScope scope(this, AVX);
vpshufb(dst.fp(), lhs.fp(), kScratchDoubleReg);
} else {
if (dst != lhs) {
movups(dst.fp(), lhs.fp());
}
CpuFeatureScope sse_scope(this, SSSE3);
pshufb(dst.fp(), kScratchDoubleReg);
}
return; return;
} }
LiftoffRegister tmp_simd =
GetUnusedRegister(kFpReg, LiftoffRegList::ForRegs(dst, lhs, rhs));
Movups(kScratchDoubleReg, lhs.fp());
uint64_t mask1[2] = {}; uint64_t mask1[2] = {};
for (int i = 15; i >= 0; i--) { for (int i = 15; i >= 0; i--) {
uint8_t lane = shuffle[i]; uint8_t lane = shuffle[i];
...@@ -2353,10 +2340,8 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst, ...@@ -2353,10 +2340,8 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
mask1[j] <<= 8; mask1[j] <<= 8;
mask1[j] |= lane < kSimd128Size ? lane : 0x80; mask1[j] |= lane < kSimd128Size ? lane : 0x80;
} }
TurboAssembler::Move(tmp_simd.fp(), mask1[0]); TurboAssembler::Move(liftoff::kScratchDoubleReg2, mask1[1], mask1[0]);
movq(kScratchRegister, mask1[1]); Pshufb(kScratchDoubleReg, lhs.fp(), liftoff::kScratchDoubleReg2);
Pinsrq(tmp_simd.fp(), kScratchRegister, uint8_t{1});
Pshufb(kScratchDoubleReg, tmp_simd.fp());
uint64_t mask2[2] = {}; uint64_t mask2[2] = {};
for (int i = 15; i >= 0; i--) { for (int i = 15; i >= 0; i--) {
...@@ -2365,14 +2350,9 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst, ...@@ -2365,14 +2350,9 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
mask2[j] <<= 8; mask2[j] <<= 8;
mask2[j] |= lane >= kSimd128Size ? (lane & 0x0F) : 0x80; mask2[j] |= lane >= kSimd128Size ? (lane & 0x0F) : 0x80;
} }
TurboAssembler::Move(tmp_simd.fp(), mask2[0]); TurboAssembler::Move(liftoff::kScratchDoubleReg2, mask2[1], mask2[0]);
movq(kScratchRegister, mask2[1]);
Pinsrq(tmp_simd.fp(), kScratchRegister, uint8_t{1});
if (dst.fp() != rhs.fp()) { Pshufb(dst.fp(), rhs.fp(), liftoff::kScratchDoubleReg2);
Movups(dst.fp(), rhs.fp());
}
Pshufb(dst.fp(), tmp_simd.fp());
Por(dst.fp(), kScratchDoubleReg); Por(dst.fp(), kScratchDoubleReg);
} }
...@@ -2385,10 +2365,7 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst, ...@@ -2385,10 +2365,7 @@ void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
TurboAssembler::Move(mask, uint32_t{0x70707070}); TurboAssembler::Move(mask, uint32_t{0x70707070});
Pshufd(mask, mask, uint8_t{0x0}); Pshufd(mask, mask, uint8_t{0x0});
Paddusb(mask, rhs.fp()); Paddusb(mask, rhs.fp());
if (lhs != dst) { Pshufb(dst.fp(), lhs.fp(), mask);
Movaps(dst.fp(), lhs.fp());
}
Pshufb(dst.fp(), mask);
} }
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst, void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment