Commit 4b90ad75 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd] Move some macro-assembler functions to shared

These functions have the same signature for both SSE and AVX versions.
We move them all into SharedTurboAssembler. Need to fixup a couple of
callsites, since now we use a template helper to call the right
function, whereas previously it was overloaded and there was implicit
conversions from int to uint8_t.

Bug: v8:11589
Change-Id: I8b4146ba1fb838f6b0d6f78f6b95495b8988fc4c
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2800569
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#73794}
parent 562c4251
......@@ -301,93 +301,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
// may be bigger than 2^16 - 1. Requires a scratch register.
void Ret(int bytes_dropped, Register scratch);
// Instructions whose SSE and AVX take the same number and type of operands.
#define AVX_OP3_WITH_TYPE(macro_name, name, dst_type, src1_type, src2_type) \
void macro_name(dst_type dst, src1_type src1, src2_type src2) { \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope scope(this, AVX); \
v##name(dst, src1, src2); \
} else { \
name(dst, src1, src2); \
} \
}
AVX_OP3_WITH_TYPE(Pshufhw, pshufhw, XMMRegister, Operand, uint8_t)
AVX_OP3_WITH_TYPE(Pshufhw, pshufhw, XMMRegister, XMMRegister, uint8_t)
AVX_OP3_WITH_TYPE(Pshuflw, pshuflw, XMMRegister, Operand, uint8_t)
AVX_OP3_WITH_TYPE(Pshuflw, pshuflw, XMMRegister, XMMRegister, uint8_t)
AVX_OP3_WITH_TYPE(Pshufd, pshufd, XMMRegister, Operand, uint8_t)
AVX_OP3_WITH_TYPE(Pshufd, pshufd, XMMRegister, XMMRegister, uint8_t)
#undef AVX_OP3_WITH_TYPE
// Same as AVX_OP3_WITH_TYPE above but with SSE scope.
#define AVX_OP3_WITH_TYPE_SCOPE(macro_name, name, dst_type, src1_type, \
src2_type, sse_scope) \
void macro_name(dst_type dst, src1_type src1, src2_type src2) { \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope scope(this, AVX); \
v##name(dst, src1, src2); \
} else { \
CpuFeatureScope scope(this, sse_scope); \
name(dst, src1, src2); \
} \
}
AVX_OP3_WITH_TYPE_SCOPE(Pextrb, pextrb, Operand, XMMRegister, uint8_t, SSE4_1)
AVX_OP3_WITH_TYPE_SCOPE(Pextrb, pextrb, Register, XMMRegister, uint8_t,
SSE4_1)
AVX_OP3_WITH_TYPE_SCOPE(Pextrw, pextrw, Operand, XMMRegister, uint8_t, SSE4_1)
AVX_OP3_WITH_TYPE_SCOPE(Pextrw, pextrw, Register, XMMRegister, uint8_t,
SSE4_1)
AVX_OP3_WITH_TYPE_SCOPE(Roundps, roundps, XMMRegister, XMMRegister,
RoundingMode, SSE4_1)
AVX_OP3_WITH_TYPE_SCOPE(Roundpd, roundpd, XMMRegister, XMMRegister,
RoundingMode, SSE4_1)
#undef AVX_OP3_WITH_TYPE_SCOPE
// SSE/SSE2 instructions with AVX version.
#define AVX_OP2_WITH_TYPE(macro_name, name, dst_type, src_type) \
void macro_name(dst_type dst, src_type src) { \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope scope(this, AVX); \
v##name(dst, src); \
} else { \
name(dst, src); \
} \
}
AVX_OP2_WITH_TYPE(Movsd, movsd, Operand, XMMRegister)
AVX_OP2_WITH_TYPE(Movsd, movsd, XMMRegister, Operand)
AVX_OP2_WITH_TYPE(Rcpps, rcpps, XMMRegister, const Operand&)
AVX_OP2_WITH_TYPE(Rsqrtps, rsqrtps, XMMRegister, const Operand&)
AVX_OP2_WITH_TYPE(Movdqu, movdqu, XMMRegister, Operand)
AVX_OP2_WITH_TYPE(Movdqu, movdqu, Operand, XMMRegister)
AVX_OP2_WITH_TYPE(Movd, movd, XMMRegister, Register)
AVX_OP2_WITH_TYPE(Movd, movd, XMMRegister, Operand)
AVX_OP2_WITH_TYPE(Movd, movd, Register, XMMRegister)
AVX_OP2_WITH_TYPE(Movd, movd, Operand, XMMRegister)
AVX_OP2_WITH_TYPE(Cvtdq2ps, cvtdq2ps, XMMRegister, Operand)
AVX_OP2_WITH_TYPE(Cvtdq2ps, cvtdq2ps, XMMRegister, XMMRegister)
AVX_OP2_WITH_TYPE(Cvtdq2pd, cvtdq2pd, XMMRegister, XMMRegister)
AVX_OP2_WITH_TYPE(Cvtps2pd, cvtps2pd, XMMRegister, XMMRegister)
AVX_OP2_WITH_TYPE(Cvtpd2ps, cvtpd2ps, XMMRegister, XMMRegister)
AVX_OP2_WITH_TYPE(Cvttps2dq, cvttps2dq, XMMRegister, XMMRegister)
AVX_OP2_WITH_TYPE(Sqrtps, sqrtps, XMMRegister, XMMRegister)
AVX_OP2_WITH_TYPE(Sqrtpd, sqrtpd, XMMRegister, XMMRegister)
AVX_OP2_WITH_TYPE(Sqrtpd, sqrtpd, XMMRegister, const Operand&)
AVX_OP2_WITH_TYPE(Movaps, movaps, XMMRegister, XMMRegister)
AVX_OP2_WITH_TYPE(Movups, movups, XMMRegister, Operand)
AVX_OP2_WITH_TYPE(Movups, movups, XMMRegister, XMMRegister)
AVX_OP2_WITH_TYPE(Movups, movups, Operand, XMMRegister)
AVX_OP2_WITH_TYPE(Movapd, movapd, XMMRegister, XMMRegister)
AVX_OP2_WITH_TYPE(Movapd, movapd, XMMRegister, const Operand&)
AVX_OP2_WITH_TYPE(Movupd, movupd, XMMRegister, const Operand&)
AVX_OP2_WITH_TYPE(Pmovmskb, pmovmskb, Register, XMMRegister)
AVX_OP2_WITH_TYPE(Movmskpd, movmskpd, Register, XMMRegister)
AVX_OP2_WITH_TYPE(Movmskps, movmskps, Register, XMMRegister)
AVX_OP2_WITH_TYPE(Movlps, movlps, Operand, XMMRegister)
AVX_OP2_WITH_TYPE(Movhps, movhps, Operand, XMMRegister)
#undef AVX_OP2_WITH_TYPE
// Only use these macros when non-destructive source of AVX version is not
// needed.
#define AVX_OP3_WITH_TYPE(macro_name, name, dst_type, src_type) \
......@@ -630,6 +543,14 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
#undef AVX_OP3_XO_SSE4
#undef AVX_OP3_WITH_TYPE_SCOPE
// TODO(zhin): Remove after moving more definitions into SharedTurboAssembler.
void Movlps(Operand dst, XMMRegister src) {
SharedTurboAssembler::Movlps(dst, src);
}
void Movhps(Operand dst, XMMRegister src) {
SharedTurboAssembler::Movhps(dst, src);
}
void Pshufb(XMMRegister dst, XMMRegister src) { Pshufb(dst, dst, src); }
void Pshufb(XMMRegister dst, Operand src) { Pshufb(dst, dst, src); }
// Handles SSE and AVX. On SSE, moves src to dst if they are not equal.
......
......@@ -18,6 +18,17 @@
namespace v8 {
namespace internal {
void SharedTurboAssembler::Movapd(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovapd(dst, src);
} else {
// On SSE, movaps is 1 byte shorter than movapd, and has the same
// behavior.
movaps(dst, src);
}
}
void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
uint8_t laneidx) {
if (laneidx == 0) {
......
......@@ -25,6 +25,20 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
public:
using TurboAssemblerBase::TurboAssemblerBase;
void Movapd(XMMRegister dst, XMMRegister src);
template <typename Dst, typename Src>
void Movdqu(Dst dst, Src src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovdqu(dst, src);
} else {
// movups is 1 byte shorter than movdqu. On most SSE systems, this incurs
// no delay moving between integer and floating-point domain.
movups(dst, src);
}
}
template <typename Dst, typename... Args>
struct AvxHelper {
Assembler* assm;
......@@ -98,11 +112,37 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
.template emit<&Assembler::v##name, &Assembler::name>(dst, args...); \
}
AVX_OP(Pmullw, pmullw)
AVX_OP(Cvtdq2pd, cvtdq2pd)
AVX_OP(Cvtdq2ps, cvtdq2ps)
AVX_OP(Cvtps2pd, cvtps2pd)
AVX_OP(Cvtpd2ps, cvtpd2ps)
AVX_OP(Cvttps2dq, cvttps2dq)
AVX_OP(Movaps, movaps)
AVX_OP(Movd, movd)
AVX_OP(Movhps, movhps)
AVX_OP(Movlps, movlps)
AVX_OP(Movmskpd, movmskpd)
AVX_OP(Movmskps, movmskps)
AVX_OP(Movss, movss)
AVX_OP(Movsd, movsd)
AVX_OP(Movupd, movupd)
AVX_OP(Movups, movups)
AVX_OP(Pmovmskb, pmovmskb)
AVX_OP(Pmullw, pmullw)
AVX_OP(Pshuflw, pshuflw)
AVX_OP(Pshufhw, pshufhw)
AVX_OP(Pshufd, pshufd)
AVX_OP(Rcpps, rcpps)
AVX_OP(Rsqrtps, rsqrtps)
AVX_OP(Sqrtps, sqrtps)
AVX_OP(Sqrtpd, sqrtpd)
AVX_OP_SSE4_1(Extractps, extractps)
AVX_OP_SSE4_1(Pextrb, pextrb)
AVX_OP_SSE4_1(Pextrw, pextrw)
AVX_OP_SSE4_1(Pmovsxbw, pmovsxbw)
AVX_OP_SSE4_1(Pmovzxbw, pmovzxbw)
AVX_OP_SSE4_1(Roundps, roundps)
AVX_OP_SSE4_1(Roundpd, roundpd)
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2,
......
......@@ -739,35 +739,6 @@ void TurboAssembler::Movdqa(XMMRegister dst, XMMRegister src) {
}
}
void TurboAssembler::Movapd(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovapd(dst, src);
} else {
// On SSE, movaps is 1 byte shorter than movapd, and has the same behavior.
movaps(dst, src);
}
}
template <typename Dst, typename Src>
void TurboAssembler::Movdqu(Dst dst, Src src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovdqu(dst, src);
} else {
// movups is 1 byte shorter than movdqu. On most SSE systems, this incurs
// no delay moving between integer and floating-point domain.
movups(dst, src);
}
}
template void TurboAssembler::Movdqu<XMMRegister, Operand>(XMMRegister dst,
Operand src);
template void TurboAssembler::Movdqu<Operand, XMMRegister>(Operand dst,
XMMRegister src);
template void TurboAssembler::Movdqu<XMMRegister, XMMRegister>(XMMRegister dst,
XMMRegister src);
void TurboAssembler::Cvtss2sd(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
......
......@@ -68,17 +68,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
AVX_OP(Orps, orps)
AVX_OP(Xorps, xorps)
AVX_OP(Xorpd, xorpd)
AVX_OP(Movd, movd)
AVX_OP(Movq, movq)
AVX_OP(Movaps, movaps)
AVX_OP(Movups, movups)
AVX_OP(Movmskps, movmskps)
AVX_OP(Movmskpd, movmskpd)
AVX_OP(Pmovmskb, pmovmskb)
AVX_OP(Movsd, movsd)
AVX_OP(Movhlps, movhlps)
AVX_OP(Movlps, movlps)
AVX_OP(Movhps, movhps)
AVX_OP(Pcmpeqb, pcmpeqb)
AVX_OP(Pcmpeqw, pcmpeqw)
AVX_OP(Pcmpeqd, pcmpeqd)
......@@ -110,9 +101,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
AVX_OP(Cmpnlepd, cmpnlepd)
AVX_OP(Sqrtss, sqrtss)
AVX_OP(Sqrtsd, sqrtsd)
AVX_OP(Sqrtps, sqrtps)
AVX_OP(Sqrtpd, sqrtpd)
AVX_OP(Cvttps2dq, cvttps2dq)
AVX_OP(Cvttpd2dq, cvttpd2dq)
AVX_OP(Ucomiss, ucomiss)
AVX_OP(Ucomisd, ucomisd)
......@@ -155,18 +143,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
AVX_OP(Divpd, divpd)
AVX_OP(Maxps, maxps)
AVX_OP(Maxpd, maxpd)
AVX_OP(Cvtdq2ps, cvtdq2ps)
AVX_OP(Cvtdq2pd, cvtdq2pd)
AVX_OP(Cvtpd2ps, cvtpd2ps)
AVX_OP(Cvtps2pd, cvtps2pd)
AVX_OP(Rcpps, rcpps)
AVX_OP(Rsqrtps, rsqrtps)
AVX_OP(Addps, addps)
AVX_OP(Subps, subps)
AVX_OP(Mulps, mulps)
AVX_OP(Divps, divps)
AVX_OP(Pshuflw, pshuflw)
AVX_OP(Pshufhw, pshufhw)
AVX_OP(Packsswb, packsswb)
AVX_OP(Packuswb, packuswb)
AVX_OP(Packssdw, packssdw)
......@@ -178,7 +158,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
AVX_OP(Punpckhdq, punpckhdq)
AVX_OP(Punpcklqdq, punpcklqdq)
AVX_OP(Punpckhqdq, punpckhqdq)
AVX_OP(Pshufd, pshufd)
AVX_OP(Cmpps, cmpps)
AVX_OP(Cmppd, cmppd)
AVX_OP(Movlhps, movlhps)
......@@ -214,11 +193,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
AVX_OP_SSE4_1(Pmovsxdq, pmovsxdq)
AVX_OP_SSE4_1(Pmovzxwd, pmovzxwd)
AVX_OP_SSE4_1(Pmovzxdq, pmovzxdq)
AVX_OP_SSE4_1(Pextrb, pextrb)
AVX_OP_SSE4_1(Pextrw, pextrw)
AVX_OP_SSE4_1(Pextrq, pextrq)
AVX_OP_SSE4_1(Roundps, roundps)
AVX_OP_SSE4_1(Roundpd, roundpd)
AVX_OP_SSE4_1(Roundss, roundss)
AVX_OP_SSE4_1(Roundsd, roundsd)
AVX_OP_SSE4_2(Pcmpgtq, pcmpgtq)
......@@ -285,13 +260,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
Label* condition_met,
Label::Distance condition_met_distance = Label::kFar);
void Movapd(XMMRegister dst, XMMRegister src);
void Movdqa(XMMRegister dst, Operand src);
void Movdqa(XMMRegister dst, XMMRegister src);
template <typename Dst, typename Src>
void Movdqu(Dst dst, Src src);
void Cvtss2sd(XMMRegister dst, XMMRegister src);
void Cvtss2sd(XMMRegister dst, Operand src);
void Cvtsd2ss(XMMRegister dst, XMMRegister src);
......
......@@ -2143,7 +2143,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister dst = i.OutputSimd128Register();
__ Pinsrd(dst, i.InputRegister(0), 0);
__ Pinsrd(dst, i.InputOperand(1), 1);
__ Pshufd(dst, dst, 0x44);
__ Pshufd(dst, dst, uint8_t{0x44});
break;
}
case kIA32I64x2ReplaceLaneI32Pair: {
......@@ -2561,7 +2561,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kIA32I32x4Splat: {
XMMRegister dst = i.OutputSimd128Register();
__ Movd(dst, i.InputOperand(0));
__ Pshufd(dst, dst, 0x0);
__ Pshufd(dst, dst, uint8_t{0x0});
break;
}
case kIA32I32x4ExtractLane: {
......@@ -2874,13 +2874,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kIA32I16x8Splat: {
XMMRegister dst = i.OutputSimd128Register();
__ Movd(dst, i.InputOperand(0));
__ Pshuflw(dst, dst, 0x0);
__ Pshufd(dst, dst, 0x0);
__ Pshuflw(dst, dst, uint8_t{0x0});
__ Pshufd(dst, dst, uint8_t{0x0});
break;
}
case kIA32I16x8ExtractLaneS: {
Register dst = i.OutputRegister();
__ Pextrw(dst, i.InputSimd128Register(0), i.InputInt8(1));
__ Pextrw(dst, i.InputSimd128Register(0), i.InputUint8(1));
__ movsx_w(dst, dst);
break;
}
......@@ -3192,7 +3192,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kIA32I8x16ExtractLaneS: {
Register dst = i.OutputRegister();
__ Pextrb(dst, i.InputSimd128Register(0), i.InputInt8(1));
__ Pextrb(dst, i.InputSimd128Register(0), i.InputUint8(1));
__ movsx_b(dst, dst);
break;
}
......@@ -3238,7 +3238,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputUint8(index + 1));
} else {
Register dst = i.OutputRegister();
__ Pextrb(dst, i.InputSimd128Register(0), i.InputInt8(1));
__ Pextrb(dst, i.InputSimd128Register(0), i.InputUint8(1));
}
break;
}
......@@ -3250,7 +3250,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputUint8(index + 1));
} else {
Register dst = i.OutputRegister();
__ Pextrw(dst, i.InputSimd128Register(0), i.InputInt8(1));
__ Pextrw(dst, i.InputSimd128Register(0), i.InputUint8(1));
}
break;
}
......@@ -3299,7 +3299,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
__ mov(tmp, mask);
__ Movd(tmp_simd, tmp);
__ Pshufd(tmp_simd, tmp_simd, 0);
__ Pshufd(tmp_simd, tmp_simd, uint8_t{0});
__ Pand(dst, tmp_simd);
} else {
// Take shift value modulo 8.
......@@ -3459,7 +3459,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
__ mov(tmp, mask);
__ Movd(tmp_simd, tmp);
__ Pshufd(tmp_simd, tmp_simd, 0);
__ Pshufd(tmp_simd, tmp_simd, uint8_t{0});
__ Pand(dst, tmp_simd);
} else {
// Unpack the bytes into words, do logical shifts, and repack.
......@@ -3730,12 +3730,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kIA32S32x4Swizzle: {
DCHECK_EQ(2, instr->InputCount());
__ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), i.InputInt8(1));
__ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), i.InputUint8(1));
break;
}
case kIA32S32x4Shuffle: {
DCHECK_EQ(4, instr->InputCount()); // Swizzles should be handled above.
int8_t shuffle = i.InputInt8(2);
uint8_t shuffle = i.InputUint8(2);
DCHECK_NE(0xe4, shuffle); // A simple blend should be handled below.
__ Pshufd(kScratchDoubleReg, i.InputOperand(1), shuffle);
__ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), shuffle);
......@@ -3747,16 +3747,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
case kIA32S16x8HalfShuffle1: {
XMMRegister dst = i.OutputSimd128Register();
__ Pshuflw(dst, i.InputOperand(0), i.InputInt8(1));
__ Pshufhw(dst, dst, i.InputInt8(2));
__ Pshuflw(dst, i.InputOperand(0), i.InputUint8(1));
__ Pshufhw(dst, dst, i.InputUint8(2));
break;
}
case kIA32S16x8HalfShuffle2: {
XMMRegister dst = i.OutputSimd128Register();
__ Pshuflw(kScratchDoubleReg, i.InputOperand(1), i.InputInt8(2));
__ Pshufhw(kScratchDoubleReg, kScratchDoubleReg, i.InputInt8(3));
__ Pshuflw(dst, i.InputOperand(0), i.InputInt8(2));
__ Pshufhw(dst, dst, i.InputInt8(3));
__ Pshuflw(kScratchDoubleReg, i.InputOperand(1), i.InputUint8(2));
__ Pshufhw(kScratchDoubleReg, kScratchDoubleReg, i.InputUint8(3));
__ Pshuflw(dst, i.InputOperand(0), i.InputUint8(2));
__ Pshufhw(dst, dst, i.InputUint8(3));
__ Pblendw(dst, kScratchDoubleReg, i.InputInt8(4));
break;
}
......@@ -3766,22 +3766,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kIA32S16x8Dup: {
XMMRegister dst = i.OutputSimd128Register();
Operand src = i.InputOperand(0);
int8_t lane = i.InputInt8(1) & 0x7;
int8_t lane4 = lane & 0x3;
int8_t half_dup = lane4 | (lane4 << 2) | (lane4 << 4) | (lane4 << 6);
uint8_t lane = i.InputUint8(1) & 0x7;
uint8_t lane4 = lane & 0x3;
uint8_t half_dup = lane4 | (lane4 << 2) | (lane4 << 4) | (lane4 << 6);
if (lane < 4) {
__ Pshuflw(dst, src, half_dup);
__ Pshufd(dst, dst, 0);
__ Pshufd(dst, dst, uint8_t{0});
} else {
__ Pshufhw(dst, src, half_dup);
__ Pshufd(dst, dst, 0xaa);
__ Pshufd(dst, dst, uint8_t{0xaa});
}
break;
}
case kIA32S8x16Dup: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
int8_t lane = i.InputInt8(1) & 0xf;
uint8_t lane = i.InputUint8(1) & 0xf;
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
if (lane < 8) {
......@@ -3798,14 +3798,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
}
lane &= 0x7;
int8_t lane4 = lane & 0x3;
int8_t half_dup = lane4 | (lane4 << 2) | (lane4 << 4) | (lane4 << 6);
uint8_t lane4 = lane & 0x3;
uint8_t half_dup = lane4 | (lane4 << 2) | (lane4 << 4) | (lane4 << 6);
if (lane < 4) {
__ Pshuflw(dst, dst, half_dup);
__ Pshufd(dst, dst, 0);
__ Pshufd(dst, dst, uint8_t{0});
} else {
__ Pshufhw(dst, dst, half_dup);
__ Pshufd(dst, dst, 0xaa);
__ Pshufd(dst, dst, uint8_t{0xaa});
}
break;
}
......
......@@ -2917,21 +2917,21 @@ void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
Pshuflw(dst.fp(), dst.fp(), 0);
Pshufd(dst.fp(), dst.fp(), 0);
Pshuflw(dst.fp(), dst.fp(), uint8_t{0});
Pshufd(dst.fp(), dst.fp(), uint8_t{0});
}
void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
Pshufd(dst.fp(), dst.fp(), 0);
Pshufd(dst.fp(), dst.fp(), uint8_t{0});
}
void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
LiftoffRegister src) {
Pinsrd(dst.fp(), src.low_gp(), 0);
Pinsrd(dst.fp(), src.high_gp(), 1);
Pshufd(dst.fp(), dst.fp(), 0x44);
Pshufd(dst.fp(), dst.fp(), uint8_t{0x44});
}
void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
......@@ -3419,7 +3419,7 @@ void LiftoffAssembler::emit_i8x16_shri_u(LiftoffRegister dst,
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
mov(tmp, mask);
Movd(liftoff::kScratchDoubleReg, tmp);
Pshufd(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, 0);
Pshufd(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, uint8_t{0});
Pand(dst.fp(), liftoff::kScratchDoubleReg);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment