Commit add293e8 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[x64][ia32] Move more AVX_OP into SharedTurboAssembler

Bug: v8:11589
Change-Id: I30dbdbc6266d703ce697352780da1d543afbb457
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2826711
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#73965}
parent d338a86b
...@@ -1852,34 +1852,6 @@ void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src, Operand mask) { ...@@ -1852,34 +1852,6 @@ void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src, Operand mask) {
pshufb(dst, mask); pshufb(dst, mask);
} }
void TurboAssembler::Pblendw(XMMRegister dst, Operand src, uint8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpblendw(dst, dst, src, imm8);
return;
}
if (CpuFeatures::IsSupported(SSE4_1)) {
CpuFeatureScope sse_scope(this, SSE4_1);
pblendw(dst, src, imm8);
return;
}
FATAL("no AVX or SSE4.1 support");
}
void TurboAssembler::Palignr(XMMRegister dst, Operand src, uint8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpalignr(dst, dst, src, imm8);
return;
}
if (CpuFeatures::IsSupported(SSSE3)) {
CpuFeatureScope sse_scope(this, SSSE3);
palignr(dst, src, imm8);
return;
}
FATAL("no AVX or SSE3 support");
}
void TurboAssembler::Pextrd(Register dst, XMMRegister src, uint8_t imm8) { void TurboAssembler::Pextrd(Register dst, XMMRegister src, uint8_t imm8) {
if (imm8 == 0) { if (imm8 == 0) {
Movd(dst, src); Movd(dst, src);
......
...@@ -305,96 +305,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler { ...@@ -305,96 +305,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
AVX_OP(Pcmpeqb, pcmpeqb) AVX_OP(Pcmpeqb, pcmpeqb)
AVX_OP(Pcmpeqw, pcmpeqw) AVX_OP(Pcmpeqw, pcmpeqw)
AVX_OP(Pcmpeqd, pcmpeqd) AVX_OP(Pcmpeqd, pcmpeqd)
AVX_OP_SSE4_1(Pcmpeqq, pcmpeqq)
// Same as AVX_OP3_WITH_TYPE but supports a CpuFeatureScope
#define AVX_OP2_WITH_TYPE_SCOPE(macro_name, name, dst_type, src_type, \
sse_scope) \
void macro_name(dst_type dst, src_type src) { \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope scope(this, AVX); \
v##name(dst, dst, src); \
} else if (CpuFeatures::IsSupported(sse_scope)) { \
CpuFeatureScope scope(this, sse_scope); \
name(dst, src); \
} \
}
#define AVX_OP2_XO(macro_name, name, sse_scope) \
AVX_OP2_WITH_TYPE_SCOPE(macro_name, name, XMMRegister, XMMRegister, \
sse_scope) \
AVX_OP2_WITH_TYPE_SCOPE(macro_name, name, XMMRegister, Operand, sse_scope)
AVX_OP2_XO(Psignb, psignb, SSSE3)
AVX_OP2_XO(Psignw, psignw, SSSE3)
AVX_OP2_XO(Psignd, psignd, SSSE3)
AVX_OP2_XO(Pcmpeqq, pcmpeqq, SSE4_1)
#undef AVX_OP2_XO
#undef AVX_OP2_WITH_TYPE_SCOPE
// Only use this macro when dst and src1 is the same in SSE case.
#define AVX_PACKED_OP3_WITH_TYPE(macro_name, name, dst_type, src_type) \
void macro_name(dst_type dst, dst_type src1, src_type src2) { \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope scope(this, AVX); \
v##name(dst, src1, src2); \
} else { \
DCHECK_EQ(dst, src1); \
name(dst, src2); \
} \
}
#define AVX_PACKED_OP3(macro_name, name) \
AVX_PACKED_OP3_WITH_TYPE(macro_name, name, XMMRegister, XMMRegister) \
AVX_PACKED_OP3_WITH_TYPE(macro_name, name, XMMRegister, Operand)
AVX_PACKED_OP3(Unpcklps, unpcklps)
AVX_PACKED_OP3(Andnps, andnps)
AVX_PACKED_OP3(Addps, addps)
AVX_PACKED_OP3(Addpd, addpd)
AVX_PACKED_OP3(Subps, subps)
AVX_PACKED_OP3(Subpd, subpd)
AVX_PACKED_OP3(Mulps, mulps)
AVX_PACKED_OP3(Mulpd, mulpd)
AVX_PACKED_OP3(Divps, divps)
AVX_PACKED_OP3(Divpd, divpd)
AVX_PACKED_OP3(Cmpeqpd, cmpeqpd)
AVX_PACKED_OP3(Cmpneqpd, cmpneqpd)
AVX_PACKED_OP3(Cmpltpd, cmpltpd)
AVX_PACKED_OP3(Cmpleps, cmpleps)
AVX_PACKED_OP3(Cmplepd, cmplepd)
AVX_PACKED_OP3(Minps, minps)
AVX_PACKED_OP3(Minpd, minpd)
AVX_PACKED_OP3(Maxps, maxps)
AVX_PACKED_OP3(Maxpd, maxpd)
AVX_PACKED_OP3(Cmpunordps, cmpunordps)
AVX_PACKED_OP3(Cmpunordpd, cmpunordpd)
AVX_PACKED_OP3(Psllw, psllw)
AVX_PACKED_OP3(Pslld, pslld)
AVX_PACKED_OP3(Psllq, psllq)
AVX_PACKED_OP3(Psrlw, psrlw)
AVX_PACKED_OP3(Psrld, psrld)
AVX_PACKED_OP3(Psrad, psrad)
AVX_PACKED_OP3(Paddd, paddd)
AVX_PACKED_OP3(Paddq, paddq)
AVX_PACKED_OP3(Pmuludq, pmuludq)
AVX_PACKED_OP3(Pavgb, pavgb)
AVX_PACKED_OP3(Pavgw, pavgw)
AVX_PACKED_OP3(Pminub, pminub)
AVX_PACKED_OP3(Pmaxub, pmaxub)
AVX_PACKED_OP3(Paddusb, paddusb)
AVX_PACKED_OP3(Psubusb, psubusb)
AVX_PACKED_OP3(Pcmpgtb, pcmpgtb)
AVX_PACKED_OP3(Paddb, paddb)
AVX_PACKED_OP3(Paddsb, paddsb)
AVX_PACKED_OP3(Psubsb, psubsb)
#undef AVX_PACKED_OP3
AVX_PACKED_OP3_WITH_TYPE(Psllw, psllw, XMMRegister, uint8_t)
AVX_PACKED_OP3_WITH_TYPE(Pslld, pslld, XMMRegister, uint8_t)
AVX_PACKED_OP3_WITH_TYPE(Psllq, psllq, XMMRegister, uint8_t)
AVX_PACKED_OP3_WITH_TYPE(Psrlw, psrlw, XMMRegister, uint8_t)
AVX_PACKED_OP3_WITH_TYPE(Psrld, psrld, XMMRegister, uint8_t)
AVX_PACKED_OP3_WITH_TYPE(Psrad, psrad, XMMRegister, uint8_t)
#undef AVX_PACKED_OP3_WITH_TYPE
// Macro for instructions that have 2 operands for AVX version and 1 operand for // Macro for instructions that have 2 operands for AVX version and 1 operand for
// SSE version. Will move src1 to dst if dst != src1. // SSE version. Will move src1 to dst if dst != src1.
...@@ -416,35 +327,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler { ...@@ -416,35 +327,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
AVX_OP3_WITH_MOVE(Pmaddwd, pmaddwd, XMMRegister, Operand) AVX_OP3_WITH_MOVE(Pmaddwd, pmaddwd, XMMRegister, Operand)
#undef AVX_OP3_WITH_MOVE #undef AVX_OP3_WITH_MOVE
#define AVX_OP3_WITH_TYPE_SCOPE(macro_name, name, dst_type, src_type, \
sse_scope) \
void macro_name(dst_type dst, dst_type src1, src_type src2) { \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope scope(this, AVX); \
v##name(dst, src1, src2); \
return; \
} \
if (CpuFeatures::IsSupported(sse_scope)) { \
CpuFeatureScope scope(this, sse_scope); \
DCHECK_EQ(dst, src1); \
name(dst, src2); \
return; \
} \
UNREACHABLE(); \
}
#define AVX_OP3_XO_SSE4(macro_name, name) \
AVX_OP3_WITH_TYPE_SCOPE(macro_name, name, XMMRegister, XMMRegister, SSE4_1) \
AVX_OP3_WITH_TYPE_SCOPE(macro_name, name, XMMRegister, Operand, SSE4_1)
AVX_OP3_WITH_TYPE_SCOPE(Haddps, haddps, XMMRegister, Operand, SSE3)
AVX_OP3_XO_SSE4(Pmaxsd, pmaxsd)
AVX_OP3_XO_SSE4(Pminsb, pminsb)
AVX_OP3_XO_SSE4(Pmaxsb, pmaxsb)
AVX_OP3_XO_SSE4(Pcmpeqq, pcmpeqq)
#undef AVX_OP3_XO_SSE4
#undef AVX_OP3_WITH_TYPE_SCOPE
// TODO(zhin): Remove after moving more definitions into SharedTurboAssembler. // TODO(zhin): Remove after moving more definitions into SharedTurboAssembler.
void Movlps(Operand dst, XMMRegister src) { void Movlps(Operand dst, XMMRegister src) {
SharedTurboAssembler::Movlps(dst, src); SharedTurboAssembler::Movlps(dst, src);
...@@ -461,16 +343,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler { ...@@ -461,16 +343,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
} }
void Pshufb(XMMRegister dst, XMMRegister src, Operand mask); void Pshufb(XMMRegister dst, XMMRegister src, Operand mask);
void Pblendw(XMMRegister dst, XMMRegister src, uint8_t imm8) {
Pblendw(dst, Operand(src), imm8);
}
void Pblendw(XMMRegister dst, Operand src, uint8_t imm8);
void Palignr(XMMRegister dst, XMMRegister src, uint8_t imm8) {
Palignr(dst, Operand(src), imm8);
}
void Palignr(XMMRegister dst, Operand src, uint8_t imm8);
void Pextrd(Register dst, XMMRegister src, uint8_t imm8); void Pextrd(Register dst, XMMRegister src, uint8_t imm8);
void Pinsrb(XMMRegister dst, Register src, int8_t imm8) { void Pinsrb(XMMRegister dst, Register src, int8_t imm8) {
Pinsrb(dst, Operand(src), imm8); Pinsrb(dst, Operand(src), imm8);
......
...@@ -39,7 +39,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -39,7 +39,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
} }
} }
// Helper struct to implement functions that checks for AVX support and // Helper struct to implement functions that check for AVX support and
// dispatch to the appropriate AVX/SSE instruction. // dispatch to the appropriate AVX/SSE instruction.
template <typename Dst, typename Arg, typename... Args> template <typename Dst, typename Arg, typename... Args>
struct AvxHelper { struct AvxHelper {
...@@ -145,14 +145,30 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -145,14 +145,30 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
} }
// Keep this list sorted by required extension, then instruction name. // Keep this list sorted by required extension, then instruction name.
AVX_OP(Addpd, addpd)
AVX_OP(Addps, addps)
AVX_OP(Andnpd, andnpd) AVX_OP(Andnpd, andnpd)
AVX_OP(Andnps, andnps)
AVX_OP(Andpd, andpd) AVX_OP(Andpd, andpd)
AVX_OP(Andps, andps) AVX_OP(Andps, andps)
AVX_OP(Cmpeqpd, cmpeqpd)
AVX_OP(Cmplepd, cmplepd)
AVX_OP(Cmpleps, cmpleps)
AVX_OP(Cmpltpd, cmpltpd)
AVX_OP(Cmpneqpd, cmpneqpd)
AVX_OP(Cmpunordpd, cmpunordpd)
AVX_OP(Cmpunordps, cmpunordps)
AVX_OP(Cvtdq2pd, cvtdq2pd) AVX_OP(Cvtdq2pd, cvtdq2pd)
AVX_OP(Cvtdq2ps, cvtdq2ps) AVX_OP(Cvtdq2ps, cvtdq2ps)
AVX_OP(Cvtpd2ps, cvtpd2ps) AVX_OP(Cvtpd2ps, cvtpd2ps)
AVX_OP(Cvtps2pd, cvtps2pd) AVX_OP(Cvtps2pd, cvtps2pd)
AVX_OP(Cvttps2dq, cvttps2dq) AVX_OP(Cvttps2dq, cvttps2dq)
AVX_OP(Divpd, divpd)
AVX_OP(Divps, divps)
AVX_OP(Maxpd, maxpd)
AVX_OP(Maxps, maxps)
AVX_OP(Minpd, minpd)
AVX_OP(Minps, minps)
AVX_OP(Movaps, movaps) AVX_OP(Movaps, movaps)
AVX_OP(Movd, movd) AVX_OP(Movd, movd)
AVX_OP(Movhlps, movhlps) AVX_OP(Movhlps, movhlps)
...@@ -164,25 +180,46 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -164,25 +180,46 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Movss, movss) AVX_OP(Movss, movss)
AVX_OP(Movupd, movupd) AVX_OP(Movupd, movupd)
AVX_OP(Movups, movups) AVX_OP(Movups, movups)
AVX_OP(Mulpd, mulpd)
AVX_OP(Mulps, mulps)
AVX_OP(Orpd, orpd) AVX_OP(Orpd, orpd)
AVX_OP(Orps, orps) AVX_OP(Orps, orps)
AVX_OP(Packssdw, packssdw) AVX_OP(Packssdw, packssdw)
AVX_OP(Packsswb, packsswb) AVX_OP(Packsswb, packsswb)
AVX_OP(Packuswb, packuswb) AVX_OP(Packuswb, packuswb)
AVX_OP(Paddb, paddb)
AVX_OP(Paddd, paddd)
AVX_OP(Paddq, paddq)
AVX_OP(Paddsb, paddsb)
AVX_OP(Paddusb, paddusb) AVX_OP(Paddusb, paddusb)
AVX_OP(Paddusw, paddusw) AVX_OP(Paddusw, paddusw)
AVX_OP(Paddw, paddw)
AVX_OP(Pand, pand) AVX_OP(Pand, pand)
AVX_OP(Pavgb, pavgb)
AVX_OP(Pavgw, pavgw)
AVX_OP(Pcmpgtb, pcmpgtb)
AVX_OP(Pmaxub, pmaxub)
AVX_OP(Pminub, pminub)
AVX_OP(Pmovmskb, pmovmskb) AVX_OP(Pmovmskb, pmovmskb)
AVX_OP(Pmullw, pmullw) AVX_OP(Pmullw, pmullw)
AVX_OP(Pmuludq, pmuludq)
AVX_OP(Por, por) AVX_OP(Por, por)
AVX_OP(Pshufd, pshufd) AVX_OP(Pshufd, pshufd)
AVX_OP(Pshufhw, pshufhw) AVX_OP(Pshufhw, pshufhw)
AVX_OP(Pshuflw, pshuflw) AVX_OP(Pshuflw, pshuflw)
AVX_OP(Pslld, pslld)
AVX_OP(Psllq, psllq)
AVX_OP(Psllw, psllw)
AVX_OP(Psrad, psrad)
AVX_OP(Psraw, psraw) AVX_OP(Psraw, psraw)
AVX_OP(Psrld, psrld)
AVX_OP(Psrlq, psrlq) AVX_OP(Psrlq, psrlq)
AVX_OP(Psrlw, psrlw)
AVX_OP(Psubb, psubb) AVX_OP(Psubb, psubb)
AVX_OP(Psubd, psubd) AVX_OP(Psubd, psubd)
AVX_OP(Psubq, psubq) AVX_OP(Psubq, psubq)
AVX_OP(Psubsb, psubsb)
AVX_OP(Psubusb, psubusb)
AVX_OP(Psubw, psubw) AVX_OP(Psubw, psubw)
AVX_OP(Punpckhbw, punpckhbw) AVX_OP(Punpckhbw, punpckhbw)
AVX_OP(Punpckhdq, punpckhdq) AVX_OP(Punpckhdq, punpckhdq)
...@@ -199,16 +236,31 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -199,16 +236,31 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Sqrtps, sqrtps) AVX_OP(Sqrtps, sqrtps)
AVX_OP(Sqrtsd, sqrtsd) AVX_OP(Sqrtsd, sqrtsd)
AVX_OP(Sqrtss, sqrtss) AVX_OP(Sqrtss, sqrtss)
AVX_OP(Subpd, subpd)
AVX_OP(Subps, subps)
AVX_OP(Unpcklps, unpcklps)
AVX_OP(Xorpd, xorpd) AVX_OP(Xorpd, xorpd)
AVX_OP(Xorps, xorps) AVX_OP(Xorps, xorps)
AVX_OP_SSE3(Haddps, haddps)
AVX_OP_SSE3(Movddup, movddup) AVX_OP_SSE3(Movddup, movddup)
AVX_OP_SSE3(Movshdup, movshdup) AVX_OP_SSE3(Movshdup, movshdup)
AVX_OP_SSSE3(Pabsb, pabsb) AVX_OP_SSSE3(Pabsb, pabsb)
AVX_OP_SSSE3(Pabsd, pabsd) AVX_OP_SSSE3(Pabsd, pabsd)
AVX_OP_SSSE3(Pabsw, pabsw) AVX_OP_SSSE3(Pabsw, pabsw)
AVX_OP_SSSE3(Palignr, palignr)
AVX_OP_SSSE3(Psignb, psignb)
AVX_OP_SSSE3(Psignd, psignd)
AVX_OP_SSSE3(Psignw, psignw)
AVX_OP_SSE4_1(Extractps, extractps) AVX_OP_SSE4_1(Extractps, extractps)
AVX_OP_SSE4_1(Pblendw, pblendw)
AVX_OP_SSE4_1(Pextrb, pextrb) AVX_OP_SSE4_1(Pextrb, pextrb)
AVX_OP_SSE4_1(Pextrw, pextrw) AVX_OP_SSE4_1(Pextrw, pextrw)
AVX_OP_SSE4_1(Pmaxsb, pmaxsb)
AVX_OP_SSE4_1(Pmaxsd, pmaxsd)
AVX_OP_SSE4_1(Pminsb, pminsb)
AVX_OP_SSE4_1(Pmovsxbw, pmovsxbw) AVX_OP_SSE4_1(Pmovsxbw, pmovsxbw)
AVX_OP_SSE4_1(Pmovsxdq, pmovsxdq) AVX_OP_SSE4_1(Pmovsxdq, pmovsxdq)
AVX_OP_SSE4_1(Pmovsxwd, pmovsxwd) AVX_OP_SSE4_1(Pmovsxwd, pmovsxwd)
......
...@@ -1274,6 +1274,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1274,6 +1274,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
SSE_CMP_P(cmpeq, 0x0) SSE_CMP_P(cmpeq, 0x0)
SSE_CMP_P(cmplt, 0x1) SSE_CMP_P(cmplt, 0x1)
SSE_CMP_P(cmple, 0x2) SSE_CMP_P(cmple, 0x2)
SSE_CMP_P(cmpunord, 0x3)
SSE_CMP_P(cmpneq, 0x4) SSE_CMP_P(cmpneq, 0x4)
SSE_CMP_P(cmpnlt, 0x5) SSE_CMP_P(cmpnlt, 0x5)
SSE_CMP_P(cmpnle, 0x6) SSE_CMP_P(cmpnle, 0x6)
...@@ -1571,6 +1572,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1571,6 +1572,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
AVX_CMP_P(vcmpeq, 0x0) AVX_CMP_P(vcmpeq, 0x0)
AVX_CMP_P(vcmplt, 0x1) AVX_CMP_P(vcmplt, 0x1)
AVX_CMP_P(vcmple, 0x2) AVX_CMP_P(vcmple, 0x2)
AVX_CMP_P(vcmpunord, 0x3)
AVX_CMP_P(vcmpneq, 0x4) AVX_CMP_P(vcmpneq, 0x4)
AVX_CMP_P(vcmpnlt, 0x5) AVX_CMP_P(vcmpnlt, 0x5)
AVX_CMP_P(vcmpnle, 0x6) AVX_CMP_P(vcmpnle, 0x6)
......
...@@ -1885,16 +1885,6 @@ void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1, ...@@ -1885,16 +1885,6 @@ void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1,
} }
} }
void TurboAssembler::Unpcklps(XMMRegister dst, XMMRegister src1, Operand src2) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vunpcklps(dst, src1, src2);
} else {
DCHECK_EQ(dst, src1);
unpcklps(dst, src2);
}
}
void TurboAssembler::Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2, void TurboAssembler::Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
byte imm8) { byte imm8) {
if (CpuFeatures::IsSupported(AVX)) { if (CpuFeatures::IsSupported(AVX)) {
...@@ -2039,26 +2029,6 @@ void TurboAssembler::Pinsrq(XMMRegister dst, XMMRegister src1, Operand src2, ...@@ -2039,26 +2029,6 @@ void TurboAssembler::Pinsrq(XMMRegister dst, XMMRegister src1, Operand src2,
imm8, base::Optional<CpuFeature>(SSE4_1)); imm8, base::Optional<CpuFeature>(SSE4_1));
} }
void TurboAssembler::Psllq(XMMRegister dst, byte imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpsllq(dst, dst, imm8);
} else {
DCHECK(!IsEnabled(AVX));
psllq(dst, imm8);
}
}
void TurboAssembler::Pslld(XMMRegister dst, byte imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpslld(dst, dst, imm8);
} else {
DCHECK(!IsEnabled(AVX));
pslld(dst, imm8);
}
}
void TurboAssembler::Pblendvb(XMMRegister dst, XMMRegister src1, void TurboAssembler::Pblendvb(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister mask) { XMMRegister src2, XMMRegister mask) {
if (CpuFeatures::IsSupported(AVX)) { if (CpuFeatures::IsSupported(AVX)) {
...@@ -2396,21 +2366,6 @@ void TurboAssembler::Negpd(XMMRegister dst) { ...@@ -2396,21 +2366,6 @@ void TurboAssembler::Negpd(XMMRegister dst) {
ExternalReference::address_of_double_neg_constant())); ExternalReference::address_of_double_neg_constant()));
} }
void TurboAssembler::Psrld(XMMRegister dst, byte imm8) {
Psrld(dst, dst, imm8);
}
void TurboAssembler::Psrld(XMMRegister dst, XMMRegister src, byte imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpsrld(dst, src, imm8);
} else {
DCHECK(!IsEnabled(AVX));
DCHECK_EQ(dst, src);
psrld(dst, imm8);
}
}
void TurboAssembler::Lzcntl(Register dst, Register src) { void TurboAssembler::Lzcntl(Register dst, Register src) {
if (CpuFeatures::IsSupported(LZCNT)) { if (CpuFeatures::IsSupported(LZCNT)) {
CpuFeatureScope scope(this, LZCNT); CpuFeatureScope scope(this, LZCNT);
......
...@@ -66,91 +66,43 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler { ...@@ -66,91 +66,43 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
AVX_OP(Subsd, subsd) AVX_OP(Subsd, subsd)
AVX_OP(Divss, divss) AVX_OP(Divss, divss)
AVX_OP(Divsd, divsd) AVX_OP(Divsd, divsd)
AVX_OP(Pcmpgtb, pcmpgtb)
AVX_OP(Pcmpgtw, pcmpgtw) AVX_OP(Pcmpgtw, pcmpgtw)
AVX_OP(Pmaxsw, pmaxsw) AVX_OP(Pmaxsw, pmaxsw)
AVX_OP(Pmaxub, pmaxub)
AVX_OP(Pminsw, pminsw) AVX_OP(Pminsw, pminsw)
AVX_OP(Pminub, pminub)
AVX_OP(Addss, addss) AVX_OP(Addss, addss)
AVX_OP(Addsd, addsd) AVX_OP(Addsd, addsd)
AVX_OP(Mulsd, mulsd) AVX_OP(Mulsd, mulsd)
AVX_OP(Andnps, andnps)
AVX_OP(Cmpeqps, cmpeqps) AVX_OP(Cmpeqps, cmpeqps)
AVX_OP(Cmpltps, cmpltps) AVX_OP(Cmpltps, cmpltps)
AVX_OP(Cmpleps, cmpleps)
AVX_OP(Cmpneqps, cmpneqps) AVX_OP(Cmpneqps, cmpneqps)
AVX_OP(Cmpnltps, cmpnltps) AVX_OP(Cmpnltps, cmpnltps)
AVX_OP(Cmpnleps, cmpnleps) AVX_OP(Cmpnleps, cmpnleps)
AVX_OP(Cmpeqpd, cmpeqpd)
AVX_OP(Cmpltpd, cmpltpd)
AVX_OP(Cmplepd, cmplepd)
AVX_OP(Cmpneqpd, cmpneqpd)
AVX_OP(Cmpnltpd, cmpnltpd) AVX_OP(Cmpnltpd, cmpnltpd)
AVX_OP(Cmpnlepd, cmpnlepd) AVX_OP(Cmpnlepd, cmpnlepd)
AVX_OP(Cvttpd2dq, cvttpd2dq) AVX_OP(Cvttpd2dq, cvttpd2dq)
AVX_OP(Ucomiss, ucomiss) AVX_OP(Ucomiss, ucomiss)
AVX_OP(Ucomisd, ucomisd) AVX_OP(Ucomisd, ucomisd)
AVX_OP(Psubsb, psubsb)
AVX_OP(Psubsw, psubsw) AVX_OP(Psubsw, psubsw)
AVX_OP(Psubusb, psubusb)
AVX_OP(Psubusw, psubusw) AVX_OP(Psubusw, psubusw)
AVX_OP(Pslld, pslld)
AVX_OP(Pavgb, pavgb)
AVX_OP(Pavgw, pavgw)
AVX_OP(Psrad, psrad)
AVX_OP(Psllw, psllw)
AVX_OP(Psllq, psllq)
AVX_OP(Psrlw, psrlw)
AVX_OP(Psrld, psrld)
AVX_OP(Paddb, paddb)
AVX_OP(Paddw, paddw)
AVX_OP(Paddd, paddd)
AVX_OP(Paddq, paddq)
AVX_OP(Paddsb, paddsb)
AVX_OP(Paddsw, paddsw) AVX_OP(Paddsw, paddsw)
AVX_OP(Pcmpgtd, pcmpgtd) AVX_OP(Pcmpgtd, pcmpgtd)
AVX_OP(Pmuludq, pmuludq)
AVX_OP(Addpd, addpd)
AVX_OP(Subpd, subpd)
AVX_OP(Mulpd, mulpd)
AVX_OP(Minps, minps)
AVX_OP(Minpd, minpd)
AVX_OP(Divpd, divpd)
AVX_OP(Maxps, maxps)
AVX_OP(Maxpd, maxpd)
AVX_OP(Addps, addps)
AVX_OP(Subps, subps)
AVX_OP(Mulps, mulps)
AVX_OP(Divps, divps)
AVX_OP(Pcmpeqb, pcmpeqb) AVX_OP(Pcmpeqb, pcmpeqb)
AVX_OP(Pcmpeqw, pcmpeqw) AVX_OP(Pcmpeqw, pcmpeqw)
AVX_OP(Pcmpeqd, pcmpeqd) AVX_OP(Pcmpeqd, pcmpeqd)
AVX_OP(Cmpps, cmpps)
AVX_OP(Cmppd, cmppd)
AVX_OP(Movlhps, movlhps) AVX_OP(Movlhps, movlhps)
AVX_OP_SSE3(Haddps, haddps)
AVX_OP_SSSE3(Phaddd, phaddd) AVX_OP_SSSE3(Phaddd, phaddd)
AVX_OP_SSSE3(Phaddw, phaddw) AVX_OP_SSSE3(Phaddw, phaddw)
AVX_OP_SSSE3(Pshufb, pshufb) AVX_OP_SSSE3(Pshufb, pshufb)
AVX_OP_SSSE3(Psignb, psignb)
AVX_OP_SSSE3(Psignw, psignw)
AVX_OP_SSSE3(Psignd, psignd)
AVX_OP_SSSE3(Palignr, palignr)
AVX_OP_SSE4_1(Pcmpeqq, pcmpeqq) AVX_OP_SSE4_1(Pcmpeqq, pcmpeqq)
AVX_OP_SSE4_1(Packusdw, packusdw) AVX_OP_SSE4_1(Packusdw, packusdw)
AVX_OP_SSE4_1(Pminsb, pminsb)
AVX_OP_SSE4_1(Pminsd, pminsd) AVX_OP_SSE4_1(Pminsd, pminsd)
AVX_OP_SSE4_1(Pminuw, pminuw) AVX_OP_SSE4_1(Pminuw, pminuw)
AVX_OP_SSE4_1(Pminud, pminud) AVX_OP_SSE4_1(Pminud, pminud)
AVX_OP_SSE4_1(Pmaxsb, pmaxsb)
AVX_OP_SSE4_1(Pmaxsd, pmaxsd)
AVX_OP_SSE4_1(Pmaxuw, pmaxuw) AVX_OP_SSE4_1(Pmaxuw, pmaxuw)
AVX_OP_SSE4_1(Pmaxud, pmaxud) AVX_OP_SSE4_1(Pmaxud, pmaxud)
AVX_OP_SSE4_1(Pmulld, pmulld) AVX_OP_SSE4_1(Pmulld, pmulld)
AVX_OP_SSE4_1(Insertps, insertps) AVX_OP_SSE4_1(Insertps, insertps)
AVX_OP_SSE4_1(Pinsrq, pinsrq) AVX_OP_SSE4_1(Pinsrq, pinsrq)
AVX_OP_SSE4_1(Pblendw, pblendw)
AVX_OP_SSE4_1(Pextrq, pextrq) AVX_OP_SSE4_1(Pextrq, pextrq)
AVX_OP_SSE4_1(Roundss, roundss) AVX_OP_SSE4_1(Roundss, roundss)
AVX_OP_SSE4_1(Roundsd, roundsd) AVX_OP_SSE4_1(Roundsd, roundsd)
...@@ -427,7 +379,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler { ...@@ -427,7 +379,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
void Pmaddubsw(XMMRegister dst, XMMRegister src1, Operand src2); void Pmaddubsw(XMMRegister dst, XMMRegister src1, Operand src2);
void Pmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2); void Pmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void Unpcklps(XMMRegister dst, XMMRegister src1, Operand src2);
// Shufps that will mov src1 into dst if AVX is not supported. // Shufps that will mov src1 into dst if AVX is not supported.
void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2, byte imm8); void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2, byte imm8);
...@@ -445,14 +396,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler { ...@@ -445,14 +396,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
void Pinsrq(XMMRegister dst, XMMRegister src1, Register src2, uint8_t imm8); void Pinsrq(XMMRegister dst, XMMRegister src1, Register src2, uint8_t imm8);
void Pinsrq(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t imm8); void Pinsrq(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t imm8);
void Psllq(XMMRegister dst, int imm8) { Psllq(dst, static_cast<byte>(imm8)); }
void Psllq(XMMRegister dst, byte imm8);
void Pslld(XMMRegister dst, byte imm8);
void Psrld(XMMRegister dst, byte imm8);
// Supports both AVX (dst != src1) and SSE (checks that dst == src1).
void Psrld(XMMRegister dst, XMMRegister src, byte imm8);
void Pblendvb(XMMRegister dst, XMMRegister src1, XMMRegister src2, void Pblendvb(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister mask); XMMRegister mask);
void Blendvps(XMMRegister dst, XMMRegister src1, XMMRegister src2, void Blendvps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
......
...@@ -2181,7 +2181,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2181,7 +2181,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
// Set up a mask [0x80000000,0,0x80000000,0]. // Set up a mask [0x80000000,0,0x80000000,0].
__ Pcmpeqb(tmp2, tmp2); __ Pcmpeqb(tmp2, tmp2);
__ Psllq(tmp2, tmp2, 63); __ Psllq(tmp2, tmp2, byte{63});
__ Psrlq(tmp2, tmp2, tmp); __ Psrlq(tmp2, tmp2, tmp);
__ Psrlq(dst, src, tmp); __ Psrlq(dst, src, tmp);
...@@ -2218,7 +2218,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2218,7 +2218,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pmuludq(tmp2, tmp2, left); __ Pmuludq(tmp2, tmp2, left);
__ Paddq(tmp2, tmp2, tmp1); __ Paddq(tmp2, tmp2, tmp1);
__ Psllq(tmp2, tmp2, 32); __ Psllq(tmp2, tmp2, byte{32});
__ Pmuludq(dst, left, right); __ Pmuludq(dst, left, right);
__ Paddq(dst, dst, tmp2); __ Paddq(dst, dst, tmp2);
...@@ -2357,10 +2357,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2357,10 +2357,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0); XMMRegister src = i.InputSimd128Register(0);
__ Pxor(kScratchDoubleReg, kScratchDoubleReg); // zeros __ Pxor(kScratchDoubleReg, kScratchDoubleReg); // zeros
__ Pblendw(kScratchDoubleReg, src, 0x55); // get lo 16 bits __ Pblendw(kScratchDoubleReg, src, uint8_t{0x55}); // get lo 16 bits
__ Psubd(dst, src, kScratchDoubleReg); // get hi 16 bits __ Psubd(dst, src, kScratchDoubleReg); // get hi 16 bits
__ Cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg); // convert lo exactly __ Cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg); // convert lo exactly
__ Psrld(dst, dst, 1); // divide by 2 to get in unsigned range __ Psrld(dst, dst, byte{1}); // divide by 2 to get in unsigned range
__ Cvtdq2ps(dst, dst); // convert hi exactly __ Cvtdq2ps(dst, dst); // convert hi exactly
__ Addps(dst, dst, dst); // double hi, exactly __ Addps(dst, dst, dst); // double hi, exactly
__ Addps(dst, dst, kScratchDoubleReg); // add hi and lo, may round. __ Addps(dst, dst, kScratchDoubleReg); // add hi and lo, may round.
...@@ -2371,11 +2371,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2371,11 +2371,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister src = i.InputSimd128Register(0); XMMRegister src = i.InputSimd128Register(0);
if (dst == src) { if (dst == src) {
__ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg); __ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ Psrld(kScratchDoubleReg, kScratchDoubleReg, 1); __ Psrld(kScratchDoubleReg, kScratchDoubleReg, byte{1});
__ Andps(dst, kScratchDoubleReg); __ Andps(dst, kScratchDoubleReg);
} else { } else {
__ Pcmpeqd(dst, dst); __ Pcmpeqd(dst, dst);
__ Psrld(dst, dst, 1); __ Psrld(dst, dst, byte{1});
__ Andps(dst, src); __ Andps(dst, src);
} }
break; break;
...@@ -2385,11 +2385,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2385,11 +2385,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister src = i.InputSimd128Register(0); XMMRegister src = i.InputSimd128Register(0);
if (dst == src) { if (dst == src) {
__ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg); __ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ Pslld(kScratchDoubleReg, kScratchDoubleReg, 31); __ Pslld(kScratchDoubleReg, kScratchDoubleReg, byte{31});
__ Xorps(dst, kScratchDoubleReg); __ Xorps(dst, kScratchDoubleReg);
} else { } else {
__ Pcmpeqd(dst, dst); __ Pcmpeqd(dst, dst);
__ Pslld(dst, dst, 31); __ Pslld(dst, dst, byte{31});
__ Xorps(dst, src); __ Xorps(dst, src);
} }
break; break;
...@@ -2580,7 +2580,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2580,7 +2580,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Cvttps2dq(dst, dst); __ Cvttps2dq(dst, dst);
// Set top bit if >=0 is now < 0 // Set top bit if >=0 is now < 0
__ Pand(kScratchDoubleReg, dst); __ Pand(kScratchDoubleReg, dst);
__ Psrad(kScratchDoubleReg, kScratchDoubleReg, 31); __ Psrad(kScratchDoubleReg, kScratchDoubleReg, byte{31});
// Set positive overflow lanes to 0x7FFFFFFF // Set positive overflow lanes to 0x7FFFFFFF
__ Pxor(dst, kScratchDoubleReg); __ Pxor(dst, kScratchDoubleReg);
break; break;
...@@ -3739,7 +3739,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3739,7 +3739,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_NE(0xe4, shuffle); // A simple blend should be handled below. DCHECK_NE(0xe4, shuffle); // A simple blend should be handled below.
__ Pshufd(kScratchDoubleReg, i.InputOperand(1), shuffle); __ Pshufd(kScratchDoubleReg, i.InputOperand(1), shuffle);
__ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), shuffle); __ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), shuffle);
__ Pblendw(i.OutputSimd128Register(), kScratchDoubleReg, i.InputInt8(3)); __ Pblendw(i.OutputSimd128Register(), kScratchDoubleReg, i.InputUint8(3));
break; break;
} }
case kIA32S16x8Blend: case kIA32S16x8Blend:
...@@ -3757,7 +3757,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3757,7 +3757,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pshufhw(kScratchDoubleReg, kScratchDoubleReg, i.InputUint8(3)); __ Pshufhw(kScratchDoubleReg, kScratchDoubleReg, i.InputUint8(3));
__ Pshuflw(dst, i.InputOperand(0), i.InputUint8(2)); __ Pshuflw(dst, i.InputOperand(0), i.InputUint8(2));
__ Pshufhw(dst, dst, i.InputUint8(3)); __ Pshufhw(dst, dst, i.InputUint8(3));
__ Pblendw(dst, kScratchDoubleReg, i.InputInt8(4)); __ Pblendw(dst, kScratchDoubleReg, i.InputUint8(4));
break; break;
} }
case kIA32S8x16Alignr: case kIA32S8x16Alignr:
......
...@@ -1486,7 +1486,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -1486,7 +1486,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
// TODO(bmeurer): Use RIP relative 128-bit constants. // TODO(bmeurer): Use RIP relative 128-bit constants.
XMMRegister tmp = i.ToDoubleRegister(instr->TempAt(0)); XMMRegister tmp = i.ToDoubleRegister(instr->TempAt(0));
__ Pcmpeqd(tmp, tmp); __ Pcmpeqd(tmp, tmp);
__ Psllq(tmp, 31); __ Psllq(tmp, byte{31});
__ Xorps(i.OutputDoubleRegister(), tmp); __ Xorps(i.OutputDoubleRegister(), tmp);
break; break;
} }
...@@ -2439,7 +2439,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2439,7 +2439,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
// propagate -0's and NaNs, which may be non-canonical. // propagate -0's and NaNs, which may be non-canonical.
__ Orpd(kScratchDoubleReg, dst); __ Orpd(kScratchDoubleReg, dst);
// Canonicalize NaNs by quieting and clearing the payload. // Canonicalize NaNs by quieting and clearing the payload.
__ Cmppd(dst, kScratchDoubleReg, int8_t{3}); __ Cmpunordpd(dst, kScratchDoubleReg);
__ Orpd(kScratchDoubleReg, dst); __ Orpd(kScratchDoubleReg, dst);
__ Psrlq(dst, byte{13}); __ Psrlq(dst, byte{13});
__ Andnpd(dst, kScratchDoubleReg); __ Andnpd(dst, kScratchDoubleReg);
...@@ -2461,7 +2461,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2461,7 +2461,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
// Propagate sign discrepancy and (subtle) quiet NaNs. // Propagate sign discrepancy and (subtle) quiet NaNs.
__ Subpd(kScratchDoubleReg, dst); __ Subpd(kScratchDoubleReg, dst);
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic. // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
__ Cmppd(dst, kScratchDoubleReg, int8_t{3}); __ Cmpunordpd(dst, kScratchDoubleReg);
__ Psrlq(dst, byte{13}); __ Psrlq(dst, byte{13});
__ Andnpd(dst, kScratchDoubleReg); __ Andnpd(dst, kScratchDoubleReg);
break; break;
...@@ -2671,7 +2671,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2671,7 +2671,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
// propagate -0's and NaNs, which may be non-canonical. // propagate -0's and NaNs, which may be non-canonical.
__ Orps(kScratchDoubleReg, dst); __ Orps(kScratchDoubleReg, dst);
// Canonicalize NaNs by quieting and clearing the payload. // Canonicalize NaNs by quieting and clearing the payload.
__ Cmpps(dst, kScratchDoubleReg, int8_t{3}); __ Cmpunordps(dst, kScratchDoubleReg);
__ Orps(kScratchDoubleReg, dst); __ Orps(kScratchDoubleReg, dst);
__ Psrld(dst, byte{10}); __ Psrld(dst, byte{10});
__ Andnps(dst, kScratchDoubleReg); __ Andnps(dst, kScratchDoubleReg);
...@@ -2693,7 +2693,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2693,7 +2693,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
// Propagate sign discrepancy and (subtle) quiet NaNs. // Propagate sign discrepancy and (subtle) quiet NaNs.
__ Subps(kScratchDoubleReg, dst); __ Subps(kScratchDoubleReg, dst);
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic. // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
__ Cmpps(dst, kScratchDoubleReg, int8_t{3}); __ Cmpunordps(dst, kScratchDoubleReg);
__ Psrld(dst, byte{10}); __ Psrld(dst, byte{10});
__ Andnps(dst, kScratchDoubleReg); __ Andnps(dst, kScratchDoubleReg);
break; break;
...@@ -2851,7 +2851,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2851,7 +2851,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pmuludq(tmp2, left); __ Pmuludq(tmp2, left);
__ Paddq(tmp2, tmp1); __ Paddq(tmp2, tmp1);
__ Psllq(tmp2, 32); __ Psllq(tmp2, byte{32});
__ Pmuludq(left, right); __ Pmuludq(left, right);
__ Paddq(left, tmp2); // left == dst __ Paddq(left, tmp2); // left == dst
......
...@@ -3896,7 +3896,7 @@ void LiftoffAssembler::emit_i64x2_shr_s(LiftoffRegister dst, ...@@ -3896,7 +3896,7 @@ void LiftoffAssembler::emit_i64x2_shr_s(LiftoffRegister dst,
// Set up a mask [0x80000000,0,0x80000000,0]. // Set up a mask [0x80000000,0,0x80000000,0].
Pcmpeqb(tmp, tmp); Pcmpeqb(tmp, tmp);
Psllq(tmp, tmp, 63); Psllq(tmp, tmp, byte{63});
Psrlq(tmp, tmp, shift); Psrlq(tmp, tmp, shift);
if (CpuFeatures::IsSupported(AVX)) { if (CpuFeatures::IsSupported(AVX)) {
...@@ -3919,7 +3919,7 @@ void LiftoffAssembler::emit_i64x2_shri_s(LiftoffRegister dst, ...@@ -3919,7 +3919,7 @@ void LiftoffAssembler::emit_i64x2_shri_s(LiftoffRegister dst,
// Set up a mask [0x80000000,0,0x80000000,0]. // Set up a mask [0x80000000,0,0x80000000,0].
Pcmpeqb(tmp, tmp); Pcmpeqb(tmp, tmp);
Psllq(tmp, tmp, 63); Psllq(tmp, tmp, byte{63});
Psrlq(tmp, tmp, byte{shift}); Psrlq(tmp, tmp, byte{shift});
liftoff::EmitSimdShiftOpImm<&Assembler::vpsrlq, &Assembler::psrlq, 6>( liftoff::EmitSimdShiftOpImm<&Assembler::vpsrlq, &Assembler::psrlq, 6>(
...@@ -3969,7 +3969,7 @@ void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -3969,7 +3969,7 @@ void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
Psrlq(tmp2.fp(), byte{32}); Psrlq(tmp2.fp(), byte{32});
Pmuludq(tmp2.fp(), tmp2.fp(), lhs.fp()); Pmuludq(tmp2.fp(), tmp2.fp(), lhs.fp());
Paddq(tmp2.fp(), tmp2.fp(), tmp1.fp()); Paddq(tmp2.fp(), tmp2.fp(), tmp1.fp());
Psllq(tmp2.fp(), tmp2.fp(), 32); Psllq(tmp2.fp(), tmp2.fp(), byte{32});
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmuludq, &Assembler::pmuludq>( liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmuludq, &Assembler::pmuludq>(
this, dst, lhs, rhs); this, dst, lhs, rhs);
Paddq(dst.fp(), dst.fp(), tmp2.fp()); Paddq(dst.fp(), dst.fp(), tmp2.fp());
...@@ -4032,11 +4032,11 @@ void LiftoffAssembler::emit_f32x4_abs(LiftoffRegister dst, ...@@ -4032,11 +4032,11 @@ void LiftoffAssembler::emit_f32x4_abs(LiftoffRegister dst,
LiftoffRegister src) { LiftoffRegister src) {
if (dst.fp() == src.fp()) { if (dst.fp() == src.fp()) {
Pcmpeqd(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg); Pcmpeqd(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
Psrld(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, 1); Psrld(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, byte{1});
Andps(dst.fp(), liftoff::kScratchDoubleReg); Andps(dst.fp(), liftoff::kScratchDoubleReg);
} else { } else {
Pcmpeqd(dst.fp(), dst.fp()); Pcmpeqd(dst.fp(), dst.fp());
Psrld(dst.fp(), dst.fp(), 1); Psrld(dst.fp(), dst.fp(), byte{1});
Andps(dst.fp(), src.fp()); Andps(dst.fp(), src.fp());
} }
} }
...@@ -4045,11 +4045,11 @@ void LiftoffAssembler::emit_f32x4_neg(LiftoffRegister dst, ...@@ -4045,11 +4045,11 @@ void LiftoffAssembler::emit_f32x4_neg(LiftoffRegister dst,
LiftoffRegister src) { LiftoffRegister src) {
if (dst.fp() == src.fp()) { if (dst.fp() == src.fp()) {
Pcmpeqd(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg); Pcmpeqd(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
Pslld(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, 31); Pslld(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, byte{31});
Xorps(dst.fp(), liftoff::kScratchDoubleReg); Xorps(dst.fp(), liftoff::kScratchDoubleReg);
} else { } else {
Pcmpeqd(dst.fp(), dst.fp()); Pcmpeqd(dst.fp(), dst.fp());
Pslld(dst.fp(), dst.fp(), 31); Pslld(dst.fp(), dst.fp(), byte{31});
Xorps(dst.fp(), src.fp()); Xorps(dst.fp(), src.fp());
} }
} }
...@@ -4201,11 +4201,11 @@ void LiftoffAssembler::emit_f64x2_neg(LiftoffRegister dst, ...@@ -4201,11 +4201,11 @@ void LiftoffAssembler::emit_f64x2_neg(LiftoffRegister dst,
LiftoffRegister src) { LiftoffRegister src) {
if (dst.fp() == src.fp()) { if (dst.fp() == src.fp()) {
Pcmpeqd(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg); Pcmpeqd(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
Psllq(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, 63); Psllq(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, byte{63});
Xorpd(dst.fp(), liftoff::kScratchDoubleReg); Xorpd(dst.fp(), liftoff::kScratchDoubleReg);
} else { } else {
Pcmpeqd(dst.fp(), dst.fp()); Pcmpeqd(dst.fp(), dst.fp());
Psllq(dst.fp(), dst.fp(), 63); Psllq(dst.fp(), dst.fp(), byte{63});
Xorpd(dst.fp(), src.fp()); Xorpd(dst.fp(), src.fp());
} }
} }
......
...@@ -3510,7 +3510,7 @@ void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -3510,7 +3510,7 @@ void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
Psrlq(tmp2.fp(), byte{32}); Psrlq(tmp2.fp(), byte{32});
Pmuludq(tmp2.fp(), lhs.fp()); Pmuludq(tmp2.fp(), lhs.fp());
Paddq(tmp2.fp(), tmp1.fp()); Paddq(tmp2.fp(), tmp1.fp());
Psllq(tmp2.fp(), 32); Psllq(tmp2.fp(), byte{32});
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmuludq, &Assembler::pmuludq>( liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmuludq, &Assembler::pmuludq>(
this, dst, lhs, rhs); this, dst, lhs, rhs);
Paddq(dst.fp(), tmp2.fp()); Paddq(dst.fp(), tmp2.fp());
...@@ -3586,11 +3586,11 @@ void LiftoffAssembler::emit_f32x4_neg(LiftoffRegister dst, ...@@ -3586,11 +3586,11 @@ void LiftoffAssembler::emit_f32x4_neg(LiftoffRegister dst,
LiftoffRegister src) { LiftoffRegister src) {
if (dst.fp() == src.fp()) { if (dst.fp() == src.fp()) {
Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg); Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
Pslld(kScratchDoubleReg, static_cast<byte>(31)); Pslld(kScratchDoubleReg, byte{31});
Xorps(dst.fp(), kScratchDoubleReg); Xorps(dst.fp(), kScratchDoubleReg);
} else { } else {
Pcmpeqd(dst.fp(), dst.fp()); Pcmpeqd(dst.fp(), dst.fp());
Pslld(dst.fp(), static_cast<byte>(31)); Pslld(dst.fp(), byte{31});
Xorps(dst.fp(), src.fp()); Xorps(dst.fp(), src.fp());
} }
} }
...@@ -3674,7 +3674,7 @@ void LiftoffAssembler::emit_f32x4_min(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -3674,7 +3674,7 @@ void LiftoffAssembler::emit_f32x4_min(LiftoffRegister dst, LiftoffRegister lhs,
// propagate -0's and NaNs, which may be non-canonical. // propagate -0's and NaNs, which may be non-canonical.
Orps(kScratchDoubleReg, dst.fp()); Orps(kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by quieting and clearing the payload. // Canonicalize NaNs by quieting and clearing the payload.
Cmpps(dst.fp(), kScratchDoubleReg, int8_t{3}); Cmpunordps(dst.fp(), kScratchDoubleReg);
Orps(kScratchDoubleReg, dst.fp()); Orps(kScratchDoubleReg, dst.fp());
Psrld(dst.fp(), byte{10}); Psrld(dst.fp(), byte{10});
Andnps(dst.fp(), kScratchDoubleReg); Andnps(dst.fp(), kScratchDoubleReg);
...@@ -3706,7 +3706,7 @@ void LiftoffAssembler::emit_f32x4_max(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -3706,7 +3706,7 @@ void LiftoffAssembler::emit_f32x4_max(LiftoffRegister dst, LiftoffRegister lhs,
// Propagate sign discrepancy and (subtle) quiet NaNs. // Propagate sign discrepancy and (subtle) quiet NaNs.
Subps(kScratchDoubleReg, dst.fp()); Subps(kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic. // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
Cmpps(dst.fp(), kScratchDoubleReg, int8_t{3}); Cmpunordps(dst.fp(), kScratchDoubleReg);
Psrld(dst.fp(), byte{10}); Psrld(dst.fp(), byte{10});
Andnps(dst.fp(), kScratchDoubleReg); Andnps(dst.fp(), kScratchDoubleReg);
} }
...@@ -3830,7 +3830,7 @@ void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -3830,7 +3830,7 @@ void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs,
// propagate -0's and NaNs, which may be non-canonical. // propagate -0's and NaNs, which may be non-canonical.
Orpd(kScratchDoubleReg, dst.fp()); Orpd(kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by quieting and clearing the payload. // Canonicalize NaNs by quieting and clearing the payload.
Cmppd(dst.fp(), kScratchDoubleReg, int8_t{3}); Cmpunordpd(dst.fp(), kScratchDoubleReg);
Orpd(kScratchDoubleReg, dst.fp()); Orpd(kScratchDoubleReg, dst.fp());
Psrlq(dst.fp(), byte{13}); Psrlq(dst.fp(), byte{13});
Andnpd(dst.fp(), kScratchDoubleReg); Andnpd(dst.fp(), kScratchDoubleReg);
...@@ -3862,7 +3862,7 @@ void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -3862,7 +3862,7 @@ void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
// Propagate sign discrepancy and (subtle) quiet NaNs. // Propagate sign discrepancy and (subtle) quiet NaNs.
Subpd(kScratchDoubleReg, dst.fp()); Subpd(kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic. // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
Cmppd(dst.fp(), kScratchDoubleReg, int8_t{3}); Cmpunordpd(dst.fp(), kScratchDoubleReg);
Psrlq(dst.fp(), byte{13}); Psrlq(dst.fp(), byte{13});
Andnpd(dst.fp(), kScratchDoubleReg); Andnpd(dst.fp(), kScratchDoubleReg);
} }
......
...@@ -559,6 +559,8 @@ TEST(DisasmX64) { ...@@ -559,6 +559,8 @@ TEST(DisasmX64) {
__ cmpltps(xmm5, Operand(rbx, rcx, times_4, 10000)); __ cmpltps(xmm5, Operand(rbx, rcx, times_4, 10000));
__ cmpleps(xmm5, xmm1); __ cmpleps(xmm5, xmm1);
__ cmpleps(xmm5, Operand(rbx, rcx, times_4, 10000)); __ cmpleps(xmm5, Operand(rbx, rcx, times_4, 10000));
__ cmpunordps(xmm5, xmm1);
__ cmpunordps(xmm5, Operand(rbx, rcx, times_4, 10000));
__ cmpneqps(xmm5, xmm1); __ cmpneqps(xmm5, xmm1);
__ cmpneqps(xmm5, Operand(rbx, rcx, times_4, 10000)); __ cmpneqps(xmm5, Operand(rbx, rcx, times_4, 10000));
__ cmpnltps(xmm5, xmm1); __ cmpnltps(xmm5, xmm1);
...@@ -573,6 +575,8 @@ TEST(DisasmX64) { ...@@ -573,6 +575,8 @@ TEST(DisasmX64) {
__ cmpltpd(xmm5, Operand(rbx, rcx, times_4, 10000)); __ cmpltpd(xmm5, Operand(rbx, rcx, times_4, 10000));
__ cmplepd(xmm5, xmm1); __ cmplepd(xmm5, xmm1);
__ cmplepd(xmm5, Operand(rbx, rcx, times_4, 10000)); __ cmplepd(xmm5, Operand(rbx, rcx, times_4, 10000));
__ cmpunordpd(xmm5, xmm1);
__ cmpunordpd(xmm5, Operand(rbx, rcx, times_4, 10000));
__ cmpneqpd(xmm5, xmm1); __ cmpneqpd(xmm5, xmm1);
__ cmpneqpd(xmm5, Operand(rbx, rcx, times_4, 10000)); __ cmpneqpd(xmm5, Operand(rbx, rcx, times_4, 10000));
__ cmpnltpd(xmm5, xmm1); __ cmpnltpd(xmm5, xmm1);
...@@ -727,6 +731,8 @@ TEST(DisasmX64) { ...@@ -727,6 +731,8 @@ TEST(DisasmX64) {
__ vcmpltps(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000)); __ vcmpltps(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000));
__ vcmpleps(xmm5, xmm4, xmm1); __ vcmpleps(xmm5, xmm4, xmm1);
__ vcmpleps(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000)); __ vcmpleps(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000));
__ vcmpunordps(xmm5, xmm4, xmm1);
__ vcmpunordps(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000));
__ vcmpneqps(xmm5, xmm4, xmm1); __ vcmpneqps(xmm5, xmm4, xmm1);
__ vcmpneqps(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000)); __ vcmpneqps(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000));
__ vcmpnltps(xmm5, xmm4, xmm1); __ vcmpnltps(xmm5, xmm4, xmm1);
...@@ -741,6 +747,8 @@ TEST(DisasmX64) { ...@@ -741,6 +747,8 @@ TEST(DisasmX64) {
__ vcmpltpd(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000)); __ vcmpltpd(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000));
__ vcmplepd(xmm5, xmm4, xmm1); __ vcmplepd(xmm5, xmm4, xmm1);
__ vcmplepd(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000)); __ vcmplepd(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000));
__ vcmpunordpd(xmm5, xmm4, xmm1);
__ vcmpunordpd(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000));
__ vcmpneqpd(xmm5, xmm4, xmm1); __ vcmpneqpd(xmm5, xmm4, xmm1);
__ vcmpneqpd(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000)); __ vcmpneqpd(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000));
__ vcmpnltpd(xmm5, xmm4, xmm1); __ vcmpnltpd(xmm5, xmm4, xmm1);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment