// Copyright 2021 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef V8_CODEGEN_SHARED_IA32_X64_MACRO_ASSEMBLER_SHARED_IA32_X64_H_ #define V8_CODEGEN_SHARED_IA32_X64_MACRO_ASSEMBLER_SHARED_IA32_X64_H_ #include "src/base/macros.h" #include "src/codegen/cpu-features.h" #include "src/codegen/external-reference.h" #include "src/codegen/turbo-assembler.h" #if V8_TARGET_ARCH_IA32 #include "src/codegen/ia32/register-ia32.h" #elif V8_TARGET_ARCH_X64 #include "src/codegen/x64/register-x64.h" #else #error Unsupported target architecture. #endif namespace v8 { namespace internal { class Assembler; // For WebAssembly we care about the full floating point register. If we are not // running Wasm, we can get away with saving half of those registers. #if V8_ENABLE_WEBASSEMBLY constexpr int kStackSavedSavedFPSize = 2 * kDoubleSize; #else constexpr int kStackSavedSavedFPSize = kDoubleSize; #endif // V8_ENABLE_WEBASSEMBLY // Base class for SharedTurboAssemblerBase. This class contains macro-assembler // functions that can be shared across ia32 and x64 without any template // machinery, i.e. does not require the CRTP pattern that // SharedTurboAssemblerBase exposes. This allows us to keep the bulk of // definition inside a separate source file, rather than putting everything // inside this header. class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { public: using TurboAssemblerBase::TurboAssemblerBase; void Move(Register dst, uint32_t src); // Move if registers are not identical. void Move(Register dst, Register src); void Add(Register dst, Immediate src); void And(Register dst, Immediate src); // Will move src1 to dst if AVX is not supported. void Movhps(XMMRegister dst, XMMRegister src1, Operand src2); void Movlps(XMMRegister dst, XMMRegister src1, Operand src2); void Pblendvb(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask); template <typename Op> void Pinsrb(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8, uint32_t* load_pc_offset = nullptr) { PinsrHelper(this, &Assembler::vpinsrb, &Assembler::pinsrb, dst, src1, src2, imm8, load_pc_offset, {SSE4_1}); } template <typename Op> void Pinsrw(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8, uint32_t* load_pc_offset = nullptr) { PinsrHelper(this, &Assembler::vpinsrw, &Assembler::pinsrw, dst, src1, src2, imm8, load_pc_offset); } // Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE. template <typename Op> void Pshufb(XMMRegister dst, XMMRegister src, Op mask) { if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope avx_scope(this, AVX); vpshufb(dst, src, mask); } else { // Make sure these are different so that we won't overwrite mask. DCHECK_NE(mask, dst); if (dst != src) { movaps(dst, src); } CpuFeatureScope sse_scope(this, SSSE3); pshufb(dst, mask); } } template <typename Op> void Pshufb(XMMRegister dst, Op mask) { Pshufb(dst, dst, mask); } // Shufps that will mov src1 into dst if AVX is not supported. void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2, uint8_t imm8); // Helper struct to implement functions that check for AVX support and // dispatch to the appropriate AVX/SSE instruction. template <typename Dst, typename Arg, typename... Args> struct AvxHelper { Assembler* assm; base::Optional<CpuFeature> feature = base::nullopt; // Call a method where the AVX version expects the dst argument to be // duplicated. // E.g. Andps(x, y) -> vandps(x, x, y) // -> andps(x, y) template <void (Assembler::*avx)(Dst, Dst, Arg, Args...), void (Assembler::*no_avx)(Dst, Arg, Args...)> void emit(Dst dst, Arg arg, Args... args) { if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope scope(assm, AVX); (assm->*avx)(dst, dst, arg, args...); } else if (feature.has_value()) { DCHECK(CpuFeatures::IsSupported(*feature)); CpuFeatureScope scope(assm, *feature); (assm->*no_avx)(dst, arg, args...); } else { (assm->*no_avx)(dst, arg, args...); } } // Call a method in the AVX form (one more operand), but if unsupported will // check that dst == first src. // E.g. Andps(x, y, z) -> vandps(x, y, z) // -> andps(x, z) and check that x == y template <void (Assembler::*avx)(Dst, Arg, Args...), void (Assembler::*no_avx)(Dst, Args...)> void emit(Dst dst, Arg arg, Args... args) { if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope scope(assm, AVX); (assm->*avx)(dst, arg, args...); } else if (feature.has_value()) { DCHECK_EQ(dst, arg); DCHECK(CpuFeatures::IsSupported(*feature)); CpuFeatureScope scope(assm, *feature); (assm->*no_avx)(dst, args...); } else { DCHECK_EQ(dst, arg); (assm->*no_avx)(dst, args...); } } // Call a method where the AVX version expects no duplicated dst argument. // E.g. Movddup(x, y) -> vmovddup(x, y) // -> movddup(x, y) template <void (Assembler::*avx)(Dst, Arg, Args...), void (Assembler::*no_avx)(Dst, Arg, Args...)> void emit(Dst dst, Arg arg, Args... args) { if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope scope(assm, AVX); (assm->*avx)(dst, arg, args...); } else if (feature.has_value()) { DCHECK(CpuFeatures::IsSupported(*feature)); CpuFeatureScope scope(assm, *feature); (assm->*no_avx)(dst, arg, args...); } else { (assm->*no_avx)(dst, arg, args...); } } }; #define AVX_OP(macro_name, name) \ template <typename Dst, typename Arg, typename... Args> \ void macro_name(Dst dst, Arg arg, Args... args) { \ AvxHelper<Dst, Arg, Args...>{this} \ .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ args...); \ } // Define a macro which uses |avx_name| when AVX is supported, and |sse_name| // when AVX is not supported. This is useful for bit-wise instructions like // andpd/andps, where the behavior is exactly the same, but the *ps // version is 1 byte shorter, and on SSE-only processors there is no // performance difference since those processors don't differentiate integer // and floating-point domains. // Note: we require |avx_name| to be the AVX instruction without the "v" // prefix. If we require the full AVX instruction name and the caller // accidentally passes in a SSE instruction, we compile without any issues and // generate the SSE instruction. By appending "v" here, we ensure that we will // generate an AVX instruction. #define AVX_OP_WITH_DIFF_SSE_INSTR(macro_name, avx_name, sse_name) \ template <typename Dst, typename Arg, typename... Args> \ void macro_name(Dst dst, Arg arg, Args... args) { \ AvxHelper<Dst, Arg, Args...>{this} \ .template emit<&Assembler::v##avx_name, &Assembler::sse_name>( \ dst, arg, args...); \ } #define AVX_OP_SSE3(macro_name, name) \ template <typename Dst, typename Arg, typename... Args> \ void macro_name(Dst dst, Arg arg, Args... args) { \ AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE3)} \ .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ args...); \ } #define AVX_OP_SSSE3(macro_name, name) \ template <typename Dst, typename Arg, typename... Args> \ void macro_name(Dst dst, Arg arg, Args... args) { \ AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSSE3)} \ .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ args...); \ } #define AVX_OP_SSE4_1(macro_name, name) \ template <typename Dst, typename Arg, typename... Args> \ void macro_name(Dst dst, Arg arg, Args... args) { \ AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE4_1)} \ .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ args...); \ } #define AVX_OP_SSE4_2(macro_name, name) \ template <typename Dst, typename Arg, typename... Args> \ void macro_name(Dst dst, Arg arg, Args... args) { \ AvxHelper<Dst, Arg, Args...>{this, base::Optional<CpuFeature>(SSE4_2)} \ .template emit<&Assembler::v##name, &Assembler::name>(dst, arg, \ args...); \ } // Keep this list sorted by required extension, then instruction name. AVX_OP(Addpd, addpd) AVX_OP(Addps, addps) AVX_OP(Addsd, addsd) AVX_OP(Addss, addss) AVX_OP(Andnpd, andnpd) AVX_OP(Andnps, andnps) AVX_OP(Andpd, andpd) AVX_OP(Andps, andps) AVX_OP(Cmpeqpd, cmpeqpd) AVX_OP(Cmpeqps, cmpeqps) AVX_OP(Cmplepd, cmplepd) AVX_OP(Cmpleps, cmpleps) AVX_OP(Cmpltpd, cmpltpd) AVX_OP(Cmpltps, cmpltps) AVX_OP(Cmpneqpd, cmpneqpd) AVX_OP(Cmpneqps, cmpneqps) AVX_OP(Cmpunordpd, cmpunordpd) AVX_OP(Cmpunordps, cmpunordps) AVX_OP(Cvtdq2pd, cvtdq2pd) AVX_OP(Cvtdq2ps, cvtdq2ps) AVX_OP(Cvtpd2ps, cvtpd2ps) AVX_OP(Cvtps2pd, cvtps2pd) AVX_OP(Cvtsd2ss, cvtsd2ss) AVX_OP(Cvtss2sd, cvtss2sd) AVX_OP(Cvttpd2dq, cvttpd2dq) AVX_OP(Cvttps2dq, cvttps2dq) AVX_OP(Cvttsd2si, cvttsd2si) AVX_OP(Cvttss2si, cvttss2si) AVX_OP(Divpd, divpd) AVX_OP(Divps, divps) AVX_OP(Divsd, divsd) AVX_OP(Divss, divss) AVX_OP(Maxpd, maxpd) AVX_OP(Maxps, maxps) AVX_OP(Minpd, minpd) AVX_OP(Minps, minps) AVX_OP(Movaps, movaps) AVX_OP(Movd, movd) AVX_OP(Movhlps, movhlps) AVX_OP(Movhps, movhps) AVX_OP(Movlps, movlps) AVX_OP(Movmskpd, movmskpd) AVX_OP(Movmskps, movmskps) AVX_OP(Movsd, movsd) AVX_OP(Movss, movss) AVX_OP(Movupd, movupd) AVX_OP(Movups, movups) AVX_OP(Mulpd, mulpd) AVX_OP(Mulps, mulps) AVX_OP(Mulsd, mulsd) AVX_OP(Mulss, mulss) AVX_OP(Orpd, orpd) AVX_OP(Orps, orps) AVX_OP(Packssdw, packssdw) AVX_OP(Packsswb, packsswb) AVX_OP(Packuswb, packuswb) AVX_OP(Paddb, paddb) AVX_OP(Paddd, paddd) AVX_OP(Paddq, paddq) AVX_OP(Paddsb, paddsb) AVX_OP(Paddsw, paddsw) AVX_OP(Paddusb, paddusb) AVX_OP(Paddusw, paddusw) AVX_OP(Paddw, paddw) AVX_OP(Pavgb, pavgb) AVX_OP(Pavgw, pavgw) AVX_OP(Pcmpgtb, pcmpgtb) AVX_OP(Pcmpgtd, pcmpgtd) AVX_OP(Pcmpgtw, pcmpgtw) AVX_OP(Pcmpeqb, pcmpeqb) AVX_OP(Pcmpeqd, pcmpeqd) AVX_OP(Pcmpeqw, pcmpeqw) AVX_OP(Pmaddwd, pmaddwd) AVX_OP(Pmaxsw, pmaxsw) AVX_OP(Pmaxub, pmaxub) AVX_OP(Pminsw, pminsw) AVX_OP(Pminub, pminub) AVX_OP(Pmovmskb, pmovmskb) AVX_OP(Pmullw, pmullw) AVX_OP(Pmuludq, pmuludq) AVX_OP(Pshufd, pshufd) AVX_OP(Pshufhw, pshufhw) AVX_OP(Pshuflw, pshuflw) AVX_OP(Pslld, pslld) AVX_OP(Psllq, psllq) AVX_OP(Psllw, psllw) AVX_OP(Psrad, psrad) AVX_OP(Psraw, psraw) AVX_OP(Psrld, psrld) AVX_OP(Psrlq, psrlq) AVX_OP(Psrlw, psrlw) AVX_OP(Psubb, psubb) AVX_OP(Psubd, psubd) AVX_OP(Psubq, psubq) AVX_OP(Psubsb, psubsb) AVX_OP(Psubsw, psubsw) AVX_OP(Psubusb, psubusb) AVX_OP(Psubusw, psubusw) AVX_OP(Psubw, psubw) AVX_OP(Punpckhbw, punpckhbw) AVX_OP(Punpckhdq, punpckhdq) AVX_OP(Punpckhqdq, punpckhqdq) AVX_OP(Punpckhwd, punpckhwd) AVX_OP(Punpcklbw, punpcklbw) AVX_OP(Punpckldq, punpckldq) AVX_OP(Punpcklqdq, punpcklqdq) AVX_OP(Punpcklwd, punpcklwd) AVX_OP(Rcpps, rcpps) AVX_OP(Rsqrtps, rsqrtps) AVX_OP(Sqrtpd, sqrtpd) AVX_OP(Sqrtps, sqrtps) AVX_OP(Sqrtsd, sqrtsd) AVX_OP(Sqrtss, sqrtss) AVX_OP(Subpd, subpd) AVX_OP(Subps, subps) AVX_OP(Subsd, subsd) AVX_OP(Subss, subss) AVX_OP(Ucomisd, ucomisd) AVX_OP(Ucomiss, ucomiss) AVX_OP(Unpcklps, unpcklps) AVX_OP(Xorpd, xorpd) AVX_OP(Xorps, xorps) // Many AVX processors have separate integer/floating-point domains, so use // vmovaps if AVX is supported. On SSE, movaps is 1 byte shorter than movdqa, // and has the same behavior. Most SSE processors also don't have the same // delay moving between integer and floating-point domains. AVX_OP_WITH_DIFF_SSE_INSTR(Movapd, movapd, movaps) AVX_OP_WITH_DIFF_SSE_INSTR(Movdqa, movdqa, movaps) AVX_OP_WITH_DIFF_SSE_INSTR(Movdqu, movdqu, movups) AVX_OP_WITH_DIFF_SSE_INSTR(Pand, pand, andps) AVX_OP_WITH_DIFF_SSE_INSTR(Por, por, orps) AVX_OP_WITH_DIFF_SSE_INSTR(Pxor, pxor, xorps) AVX_OP_SSE3(Haddps, haddps) AVX_OP_SSE3(Movddup, movddup) AVX_OP_SSE3(Movshdup, movshdup) AVX_OP_SSSE3(Pabsb, pabsb) AVX_OP_SSSE3(Pabsd, pabsd) AVX_OP_SSSE3(Pabsw, pabsw) AVX_OP_SSSE3(Palignr, palignr) AVX_OP_SSSE3(Pmulhrsw, pmulhrsw) AVX_OP_SSSE3(Psignb, psignb) AVX_OP_SSSE3(Psignd, psignd) AVX_OP_SSSE3(Psignw, psignw) AVX_OP_SSE4_1(Extractps, extractps) AVX_OP_SSE4_1(Insertps, insertps) AVX_OP_SSE4_1(Packusdw, packusdw) AVX_OP_SSE4_1(Pblendw, pblendw) AVX_OP_SSE4_1(Pcmpeqq, pcmpeqq) AVX_OP_SSE4_1(Pextrb, pextrb) AVX_OP_SSE4_1(Pextrw, pextrw) AVX_OP_SSE4_1(Pmaxsb, pmaxsb) AVX_OP_SSE4_1(Pmaxsd, pmaxsd) AVX_OP_SSE4_1(Pmaxud, pmaxud) AVX_OP_SSE4_1(Pmaxuw, pmaxuw) AVX_OP_SSE4_1(Pminsb, pminsb) AVX_OP_SSE4_1(Pminsd, pminsd) AVX_OP_SSE4_1(Pminud, pminud) AVX_OP_SSE4_1(Pminuw, pminuw) AVX_OP_SSE4_1(Pmovsxbw, pmovsxbw) AVX_OP_SSE4_1(Pmovsxdq, pmovsxdq) AVX_OP_SSE4_1(Pmovsxwd, pmovsxwd) AVX_OP_SSE4_1(Pmovzxbw, pmovzxbw) AVX_OP_SSE4_1(Pmovzxbd, pmovzxbd) AVX_OP_SSE4_1(Pmovzxdq, pmovzxdq) AVX_OP_SSE4_1(Pmovzxwd, pmovzxwd) AVX_OP_SSE4_1(Pmulld, pmulld) AVX_OP_SSE4_1(Ptest, ptest) AVX_OP_SSE4_1(Roundpd, roundpd) AVX_OP_SSE4_1(Roundps, roundps) AVX_OP_SSE4_1(Roundsd, roundsd) AVX_OP_SSE4_1(Roundss, roundss) #undef AVX_OP #undef AVX_OP_SSE3 #undef AVX_OP_SSSE3 #undef AVX_OP_SSE4_1 #undef AVX_OP_SSE4_2 void F64x2ExtractLane(DoubleRegister dst, XMMRegister src, uint8_t lane); void F64x2ReplaceLane(XMMRegister dst, XMMRegister src, DoubleRegister rep, uint8_t lane); void F64x2Min(XMMRegister dst, XMMRegister lhs, XMMRegister rhs, XMMRegister scratch); void F64x2Max(XMMRegister dst, XMMRegister lhs, XMMRegister rhs, XMMRegister scratch); void F32x4Splat(XMMRegister dst, DoubleRegister src); void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane); void F32x4Min(XMMRegister dst, XMMRegister lhs, XMMRegister rhs, XMMRegister scratch); void F32x4Max(XMMRegister dst, XMMRegister lhs, XMMRegister rhs, XMMRegister scratch); void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx); void I8x16Splat(XMMRegister dst, Register src, XMMRegister scratch); void I8x16Splat(XMMRegister dst, Operand src, XMMRegister scratch); void I8x16Shl(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1, XMMRegister tmp2); void I8x16Shl(XMMRegister dst, XMMRegister src1, Register src2, Register tmp1, XMMRegister tmp2, XMMRegister tmp3); void I8x16ShrS(XMMRegister dst, XMMRegister src1, uint8_t src2, XMMRegister tmp); void I8x16ShrS(XMMRegister dst, XMMRegister src1, Register src2, Register tmp1, XMMRegister tmp2, XMMRegister tmp3); void I8x16ShrU(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1, XMMRegister tmp2); void I8x16ShrU(XMMRegister dst, XMMRegister src1, Register src2, Register tmp1, XMMRegister tmp2, XMMRegister tmp3); void I16x8Splat(XMMRegister dst, Register src); void I16x8Splat(XMMRegister dst, Operand src); void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister scrat, bool is_signed); void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister scratch); void I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister scratch); void I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src); void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src, XMMRegister scratch); // Will move src1 to dst if AVX is not supported. void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister scratch); void I16x8DotI8x16I7x16S(XMMRegister dst, XMMRegister src1, XMMRegister src2); void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src, XMMRegister tmp); // Requires that dst == src1 if AVX is not supported. void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister scratch, bool low, bool is_signed); void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src); void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src, XMMRegister scratch); void I64x2Neg(XMMRegister dst, XMMRegister src, XMMRegister scratch); void I64x2Abs(XMMRegister dst, XMMRegister src, XMMRegister scratch); void I64x2GtS(XMMRegister dst, XMMRegister src0, XMMRegister src1, XMMRegister scratch); void I64x2GeS(XMMRegister dst, XMMRegister src0, XMMRegister src1, XMMRegister scratch); void I64x2ShrS(XMMRegister dst, XMMRegister src, uint8_t shift, XMMRegister xmm_tmp); void I64x2ShrS(XMMRegister dst, XMMRegister src, Register shift, XMMRegister xmm_tmp, XMMRegister xmm_shift, Register tmp_shift); void I64x2Mul(XMMRegister dst, XMMRegister lhs, XMMRegister rhs, XMMRegister tmp1, XMMRegister tmp2); void I64x2ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister scratch, bool low, bool is_signed); void I64x2SConvertI32x4High(XMMRegister dst, XMMRegister src); void I64x2UConvertI32x4High(XMMRegister dst, XMMRegister src, XMMRegister scratch); void S128Not(XMMRegister dst, XMMRegister src, XMMRegister scratch); // Requires dst == mask when AVX is not supported. void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1, XMMRegister src2, XMMRegister scratch); void S128Load8Splat(XMMRegister dst, Operand src, XMMRegister scratch); void S128Load16Splat(XMMRegister dst, Operand src, XMMRegister scratch); void S128Load32Splat(XMMRegister dst, Operand src); void S128Store64Lane(Operand dst, XMMRegister src, uint8_t laneidx); void F64x2Qfma(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister src3, XMMRegister tmp); void F64x2Qfms(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister src3, XMMRegister tmp); void F32x4Qfma(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister src3, XMMRegister tmp); void F32x4Qfms(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister src3, XMMRegister tmp); protected: template <typename Op> using AvxFn = void (Assembler::*)(XMMRegister, XMMRegister, Op, uint8_t); template <typename Op> using NoAvxFn = void (Assembler::*)(XMMRegister, Op, uint8_t); template <typename Op> void PinsrHelper(Assembler* assm, AvxFn<Op> avx, NoAvxFn<Op> noavx, XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8, uint32_t* load_pc_offset = nullptr, base::Optional<CpuFeature> feature = base::nullopt) { if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope scope(assm, AVX); if (load_pc_offset) *load_pc_offset = assm->pc_offset(); (assm->*avx)(dst, src1, src2, imm8); return; } if (dst != src1) assm->movaps(dst, src1); if (load_pc_offset) *load_pc_offset = assm->pc_offset(); if (feature.has_value()) { DCHECK(CpuFeatures::IsSupported(*feature)); CpuFeatureScope scope(assm, *feature); (assm->*noavx)(dst, src2, imm8); } else { (assm->*noavx)(dst, src2, imm8); } } private: template <typename Op> void I8x16SplatPreAvx2(XMMRegister dst, Op src, XMMRegister scratch); template <typename Op> void I16x8SplatPreAvx2(XMMRegister dst, Op src); }; // Common base class template shared by ia32 and x64 TurboAssembler. This uses // the Curiously Recurring Template Pattern (CRTP), where Impl is the actual // class (subclass of SharedTurboAssemblerBase instantiated with the actual // class). This allows static polymorphism, where member functions can be move // into SharedTurboAssembler, and we can also call into member functions // defined in ia32 or x64 specific TurboAssembler from within this template // class, via Impl. // // Note: all member functions must be defined in this header file so that the // compiler can generate code for the function definitions. See // https://isocpp.org/wiki/faq/templates#templates-defn-vs-decl for rationale. // If a function does not need polymorphism, move it into SharedTurboAssembler, // and define it outside of this header. template <typename Impl> class V8_EXPORT_PRIVATE SharedTurboAssemblerBase : public SharedTurboAssembler { using SharedTurboAssembler::SharedTurboAssembler; public: void Abspd(XMMRegister dst, XMMRegister src, Register tmp) { FloatUnop(dst, src, tmp, &SharedTurboAssembler::Andps, ExternalReference::address_of_double_abs_constant()); } void Absps(XMMRegister dst, XMMRegister src, Register tmp) { FloatUnop(dst, src, tmp, &SharedTurboAssembler::Andps, ExternalReference::address_of_float_abs_constant()); } void Negpd(XMMRegister dst, XMMRegister src, Register tmp) { FloatUnop(dst, src, tmp, &SharedTurboAssembler::Xorps, ExternalReference::address_of_double_neg_constant()); } void Negps(XMMRegister dst, XMMRegister src, Register tmp) { FloatUnop(dst, src, tmp, &SharedTurboAssembler::Xorps, ExternalReference::address_of_float_neg_constant()); } #undef FLOAT_UNOP void Pextrd(Register dst, XMMRegister src, uint8_t imm8) { if (imm8 == 0) { Movd(dst, src); return; } if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope scope(this, AVX); vpextrd(dst, src, imm8); } else if (CpuFeatures::IsSupported(SSE4_1)) { CpuFeatureScope sse_scope(this, SSE4_1); pextrd(dst, src, imm8); } else { DCHECK_LT(imm8, 2); impl()->PextrdPreSse41(dst, src, imm8); } } template <typename Op> void Pinsrd(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8, uint32_t* load_pc_offset = nullptr) { if (CpuFeatures::IsSupported(SSE4_1)) { PinsrHelper(this, &Assembler::vpinsrd, &Assembler::pinsrd, dst, src1, src2, imm8, load_pc_offset, base::Optional<CpuFeature>(SSE4_1)); } else { if (dst != src1) { movaps(dst, src1); } impl()->PinsrdPreSse41(dst, src2, imm8, load_pc_offset); } } template <typename Op> void Pinsrd(XMMRegister dst, Op src, uint8_t imm8, uint32_t* load_pc_offset = nullptr) { Pinsrd(dst, dst, src, imm8, load_pc_offset); } void F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src, Register scratch) { ASM_CODE_COMMENT(this); // dst = [ src_low, 0x43300000, src_high, 0x4330000 ]; // 0x43300000'00000000 is a special double where the significand bits // precisely represents all uint32 numbers. if (!CpuFeatures::IsSupported(AVX) && dst != src) { movaps(dst, src); src = dst; } Unpcklps(dst, src, ExternalReferenceAsOperand( ExternalReference:: address_of_wasm_f64x2_convert_low_i32x4_u_int_mask(), scratch)); Subpd(dst, ExternalReferenceAsOperand( ExternalReference::address_of_wasm_double_2_power_52(), scratch)); } void I32x4SConvertF32x4(XMMRegister dst, XMMRegister src, XMMRegister tmp, Register scratch) { ASM_CODE_COMMENT(this); Operand op = ExternalReferenceAsOperand( ExternalReference::address_of_wasm_int32_overflow_as_float(), scratch); // This algorithm works by: // 1. lanes with NaNs are zero-ed // 2. lanes ge than 2147483648.0f (MAX_INT32+1) set to 0xffff'ffff // 3. cvttps2dq sets all out of range lanes to 0x8000'0000 // a. correct for underflows (< MIN_INT32) // b. wrong for overflow, and we know which lanes overflow from 2. // 4. adjust for 3b by xor-ing 2 and 3 // a. 0x8000'0000 xor 0xffff'ffff = 0x7fff'ffff (MAX_INT32) if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope scope(this, AVX); vcmpeqps(tmp, src, src); vandps(dst, src, tmp); vcmpgeps(tmp, src, op); vcvttps2dq(dst, dst); vpxor(dst, dst, tmp); } else { if (src == dst) { movaps(tmp, src); cmpeqps(tmp, tmp); andps(dst, tmp); movaps(tmp, op); cmpleps(tmp, dst); cvttps2dq(dst, dst); xorps(dst, tmp); } else { movaps(tmp, op); cmpleps(tmp, src); cvttps2dq(dst, src); xorps(dst, tmp); movaps(tmp, src); cmpeqps(tmp, tmp); andps(dst, tmp); } } } void I32x4TruncSatF64x2SZero(XMMRegister dst, XMMRegister src, XMMRegister scratch, Register tmp) { ASM_CODE_COMMENT(this); if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope avx_scope(this, AVX); XMMRegister original_dst = dst; // Make sure we don't overwrite src. if (dst == src) { DCHECK_NE(src, scratch); dst = scratch; } // dst = 0 if src == NaN, else all ones. vcmpeqpd(dst, src, src); // dst = 0 if src == NaN, else INT32_MAX as double. vandpd( dst, dst, ExternalReferenceAsOperand( ExternalReference::address_of_wasm_int32_max_as_double(), tmp)); // dst = 0 if src == NaN, src is saturated to INT32_MAX as double. vminpd(dst, src, dst); // Values > INT32_MAX already saturated, values < INT32_MIN raises an // exception, which is masked and returns 0x80000000. vcvttpd2dq(original_dst, dst); } else { if (dst != src) { movaps(dst, src); } movaps(scratch, dst); cmpeqpd(scratch, dst); andps(scratch, ExternalReferenceAsOperand( ExternalReference::address_of_wasm_int32_max_as_double(), tmp)); minpd(dst, scratch); cvttpd2dq(dst, dst); } } void I32x4TruncSatF64x2UZero(XMMRegister dst, XMMRegister src, XMMRegister scratch, Register tmp) { ASM_CODE_COMMENT(this); if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope avx_scope(this, AVX); vxorpd(scratch, scratch, scratch); // Saturate to 0. vmaxpd(dst, src, scratch); // Saturate to UINT32_MAX. vminpd( dst, dst, ExternalReferenceAsOperand( ExternalReference::address_of_wasm_uint32_max_as_double(), tmp)); // Truncate. vroundpd(dst, dst, kRoundToZero); // Add to special double where significant bits == uint32. vaddpd(dst, dst, ExternalReferenceAsOperand( ExternalReference::address_of_wasm_double_2_power_52(), tmp)); // Extract low 32 bits of each double's significand, zero top lanes. // dst = [dst[0], dst[2], 0, 0] vshufps(dst, dst, scratch, 0x88); } else { CpuFeatureScope scope(this, SSE4_1); if (dst != src) { movaps(dst, src); } xorps(scratch, scratch); maxpd(dst, scratch); minpd(dst, ExternalReferenceAsOperand( ExternalReference::address_of_wasm_uint32_max_as_double(), tmp)); roundpd(dst, dst, kRoundToZero); addpd(dst, ExternalReferenceAsOperand( ExternalReference::address_of_wasm_double_2_power_52(), tmp)); shufps(dst, scratch, 0x88); } } void I32x4TruncF64x2UZero(XMMRegister dst, XMMRegister src, Register tmp, XMMRegister scratch) { // TODO(zhin): call this from I32x4TruncSatF64x2UZero. ASM_CODE_COMMENT(this); if (dst != src && !CpuFeatures::IsSupported(AVX)) { movaps(dst, src); src = dst; } // Same as I32x4TruncSatF64x2UZero but without the saturation. Roundpd(dst, src, kRoundToZero); // Add to special double where significant bits == uint32. Addpd(dst, dst, ExternalReferenceAsOperand( ExternalReference::address_of_wasm_double_2_power_52(), tmp)); // Extract low 32 bits of each double's significand, zero top lanes. // dst = [dst[0], dst[2], 0, 0] Shufps(dst, dst, scratch, 0x88); } void I32x4TruncF32x4U(XMMRegister dst, XMMRegister src, Register scratch, XMMRegister tmp) { ASM_CODE_COMMENT(this); Operand int32_overflow_op = ExternalReferenceAsOperand( ExternalReference::address_of_wasm_int32_overflow_as_float(), scratch); if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope avx_scope(this, AVX); vcmpltps(tmp, src, int32_overflow_op); } else { movaps(tmp, src); cmpltps(tmp, int32_overflow_op); } // In tmp, lanes < INT32_MAX are left alone, other lanes are zeroed. Pand(tmp, src); // tmp = src with all the valid conversions if (dst != src) { Movaps(dst, src); } // In dst, lanes < INT32_MAX are zeroed, other lanes left alone. Pxor(dst, tmp); // tmp contains only lanes which can be converted correctly (<INT32_MAX) Cvttps2dq(tmp, tmp); // Bit-trick follows: // All integers from INT32_MAX to UINT32_MAX that are representable as // floats lie between [0x4f00'0000,0x4f80'0000). // The bit representation of the integers is actually shifted right by 8. // For example given 2147483904.0f (which fits in UINT32_MAX): // // 01001111 000000000 000000000 000000001 (float 0x4f00'0001) // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ // these are exactly the top 24 bits of the int representation // but needs the top bit to be flipped // 10000000 000000000 000000001 000000000 (int 0x8000'0100) // // So what needs to be done is to flip bit 23, which is the lowest bit of // the exponent, which means multiply by 2 (or addps to itself). Addps(dst, dst, dst); // Then shift to get the bit representation of the int. Pslld(dst, byte{8}); // Merge the converted lanes and bit shifted lanes. Paddd(dst, tmp); } void I32x4ExtAddPairwiseI16x8S(XMMRegister dst, XMMRegister src, Register scratch) { ASM_CODE_COMMENT(this); Operand op = ExternalReferenceAsOperand( ExternalReference::address_of_wasm_i16x8_splat_0x0001(), scratch); // pmaddwd multiplies signed words in src and op, producing // signed doublewords, then adds pairwise. // src = |a|b|c|d|e|f|g|h| // dst = | a*1 + b*1 | c*1 + d*1 | e*1 + f*1 | g*1 + h*1 | if (!CpuFeatures::IsSupported(AVX) && (dst != src)) { movaps(dst, src); src = dst; } Pmaddwd(dst, src, op); } void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src, XMMRegister scratch, Register tmp) { ASM_CODE_COMMENT(this); // pmaddubsw treats the first operand as unsigned, so pass the external // reference to it as the first operand. Operand op = ExternalReferenceAsOperand( ExternalReference::address_of_wasm_i8x16_splat_0x01(), tmp); if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope avx_scope(this, AVX); vmovdqa(scratch, op); vpmaddubsw(dst, scratch, src); } else { CpuFeatureScope sse_scope(this, SSSE3); if (dst == src) { movaps(scratch, op); pmaddubsw(scratch, src); movaps(dst, scratch); } else { movaps(dst, op); pmaddubsw(dst, src); } } } void I16x8ExtAddPairwiseI8x16U(XMMRegister dst, XMMRegister src, Register scratch) { ASM_CODE_COMMENT(this); Operand op = ExternalReferenceAsOperand( ExternalReference::address_of_wasm_i8x16_splat_0x01(), scratch); if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope avx_scope(this, AVX); vpmaddubsw(dst, src, op); } else { CpuFeatureScope sse_scope(this, SSSE3); if (dst != src) { movaps(dst, src); } pmaddubsw(dst, op); } } void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask, XMMRegister scratch, Register tmp, bool omit_add = false) { ASM_CODE_COMMENT(this); if (omit_add) { // We have determined that the indices are immediates, and they are either // within bounds, or the top bit is set, so we can omit the add. Pshufb(dst, src, mask); return; } // Out-of-range indices should return 0, add 112 so that any value > 15 // saturates to 128 (top bit set), so pshufb will zero that lane. Operand op = ExternalReferenceAsOperand( ExternalReference::address_of_wasm_i8x16_swizzle_mask(), tmp); if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope avx_scope(this, AVX); vpaddusb(scratch, mask, op); vpshufb(dst, src, scratch); } else { CpuFeatureScope sse_scope(this, SSSE3); movaps(scratch, op); if (dst != src) { DCHECK_NE(dst, mask); movaps(dst, src); } paddusb(scratch, mask); pshufb(dst, scratch); } } void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1, XMMRegister tmp2, Register scratch) { ASM_CODE_COMMENT(this); DCHECK_NE(dst, tmp1); DCHECK_NE(src, tmp1); DCHECK_NE(dst, tmp2); DCHECK_NE(src, tmp2); if (CpuFeatures::IsSupported(AVX)) { CpuFeatureScope avx_scope(this, AVX); vmovdqa(tmp1, ExternalReferenceAsOperand( ExternalReference::address_of_wasm_i8x16_splat_0x0f(), scratch)); vpandn(tmp2, tmp1, src); vpand(dst, tmp1, src); vmovdqa(tmp1, ExternalReferenceAsOperand( ExternalReference::address_of_wasm_i8x16_popcnt_mask(), scratch)); vpsrlw(tmp2, tmp2, 4); vpshufb(dst, tmp1, dst); vpshufb(tmp2, tmp1, tmp2); vpaddb(dst, dst, tmp2); } else if (CpuFeatures::IsSupported(INTEL_ATOM)) { // Pre-Goldmont low-power Intel microarchitectures have very slow // PSHUFB instruction, thus use PSHUFB-free divide-and-conquer // algorithm on these processors. ATOM CPU feature captures exactly // the right set of processors. movaps(tmp1, src); psrlw(tmp1, 1); if (dst != src) { movaps(dst, src); } andps(tmp1, ExternalReferenceAsOperand( ExternalReference::address_of_wasm_i8x16_splat_0x55(), scratch)); psubb(dst, tmp1); Operand splat_0x33 = ExternalReferenceAsOperand( ExternalReference::address_of_wasm_i8x16_splat_0x33(), scratch); movaps(tmp1, dst); andps(dst, splat_0x33); psrlw(tmp1, 2); andps(tmp1, splat_0x33); paddb(dst, tmp1); movaps(tmp1, dst); psrlw(dst, 4); paddb(dst, tmp1); andps(dst, ExternalReferenceAsOperand( ExternalReference::address_of_wasm_i8x16_splat_0x0f(), scratch)); } else { CpuFeatureScope sse_scope(this, SSSE3); movaps(tmp1, ExternalReferenceAsOperand( ExternalReference::address_of_wasm_i8x16_splat_0x0f(), scratch)); Operand mask = ExternalReferenceAsOperand( ExternalReference::address_of_wasm_i8x16_popcnt_mask(), scratch); if (tmp2 != tmp1) { movaps(tmp2, tmp1); } andps(tmp1, src); andnps(tmp2, src); psrlw(tmp2, 4); movaps(dst, mask); pshufb(dst, tmp1); movaps(tmp1, mask); pshufb(tmp1, tmp2); paddb(dst, tmp1); } } private: // All implementation-specific methods must be called through this. Impl* impl() { return static_cast<Impl*>(this); } Operand ExternalReferenceAsOperand(ExternalReference reference, Register scratch) { return impl()->ExternalReferenceAsOperand(reference, scratch); } using FloatInstruction = void (SharedTurboAssembler::*)(XMMRegister, XMMRegister, Operand); void FloatUnop(XMMRegister dst, XMMRegister src, Register tmp, FloatInstruction op, ExternalReference ext) { if (!CpuFeatures::IsSupported(AVX) && (dst != src)) { movaps(dst, src); src = dst; } SharedTurboAssembler* assm = this; (assm->*op)(dst, src, ExternalReferenceAsOperand(ext, tmp)); } }; } // namespace internal } // namespace v8 #endif // V8_CODEGEN_SHARED_IA32_X64_MACRO_ASSEMBLER_SHARED_IA32_X64_H_