Commit 83a635e0 authored by Weiliang Lin's avatar Weiliang Lin Committed by Benedikt Meurer

[x64] Introduce FMA3 instructions on scalar data elements.

R=bmeurer@chromium.org

Review URL: https://codereview.chromium.org/757503002

Patch from Weiliang Lin <weiliang.lin@intel.com>.

Cr-Commit-Position: refs/heads/master@{#25509}
parent 819955b2
......@@ -291,32 +291,35 @@ static bool HasListItem(const char* list, const char* item) {
#endif // V8_HOST_ARCH_IA32 || V8_HOST_ARCH_X64
CPU::CPU() : stepping_(0),
model_(0),
ext_model_(0),
family_(0),
ext_family_(0),
type_(0),
implementer_(0),
architecture_(0),
part_(0),
has_fpu_(false),
has_cmov_(false),
has_sahf_(false),
has_mmx_(false),
has_sse_(false),
has_sse2_(false),
has_sse3_(false),
has_ssse3_(false),
has_sse41_(false),
has_sse42_(false),
has_idiva_(false),
has_neon_(false),
has_thumb2_(false),
has_vfp_(false),
has_vfp3_(false),
has_vfp3_d32_(false),
is_fp64_mode_(false) {
CPU::CPU()
: stepping_(0),
model_(0),
ext_model_(0),
family_(0),
ext_family_(0),
type_(0),
implementer_(0),
architecture_(0),
part_(0),
has_fpu_(false),
has_cmov_(false),
has_sahf_(false),
has_mmx_(false),
has_sse_(false),
has_sse2_(false),
has_sse3_(false),
has_ssse3_(false),
has_sse41_(false),
has_sse42_(false),
has_avx_(false),
has_fma3_(false),
has_idiva_(false),
has_neon_(false),
has_thumb2_(false),
has_vfp_(false),
has_vfp3_(false),
has_vfp3_d32_(false),
is_fp64_mode_(false) {
memcpy(vendor_, "Unknown", 8);
#if V8_OS_NACL
// Portable host shouldn't do feature detection.
......@@ -356,6 +359,8 @@ CPU::CPU() : stepping_(0),
has_ssse3_ = (cpu_info[2] & 0x00000200) != 0;
has_sse41_ = (cpu_info[2] & 0x00080000) != 0;
has_sse42_ = (cpu_info[2] & 0x00100000) != 0;
has_avx_ = (cpu_info[2] & 0x18000000) != 0;
if (has_avx_) has_fma3_ = (cpu_info[2] & 0x00001000) != 0;
}
#if V8_HOST_ARCH_IA32
......
......@@ -68,6 +68,8 @@ class CPU FINAL {
bool has_ssse3() const { return has_ssse3_; }
bool has_sse41() const { return has_sse41_; }
bool has_sse42() const { return has_sse42_; }
bool has_avx() const { return has_avx_; }
bool has_fma3() const { return has_fma3_; }
// arm features
bool has_idiva() const { return has_idiva_; }
......@@ -101,6 +103,8 @@ class CPU FINAL {
bool has_ssse3_;
bool has_sse41_;
bool has_sse42_;
bool has_avx_;
bool has_fma3_;
bool has_idiva_;
bool has_neon_;
bool has_thumb2_;
......
......@@ -421,6 +421,8 @@ DEFINE_BOOL(enable_sse4_1, true,
"enable use of SSE4.1 instructions if available")
DEFINE_BOOL(enable_sahf, true,
"enable use of SAHF instruction if available (X64 only)")
DEFINE_BOOL(enable_avx, true, "enable use of AVX instructions if available")
DEFINE_BOOL(enable_fma3, true, "enable use of FMA3 instructions if available")
DEFINE_BOOL(enable_vfp3, ENABLE_VFP3_DEFAULT,
"enable use of VFP3 instructions if available")
DEFINE_BOOL(enable_armv7, ENABLE_ARMV7_DEFAULT,
......
......@@ -617,6 +617,8 @@ enum CpuFeature {
SSE4_1,
SSE3,
SAHF,
AVX,
FMA3,
// ARM
VFP3,
ARMv7,
......
......@@ -27,12 +27,19 @@ void CpuFeatures::ProbeImpl(bool cross_compile) {
if (cpu.has_sse41() && FLAG_enable_sse4_1) supported_ |= 1u << SSE4_1;
if (cpu.has_sse3() && FLAG_enable_sse3) supported_ |= 1u << SSE3;
// SAHF is not generally available in long mode.
if (cpu.has_sahf() && FLAG_enable_sahf) supported_|= 1u << SAHF;
if (cpu.has_sahf() && FLAG_enable_sahf) supported_ |= 1u << SAHF;
if (cpu.has_avx() && FLAG_enable_avx) supported_ |= 1u << AVX;
if (cpu.has_fma3() && FLAG_enable_fma3) supported_ |= 1u << FMA3;
}
void CpuFeatures::PrintTarget() { }
void CpuFeatures::PrintFeatures() { }
void CpuFeatures::PrintFeatures() {
printf("SSE3=%d SSE4_1=%d SAHF=%d AVX=%d FMA3=%d\n",
CpuFeatures::IsSupported(SSE3), CpuFeatures::IsSupported(SSE4_1),
CpuFeatures::IsSupported(SAHF), CpuFeatures::IsSupported(AVX),
CpuFeatures::IsSupported(FMA3));
}
// -----------------------------------------------------------------------------
......@@ -2638,6 +2645,104 @@ void Assembler::movapd(XMMRegister dst, XMMRegister src) {
}
void Assembler::addss(XMMRegister dst, XMMRegister src) {
EnsureSpace ensure_space(this);
emit(0xF3);
emit_optional_rex_32(dst, src);
emit(0x0F);
emit(0x58);
emit_sse_operand(dst, src);
}
void Assembler::addss(XMMRegister dst, const Operand& src) {
EnsureSpace ensure_space(this);
emit(0xF3);
emit_optional_rex_32(dst, src);
emit(0x0F);
emit(0x58);
emit_sse_operand(dst, src);
}
void Assembler::subss(XMMRegister dst, XMMRegister src) {
EnsureSpace ensure_space(this);
emit(0xF3);
emit_optional_rex_32(dst, src);
emit(0x0F);
emit(0x5C);
emit_sse_operand(dst, src);
}
void Assembler::subss(XMMRegister dst, const Operand& src) {
EnsureSpace ensure_space(this);
emit(0xF3);
emit_optional_rex_32(dst, src);
emit(0x0F);
emit(0x5C);
emit_sse_operand(dst, src);
}
void Assembler::mulss(XMMRegister dst, XMMRegister src) {
EnsureSpace ensure_space(this);
emit(0xF3);
emit_optional_rex_32(dst, src);
emit(0x0F);
emit(0x59);
emit_sse_operand(dst, src);
}
void Assembler::mulss(XMMRegister dst, const Operand& src) {
EnsureSpace ensure_space(this);
emit(0xF3);
emit_optional_rex_32(dst, src);
emit(0x0F);
emit(0x59);
emit_sse_operand(dst, src);
}
void Assembler::divss(XMMRegister dst, XMMRegister src) {
EnsureSpace ensure_space(this);
emit(0xF3);
emit_optional_rex_32(dst, src);
emit(0x0F);
emit(0x5E);
emit_sse_operand(dst, src);
}
void Assembler::divss(XMMRegister dst, const Operand& src) {
EnsureSpace ensure_space(this);
emit(0xF3);
emit_optional_rex_32(dst, src);
emit(0x0F);
emit(0x5E);
emit_sse_operand(dst, src);
}
void Assembler::ucomiss(XMMRegister dst, XMMRegister src) {
EnsureSpace ensure_space(this);
emit_optional_rex_32(dst, src);
emit(0x0f);
emit(0x2e);
emit_sse_operand(dst, src);
}
void Assembler::ucomiss(XMMRegister dst, const Operand& src) {
EnsureSpace ensure_space(this);
emit_optional_rex_32(dst, src);
emit(0x0f);
emit(0x2e);
emit_sse_operand(dst, src);
}
void Assembler::movss(XMMRegister dst, const Operand& src) {
EnsureSpace ensure_space(this);
emit(0xF3); // single
......@@ -3077,6 +3182,86 @@ void Assembler::pcmpeqd(XMMRegister dst, XMMRegister src) {
}
// byte 1 of 3-byte VEX
void Assembler::emit_vex3_byte1(XMMRegister reg, XMMRegister rm, byte m) {
DCHECK(1 <= m && m <= 3);
byte rxb = ~((reg.high_bit() << 2) | rm.high_bit()) << 5;
emit(rxb | m);
}
// byte 1 of 3-byte VEX
void Assembler::emit_vex3_byte1(XMMRegister reg, const Operand& rm, byte m) {
DCHECK(1 <= m && m <= 3);
byte rxb = ~((reg.high_bit() << 2) | rm.rex_) << 5;
emit(rxb | m);
}
// byte 1 of 2-byte VEX
void Assembler::emit_vex2_byte1(XMMRegister reg, XMMRegister v, byte lpp) {
DCHECK(lpp <= 3);
byte rv = ~((reg.high_bit() << 4) | v.code()) << 3;
emit(rv | lpp);
}
// byte 2 of 3-byte VEX
void Assembler::emit_vex3_byte2(byte w, XMMRegister v, byte lpp) {
DCHECK(w <= 1);
DCHECK(lpp <= 3);
emit((w << 7) | ((~v.code() & 0xf) << 3) | lpp);
}
void Assembler::vfmasd(byte op, XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
DCHECK(IsEnabled(FMA3));
EnsureSpace ensure_space(this);
emit_vex3_byte0();
emit_vex3_byte1(dst, src2, 0x02);
emit_vex3_byte2(0x1, src1, 0x01);
emit(op);
emit_sse_operand(dst, src2);
}
void Assembler::vfmasd(byte op, XMMRegister dst, XMMRegister src1,
const Operand& src2) {
DCHECK(IsEnabled(FMA3));
EnsureSpace ensure_space(this);
emit_vex3_byte0();
emit_vex3_byte1(dst, src2, 0x02);
emit_vex3_byte2(0x1, src1, 0x01);
emit(op);
emit_sse_operand(dst, src2);
}
void Assembler::vfmass(byte op, XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
DCHECK(IsEnabled(FMA3));
EnsureSpace ensure_space(this);
emit_vex3_byte0();
emit_vex3_byte1(dst, src2, 0x02);
emit_vex3_byte2(0x0, src1, 0x01);
emit(op);
emit_sse_operand(dst, src2);
}
void Assembler::vfmass(byte op, XMMRegister dst, XMMRegister src1,
const Operand& src2) {
DCHECK(IsEnabled(FMA3));
EnsureSpace ensure_space(this);
emit_vex3_byte0();
emit_vex3_byte1(dst, src2, 0x02);
emit_vex3_byte2(0x0, src1, 0x01);
emit(op);
emit_sse_operand(dst, src2);
}
void Assembler::emit_sse_operand(XMMRegister reg, const Operand& adr) {
Register ireg = { reg.code() };
emit_operand(ireg, adr);
......
......@@ -1014,6 +1014,17 @@ class Assembler : public AssemblerBase {
void sahf();
// SSE instructions
void addss(XMMRegister dst, XMMRegister src);
void addss(XMMRegister dst, const Operand& src);
void subss(XMMRegister dst, XMMRegister src);
void subss(XMMRegister dst, const Operand& src);
void mulss(XMMRegister dst, XMMRegister src);
void mulss(XMMRegister dst, const Operand& src);
void divss(XMMRegister dst, XMMRegister src);
void divss(XMMRegister dst, const Operand& src);
void ucomiss(XMMRegister dst, XMMRegister src);
void ucomiss(XMMRegister dst, const Operand& src);
void movaps(XMMRegister dst, XMMRegister src);
void movss(XMMRegister dst, const Operand& src);
void movss(const Operand& dst, XMMRegister src);
......@@ -1123,6 +1134,157 @@ class Assembler : public AssemblerBase {
void roundsd(XMMRegister dst, XMMRegister src, RoundingMode mode);
// AVX instruction
void vfmadd132sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmasd(0x99, dst, src1, src2);
}
void vfmadd213sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmasd(0xa9, dst, src1, src2);
}
void vfmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmasd(0xb9, dst, src1, src2);
}
void vfmadd132sd(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmasd(0x99, dst, src1, src2);
}
void vfmadd213sd(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmasd(0xa9, dst, src1, src2);
}
void vfmadd231sd(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmasd(0xb9, dst, src1, src2);
}
void vfmsub132sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmasd(0x9b, dst, src1, src2);
}
void vfmsub213sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmasd(0xab, dst, src1, src2);
}
void vfmsub231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmasd(0xbb, dst, src1, src2);
}
void vfmsub132sd(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmasd(0x9b, dst, src1, src2);
}
void vfmsub213sd(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmasd(0xab, dst, src1, src2);
}
void vfmsub231sd(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmasd(0xbb, dst, src1, src2);
}
void vfnmadd132sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmasd(0x9d, dst, src1, src2);
}
void vfnmadd213sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmasd(0xad, dst, src1, src2);
}
void vfnmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmasd(0xbd, dst, src1, src2);
}
void vfnmadd132sd(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmasd(0x9d, dst, src1, src2);
}
void vfnmadd213sd(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmasd(0xad, dst, src1, src2);
}
void vfnmadd231sd(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmasd(0xbd, dst, src1, src2);
}
void vfnmsub132sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmasd(0x9f, dst, src1, src2);
}
void vfnmsub213sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmasd(0xaf, dst, src1, src2);
}
void vfnmsub231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmasd(0xbf, dst, src1, src2);
}
void vfnmsub132sd(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmasd(0x9f, dst, src1, src2);
}
void vfnmsub213sd(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmasd(0xaf, dst, src1, src2);
}
void vfnmsub231sd(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmasd(0xbf, dst, src1, src2);
}
void vfmasd(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vfmasd(byte op, XMMRegister dst, XMMRegister src1, const Operand& src2);
void vfmadd132ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmass(0x99, dst, src1, src2);
}
void vfmadd213ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmass(0xa9, dst, src1, src2);
}
void vfmadd231ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmass(0xb9, dst, src1, src2);
}
void vfmadd132ss(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmass(0x99, dst, src1, src2);
}
void vfmadd213ss(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmass(0xa9, dst, src1, src2);
}
void vfmadd231ss(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmass(0xb9, dst, src1, src2);
}
void vfmsub132ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmass(0x9b, dst, src1, src2);
}
void vfmsub213ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmass(0xab, dst, src1, src2);
}
void vfmsub231ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmass(0xbb, dst, src1, src2);
}
void vfmsub132ss(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmass(0x9b, dst, src1, src2);
}
void vfmsub213ss(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmass(0xab, dst, src1, src2);
}
void vfmsub231ss(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmass(0xbb, dst, src1, src2);
}
void vfnmadd132ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmass(0x9d, dst, src1, src2);
}
void vfnmadd213ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmass(0xad, dst, src1, src2);
}
void vfnmadd231ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmass(0xbd, dst, src1, src2);
}
void vfnmadd132ss(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmass(0x9d, dst, src1, src2);
}
void vfnmadd213ss(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmass(0xad, dst, src1, src2);
}
void vfnmadd231ss(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmass(0xbd, dst, src1, src2);
}
void vfnmsub132ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmass(0x9f, dst, src1, src2);
}
void vfnmsub213ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmass(0xaf, dst, src1, src2);
}
void vfnmsub231ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmass(0xbf, dst, src1, src2);
}
void vfnmsub132ss(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmass(0x9f, dst, src1, src2);
}
void vfnmsub213ss(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmass(0xaf, dst, src1, src2);
}
void vfnmsub231ss(XMMRegister dst, XMMRegister src1, const Operand& src2) {
vfmass(0xbf, dst, src1, src2);
}
void vfmass(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vfmass(byte op, XMMRegister dst, XMMRegister src1, const Operand& src2);
// Debugging
void Print();
......@@ -1316,6 +1478,14 @@ class Assembler : public AssemblerBase {
}
}
// Emit vex prefix
void emit_vex2_byte0() { emit(0xc5); }
void emit_vex2_byte1(XMMRegister reg, XMMRegister v, byte lpp);
void emit_vex3_byte0() { emit(0xc4); }
void emit_vex3_byte1(XMMRegister reg, XMMRegister rm, byte m);
void emit_vex3_byte1(XMMRegister reg, const Operand& rm, byte m);
void emit_vex3_byte2(byte w, XMMRegister v, byte lpp);
// Emit the ModR/M byte, and optionally the SIB byte and
// 1- or 4-byte offset for a memory operand. Also encodes
// the second operand of the operation, a register or operation
......
This diff is collapsed.
This diff is collapsed.
......@@ -51,7 +51,7 @@ TEST(DisasmX64) {
CcTest::InitializeVM();
Isolate* isolate = CcTest::i_isolate();
HandleScope scope(isolate);
v8::internal::byte buffer[2048];
v8::internal::byte buffer[4096];
Assembler assm(isolate, buffer, sizeof buffer);
DummyStaticFunction(NULL); // just bloody use it (DELETE; debugging)
......@@ -394,6 +394,14 @@ TEST(DisasmX64) {
__ xorps(xmm0, Operand(rbx, rcx, times_4, 10000));
// Arithmetic operation
__ addss(xmm1, xmm0);
__ addss(xmm1, Operand(rbx, rcx, times_4, 10000));
__ mulss(xmm1, xmm0);
__ mulss(xmm1, Operand(rbx, rcx, times_4, 10000));
__ subss(xmm1, xmm0);
__ subss(xmm1, Operand(rbx, rcx, times_4, 10000));
__ divss(xmm1, xmm0);
__ divss(xmm1, Operand(rbx, rcx, times_4, 10000));
__ addps(xmm1, xmm0);
__ addps(xmm1, Operand(rbx, rcx, times_4, 10000));
__ subps(xmm1, xmm0);
......@@ -402,6 +410,9 @@ TEST(DisasmX64) {
__ mulps(xmm1, Operand(rbx, rcx, times_4, 10000));
__ divps(xmm1, xmm0);
__ divps(xmm1, Operand(rbx, rcx, times_4, 10000));
__ ucomiss(xmm0, xmm1);
__ ucomiss(xmm0, Operand(rbx, rcx, times_4, 10000));
}
// SSE 2 instructions
{
......@@ -464,6 +475,74 @@ TEST(DisasmX64) {
}
}
// FMA3 instruction
{
if (CpuFeatures::IsSupported(FMA3)) {
CpuFeatureScope scope(&assm, FMA3);
__ vfmadd132sd(xmm0, xmm1, xmm2);
__ vfmadd132sd(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfmadd213sd(xmm0, xmm1, xmm2);
__ vfmadd213sd(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfmadd231sd(xmm0, xmm1, xmm2);
__ vfmadd231sd(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfmadd132sd(xmm9, xmm10, xmm11);
__ vfmadd132sd(xmm9, xmm10, Operand(r9, r11, times_4, 10000));
__ vfmadd213sd(xmm9, xmm10, xmm11);
__ vfmadd213sd(xmm9, xmm10, Operand(r9, r11, times_4, 10000));
__ vfmadd231sd(xmm9, xmm10, xmm11);
__ vfmadd231sd(xmm9, xmm10, Operand(r9, r11, times_4, 10000));
__ vfmsub132sd(xmm0, xmm1, xmm2);
__ vfmsub132sd(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfmsub213sd(xmm0, xmm1, xmm2);
__ vfmsub213sd(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfmsub231sd(xmm0, xmm1, xmm2);
__ vfmsub231sd(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfnmadd132sd(xmm0, xmm1, xmm2);
__ vfnmadd132sd(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfnmadd213sd(xmm0, xmm1, xmm2);
__ vfnmadd213sd(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfnmadd231sd(xmm0, xmm1, xmm2);
__ vfnmadd231sd(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfnmsub132sd(xmm0, xmm1, xmm2);
__ vfnmsub132sd(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfnmsub213sd(xmm0, xmm1, xmm2);
__ vfnmsub213sd(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfnmsub231sd(xmm0, xmm1, xmm2);
__ vfnmsub231sd(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfmadd132ss(xmm0, xmm1, xmm2);
__ vfmadd132ss(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfmadd213ss(xmm0, xmm1, xmm2);
__ vfmadd213ss(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfmadd231ss(xmm0, xmm1, xmm2);
__ vfmadd231ss(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfmsub132ss(xmm0, xmm1, xmm2);
__ vfmsub132ss(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfmsub213ss(xmm0, xmm1, xmm2);
__ vfmsub213ss(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfmsub231ss(xmm0, xmm1, xmm2);
__ vfmsub231ss(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfnmadd132ss(xmm0, xmm1, xmm2);
__ vfnmadd132ss(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfnmadd213ss(xmm0, xmm1, xmm2);
__ vfnmadd213ss(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfnmadd231ss(xmm0, xmm1, xmm2);
__ vfnmadd231ss(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfnmsub132ss(xmm0, xmm1, xmm2);
__ vfnmsub132ss(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfnmsub213ss(xmm0, xmm1, xmm2);
__ vfnmsub213ss(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
__ vfnmsub231ss(xmm0, xmm1, xmm2);
__ vfnmsub231ss(xmm0, xmm1, Operand(rbx, rcx, times_4, 10000));
}
}
// xchg.
{
__ xchgq(rax, rax);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment