Commit 506c0979 authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[x64] Sort out move instructions in codegen

In AVX, it is better to use the appropriate integer or floating point
moves depending on which instructions produce/consume these moves, since
there can be a delay moving from integer to floating point domain. On
SSE systems, it is less important, and we can move movaps/movups which
is 1 byte shorter than movdqa/movdqu.

This patch cleans up a couple of places, and defines macro-assembler
functions Movdqa, Movdqu, Movapd, to call into movaps/movups when AVX is
not supported.

Change-Id: Iba6c54e218875f1a70f61792978d7b3f69edfb4b
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2599843
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71884}
parent ddd9b349
...@@ -2750,6 +2750,15 @@ void Assembler::movdqa(XMMRegister dst, Operand src) { ...@@ -2750,6 +2750,15 @@ void Assembler::movdqa(XMMRegister dst, Operand src) {
emit_sse_operand(dst, src); emit_sse_operand(dst, src);
} }
void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
EnsureSpace ensure_space(this);
emit(0x66);
emit_rex_64(src, dst);
emit(0x0F);
emit(0x7F);
emit_sse_operand(src, dst);
}
void Assembler::movdqu(Operand dst, XMMRegister src) { void Assembler::movdqu(Operand dst, XMMRegister src) {
EnsureSpace ensure_space(this); EnsureSpace ensure_space(this);
emit(0xF3); emit(0xF3);
...@@ -3486,6 +3495,14 @@ void Assembler::vmovq(Register dst, XMMRegister src) { ...@@ -3486,6 +3495,14 @@ void Assembler::vmovq(Register dst, XMMRegister src) {
emit_sse_operand(src, dst); emit_sse_operand(src, dst);
} }
void Assembler::vmovdqa(XMMRegister dst, XMMRegister src) {
DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this);
emit_vex_prefix(dst, xmm0, src, kL128, k66, k0F, kWIG);
emit(0x6F);
emit_sse_operand(dst, src);
}
void Assembler::vmovdqu(XMMRegister dst, Operand src) { void Assembler::vmovdqu(XMMRegister dst, Operand src) {
DCHECK(IsEnabled(AVX)); DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this); EnsureSpace ensure_space(this);
...@@ -3502,6 +3519,14 @@ void Assembler::vmovdqu(Operand dst, XMMRegister src) { ...@@ -3502,6 +3519,14 @@ void Assembler::vmovdqu(Operand dst, XMMRegister src) {
emit_sse_operand(src, dst); emit_sse_operand(src, dst);
} }
void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this);
emit_vex_prefix(src, xmm0, dst, kL128, kF3, k0F, kWIG);
emit(0x7F);
emit_sse_operand(src, dst);
}
void Assembler::vmovlps(XMMRegister dst, XMMRegister src1, Operand src2) { void Assembler::vmovlps(XMMRegister dst, XMMRegister src1, Operand src2) {
DCHECK(IsEnabled(AVX)); DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this); EnsureSpace ensure_space(this);
......
...@@ -1195,6 +1195,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1195,6 +1195,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void movdqa(Operand dst, XMMRegister src); void movdqa(Operand dst, XMMRegister src);
void movdqa(XMMRegister dst, Operand src); void movdqa(XMMRegister dst, Operand src);
void movdqa(XMMRegister dst, XMMRegister src);
void movdqu(Operand dst, XMMRegister src); void movdqu(Operand dst, XMMRegister src);
void movdqu(XMMRegister dst, Operand src); void movdqu(XMMRegister dst, Operand src);
...@@ -1328,8 +1329,10 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1328,8 +1329,10 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
} }
void vmovsd(XMMRegister dst, Operand src) { vsd(0x10, dst, xmm0, src); } void vmovsd(XMMRegister dst, Operand src) { vsd(0x10, dst, xmm0, src); }
void vmovsd(Operand dst, XMMRegister src) { vsd(0x11, src, xmm0, dst); } void vmovsd(Operand dst, XMMRegister src) { vsd(0x11, src, xmm0, dst); }
void vmovdqa(XMMRegister dst, XMMRegister src);
void vmovdqu(XMMRegister dst, Operand src); void vmovdqu(XMMRegister dst, Operand src);
void vmovdqu(Operand dst, XMMRegister src); void vmovdqu(Operand dst, XMMRegister src);
void vmovdqu(XMMRegister dst, XMMRegister src);
void vmovlps(XMMRegister dst, XMMRegister src1, Operand src2); void vmovlps(XMMRegister dst, XMMRegister src1, Operand src2);
void vmovlps(Operand dst, XMMRegister src); void vmovlps(Operand dst, XMMRegister src);
......
...@@ -694,6 +694,49 @@ int TurboAssembler::PopCallerSaved(SaveFPRegsMode fp_mode, Register exclusion1, ...@@ -694,6 +694,49 @@ int TurboAssembler::PopCallerSaved(SaveFPRegsMode fp_mode, Register exclusion1,
return bytes; return bytes;
} }
void TurboAssembler::Movdqa(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// Many AVX processors have separate integer/floating-point domains. Use the
// appropriate instructions.
vmovdqa(dst, src);
} else {
// On SSE, movaps is 1 byte shorter than movdqa, and has the same behavior.
// Most SSE processors also don't have the same delay moving between integer
// and floating-point domains.
movaps(dst, src);
}
}
void TurboAssembler::Movapd(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovapd(dst, src);
} else {
// On SSE, movaps is 1 byte shorter than movapd, and has the same behavior.
movaps(dst, src);
}
}
template <typename Dst, typename Src>
void TurboAssembler::Movdqu(Dst dst, Src src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovdqu(dst, src);
} else {
// movups is 1 byte shorter than movdqu. On most SSE systems, this incurs
// no delay moving between integer and floating-point domain.
movups(dst, src);
}
}
template void TurboAssembler::Movdqu<XMMRegister, Operand>(XMMRegister dst,
Operand src);
template void TurboAssembler::Movdqu<Operand, XMMRegister>(Operand dst,
XMMRegister src);
template void TurboAssembler::Movdqu<XMMRegister, XMMRegister>(XMMRegister dst,
XMMRegister src);
void TurboAssembler::Cvtss2sd(XMMRegister dst, XMMRegister src) { void TurboAssembler::Cvtss2sd(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) { if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX); CpuFeatureScope scope(this, AVX);
......
...@@ -141,14 +141,12 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -141,14 +141,12 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Movd, movd) AVX_OP(Movd, movd)
AVX_OP(Movq, movq) AVX_OP(Movq, movq)
AVX_OP(Movaps, movaps) AVX_OP(Movaps, movaps)
AVX_OP(Movapd, movapd)
AVX_OP(Movups, movups) AVX_OP(Movups, movups)
AVX_OP(Movmskps, movmskps) AVX_OP(Movmskps, movmskps)
AVX_OP(Movmskpd, movmskpd) AVX_OP(Movmskpd, movmskpd)
AVX_OP(Pmovmskb, pmovmskb) AVX_OP(Pmovmskb, pmovmskb)
AVX_OP(Movss, movss) AVX_OP(Movss, movss)
AVX_OP(Movsd, movsd) AVX_OP(Movsd, movsd)
AVX_OP(Movdqu, movdqu)
AVX_OP(Movhlps, movhlps) AVX_OP(Movhlps, movhlps)
AVX_OP(Movlps, movlps) AVX_OP(Movlps, movlps)
AVX_OP(Movhps, movhps) AVX_OP(Movhps, movhps)
...@@ -355,6 +353,12 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -355,6 +353,12 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
Label* condition_met, Label* condition_met,
Label::Distance condition_met_distance = Label::kFar); Label::Distance condition_met_distance = Label::kFar);
void Movapd(XMMRegister dst, XMMRegister src);
void Movdqa(XMMRegister dst, XMMRegister src);
template <typename Dst, typename Src>
void Movdqu(Dst dst, Src src);
void Cvtss2sd(XMMRegister dst, XMMRegister src); void Cvtss2sd(XMMRegister dst, XMMRegister src);
void Cvtss2sd(XMMRegister dst, Operand src); void Cvtss2sd(XMMRegister dst, Operand src);
void Cvtsd2ss(XMMRegister dst, XMMRegister src); void Cvtsd2ss(XMMRegister dst, XMMRegister src);
......
...@@ -2839,7 +2839,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2839,7 +2839,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0); XMMRegister src = i.InputSimd128Register(0);
if (dst == src) { if (dst == src) {
__ Movapd(kScratchDoubleReg, src); __ Movdqa(kScratchDoubleReg, src);
src = kScratchDoubleReg; src = kScratchDoubleReg;
} }
__ Pxor(dst, dst); __ Pxor(dst, dst);
...@@ -2889,8 +2889,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2889,8 +2889,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister tmp1 = i.TempSimd128Register(0); XMMRegister tmp1 = i.TempSimd128Register(0);
XMMRegister tmp2 = i.TempSimd128Register(1); XMMRegister tmp2 = i.TempSimd128Register(1);
__ Movaps(tmp1, left); __ Movdqa(tmp1, left);
__ Movaps(tmp2, right); __ Movdqa(tmp2, right);
// Multiply high dword of each qword of left with right. // Multiply high dword of each qword of left with right.
__ Psrlq(tmp1, 32); __ Psrlq(tmp1, 32);
...@@ -3546,8 +3546,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3546,8 +3546,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
// right= BBbb BBbb ... BBbb BBbb // right= BBbb BBbb ... BBbb BBbb
// t = 00AA 00AA ... 00AA 00AA // t = 00AA 00AA ... 00AA 00AA
// s = 00BB 00BB ... 00BB 00BB // s = 00BB 00BB ... 00BB 00BB
__ Movaps(tmp, dst); __ Movdqa(tmp, dst);
__ Movaps(kScratchDoubleReg, right); __ Movdqa(kScratchDoubleReg, right);
__ Psrlw(tmp, byte{8}); __ Psrlw(tmp, byte{8});
__ Psrlw(kScratchDoubleReg, byte{8}); __ Psrlw(kScratchDoubleReg, byte{8});
// dst = left * 256 // dst = left * 256
...@@ -3696,12 +3696,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3696,12 +3696,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
XMMRegister mask = i.InputSimd128Register(2); XMMRegister mask = i.InputSimd128Register(2);
DCHECK_EQ(xmm0, mask); DCHECK_EQ(xmm0, mask);
__ movapd(kScratchDoubleReg, mask); __ movaps(kScratchDoubleReg, mask);
__ pxor(mask, mask); __ xorps(mask, mask);
__ pcmpgtw(mask, kScratchDoubleReg); __ pcmpgtw(mask, kScratchDoubleReg);
__ pblendvb(i.OutputSimd128Register(), i.InputSimd128Register(1)); __ pblendvb(i.OutputSimd128Register(), i.InputSimd128Register(1));
// Restore mask. // Restore mask.
__ movapd(mask, kScratchDoubleReg); __ movaps(mask, kScratchDoubleReg);
} }
break; break;
} }
...@@ -3747,7 +3747,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3747,7 +3747,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0); XMMRegister src = i.InputSimd128Register(0);
if (dst == src) { if (dst == src) {
__ Movaps(kScratchDoubleReg, dst); __ Movdqa(kScratchDoubleReg, dst);
__ Pcmpeqd(dst, dst); __ Pcmpeqd(dst, dst);
__ Pxor(dst, kScratchDoubleReg); __ Pxor(dst, kScratchDoubleReg);
} else { } else {
...@@ -3816,7 +3816,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3816,7 +3816,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} else { // two input operands } else { // two input operands
DCHECK_NE(tmp_simd, i.InputSimd128Register(1)); DCHECK_NE(tmp_simd, i.InputSimd128Register(1));
DCHECK_EQ(6, instr->InputCount()); DCHECK_EQ(6, instr->InputCount());
ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 0); ASSEMBLE_SIMD_INSTR(Movdqu, kScratchDoubleReg, 0);
uint32_t mask1[4] = {}; uint32_t mask1[4] = {};
for (int j = 5; j > 1; j--) { for (int j = 5; j > 1; j--) {
uint32_t lanes = i.InputUint32(j); uint32_t lanes = i.InputUint32(j);
...@@ -3830,9 +3830,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3830,9 +3830,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
uint32_t mask2[4] = {}; uint32_t mask2[4] = {};
if (instr->InputAt(1)->IsSimd128Register()) { if (instr->InputAt(1)->IsSimd128Register()) {
XMMRegister src1 = i.InputSimd128Register(1); XMMRegister src1 = i.InputSimd128Register(1);
if (src1 != dst) __ movups(dst, src1); if (src1 != dst) __ Movdqa(dst, src1);
} else { } else {
__ Movups(dst, i.InputOperand(1)); __ Movdqu(dst, i.InputOperand(1));
} }
for (int j = 5; j > 1; j--) { for (int j = 5; j > 1; j--) {
uint32_t lanes = i.InputUint32(j); uint32_t lanes = i.InputUint32(j);
...@@ -4065,7 +4065,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -4065,7 +4065,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister src2 = dst; XMMRegister src2 = dst;
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
if (instr->InputCount() == 2) { if (instr->InputCount() == 2) {
ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 1); ASSEMBLE_SIMD_INSTR(Movdqu, kScratchDoubleReg, 1);
__ Psrld(kScratchDoubleReg, byte{16}); __ Psrld(kScratchDoubleReg, byte{16});
src2 = kScratchDoubleReg; src2 = kScratchDoubleReg;
} }
...@@ -4091,7 +4091,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -4091,7 +4091,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister src2 = dst; XMMRegister src2 = dst;
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
if (instr->InputCount() == 2) { if (instr->InputCount() == 2) {
ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 1); ASSEMBLE_SIMD_INSTR(Movdqu, kScratchDoubleReg, 1);
__ Psrlw(kScratchDoubleReg, byte{8}); __ Psrlw(kScratchDoubleReg, byte{8});
src2 = kScratchDoubleReg; src2 = kScratchDoubleReg;
} }
...@@ -4104,7 +4104,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -4104,7 +4104,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister src2 = dst; XMMRegister src2 = dst;
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
if (instr->InputCount() == 2) { if (instr->InputCount() == 2) {
ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 1); ASSEMBLE_SIMD_INSTR(Movdqu, kScratchDoubleReg, 1);
__ Psllw(kScratchDoubleReg, byte{8}); __ Psllw(kScratchDoubleReg, byte{8});
__ Psrlw(kScratchDoubleReg, byte{8}); __ Psrlw(kScratchDoubleReg, byte{8});
src2 = kScratchDoubleReg; src2 = kScratchDoubleReg;
...@@ -4119,10 +4119,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -4119,10 +4119,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
__ Psllw(dst, byte{8}); __ Psllw(dst, byte{8});
if (instr->InputCount() == 1) { if (instr->InputCount() == 1) {
__ Movups(kScratchDoubleReg, dst); __ Movdqa(kScratchDoubleReg, dst);
} else { } else {
DCHECK_EQ(2, instr->InputCount()); DCHECK_EQ(2, instr->InputCount());
ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 1); ASSEMBLE_SIMD_INSTR(Movdqu, kScratchDoubleReg, 1);
__ Psllw(kScratchDoubleReg, byte{8}); __ Psllw(kScratchDoubleReg, byte{8});
} }
__ Psrlw(dst, byte{8}); __ Psrlw(dst, byte{8});
...@@ -4134,10 +4134,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -4134,10 +4134,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
__ Psrlw(dst, byte{8}); __ Psrlw(dst, byte{8});
if (instr->InputCount() == 1) { if (instr->InputCount() == 1) {
__ Movups(kScratchDoubleReg, dst); __ Movdqa(kScratchDoubleReg, dst);
} else { } else {
DCHECK_EQ(2, instr->InputCount()); DCHECK_EQ(2, instr->InputCount());
ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 1); ASSEMBLE_SIMD_INSTR(Movdqu, kScratchDoubleReg, 1);
__ Psrlw(kScratchDoubleReg, byte{8}); __ Psrlw(kScratchDoubleReg, byte{8});
} }
__ Psllw(kScratchDoubleReg, byte{8}); __ Psllw(kScratchDoubleReg, byte{8});
...@@ -4156,7 +4156,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -4156,7 +4156,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pshuflw(dst, dst, shuffle_mask); __ Pshuflw(dst, dst, shuffle_mask);
__ Pshufhw(dst, dst, shuffle_mask); __ Pshufhw(dst, dst, shuffle_mask);
} }
__ Movaps(kScratchDoubleReg, dst); __ Movdqa(kScratchDoubleReg, dst);
__ Psrlw(kScratchDoubleReg, byte{8}); __ Psrlw(kScratchDoubleReg, byte{8});
__ Psllw(dst, byte{8}); __ Psllw(dst, byte{8});
__ Por(dst, kScratchDoubleReg); __ Por(dst, kScratchDoubleReg);
......
...@@ -1529,6 +1529,10 @@ int DisassemblerX64::AVXInstruction(byte* data) { ...@@ -1529,6 +1529,10 @@ int DisassemblerX64::AVXInstruction(byte* data) {
NameOfXMMRegister(regop)); NameOfXMMRegister(regop));
current += PrintRightOperand(current); current += PrintRightOperand(current);
break; break;
case 0x6F:
AppendToBuffer("vmovdqa %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
break;
case 0x70: case 0x70:
AppendToBuffer("vpshufd %s,", NameOfXMMRegister(regop)); AppendToBuffer("vpshufd %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current); current += PrintRightXMMOperand(current);
......
...@@ -450,6 +450,7 @@ TEST(DisasmX64) { ...@@ -450,6 +450,7 @@ TEST(DisasmX64) {
__ movupd(Operand(rbx, rcx, times_4, 10000), xmm0); __ movupd(Operand(rbx, rcx, times_4, 10000), xmm0);
__ movdqa(xmm0, Operand(rbx, rcx, times_4, 10000)); __ movdqa(xmm0, Operand(rbx, rcx, times_4, 10000));
__ movdqa(Operand(rbx, rcx, times_4, 10000), xmm0); __ movdqa(Operand(rbx, rcx, times_4, 10000), xmm0);
__ movdqa(xmm0, xmm1);
__ ucomisd(xmm0, xmm1); __ ucomisd(xmm0, xmm1);
__ ucomisd(xmm8, Operand(rbx, rdx, times_4, 10000)); __ ucomisd(xmm8, Operand(rbx, rdx, times_4, 10000));
...@@ -658,8 +659,11 @@ TEST(DisasmX64) { ...@@ -658,8 +659,11 @@ TEST(DisasmX64) {
__ vmovsd(xmm9, Operand(rbx, rcx, times_4, 10000)); __ vmovsd(xmm9, Operand(rbx, rcx, times_4, 10000));
__ vmovsd(Operand(rbx, rcx, times_4, 10000), xmm0); __ vmovsd(Operand(rbx, rcx, times_4, 10000), xmm0);
__ vmovdqa(xmm4, xmm5);
__ vmovdqu(xmm9, Operand(rbx, rcx, times_4, 10000)); __ vmovdqu(xmm9, Operand(rbx, rcx, times_4, 10000));
__ vmovdqu(Operand(rbx, rcx, times_4, 10000), xmm0); __ vmovdqu(Operand(rbx, rcx, times_4, 10000), xmm0);
__ vmovdqu(xmm4, xmm5);
__ vmovhlps(xmm1, xmm3, xmm5); __ vmovhlps(xmm1, xmm3, xmm5);
__ vmovlps(xmm8, xmm9, Operand(rbx, rcx, times_4, 10000)); __ vmovlps(xmm8, xmm9, Operand(rbx, rcx, times_4, 10000));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment