Commit 506c0979 authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[x64] Sort out move instructions in codegen

In AVX, it is better to use the appropriate integer or floating point
moves depending on which instructions produce/consume these moves, since
there can be a delay moving from integer to floating point domain. On
SSE systems, it is less important, and we can move movaps/movups which
is 1 byte shorter than movdqa/movdqu.

This patch cleans up a couple of places, and defines macro-assembler
functions Movdqa, Movdqu, Movapd, to call into movaps/movups when AVX is
not supported.

Change-Id: Iba6c54e218875f1a70f61792978d7b3f69edfb4b
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2599843
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71884}
parent ddd9b349
......@@ -2750,6 +2750,15 @@ void Assembler::movdqa(XMMRegister dst, Operand src) {
emit_sse_operand(dst, src);
}
void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
EnsureSpace ensure_space(this);
emit(0x66);
emit_rex_64(src, dst);
emit(0x0F);
emit(0x7F);
emit_sse_operand(src, dst);
}
void Assembler::movdqu(Operand dst, XMMRegister src) {
EnsureSpace ensure_space(this);
emit(0xF3);
......@@ -3486,6 +3495,14 @@ void Assembler::vmovq(Register dst, XMMRegister src) {
emit_sse_operand(src, dst);
}
void Assembler::vmovdqa(XMMRegister dst, XMMRegister src) {
DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this);
emit_vex_prefix(dst, xmm0, src, kL128, k66, k0F, kWIG);
emit(0x6F);
emit_sse_operand(dst, src);
}
void Assembler::vmovdqu(XMMRegister dst, Operand src) {
DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this);
......@@ -3502,6 +3519,14 @@ void Assembler::vmovdqu(Operand dst, XMMRegister src) {
emit_sse_operand(src, dst);
}
void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this);
emit_vex_prefix(src, xmm0, dst, kL128, kF3, k0F, kWIG);
emit(0x7F);
emit_sse_operand(src, dst);
}
void Assembler::vmovlps(XMMRegister dst, XMMRegister src1, Operand src2) {
DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this);
......
......@@ -1195,6 +1195,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void movdqa(Operand dst, XMMRegister src);
void movdqa(XMMRegister dst, Operand src);
void movdqa(XMMRegister dst, XMMRegister src);
void movdqu(Operand dst, XMMRegister src);
void movdqu(XMMRegister dst, Operand src);
......@@ -1328,8 +1329,10 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
}
void vmovsd(XMMRegister dst, Operand src) { vsd(0x10, dst, xmm0, src); }
void vmovsd(Operand dst, XMMRegister src) { vsd(0x11, src, xmm0, dst); }
void vmovdqa(XMMRegister dst, XMMRegister src);
void vmovdqu(XMMRegister dst, Operand src);
void vmovdqu(Operand dst, XMMRegister src);
void vmovdqu(XMMRegister dst, XMMRegister src);
void vmovlps(XMMRegister dst, XMMRegister src1, Operand src2);
void vmovlps(Operand dst, XMMRegister src);
......
......@@ -694,6 +694,49 @@ int TurboAssembler::PopCallerSaved(SaveFPRegsMode fp_mode, Register exclusion1,
return bytes;
}
void TurboAssembler::Movdqa(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// Many AVX processors have separate integer/floating-point domains. Use the
// appropriate instructions.
vmovdqa(dst, src);
} else {
// On SSE, movaps is 1 byte shorter than movdqa, and has the same behavior.
// Most SSE processors also don't have the same delay moving between integer
// and floating-point domains.
movaps(dst, src);
}
}
void TurboAssembler::Movapd(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovapd(dst, src);
} else {
// On SSE, movaps is 1 byte shorter than movapd, and has the same behavior.
movaps(dst, src);
}
}
template <typename Dst, typename Src>
void TurboAssembler::Movdqu(Dst dst, Src src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovdqu(dst, src);
} else {
// movups is 1 byte shorter than movdqu. On most SSE systems, this incurs
// no delay moving between integer and floating-point domain.
movups(dst, src);
}
}
template void TurboAssembler::Movdqu<XMMRegister, Operand>(XMMRegister dst,
Operand src);
template void TurboAssembler::Movdqu<Operand, XMMRegister>(Operand dst,
XMMRegister src);
template void TurboAssembler::Movdqu<XMMRegister, XMMRegister>(XMMRegister dst,
XMMRegister src);
void TurboAssembler::Cvtss2sd(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
......
......@@ -141,14 +141,12 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Movd, movd)
AVX_OP(Movq, movq)
AVX_OP(Movaps, movaps)
AVX_OP(Movapd, movapd)
AVX_OP(Movups, movups)
AVX_OP(Movmskps, movmskps)
AVX_OP(Movmskpd, movmskpd)
AVX_OP(Pmovmskb, pmovmskb)
AVX_OP(Movss, movss)
AVX_OP(Movsd, movsd)
AVX_OP(Movdqu, movdqu)
AVX_OP(Movhlps, movhlps)
AVX_OP(Movlps, movlps)
AVX_OP(Movhps, movhps)
......@@ -355,6 +353,12 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
Label* condition_met,
Label::Distance condition_met_distance = Label::kFar);
void Movapd(XMMRegister dst, XMMRegister src);
void Movdqa(XMMRegister dst, XMMRegister src);
template <typename Dst, typename Src>
void Movdqu(Dst dst, Src src);
void Cvtss2sd(XMMRegister dst, XMMRegister src);
void Cvtss2sd(XMMRegister dst, Operand src);
void Cvtsd2ss(XMMRegister dst, XMMRegister src);
......
......@@ -2839,7 +2839,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
if (dst == src) {
__ Movapd(kScratchDoubleReg, src);
__ Movdqa(kScratchDoubleReg, src);
src = kScratchDoubleReg;
}
__ Pxor(dst, dst);
......@@ -2889,8 +2889,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister tmp1 = i.TempSimd128Register(0);
XMMRegister tmp2 = i.TempSimd128Register(1);
__ Movaps(tmp1, left);
__ Movaps(tmp2, right);
__ Movdqa(tmp1, left);
__ Movdqa(tmp2, right);
// Multiply high dword of each qword of left with right.
__ Psrlq(tmp1, 32);
......@@ -3546,8 +3546,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
// right= BBbb BBbb ... BBbb BBbb
// t = 00AA 00AA ... 00AA 00AA
// s = 00BB 00BB ... 00BB 00BB
__ Movaps(tmp, dst);
__ Movaps(kScratchDoubleReg, right);
__ Movdqa(tmp, dst);
__ Movdqa(kScratchDoubleReg, right);
__ Psrlw(tmp, byte{8});
__ Psrlw(kScratchDoubleReg, byte{8});
// dst = left * 256
......@@ -3696,12 +3696,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
XMMRegister mask = i.InputSimd128Register(2);
DCHECK_EQ(xmm0, mask);
__ movapd(kScratchDoubleReg, mask);
__ pxor(mask, mask);
__ movaps(kScratchDoubleReg, mask);
__ xorps(mask, mask);
__ pcmpgtw(mask, kScratchDoubleReg);
__ pblendvb(i.OutputSimd128Register(), i.InputSimd128Register(1));
// Restore mask.
__ movapd(mask, kScratchDoubleReg);
__ movaps(mask, kScratchDoubleReg);
}
break;
}
......@@ -3747,7 +3747,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
if (dst == src) {
__ Movaps(kScratchDoubleReg, dst);
__ Movdqa(kScratchDoubleReg, dst);
__ Pcmpeqd(dst, dst);
__ Pxor(dst, kScratchDoubleReg);
} else {
......@@ -3816,7 +3816,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} else { // two input operands
DCHECK_NE(tmp_simd, i.InputSimd128Register(1));
DCHECK_EQ(6, instr->InputCount());
ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 0);
ASSEMBLE_SIMD_INSTR(Movdqu, kScratchDoubleReg, 0);
uint32_t mask1[4] = {};
for (int j = 5; j > 1; j--) {
uint32_t lanes = i.InputUint32(j);
......@@ -3830,9 +3830,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
uint32_t mask2[4] = {};
if (instr->InputAt(1)->IsSimd128Register()) {
XMMRegister src1 = i.InputSimd128Register(1);
if (src1 != dst) __ movups(dst, src1);
if (src1 != dst) __ Movdqa(dst, src1);
} else {
__ Movups(dst, i.InputOperand(1));
__ Movdqu(dst, i.InputOperand(1));
}
for (int j = 5; j > 1; j--) {
uint32_t lanes = i.InputUint32(j);
......@@ -4065,7 +4065,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister src2 = dst;
DCHECK_EQ(dst, i.InputSimd128Register(0));
if (instr->InputCount() == 2) {
ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 1);
ASSEMBLE_SIMD_INSTR(Movdqu, kScratchDoubleReg, 1);
__ Psrld(kScratchDoubleReg, byte{16});
src2 = kScratchDoubleReg;
}
......@@ -4091,7 +4091,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister src2 = dst;
DCHECK_EQ(dst, i.InputSimd128Register(0));
if (instr->InputCount() == 2) {
ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 1);
ASSEMBLE_SIMD_INSTR(Movdqu, kScratchDoubleReg, 1);
__ Psrlw(kScratchDoubleReg, byte{8});
src2 = kScratchDoubleReg;
}
......@@ -4104,7 +4104,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister src2 = dst;
DCHECK_EQ(dst, i.InputSimd128Register(0));
if (instr->InputCount() == 2) {
ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 1);
ASSEMBLE_SIMD_INSTR(Movdqu, kScratchDoubleReg, 1);
__ Psllw(kScratchDoubleReg, byte{8});
__ Psrlw(kScratchDoubleReg, byte{8});
src2 = kScratchDoubleReg;
......@@ -4119,10 +4119,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_EQ(dst, i.InputSimd128Register(0));
__ Psllw(dst, byte{8});
if (instr->InputCount() == 1) {
__ Movups(kScratchDoubleReg, dst);
__ Movdqa(kScratchDoubleReg, dst);
} else {
DCHECK_EQ(2, instr->InputCount());
ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 1);
ASSEMBLE_SIMD_INSTR(Movdqu, kScratchDoubleReg, 1);
__ Psllw(kScratchDoubleReg, byte{8});
}
__ Psrlw(dst, byte{8});
......@@ -4134,10 +4134,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_EQ(dst, i.InputSimd128Register(0));
__ Psrlw(dst, byte{8});
if (instr->InputCount() == 1) {
__ Movups(kScratchDoubleReg, dst);
__ Movdqa(kScratchDoubleReg, dst);
} else {
DCHECK_EQ(2, instr->InputCount());
ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 1);
ASSEMBLE_SIMD_INSTR(Movdqu, kScratchDoubleReg, 1);
__ Psrlw(kScratchDoubleReg, byte{8});
}
__ Psllw(kScratchDoubleReg, byte{8});
......@@ -4156,7 +4156,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pshuflw(dst, dst, shuffle_mask);
__ Pshufhw(dst, dst, shuffle_mask);
}
__ Movaps(kScratchDoubleReg, dst);
__ Movdqa(kScratchDoubleReg, dst);
__ Psrlw(kScratchDoubleReg, byte{8});
__ Psllw(dst, byte{8});
__ Por(dst, kScratchDoubleReg);
......
......@@ -1529,6 +1529,10 @@ int DisassemblerX64::AVXInstruction(byte* data) {
NameOfXMMRegister(regop));
current += PrintRightOperand(current);
break;
case 0x6F:
AppendToBuffer("vmovdqa %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
break;
case 0x70:
AppendToBuffer("vpshufd %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
......
......@@ -450,6 +450,7 @@ TEST(DisasmX64) {
__ movupd(Operand(rbx, rcx, times_4, 10000), xmm0);
__ movdqa(xmm0, Operand(rbx, rcx, times_4, 10000));
__ movdqa(Operand(rbx, rcx, times_4, 10000), xmm0);
__ movdqa(xmm0, xmm1);
__ ucomisd(xmm0, xmm1);
__ ucomisd(xmm8, Operand(rbx, rdx, times_4, 10000));
......@@ -658,8 +659,11 @@ TEST(DisasmX64) {
__ vmovsd(xmm9, Operand(rbx, rcx, times_4, 10000));
__ vmovsd(Operand(rbx, rcx, times_4, 10000), xmm0);
__ vmovdqa(xmm4, xmm5);
__ vmovdqu(xmm9, Operand(rbx, rcx, times_4, 10000));
__ vmovdqu(Operand(rbx, rcx, times_4, 10000), xmm0);
__ vmovdqu(xmm4, xmm5);
__ vmovhlps(xmm1, xmm3, xmm5);
__ vmovlps(xmm8, xmm9, Operand(rbx, rcx, times_4, 10000));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment