Commit 6cb61e63 authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[wasm-simd][x64] Optimize f64x2.extract_lane

pextrq + movq crosses register files twice, which is not efficient.

Optimize this by:
- checking if lane 0, do nothing if dst == src (macro-assembler helper)
- use vmovhlps on AVX, with src as the operands to avoid false
dependency on dst
- use movhlps otherwise, this is shorter than shufpd, and faster on
older system

Change-Id: I3486d87224c048b3229c2f92359b8b8e6d5fd025
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2589056
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71751}
parent 3bc06ed3
...@@ -1284,6 +1284,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1284,6 +1284,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void pshuflw(XMMRegister dst, XMMRegister src, uint8_t shuffle); void pshuflw(XMMRegister dst, XMMRegister src, uint8_t shuffle);
void pshuflw(XMMRegister dst, Operand src, uint8_t shuffle); void pshuflw(XMMRegister dst, Operand src, uint8_t shuffle);
void movhlps(XMMRegister dst, XMMRegister src) {
sse_instr(dst, src, 0x0F, 0x12);
}
void movlhps(XMMRegister dst, XMMRegister src) { void movlhps(XMMRegister dst, XMMRegister src) {
sse_instr(dst, src, 0x0F, 0x16); sse_instr(dst, src, 0x0F, 0x16);
} }
...@@ -1386,6 +1389,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1386,6 +1389,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vmovlhps(XMMRegister dst, XMMRegister src1, XMMRegister src2) { void vmovlhps(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vinstr(0x16, dst, src1, src2, kNone, k0F, kWIG); vinstr(0x16, dst, src1, src2, kNone, k0F, kWIG);
} }
void vmovhlps(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vinstr(0x12, dst, src1, src2, kNone, k0F, kWIG);
}
void vcvtss2sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) { void vcvtss2sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vinstr(0x5a, dst, src1, src2, kF3, k0F, kWIG); vinstr(0x5a, dst, src1, src2, kF3, k0F, kWIG);
} }
......
...@@ -1293,6 +1293,12 @@ void TurboAssembler::Move(Register dst, Register src) { ...@@ -1293,6 +1293,12 @@ void TurboAssembler::Move(Register dst, Register src) {
} }
} }
void TurboAssembler::Move(XMMRegister dst, XMMRegister src) {
if (dst != src) {
movaps(dst, src);
}
}
void TurboAssembler::MovePair(Register dst0, Register src0, Register dst1, void TurboAssembler::MovePair(Register dst0, Register src0, Register dst1,
Register src1) { Register src1) {
if (dst0 != src1) { if (dst0 != src1) {
......
...@@ -442,6 +442,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -442,6 +442,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
// Move if the registers are not identical. // Move if the registers are not identical.
void Move(Register target, Register source); void Move(Register target, Register source);
void Move(XMMRegister target, XMMRegister source);
void Move(Register dst, Handle<HeapObject> source, void Move(Register dst, Handle<HeapObject> source,
RelocInfo::Mode rmode = RelocInfo::FULL_EMBEDDED_OBJECT); RelocInfo::Mode rmode = RelocInfo::FULL_EMBEDDED_OBJECT);
......
...@@ -2453,8 +2453,21 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2453,8 +2453,21 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64F64x2ExtractLane: { case kX64F64x2ExtractLane: {
__ Pextrq(kScratchRegister, i.InputSimd128Register(0), i.InputInt8(1)); DoubleRegister dst = i.OutputDoubleRegister();
__ Movq(i.OutputDoubleRegister(), kScratchRegister); XMMRegister src = i.InputSimd128Register(0);
uint8_t lane = i.InputUint8(1);
if (lane == 0) {
__ Move(dst, src);
} else {
DCHECK_EQ(1, lane);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
// Pass src as operand to avoid false-dependency on dst.
__ vmovhlps(dst, src, src);
} else {
__ movhlps(dst, src);
}
}
break; break;
} }
case kX64F64x2Sqrt: { case kX64F64x2Sqrt: {
......
...@@ -1381,9 +1381,15 @@ int DisassemblerX64::AVXInstruction(byte* data) { ...@@ -1381,9 +1381,15 @@ int DisassemblerX64::AVXInstruction(byte* data) {
AppendToBuffer(",%s", NameOfXMMRegister(regop)); AppendToBuffer(",%s", NameOfXMMRegister(regop));
break; break;
case 0x12: case 0x12:
if (mod == 0b11) {
AppendToBuffer("vmovhlps %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
} else {
AppendToBuffer("vmovlps %s,%s,", NameOfXMMRegister(regop), AppendToBuffer("vmovlps %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv)); NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current); current += PrintRightXMMOperand(current);
}
break; break;
case 0x13: case 0x13:
AppendToBuffer("vmovlps "); AppendToBuffer("vmovlps ");
...@@ -2065,8 +2071,13 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) { ...@@ -2065,8 +2071,13 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) {
// movups xmm/m128, xmm // movups xmm/m128, xmm
current += PrintOperands("movups", XMMOPER_XMMREG_OP_ORDER, current); current += PrintOperands("movups", XMMOPER_XMMREG_OP_ORDER, current);
} else if (opcode == 0x12) { } else if (opcode == 0x12) {
// movhlps xmm1, xmm2
// movlps xmm1, m64 // movlps xmm1, m64
if (mod == 0b11) {
current += PrintOperands("movhlps", XMMREG_XMMOPER_OP_ORDER, current);
} else {
current += PrintOperands("movlps", XMMREG_OPER_OP_ORDER, current); current += PrintOperands("movlps", XMMREG_OPER_OP_ORDER, current);
}
} else if (opcode == 0x13) { } else if (opcode == 0x13) {
// movlps m64, xmm1 // movlps m64, xmm1
AppendToBuffer("movlps "); AppendToBuffer("movlps ");
......
...@@ -401,8 +401,10 @@ TEST(DisasmX64) { ...@@ -401,8 +401,10 @@ TEST(DisasmX64) {
__ movdqu(xmm0, Operand(rsp, 12)); __ movdqu(xmm0, Operand(rsp, 12));
__ movdqu(Operand(rsp, 12), xmm0); __ movdqu(Operand(rsp, 12), xmm0);
__ movdqu(xmm1, xmm0); __ movdqu(xmm1, xmm0);
__ movhlps(xmm5, xmm1);
__ movlps(xmm8, Operand(rbx, rcx, times_4, 10000)); __ movlps(xmm8, Operand(rbx, rcx, times_4, 10000));
__ movlps(Operand(rbx, rcx, times_4, 10000), xmm9); __ movlps(Operand(rbx, rcx, times_4, 10000), xmm9);
__ movlhps(xmm5, xmm1);
__ movhps(xmm8, Operand(rbx, rcx, times_4, 10000)); __ movhps(xmm8, Operand(rbx, rcx, times_4, 10000));
__ movhps(Operand(rbx, rcx, times_4, 10000), xmm9); __ movhps(Operand(rbx, rcx, times_4, 10000), xmm9);
__ shufps(xmm0, xmm9, 0x0); __ shufps(xmm0, xmm9, 0x0);
...@@ -577,7 +579,6 @@ TEST(DisasmX64) { ...@@ -577,7 +579,6 @@ TEST(DisasmX64) {
__ movups(xmm5, xmm1); __ movups(xmm5, xmm1);
__ movups(xmm5, Operand(rdx, 4)); __ movups(xmm5, Operand(rdx, 4));
__ movups(Operand(rdx, 4), xmm5); __ movups(Operand(rdx, 4), xmm5);
__ movlhps(xmm5, xmm1);
__ pmulld(xmm5, xmm1); __ pmulld(xmm5, xmm1);
__ pmulld(xmm5, Operand(rdx, 4)); __ pmulld(xmm5, Operand(rdx, 4));
__ pmullw(xmm5, xmm1); __ pmullw(xmm5, xmm1);
...@@ -659,8 +660,10 @@ TEST(DisasmX64) { ...@@ -659,8 +660,10 @@ TEST(DisasmX64) {
__ vmovdqu(xmm9, Operand(rbx, rcx, times_4, 10000)); __ vmovdqu(xmm9, Operand(rbx, rcx, times_4, 10000));
__ vmovdqu(Operand(rbx, rcx, times_4, 10000), xmm0); __ vmovdqu(Operand(rbx, rcx, times_4, 10000), xmm0);
__ vmovhlps(xmm1, xmm3, xmm5);
__ vmovlps(xmm8, xmm9, Operand(rbx, rcx, times_4, 10000)); __ vmovlps(xmm8, xmm9, Operand(rbx, rcx, times_4, 10000));
__ vmovlps(Operand(rbx, rcx, times_4, 10000), xmm9); __ vmovlps(Operand(rbx, rcx, times_4, 10000), xmm9);
__ vmovlhps(xmm1, xmm3, xmm5);
__ vmovhps(xmm8, xmm9, Operand(rbx, rcx, times_4, 10000)); __ vmovhps(xmm8, xmm9, Operand(rbx, rcx, times_4, 10000));
__ vmovhps(Operand(rbx, rcx, times_4, 10000), xmm12); __ vmovhps(Operand(rbx, rcx, times_4, 10000), xmm12);
...@@ -693,7 +696,6 @@ TEST(DisasmX64) { ...@@ -693,7 +696,6 @@ TEST(DisasmX64) {
__ vmovups(xmm5, xmm1); __ vmovups(xmm5, xmm1);
__ vmovups(xmm5, Operand(rdx, 4)); __ vmovups(xmm5, Operand(rdx, 4));
__ vmovups(Operand(rdx, 4), xmm5); __ vmovups(Operand(rdx, 4), xmm5);
__ vmovlhps(xmm1, xmm3, xmm5);
__ vandps(xmm0, xmm9, xmm2); __ vandps(xmm0, xmm9, xmm2);
__ vandps(xmm9, xmm1, Operand(rbx, rcx, times_4, 10000)); __ vandps(xmm9, xmm1, Operand(rbx, rcx, times_4, 10000));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment