Commit 741e5a66 authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[wasm-simd][ia32][x64] More optimization for f32x4.extract_lane

We can have more optimizations for this instruction, they leave some
junk in the top lanes of dst, but that doesn't matter:

- when lane is 1: we use movshdup, this is 4 bytes long
- when lane is 2: use movhlps, this is 3 bytes long
- otherwise use shufps (4 bytes) or pshufd (5 bytes)

All of which are better than insertps (6 bytes).

Change-Id: I0e524431d1832e297e8c8bb418d42382d93fa691
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2591850
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71813}
parent 08c2c005
......@@ -2395,6 +2395,15 @@ void Assembler::movddup(XMMRegister dst, Operand src) {
emit_sse_operand(dst, src);
}
void Assembler::movshdup(XMMRegister dst, XMMRegister src) {
DCHECK(IsEnabled(SSE3));
EnsureSpace ensure_space(this);
EMIT(0xF3);
EMIT(0x0F);
EMIT(0x16);
emit_sse_operand(dst, src);
}
void Assembler::shufps(XMMRegister dst, XMMRegister src, byte imm8) {
DCHECK(is_uint8(imm8));
EnsureSpace ensure_space(this);
......@@ -2414,6 +2423,13 @@ void Assembler::shufpd(XMMRegister dst, XMMRegister src, byte imm8) {
EMIT(imm8);
}
void Assembler::movhlps(XMMRegister dst, XMMRegister src) {
EnsureSpace ensure_space(this);
EMIT(0x0F);
EMIT(0x12);
emit_sse_operand(dst, src);
}
void Assembler::movlps(XMMRegister dst, Operand src) {
EnsureSpace ensure_space(this);
EMIT(0x0F);
......@@ -2883,6 +2899,10 @@ void Assembler::vshufpd(XMMRegister dst, XMMRegister src1, Operand src2,
EMIT(imm8);
}
void Assembler::vmovhlps(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vinstr(0x12, dst, src1, src2, kNone, k0F, kWIG);
}
void Assembler::vmovlps(XMMRegister dst, XMMRegister src1, Operand src2) {
vinstr(0x12, dst, src1, src2, kNone, k0F, kWIG);
}
......
......@@ -861,6 +861,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void shufps(XMMRegister dst, XMMRegister src, byte imm8);
void shufpd(XMMRegister dst, XMMRegister src, byte imm8);
void movhlps(XMMRegister dst, XMMRegister src);
void movlps(XMMRegister dst, Operand src);
void movlps(Operand dst, XMMRegister src);
void movhps(XMMRegister dst, Operand src);
......@@ -1050,6 +1051,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
// SSE3 instructions
void movddup(XMMRegister dst, Operand src);
void movddup(XMMRegister dst, XMMRegister src) { movddup(dst, Operand(src)); }
void movshdup(XMMRegister dst, XMMRegister src);
// Use SSE4_1 encoding for pextrw reg, xmm, imm8 for consistency
void pextrw(Register dst, XMMRegister src, uint8_t offset) {
......@@ -1380,6 +1382,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
}
void vshufpd(XMMRegister dst, XMMRegister src1, Operand src2, byte imm8);
void vmovhlps(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vmovlps(XMMRegister dst, XMMRegister src1, Operand src2);
void vmovlps(Operand dst, XMMRegister src);
void vmovhps(XMMRegister dst, XMMRegister src1, Operand src2);
......@@ -1483,6 +1486,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vmovddup(XMMRegister dst, XMMRegister src) {
vmovddup(dst, Operand(src));
}
void vmovshdup(XMMRegister dst, XMMRegister src) {
vinstr(0x16, dst, xmm0, src, kF3, k0F, kWIG);
}
void vbroadcastss(XMMRegister dst, Operand src) {
vinstr(0x18, dst, xmm0, src, k66, k0F38, kW0);
}
......
......@@ -1875,6 +1875,19 @@ void TurboAssembler::Extractps(Operand dst, XMMRegister src, uint8_t imm8) {
extractps(dst, src, imm8);
}
void TurboAssembler::Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
uint8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vshufps(dst, src1, src2, imm8);
} else {
if (dst != src1) {
movaps(dst, src1);
}
shufps(dst, src2, imm8);
}
}
void TurboAssembler::Lzcnt(Register dst, Operand src) {
if (CpuFeatures::IsSupported(LZCNT)) {
CpuFeatureScope scope(this, LZCNT);
......
......@@ -386,6 +386,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP3_XO(Orpd, orpd)
AVX_OP3_XO(Andnpd, andnpd)
AVX_OP3_XO(Pmullw, pmullw)
AVX_OP3_WITH_TYPE(Movhlps, movhlps, XMMRegister, XMMRegister)
#undef AVX_OP3_XO
#undef AVX_OP3_WITH_TYPE
......@@ -476,6 +477,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP2_WITH_TYPE_SCOPE(macro_name, name, XMMRegister, XMMRegister, SSE3) \
AVX_OP2_WITH_TYPE_SCOPE(macro_name, name, XMMRegister, Operand, SSE3)
AVX_OP2_XO_SSE3(Movddup, movddup)
AVX_OP2_WITH_TYPE_SCOPE(Movshdup, movshdup, XMMRegister, XMMRegister, SSE3)
#undef AVX_OP2_XO_SSE3
......@@ -576,6 +578,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void Vbroadcastss(XMMRegister dst, Operand src);
void Extractps(Operand dst, XMMRegister src, uint8_t imm8);
// Shufps that will mov src1 into dst if AVX is not supported.
void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
uint8_t imm8);
// Expression support
// cvtsi2sd instruction only writes to the low 64-bit of dst register, which
// hinders register renaming and makes dependence chains longer. So we use
......
......@@ -3393,6 +3393,14 @@ void Assembler::vmovddup(XMMRegister dst, Operand src) {
emit_sse_operand(dst, src);
}
void Assembler::vmovshdup(XMMRegister dst, XMMRegister src) {
DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this);
emit_vex_prefix(dst, xmm0, src, kL128, kF3, k0F, kWIG);
emit(0x16);
emit_sse_operand(dst, src);
}
void Assembler::vbroadcastss(XMMRegister dst, Operand src) {
DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this);
......@@ -4080,6 +4088,16 @@ void Assembler::movddup(XMMRegister dst, Operand src) {
emit_sse_operand(dst, src);
}
void Assembler::movshdup(XMMRegister dst, XMMRegister src) {
DCHECK(IsEnabled(SSE3));
EnsureSpace ensure_space(this);
emit(0xF3);
emit_optional_rex_32(dst, src);
emit(0x0F);
emit(0x16);
emit_sse_operand(dst, src);
}
void Assembler::psrldq(XMMRegister dst, uint8_t shift) {
EnsureSpace ensure_space(this);
emit(0x66);
......
......@@ -1024,6 +1024,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void lddqu(XMMRegister dst, Operand src);
void movddup(XMMRegister dst, Operand src);
void movddup(XMMRegister dst, XMMRegister src);
void movshdup(XMMRegister dst, XMMRegister src);
// SSSE3
void ssse3_instr(XMMRegister dst, XMMRegister src, byte prefix, byte escape1,
......@@ -1294,6 +1295,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
// AVX instruction
void vmovddup(XMMRegister dst, XMMRegister src);
void vmovddup(XMMRegister dst, Operand src);
void vmovshdup(XMMRegister dst, XMMRegister src);
void vbroadcastss(XMMRegister dst, Operand src);
void fma_instr(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2,
......
......@@ -149,6 +149,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Movss, movss)
AVX_OP(Movsd, movsd)
AVX_OP(Movdqu, movdqu)
AVX_OP(Movhlps, movhlps)
AVX_OP(Movlps, movlps)
AVX_OP(Movhps, movhps)
AVX_OP(Pcmpeqb, pcmpeqb)
......@@ -253,6 +254,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Movlhps, movlhps)
AVX_OP_SSE3(Haddps, haddps)
AVX_OP_SSE3(Movddup, movddup)
AVX_OP_SSE3(Movshdup, movshdup)
AVX_OP_SSSE3(Phaddd, phaddd)
AVX_OP_SSSE3(Phaddw, phaddw)
AVX_OP_SSSE3(Pshufb, pshufb)
......
......@@ -2396,18 +2396,21 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister src = i.InputSimd128Register(0);
uint8_t lane = i.InputUint8(1);
DCHECK_LT(lane, 4);
if (lane == 0 && dst == src) {
break;
}
uint8_t zmask = 0xE; // Zero top 3 lanes.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
// Use src for both operands to avoid false-dependency on dst.
__ vinsertps(dst, src, src, zmask | (lane << 6));
// These instructions are shorter than insertps, but will leave junk in
// the top lanes of dst.
if (lane == 0) {
if (dst != src) {
__ Movaps(dst, src);
}
} else if (lane == 1) {
__ Movshdup(dst, src);
} else if (lane == 2 && dst == src) {
// Check dst == src to avoid false dependency on dst.
__ Movhlps(dst, src);
} else if (dst == src) {
__ Shufps(dst, src, src, lane);
} else {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ insertps(dst, src, zmask | (lane << 6));
__ Pshufd(dst, src, lane);
}
break;
}
......
......@@ -2581,17 +2581,19 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister src = i.InputSimd128Register(0);
uint8_t lane = i.InputUint8(1);
DCHECK_LT(lane, 4);
if (lane == 0 && dst == src) {
break;
}
uint8_t zmask = 0xE; // Zero top 3 lanes.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
// Use src for both operands to avoid false-dependency on dst.
__ vinsertps(dst, src, src, zmask | (lane << 6));
// These instructions are shorter than insertps, but will leave junk in
// the top lanes of dst.
if (lane == 0) {
__ Move(dst, src);
} else if (lane == 1) {
__ Movshdup(dst, src);
} else if (lane == 2 && dst == src) {
// Check dst == src to avoid false dependency on dst.
__ Movhlps(dst, src);
} else if (dst == src) {
__ Shufps(dst, src, src, lane);
} else {
__ insertps(dst, src, zmask | (lane << 6));
__ Pshufd(dst, src, lane);
}
break;
}
......
......@@ -958,6 +958,10 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
current += PrintRightXMMOperand(current);
AppendToBuffer(",%s", NameOfXMMRegister(regop));
break;
case 0x16:
AppendToBuffer("vmovshdup %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
break;
case 0x51:
AppendToBuffer("vsqrtss %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));
......@@ -1122,7 +1126,13 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
AppendToBuffer(",%s", NameOfXMMRegister(regop));
break;
case 0x12:
AppendToBuffer("vmovlps %s,", NameOfXMMRegister(regop));
if (mod == 0b11) {
AppendToBuffer("vmovhlps %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));
} else {
AppendToBuffer("vmovlps %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));
}
current += PrintRightXMMOperand(current);
break;
case 0x13:
......@@ -1863,7 +1873,11 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
get_modrm(*(data + 2), &mod, &regop, &rm);
if (f0byte == 0x12) {
data += 2;
AppendToBuffer("movlps %s,", NameOfXMMRegister(regop));
if (mod == 0b11) {
AppendToBuffer("movhlps %s,", NameOfXMMRegister(regop));
} else {
AppendToBuffer("movlps %s,", NameOfXMMRegister(regop));
}
data += PrintRightXMMOperand(data);
} else if (f0byte == 0x13) {
data += 2;
......@@ -2667,6 +2681,12 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
get_modrm(*data, &mod, &regop, &rm);
AppendToBuffer("movss %s,", NameOfXMMRegister(regop));
data += PrintRightXMMOperand(data);
} else if (b2 == 0x16) {
data += 3;
int mod, regop, rm;
get_modrm(*data, &mod, &regop, &rm);
AppendToBuffer("movshdup %s,", NameOfXMMRegister(regop));
data += PrintRightXMMOperand(data);
} else if (b2 == 0x5A) {
data += 3;
int mod, regop, rm;
......
......@@ -1093,6 +1093,10 @@ int DisassemblerX64::AVXInstruction(byte* data) {
}
AppendToBuffer(",%s", NameOfXMMRegister(regop));
break;
case 0x16:
AppendToBuffer("vmovshdup %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
break;
case 0x2A:
AppendToBuffer("%s %s,%s,", vex_w() ? "vcvtqsi2ss" : "vcvtlsi2ss",
NameOfXMMRegister(regop), NameOfXMMRegister(vvvv));
......@@ -2020,6 +2024,8 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) {
current += PrintOperands("movss", XMMREG_OPER_OP_ORDER, current);
} else if (opcode == 0x11) {
current += PrintOperands("movss", OPER_XMMREG_OP_ORDER, current);
} else if (opcode == 0x16) {
current += PrintOperands("movshdup", XMMREG_XMMOPER_OP_ORDER, current);
} else if (opcode == 0x2A) {
// CVTSI2SS: integer to XMM single conversion.
current += PrintOperands(mnemonic, XMMREG_OPER_OP_ORDER, current);
......
......@@ -396,6 +396,7 @@ TEST(DisasmIa320) {
__ cvtsd2ss(xmm0, Operand(ebx, ecx, times_4, 10000));
__ movq(xmm0, Operand(edx, 4));
__ movhlps(xmm0, xmm1);
__ movlps(xmm0, Operand(ebx, ecx, times_4, 10000));
__ movlps(Operand(ebx, ecx, times_4, 10000), xmm0);
__ movhps(xmm0, Operand(ebx, ecx, times_4, 10000));
......@@ -592,6 +593,7 @@ TEST(DisasmIa320) {
__ haddps(xmm1, Operand(ebx, ecx, times_4, 10000));
__ movddup(xmm1, Operand(eax, 5));
__ movddup(xmm1, xmm2);
__ movshdup(xmm1, xmm2);
}
}
......@@ -709,6 +711,7 @@ TEST(DisasmIa320) {
__ vhaddps(xmm0, xmm1, xmm2);
__ vhaddps(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000));
__ vmovhlps(xmm0, xmm1, xmm2);
__ vmovlps(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000));
__ vmovlps(Operand(ebx, ecx, times_4, 10000), xmm0);
__ vmovhps(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000));
......@@ -802,6 +805,7 @@ TEST(DisasmIa320) {
__ vmovddup(xmm1, xmm2);
__ vmovddup(xmm1, Operand(ebx, ecx, times_4, 10000));
__ vmovshdup(xmm1, xmm2);
__ vbroadcastss(xmm1, Operand(ebx, ecx, times_4, 10000));
__ vmovdqu(xmm0, Operand(ebx, ecx, times_4, 10000));
__ vmovdqu(Operand(ebx, ecx, times_4, 10000), xmm0);
......
......@@ -512,6 +512,7 @@ TEST(DisasmX64) {
__ lddqu(xmm1, Operand(rdx, 4));
__ movddup(xmm1, Operand(rax, 5));
__ movddup(xmm1, xmm2);
__ movshdup(xmm1, xmm2);
}
}
......@@ -841,6 +842,7 @@ TEST(DisasmX64) {
__ vmovddup(xmm1, xmm2);
__ vmovddup(xmm1, Operand(rbx, rcx, times_4, 10000));
__ vmovshdup(xmm1, xmm2);
__ vbroadcastss(xmm1, Operand(rbx, rcx, times_4, 10000));
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment