Commit 83a58b70 authored by Yolanda Chen's avatar Yolanda Chen Committed by V8 LUCI CQ

[x64] Implement 256-bit assembly for v(p)broadcast*

Bug: v8:12228
Change-Id: I434b07e3d7a2e270dc7dd26950b9dd047eb46a56
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3219944Reviewed-by: 's avatarZhi An Ng <zhin@chromium.org>
Commit-Queue: Yolanda Chen <yolanda.chen@intel.com>
Cr-Commit-Position: refs/heads/main@{#77446}
parent dfbd9edb
......@@ -3441,21 +3441,24 @@ void Assembler::vmovshdup(XMMRegister dst, XMMRegister src) {
emit_sse_operand(dst, src);
}
void Assembler::vbroadcastss(XMMRegister dst, Operand src) {
DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this);
emit_vex_prefix(dst, xmm0, src, kL128, k66, k0F38, kW0);
emit(0x18);
emit_sse_operand(dst, src);
}
void Assembler::vbroadcastss(XMMRegister dst, XMMRegister src) {
DCHECK(IsEnabled(AVX2));
EnsureSpace ensure_space(this);
emit_vex_prefix(dst, xmm0, src, kL128, k66, k0F38, kW0);
emit(0x18);
emit_sse_operand(dst, src);
}
#define BROADCASTSS(SIMDRegister, length) \
void Assembler::vbroadcastss(SIMDRegister dst, Operand src) { \
DCHECK(IsEnabled(AVX)); \
EnsureSpace ensure_space(this); \
emit_vex_prefix(dst, xmm0, src, k##length, k66, k0F38, kW0); \
emit(0x18); \
emit_sse_operand(dst, src); \
} \
void Assembler::vbroadcastss(SIMDRegister dst, XMMRegister src) { \
DCHECK(IsEnabled(AVX2)); \
EnsureSpace ensure_space(this); \
emit_vex_prefix(dst, xmm0, src, k##length, k66, k0F38, kW0); \
emit(0x18); \
emit_sse_operand(dst, src); \
}
BROADCASTSS(XMMRegister, L128)
BROADCASTSS(YMMRegister, L256)
#undef BROADCASTSS
void Assembler::fma_instr(byte op, XMMRegister dst, XMMRegister src1,
XMMRegister src2, VectorLength l, SIMDPrefix pp,
......@@ -3652,17 +3655,6 @@ void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1,
emit_sse_operand(dst, src2);
}
void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1,
YMMRegister src2, SIMDPrefix pp, LeadingOpcode m, VexW w,
CpuFeature feature) {
DCHECK(IsEnabled(feature));
DCHECK(feature == AVX || feature == AVX2);
EnsureSpace ensure_space(this);
emit_vex_prefix(dst, src1, src2, kL256, pp, m, w);
emit(op);
emit_sse_operand(dst, src2);
}
void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1, Operand src2,
SIMDPrefix pp, LeadingOpcode m, VexW w,
CpuFeature feature) {
......@@ -3674,9 +3666,9 @@ void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1, Operand src2,
emit_sse_operand(dst, src2);
}
void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1, Operand src2,
SIMDPrefix pp, LeadingOpcode m, VexW w,
CpuFeature feature) {
template <typename Reg1, typename Reg2, typename Op>
void Assembler::vinstr(byte op, Reg1 dst, Reg2 src1, Op src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature) {
DCHECK(IsEnabled(feature));
DCHECK(feature == AVX || feature == AVX2);
EnsureSpace ensure_space(this);
......@@ -3685,6 +3677,19 @@ void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1, Operand src2,
emit_sse_operand(dst, src2);
}
template EXPORT_TEMPLATE_DEFINE(V8_EXPORT_PRIVATE) void Assembler::vinstr(
byte op, YMMRegister dst, YMMRegister src1, YMMRegister src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature);
template EXPORT_TEMPLATE_DEFINE(V8_EXPORT_PRIVATE) void Assembler::vinstr(
byte op, YMMRegister dst, XMMRegister src1, XMMRegister src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature);
template EXPORT_TEMPLATE_DEFINE(V8_EXPORT_PRIVATE) void Assembler::vinstr(
byte op, YMMRegister dst, YMMRegister src1, Operand src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature);
template EXPORT_TEMPLATE_DEFINE(V8_EXPORT_PRIVATE) void Assembler::vinstr(
byte op, YMMRegister dst, XMMRegister src1, Operand src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature);
void Assembler::vps(byte op, XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
DCHECK(IsEnabled(AVX));
......
......@@ -42,6 +42,7 @@
#include <memory>
#include <vector>
#include "src/base/export-template.h"
#include "src/codegen/assembler.h"
#include "src/codegen/cpu-features.h"
#include "src/codegen/label.h"
......@@ -930,14 +931,12 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vinstr(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2,
SIMDPrefix pp, LeadingOpcode m, VexW w, CpuFeature feature = AVX);
void vinstr(byte op, YMMRegister dst, YMMRegister src1, YMMRegister src2,
SIMDPrefix pp, LeadingOpcode m, VexW w,
CpuFeature feature = AVX2);
void vinstr(byte op, XMMRegister dst, XMMRegister src1, Operand src2,
SIMDPrefix pp, LeadingOpcode m, VexW w, CpuFeature feature = AVX);
void vinstr(byte op, YMMRegister dst, YMMRegister src1, Operand src2,
SIMDPrefix pp, LeadingOpcode m, VexW w,
CpuFeature feature = AVX2);
template <typename Reg1, typename Reg2, typename Op>
void vinstr(byte op, Reg1 dst, Reg2 src1, Op src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature = AVX2);
// SSE instructions
void sse_instr(XMMRegister dst, XMMRegister src, byte escape, byte opcode);
......@@ -1290,6 +1289,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vmovshdup(XMMRegister dst, XMMRegister src);
void vbroadcastss(XMMRegister dst, Operand src);
void vbroadcastss(XMMRegister dst, XMMRegister src);
void vbroadcastss(YMMRegister dst, Operand src);
void vbroadcastss(YMMRegister dst, XMMRegister src);
void fma_instr(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2,
VectorLength l, SIMDPrefix pp, LeadingOpcode m, VexW w);
......@@ -1735,11 +1736,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
// AVX2 instructions
#define AVX2_INSTRUCTION(instr, prefix, escape1, escape2, opcode) \
void instr(XMMRegister dst, XMMRegister src) { \
vinstr(0x##opcode, dst, xmm0, src, k##prefix, k##escape1##escape2, kW0, \
AVX2); \
} \
void instr(XMMRegister dst, Operand src) { \
template <typename Reg, typename Op> \
void instr(Reg dst, Op src) { \
vinstr(0x##opcode, dst, xmm0, src, k##prefix, k##escape1##escape2, kW0, \
AVX2); \
}
......@@ -2418,6 +2416,23 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
#endif
};
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1,
YMMRegister src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature);
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
void Assembler::vinstr(byte op, YMMRegister dst, XMMRegister src1,
XMMRegister src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature);
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1,
Operand src2, SIMDPrefix pp, LeadingOpcode m,
VexW w, CpuFeature feature);
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
void Assembler::vinstr(byte op, YMMRegister dst, XMMRegister src1,
Operand src2, SIMDPrefix pp, LeadingOpcode m,
VexW w, CpuFeature feature);
// Helper class that ensures that there is enough space for generating
// instructions and relocation information. The constructor makes
// sure that there is enough space and (in debug mode) the destructor
......
......@@ -186,7 +186,7 @@
#define SSE4_2_INSTRUCTION_LIST(V) V(pcmpgtq, 66, 0F, 38, 37)
// These require AVX2, and we only define the VEX-128 versions.
// These require AVX2.
#define AVX2_BROADCAST_LIST(V) \
V(vpbroadcastd, 66, 0F, 38, 58) \
V(vpbroadcastb, 66, 0F, 38, 78) \
......
......@@ -891,7 +891,7 @@ int DisassemblerX64::AVXInstruction(byte* data) {
switch (opcode) {
case 0x18:
AppendToBuffer("vbroadcastss %s,", NameOfAVXRegister(regop));
current += PrintRightAVXOperand(current);
current += PrintRightXMMOperand(current);
break;
case 0x98:
AppendToBuffer("vfmadd132p%c %s,%s,", float_size_code(),
......@@ -1017,7 +1017,7 @@ int DisassemblerX64::AVXInstruction(byte* data) {
#define DISASSEMBLE_AVX2_BROADCAST(instruction, _1, _2, _3, code) \
case 0x##code: \
AppendToBuffer("" #instruction " %s,", NameOfAVXRegister(regop)); \
current += PrintRightAVXOperand(current); \
current += PrintRightXMMOperand(current); \
break;
AVX2_BROADCAST_LIST(DISASSEMBLE_AVX2_BROADCAST)
#undef DISASSEMBLE_AVX2_BROADCAST
......
......@@ -2536,6 +2536,7 @@ TEST(AssemblerX64Regmove256bit) {
__ vmovdqu(ymm10, ymm11);
__ vmovdqu(ymm9, Operand(rbx, rcx, times_4, 10000));
__ vmovdqu(Operand(rbx, rcx, times_4, 10000), ymm0);
__ vbroadcastss(ymm7, Operand(rbx, rcx, times_4, 10000));
CodeDesc desc;
masm.GetCode(isolate, &desc);
......@@ -2558,11 +2559,15 @@ TEST(AssemblerX64Regmove256bit) {
// vmovdqu ymm9,YMMWORD PTR [rbx+rcx*4+0x2710]
0xC5, 0x7E, 0x6F, 0x8C, 0x8B, 0x10, 0x27, 0x00, 0x00,
// vmovdqu YMMWORD PTR [rbx+rcx*4+0x2710],ymm0
0xC5, 0xFE, 0x7F, 0x84, 0x8B, 0x10, 0x27, 0x00, 0x00};
0xC5, 0xFE, 0x7F, 0x84, 0x8B, 0x10, 0x27, 0x00, 0x00,
// vbroadcastss ymm7, DWORD PTR [rbx+rcx*4+0x2710]
0xc4, 0xe2, 0x7d, 0x18, 0xbc, 0x8b, 0x10, 0x27, 0x00,
0x00};
CHECK_EQ(0, memcmp(expected, desc.buffer, sizeof(expected)));
}
TEST(AssemblerX64LaneOp256bit) {
TEST(AssemblerX64AVX2Op256bit) {
if (!CpuFeatures::IsSupported(AVX2)) return;
CcTest::InitializeVM();
v8::HandleScope scope(CcTest::isolate());
......@@ -2581,6 +2586,11 @@ TEST(AssemblerX64LaneOp256bit) {
__ vpblendw(ymm2, ymm3, Operand(rbx, rcx, times_4, 10000), 23);
__ vpalignr(ymm10, ymm11, ymm12, 4);
__ vpalignr(ymm10, ymm11, Operand(rbx, rcx, times_4, 10000), 4);
__ vbroadcastss(ymm7, xmm0);
__ vpbroadcastb(ymm2, xmm1);
__ vpbroadcastb(ymm3, Operand(rbx, rcx, times_4, 10000));
__ vpbroadcastw(ymm15, xmm4);
__ vpbroadcastw(ymm5, Operand(rbx, rcx, times_4, 10000));
CodeDesc desc;
masm.GetCode(isolate, &desc);
......@@ -2611,7 +2621,17 @@ TEST(AssemblerX64LaneOp256bit) {
// vpalignr ymm10, ymm11, ymm12, 4
0xC4, 0x43, 0x25, 0x0F, 0xD4, 0x04,
// vpalignr ymm10, ymm11, YMMWORD PTR [rbx+rcx*4+0x2710], 4
0xC4, 0x63, 0x25, 0x0F, 0x94, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x04};
0xC4, 0x63, 0x25, 0x0F, 0x94, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x04,
// vbroadcastss ymm7, xmm0
0xc4, 0xe2, 0x7d, 0x18, 0xf8,
// vpbroadcastb ymm2, xmm1
0xc4, 0xe2, 0x7d, 0x78, 0xd1,
// vpbroadcastb ymm3, BYTE PTR [rbx+rcx*4+0x2710]
0xc4, 0xe2, 0x7d, 0x78, 0x9c, 0x8b, 0x10, 0x27, 0x00, 0x00,
// vpbroadcastw ymm15, xmm4
0xc4, 0x62, 0x7d, 0x79, 0xfc,
// vpbroadcastw ymm5, WORD PTR [rbx+rcx*4+0x2710]
0xc4, 0xe2, 0x7d, 0x79, 0xac, 0x8b, 0x10, 0x27, 0x00, 0x00};
CHECK_EQ(0, memcmp(expected, desc.buffer, sizeof(expected)));
}
......
......@@ -1312,14 +1312,36 @@ UNINITIALIZED_TEST(DisasmX64CheckOutputAVX) {
UNINITIALIZED_TEST(DisasmX64YMMRegister) {
if (!CpuFeatures::IsSupported(AVX)) return;
DisassemblerTester t;
CpuFeatureScope fscope(t.assm(), AVX);
// Short immediate instructions
COMPARE("c5fd6fc1 vmovdqa ymm0,ymm1", vmovdqa(ymm0, ymm1));
COMPARE("c5f77cc2 vhaddps ymm0,ymm1,ymm2",
vhaddps(ymm0, ymm1, ymm2));
COMPARE("c5f77c848b10270000 vhaddps ymm0,ymm1,[rbx+rcx*4+0x2710]",
vhaddps(ymm0, ymm1, Operand(rbx, rcx, times_4, 10000)));
{
CpuFeatureScope fscope(t.assm(), AVX);
// Short immediate instructions
COMPARE("c5fd6fc1 vmovdqa ymm0,ymm1", vmovdqa(ymm0, ymm1));
COMPARE("c5f77cc2 vhaddps ymm0,ymm1,ymm2",
vhaddps(ymm0, ymm1, ymm2));
COMPARE("c5f77c848b10270000 vhaddps ymm0,ymm1,[rbx+rcx*4+0x2710]",
vhaddps(ymm0, ymm1, Operand(rbx, rcx, times_4, 10000)));
COMPARE("c4e27d18bc8b10270000 vbroadcastss ymm7,[rbx+rcx*4+0x2710]",
vbroadcastss(ymm7, Operand(rbx, rcx, times_4, 10000)));
}
if (!CpuFeatures::IsSupported(AVX2)) return;
{
CpuFeatureScope fscope(t.assm(), AVX2);
// Short immediate instructions
COMPARE("c4e27d18d1 vbroadcastss ymm2,xmm1",
vbroadcastss(ymm2, xmm1));
COMPARE("c4e27d789c8b10270000 vpbroadcastb ymm3,[rbx+rcx*4+0x2710]",
vpbroadcastb(ymm3, Operand(rbx, rcx, times_4, 10000)));
COMPARE("c4e27d79d3 vpbroadcastw ymm2,xmm3",
vpbroadcastw(ymm2, xmm3));
COMPARE("c4c27d58f8 vpbroadcastd ymm7,xmm8",
vpbroadcastd(ymm7, xmm8));
COMPARE("c4627d588c8b10270000 vpbroadcastd ymm9,[rbx+rcx*4+0x2710]",
vpbroadcastd(ymm9, Operand(rbx, rcx, times_4, 10000)));
}
}
#undef __
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment