Commit 83a58b70 authored by Yolanda Chen's avatar Yolanda Chen Committed by V8 LUCI CQ

[x64] Implement 256-bit assembly for v(p)broadcast*

Bug: v8:12228
Change-Id: I434b07e3d7a2e270dc7dd26950b9dd047eb46a56
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3219944Reviewed-by: 's avatarZhi An Ng <zhin@chromium.org>
Commit-Queue: Yolanda Chen <yolanda.chen@intel.com>
Cr-Commit-Position: refs/heads/main@{#77446}
parent dfbd9edb
...@@ -3441,21 +3441,24 @@ void Assembler::vmovshdup(XMMRegister dst, XMMRegister src) { ...@@ -3441,21 +3441,24 @@ void Assembler::vmovshdup(XMMRegister dst, XMMRegister src) {
emit_sse_operand(dst, src); emit_sse_operand(dst, src);
} }
void Assembler::vbroadcastss(XMMRegister dst, Operand src) { #define BROADCASTSS(SIMDRegister, length) \
DCHECK(IsEnabled(AVX)); void Assembler::vbroadcastss(SIMDRegister dst, Operand src) { \
EnsureSpace ensure_space(this); DCHECK(IsEnabled(AVX)); \
emit_vex_prefix(dst, xmm0, src, kL128, k66, k0F38, kW0); EnsureSpace ensure_space(this); \
emit(0x18); emit_vex_prefix(dst, xmm0, src, k##length, k66, k0F38, kW0); \
emit_sse_operand(dst, src); emit(0x18); \
} emit_sse_operand(dst, src); \
} \
void Assembler::vbroadcastss(XMMRegister dst, XMMRegister src) { void Assembler::vbroadcastss(SIMDRegister dst, XMMRegister src) { \
DCHECK(IsEnabled(AVX2)); DCHECK(IsEnabled(AVX2)); \
EnsureSpace ensure_space(this); EnsureSpace ensure_space(this); \
emit_vex_prefix(dst, xmm0, src, kL128, k66, k0F38, kW0); emit_vex_prefix(dst, xmm0, src, k##length, k66, k0F38, kW0); \
emit(0x18); emit(0x18); \
emit_sse_operand(dst, src); emit_sse_operand(dst, src); \
} }
BROADCASTSS(XMMRegister, L128)
BROADCASTSS(YMMRegister, L256)
#undef BROADCASTSS
void Assembler::fma_instr(byte op, XMMRegister dst, XMMRegister src1, void Assembler::fma_instr(byte op, XMMRegister dst, XMMRegister src1,
XMMRegister src2, VectorLength l, SIMDPrefix pp, XMMRegister src2, VectorLength l, SIMDPrefix pp,
...@@ -3652,17 +3655,6 @@ void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1, ...@@ -3652,17 +3655,6 @@ void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1,
emit_sse_operand(dst, src2); emit_sse_operand(dst, src2);
} }
void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1,
YMMRegister src2, SIMDPrefix pp, LeadingOpcode m, VexW w,
CpuFeature feature) {
DCHECK(IsEnabled(feature));
DCHECK(feature == AVX || feature == AVX2);
EnsureSpace ensure_space(this);
emit_vex_prefix(dst, src1, src2, kL256, pp, m, w);
emit(op);
emit_sse_operand(dst, src2);
}
void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1, Operand src2, void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1, Operand src2,
SIMDPrefix pp, LeadingOpcode m, VexW w, SIMDPrefix pp, LeadingOpcode m, VexW w,
CpuFeature feature) { CpuFeature feature) {
...@@ -3674,9 +3666,9 @@ void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1, Operand src2, ...@@ -3674,9 +3666,9 @@ void Assembler::vinstr(byte op, XMMRegister dst, XMMRegister src1, Operand src2,
emit_sse_operand(dst, src2); emit_sse_operand(dst, src2);
} }
void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1, Operand src2, template <typename Reg1, typename Reg2, typename Op>
SIMDPrefix pp, LeadingOpcode m, VexW w, void Assembler::vinstr(byte op, Reg1 dst, Reg2 src1, Op src2, SIMDPrefix pp,
CpuFeature feature) { LeadingOpcode m, VexW w, CpuFeature feature) {
DCHECK(IsEnabled(feature)); DCHECK(IsEnabled(feature));
DCHECK(feature == AVX || feature == AVX2); DCHECK(feature == AVX || feature == AVX2);
EnsureSpace ensure_space(this); EnsureSpace ensure_space(this);
...@@ -3685,6 +3677,19 @@ void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1, Operand src2, ...@@ -3685,6 +3677,19 @@ void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1, Operand src2,
emit_sse_operand(dst, src2); emit_sse_operand(dst, src2);
} }
template EXPORT_TEMPLATE_DEFINE(V8_EXPORT_PRIVATE) void Assembler::vinstr(
byte op, YMMRegister dst, YMMRegister src1, YMMRegister src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature);
template EXPORT_TEMPLATE_DEFINE(V8_EXPORT_PRIVATE) void Assembler::vinstr(
byte op, YMMRegister dst, XMMRegister src1, XMMRegister src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature);
template EXPORT_TEMPLATE_DEFINE(V8_EXPORT_PRIVATE) void Assembler::vinstr(
byte op, YMMRegister dst, YMMRegister src1, Operand src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature);
template EXPORT_TEMPLATE_DEFINE(V8_EXPORT_PRIVATE) void Assembler::vinstr(
byte op, YMMRegister dst, XMMRegister src1, Operand src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature);
void Assembler::vps(byte op, XMMRegister dst, XMMRegister src1, void Assembler::vps(byte op, XMMRegister dst, XMMRegister src1,
XMMRegister src2) { XMMRegister src2) {
DCHECK(IsEnabled(AVX)); DCHECK(IsEnabled(AVX));
......
...@@ -42,6 +42,7 @@ ...@@ -42,6 +42,7 @@
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "src/base/export-template.h"
#include "src/codegen/assembler.h" #include "src/codegen/assembler.h"
#include "src/codegen/cpu-features.h" #include "src/codegen/cpu-features.h"
#include "src/codegen/label.h" #include "src/codegen/label.h"
...@@ -930,14 +931,12 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -930,14 +931,12 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vinstr(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2, void vinstr(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2,
SIMDPrefix pp, LeadingOpcode m, VexW w, CpuFeature feature = AVX); SIMDPrefix pp, LeadingOpcode m, VexW w, CpuFeature feature = AVX);
void vinstr(byte op, YMMRegister dst, YMMRegister src1, YMMRegister src2,
SIMDPrefix pp, LeadingOpcode m, VexW w,
CpuFeature feature = AVX2);
void vinstr(byte op, XMMRegister dst, XMMRegister src1, Operand src2, void vinstr(byte op, XMMRegister dst, XMMRegister src1, Operand src2,
SIMDPrefix pp, LeadingOpcode m, VexW w, CpuFeature feature = AVX); SIMDPrefix pp, LeadingOpcode m, VexW w, CpuFeature feature = AVX);
void vinstr(byte op, YMMRegister dst, YMMRegister src1, Operand src2,
SIMDPrefix pp, LeadingOpcode m, VexW w, template <typename Reg1, typename Reg2, typename Op>
CpuFeature feature = AVX2); void vinstr(byte op, Reg1 dst, Reg2 src1, Op src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature = AVX2);
// SSE instructions // SSE instructions
void sse_instr(XMMRegister dst, XMMRegister src, byte escape, byte opcode); void sse_instr(XMMRegister dst, XMMRegister src, byte escape, byte opcode);
...@@ -1290,6 +1289,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1290,6 +1289,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vmovshdup(XMMRegister dst, XMMRegister src); void vmovshdup(XMMRegister dst, XMMRegister src);
void vbroadcastss(XMMRegister dst, Operand src); void vbroadcastss(XMMRegister dst, Operand src);
void vbroadcastss(XMMRegister dst, XMMRegister src); void vbroadcastss(XMMRegister dst, XMMRegister src);
void vbroadcastss(YMMRegister dst, Operand src);
void vbroadcastss(YMMRegister dst, XMMRegister src);
void fma_instr(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2, void fma_instr(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2,
VectorLength l, SIMDPrefix pp, LeadingOpcode m, VexW w); VectorLength l, SIMDPrefix pp, LeadingOpcode m, VexW w);
...@@ -1735,11 +1736,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1735,11 +1736,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
// AVX2 instructions // AVX2 instructions
#define AVX2_INSTRUCTION(instr, prefix, escape1, escape2, opcode) \ #define AVX2_INSTRUCTION(instr, prefix, escape1, escape2, opcode) \
void instr(XMMRegister dst, XMMRegister src) { \ template <typename Reg, typename Op> \
vinstr(0x##opcode, dst, xmm0, src, k##prefix, k##escape1##escape2, kW0, \ void instr(Reg dst, Op src) { \
AVX2); \
} \
void instr(XMMRegister dst, Operand src) { \
vinstr(0x##opcode, dst, xmm0, src, k##prefix, k##escape1##escape2, kW0, \ vinstr(0x##opcode, dst, xmm0, src, k##prefix, k##escape1##escape2, kW0, \
AVX2); \ AVX2); \
} }
...@@ -2418,6 +2416,23 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -2418,6 +2416,23 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
#endif #endif
}; };
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1,
YMMRegister src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature);
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
void Assembler::vinstr(byte op, YMMRegister dst, XMMRegister src1,
XMMRegister src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature);
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1,
Operand src2, SIMDPrefix pp, LeadingOpcode m,
VexW w, CpuFeature feature);
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
void Assembler::vinstr(byte op, YMMRegister dst, XMMRegister src1,
Operand src2, SIMDPrefix pp, LeadingOpcode m,
VexW w, CpuFeature feature);
// Helper class that ensures that there is enough space for generating // Helper class that ensures that there is enough space for generating
// instructions and relocation information. The constructor makes // instructions and relocation information. The constructor makes
// sure that there is enough space and (in debug mode) the destructor // sure that there is enough space and (in debug mode) the destructor
......
...@@ -186,7 +186,7 @@ ...@@ -186,7 +186,7 @@
#define SSE4_2_INSTRUCTION_LIST(V) V(pcmpgtq, 66, 0F, 38, 37) #define SSE4_2_INSTRUCTION_LIST(V) V(pcmpgtq, 66, 0F, 38, 37)
// These require AVX2, and we only define the VEX-128 versions. // These require AVX2.
#define AVX2_BROADCAST_LIST(V) \ #define AVX2_BROADCAST_LIST(V) \
V(vpbroadcastd, 66, 0F, 38, 58) \ V(vpbroadcastd, 66, 0F, 38, 58) \
V(vpbroadcastb, 66, 0F, 38, 78) \ V(vpbroadcastb, 66, 0F, 38, 78) \
......
...@@ -891,7 +891,7 @@ int DisassemblerX64::AVXInstruction(byte* data) { ...@@ -891,7 +891,7 @@ int DisassemblerX64::AVXInstruction(byte* data) {
switch (opcode) { switch (opcode) {
case 0x18: case 0x18:
AppendToBuffer("vbroadcastss %s,", NameOfAVXRegister(regop)); AppendToBuffer("vbroadcastss %s,", NameOfAVXRegister(regop));
current += PrintRightAVXOperand(current); current += PrintRightXMMOperand(current);
break; break;
case 0x98: case 0x98:
AppendToBuffer("vfmadd132p%c %s,%s,", float_size_code(), AppendToBuffer("vfmadd132p%c %s,%s,", float_size_code(),
...@@ -1017,7 +1017,7 @@ int DisassemblerX64::AVXInstruction(byte* data) { ...@@ -1017,7 +1017,7 @@ int DisassemblerX64::AVXInstruction(byte* data) {
#define DISASSEMBLE_AVX2_BROADCAST(instruction, _1, _2, _3, code) \ #define DISASSEMBLE_AVX2_BROADCAST(instruction, _1, _2, _3, code) \
case 0x##code: \ case 0x##code: \
AppendToBuffer("" #instruction " %s,", NameOfAVXRegister(regop)); \ AppendToBuffer("" #instruction " %s,", NameOfAVXRegister(regop)); \
current += PrintRightAVXOperand(current); \ current += PrintRightXMMOperand(current); \
break; break;
AVX2_BROADCAST_LIST(DISASSEMBLE_AVX2_BROADCAST) AVX2_BROADCAST_LIST(DISASSEMBLE_AVX2_BROADCAST)
#undef DISASSEMBLE_AVX2_BROADCAST #undef DISASSEMBLE_AVX2_BROADCAST
......
...@@ -2536,6 +2536,7 @@ TEST(AssemblerX64Regmove256bit) { ...@@ -2536,6 +2536,7 @@ TEST(AssemblerX64Regmove256bit) {
__ vmovdqu(ymm10, ymm11); __ vmovdqu(ymm10, ymm11);
__ vmovdqu(ymm9, Operand(rbx, rcx, times_4, 10000)); __ vmovdqu(ymm9, Operand(rbx, rcx, times_4, 10000));
__ vmovdqu(Operand(rbx, rcx, times_4, 10000), ymm0); __ vmovdqu(Operand(rbx, rcx, times_4, 10000), ymm0);
__ vbroadcastss(ymm7, Operand(rbx, rcx, times_4, 10000));
CodeDesc desc; CodeDesc desc;
masm.GetCode(isolate, &desc); masm.GetCode(isolate, &desc);
...@@ -2558,11 +2559,15 @@ TEST(AssemblerX64Regmove256bit) { ...@@ -2558,11 +2559,15 @@ TEST(AssemblerX64Regmove256bit) {
// vmovdqu ymm9,YMMWORD PTR [rbx+rcx*4+0x2710] // vmovdqu ymm9,YMMWORD PTR [rbx+rcx*4+0x2710]
0xC5, 0x7E, 0x6F, 0x8C, 0x8B, 0x10, 0x27, 0x00, 0x00, 0xC5, 0x7E, 0x6F, 0x8C, 0x8B, 0x10, 0x27, 0x00, 0x00,
// vmovdqu YMMWORD PTR [rbx+rcx*4+0x2710],ymm0 // vmovdqu YMMWORD PTR [rbx+rcx*4+0x2710],ymm0
0xC5, 0xFE, 0x7F, 0x84, 0x8B, 0x10, 0x27, 0x00, 0x00}; 0xC5, 0xFE, 0x7F, 0x84, 0x8B, 0x10, 0x27, 0x00, 0x00,
// vbroadcastss ymm7, DWORD PTR [rbx+rcx*4+0x2710]
0xc4, 0xe2, 0x7d, 0x18, 0xbc, 0x8b, 0x10, 0x27, 0x00,
0x00};
CHECK_EQ(0, memcmp(expected, desc.buffer, sizeof(expected))); CHECK_EQ(0, memcmp(expected, desc.buffer, sizeof(expected)));
} }
TEST(AssemblerX64LaneOp256bit) { TEST(AssemblerX64AVX2Op256bit) {
if (!CpuFeatures::IsSupported(AVX2)) return; if (!CpuFeatures::IsSupported(AVX2)) return;
CcTest::InitializeVM(); CcTest::InitializeVM();
v8::HandleScope scope(CcTest::isolate()); v8::HandleScope scope(CcTest::isolate());
...@@ -2581,6 +2586,11 @@ TEST(AssemblerX64LaneOp256bit) { ...@@ -2581,6 +2586,11 @@ TEST(AssemblerX64LaneOp256bit) {
__ vpblendw(ymm2, ymm3, Operand(rbx, rcx, times_4, 10000), 23); __ vpblendw(ymm2, ymm3, Operand(rbx, rcx, times_4, 10000), 23);
__ vpalignr(ymm10, ymm11, ymm12, 4); __ vpalignr(ymm10, ymm11, ymm12, 4);
__ vpalignr(ymm10, ymm11, Operand(rbx, rcx, times_4, 10000), 4); __ vpalignr(ymm10, ymm11, Operand(rbx, rcx, times_4, 10000), 4);
__ vbroadcastss(ymm7, xmm0);
__ vpbroadcastb(ymm2, xmm1);
__ vpbroadcastb(ymm3, Operand(rbx, rcx, times_4, 10000));
__ vpbroadcastw(ymm15, xmm4);
__ vpbroadcastw(ymm5, Operand(rbx, rcx, times_4, 10000));
CodeDesc desc; CodeDesc desc;
masm.GetCode(isolate, &desc); masm.GetCode(isolate, &desc);
...@@ -2611,7 +2621,17 @@ TEST(AssemblerX64LaneOp256bit) { ...@@ -2611,7 +2621,17 @@ TEST(AssemblerX64LaneOp256bit) {
// vpalignr ymm10, ymm11, ymm12, 4 // vpalignr ymm10, ymm11, ymm12, 4
0xC4, 0x43, 0x25, 0x0F, 0xD4, 0x04, 0xC4, 0x43, 0x25, 0x0F, 0xD4, 0x04,
// vpalignr ymm10, ymm11, YMMWORD PTR [rbx+rcx*4+0x2710], 4 // vpalignr ymm10, ymm11, YMMWORD PTR [rbx+rcx*4+0x2710], 4
0xC4, 0x63, 0x25, 0x0F, 0x94, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x04}; 0xC4, 0x63, 0x25, 0x0F, 0x94, 0x8B, 0x10, 0x27, 0x00, 0x00, 0x04,
// vbroadcastss ymm7, xmm0
0xc4, 0xe2, 0x7d, 0x18, 0xf8,
// vpbroadcastb ymm2, xmm1
0xc4, 0xe2, 0x7d, 0x78, 0xd1,
// vpbroadcastb ymm3, BYTE PTR [rbx+rcx*4+0x2710]
0xc4, 0xe2, 0x7d, 0x78, 0x9c, 0x8b, 0x10, 0x27, 0x00, 0x00,
// vpbroadcastw ymm15, xmm4
0xc4, 0x62, 0x7d, 0x79, 0xfc,
// vpbroadcastw ymm5, WORD PTR [rbx+rcx*4+0x2710]
0xc4, 0xe2, 0x7d, 0x79, 0xac, 0x8b, 0x10, 0x27, 0x00, 0x00};
CHECK_EQ(0, memcmp(expected, desc.buffer, sizeof(expected))); CHECK_EQ(0, memcmp(expected, desc.buffer, sizeof(expected)));
} }
......
...@@ -1312,14 +1312,36 @@ UNINITIALIZED_TEST(DisasmX64CheckOutputAVX) { ...@@ -1312,14 +1312,36 @@ UNINITIALIZED_TEST(DisasmX64CheckOutputAVX) {
UNINITIALIZED_TEST(DisasmX64YMMRegister) { UNINITIALIZED_TEST(DisasmX64YMMRegister) {
if (!CpuFeatures::IsSupported(AVX)) return; if (!CpuFeatures::IsSupported(AVX)) return;
DisassemblerTester t; DisassemblerTester t;
CpuFeatureScope fscope(t.assm(), AVX);
// Short immediate instructions {
COMPARE("c5fd6fc1 vmovdqa ymm0,ymm1", vmovdqa(ymm0, ymm1)); CpuFeatureScope fscope(t.assm(), AVX);
COMPARE("c5f77cc2 vhaddps ymm0,ymm1,ymm2",
vhaddps(ymm0, ymm1, ymm2)); // Short immediate instructions
COMPARE("c5f77c848b10270000 vhaddps ymm0,ymm1,[rbx+rcx*4+0x2710]", COMPARE("c5fd6fc1 vmovdqa ymm0,ymm1", vmovdqa(ymm0, ymm1));
vhaddps(ymm0, ymm1, Operand(rbx, rcx, times_4, 10000))); COMPARE("c5f77cc2 vhaddps ymm0,ymm1,ymm2",
vhaddps(ymm0, ymm1, ymm2));
COMPARE("c5f77c848b10270000 vhaddps ymm0,ymm1,[rbx+rcx*4+0x2710]",
vhaddps(ymm0, ymm1, Operand(rbx, rcx, times_4, 10000)));
COMPARE("c4e27d18bc8b10270000 vbroadcastss ymm7,[rbx+rcx*4+0x2710]",
vbroadcastss(ymm7, Operand(rbx, rcx, times_4, 10000)));
}
if (!CpuFeatures::IsSupported(AVX2)) return;
{
CpuFeatureScope fscope(t.assm(), AVX2);
// Short immediate instructions
COMPARE("c4e27d18d1 vbroadcastss ymm2,xmm1",
vbroadcastss(ymm2, xmm1));
COMPARE("c4e27d789c8b10270000 vpbroadcastb ymm3,[rbx+rcx*4+0x2710]",
vpbroadcastb(ymm3, Operand(rbx, rcx, times_4, 10000)));
COMPARE("c4e27d79d3 vpbroadcastw ymm2,xmm3",
vpbroadcastw(ymm2, xmm3));
COMPARE("c4c27d58f8 vpbroadcastd ymm7,xmm8",
vpbroadcastd(ymm7, xmm8));
COMPARE("c4627d588c8b10270000 vpbroadcastd ymm9,[rbx+rcx*4+0x2710]",
vpbroadcastd(ymm9, Operand(rbx, rcx, times_4, 10000)));
}
} }
#undef __ #undef __
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment