Commit 411bb69c authored by jiepan's avatar jiepan Committed by V8 LUCI CQ

[x64] Implement 256-bit assembler for SSE2_AVX instructions.

    Not all the SSE2 instructions can be extended to
256-bit wide AVX instructions, AVX only supports 128-bit
wide packed integer operands, while AVX2 supports both
128-bit and 256-bit wide packed integer operands. Moreover,
the 256-bit shift instructions use XMM register/m128 to store
the shift count, while all the operands of others are YMM
registers/m256 operands,so we have to divide the
SSE2_INSTRUCTION_LIST into 3 lists, packed double, packed
integer and packed integer shift.

Bug: v8:12228
Change-Id: Ieb240673ec51eec4315871e873e145a59bf16d5a
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3246760Reviewed-by: 's avatarZhi An Ng <zhin@chromium.org>
Commit-Queue: Jie Pan <jie.pan@intel.com>
Cr-Commit-Position: refs/heads/main@{#77583}
parent aa0b4a21
......@@ -3686,6 +3686,9 @@ template EXPORT_TEMPLATE_DEFINE(V8_EXPORT_PRIVATE) void Assembler::vinstr(
template EXPORT_TEMPLATE_DEFINE(V8_EXPORT_PRIVATE) void Assembler::vinstr(
byte op, YMMRegister dst, YMMRegister src1, Operand src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature);
template EXPORT_TEMPLATE_DEFINE(V8_EXPORT_PRIVATE) void Assembler::vinstr(
byte op, YMMRegister dst, YMMRegister src1, XMMRegister src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature);
template EXPORT_TEMPLATE_DEFINE(V8_EXPORT_PRIVATE) void Assembler::vinstr(
byte op, YMMRegister dst, XMMRegister src1, Operand src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature);
......
......@@ -995,7 +995,42 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
vinstr(0x##opcode, dst, src1, src2, k##prefix, k##escape, kW0); \
}
SSE2_INSTRUCTION_LIST(DECLARE_SSE2_AVX_INSTRUCTION)
#define DECLARE_SSE2_PD_AVX_INSTRUCTION(instruction, prefix, escape, opcode) \
DECLARE_SSE2_AVX_INSTRUCTION(instruction, prefix, escape, opcode) \
void v##instruction(YMMRegister dst, YMMRegister src1, YMMRegister src2) { \
vinstr(0x##opcode, dst, src1, src2, k##prefix, k##escape, kW0, AVX); \
} \
void v##instruction(YMMRegister dst, YMMRegister src1, Operand src2) { \
vinstr(0x##opcode, dst, src1, src2, k##prefix, k##escape, kW0, AVX); \
}
SSE2_INSTRUCTION_LIST_PD(DECLARE_SSE2_PD_AVX_INSTRUCTION)
#undef DECLARE_SSE2_PD_AVX_INSTRUCTION
#define DECLARE_SSE2_PI_AVX_INSTRUCTION(instruction, prefix, escape, opcode) \
DECLARE_SSE2_AVX_INSTRUCTION(instruction, prefix, escape, opcode) \
void v##instruction(YMMRegister dst, YMMRegister src1, YMMRegister src2) { \
vinstr(0x##opcode, dst, src1, src2, k##prefix, k##escape, kW0, AVX2); \
} \
void v##instruction(YMMRegister dst, YMMRegister src1, Operand src2) { \
vinstr(0x##opcode, dst, src1, src2, k##prefix, k##escape, kW0, AVX2); \
}
SSE2_INSTRUCTION_LIST_PI(DECLARE_SSE2_PI_AVX_INSTRUCTION)
#undef DECLARE_SSE2_PI_AVX_INSTRUCTION
#define DECLARE_SSE2_SHIFT_AVX_INSTRUCTION(instruction, prefix, escape, \
opcode) \
DECLARE_SSE2_AVX_INSTRUCTION(instruction, prefix, escape, opcode) \
void v##instruction(YMMRegister dst, YMMRegister src1, XMMRegister src2) { \
vinstr(0x##opcode, dst, src1, src2, k##prefix, k##escape, kW0, AVX2); \
} \
void v##instruction(YMMRegister dst, YMMRegister src1, Operand src2) { \
vinstr(0x##opcode, dst, src1, src2, k##prefix, k##escape, kW0, AVX2); \
}
SSE2_INSTRUCTION_LIST_SHIFT(DECLARE_SSE2_SHIFT_AVX_INSTRUCTION)
#undef DECLARE_SSE2_SHIFT_AVX_INSTRUCTION
#undef DECLARE_SSE2_AVX_INSTRUCTION
#define DECLARE_SSE2_UNOP_AVX_INSTRUCTION(instruction, prefix, escape, opcode) \
......@@ -2429,6 +2464,10 @@ void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1,
Operand src2, SIMDPrefix pp, LeadingOpcode m,
VexW w, CpuFeature feature);
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
void Assembler::vinstr(byte op, YMMRegister dst, YMMRegister src1,
XMMRegister src2, SIMDPrefix pp,
LeadingOpcode m, VexW w, CpuFeature feature);
extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
void Assembler::vinstr(byte op, YMMRegister dst, XMMRegister src1,
Operand src2, SIMDPrefix pp, LeadingOpcode m,
VexW w, CpuFeature feature);
......
......@@ -39,74 +39,86 @@
V(maxss, F3, 0F, 5F)
// Keep sorted by last code.
#define SSE2_INSTRUCTION_LIST(V) \
V(andpd, 66, 0F, 54) \
V(andnpd, 66, 0F, 55) \
V(orpd, 66, 0F, 56) \
V(xorpd, 66, 0F, 57) \
V(addpd, 66, 0F, 58) \
V(mulpd, 66, 0F, 59) \
V(subpd, 66, 0F, 5C) \
V(minpd, 66, 0F, 5D) \
V(divpd, 66, 0F, 5E) \
V(maxpd, 66, 0F, 5F) \
V(punpcklbw, 66, 0F, 60) \
V(punpcklwd, 66, 0F, 61) \
V(punpckldq, 66, 0F, 62) \
V(packsswb, 66, 0F, 63) \
V(pcmpgtb, 66, 0F, 64) \
V(pcmpgtw, 66, 0F, 65) \
V(pcmpgtd, 66, 0F, 66) \
V(packuswb, 66, 0F, 67) \
V(punpckhbw, 66, 0F, 68) \
V(punpckhwd, 66, 0F, 69) \
V(punpckhdq, 66, 0F, 6A) \
V(packssdw, 66, 0F, 6B) \
V(punpcklqdq, 66, 0F, 6C) \
V(punpckhqdq, 66, 0F, 6D) \
V(pcmpeqb, 66, 0F, 74) \
V(pcmpeqw, 66, 0F, 75) \
V(pcmpeqd, 66, 0F, 76) \
V(psrlw, 66, 0F, D1) \
V(psrld, 66, 0F, D2) \
V(psrlq, 66, 0F, D3) \
V(paddq, 66, 0F, D4) \
V(pmullw, 66, 0F, D5) \
V(psubusb, 66, 0F, D8) \
V(psubusw, 66, 0F, D9) \
V(pminub, 66, 0F, DA) \
V(pand, 66, 0F, DB) \
V(paddusb, 66, 0F, DC) \
V(paddusw, 66, 0F, DD) \
V(pmaxub, 66, 0F, DE) \
V(pandn, 66, 0F, DF) \
V(pavgb, 66, 0F, E0) \
V(psraw, 66, 0F, E1) \
V(psrad, 66, 0F, E2) \
V(pavgw, 66, 0F, E3) \
V(pmulhuw, 66, 0F, E4) \
V(pmulhw, 66, 0F, E5) \
V(psubsb, 66, 0F, E8) \
V(psubsw, 66, 0F, E9) \
V(pminsw, 66, 0F, EA) \
V(por, 66, 0F, EB) \
V(paddsb, 66, 0F, EC) \
V(paddsw, 66, 0F, ED) \
V(pmaxsw, 66, 0F, EE) \
V(pxor, 66, 0F, EF) \
V(psllw, 66, 0F, F1) \
V(pslld, 66, 0F, F2) \
V(psllq, 66, 0F, F3) \
V(pmuludq, 66, 0F, F4) \
V(pmaddwd, 66, 0F, F5) \
V(psubb, 66, 0F, F8) \
V(psubw, 66, 0F, F9) \
V(psubd, 66, 0F, FA) \
V(psubq, 66, 0F, FB) \
V(paddb, 66, 0F, FC) \
V(paddw, 66, 0F, FD) \
// SSE2 Instructions dealing with packed double-precision values.
#define SSE2_INSTRUCTION_LIST_PD(V) \
V(andpd, 66, 0F, 54) \
V(andnpd, 66, 0F, 55) \
V(orpd, 66, 0F, 56) \
V(xorpd, 66, 0F, 57) \
V(addpd, 66, 0F, 58) \
V(mulpd, 66, 0F, 59) \
V(subpd, 66, 0F, 5C) \
V(minpd, 66, 0F, 5D) \
V(divpd, 66, 0F, 5E) \
V(maxpd, 66, 0F, 5F)
// SSE2 Instructions dealing with packed integer values.
#define SSE2_INSTRUCTION_LIST_PI(V) \
V(punpcklbw, 66, 0F, 60) \
V(punpcklwd, 66, 0F, 61) \
V(punpckldq, 66, 0F, 62) \
V(packsswb, 66, 0F, 63) \
V(pcmpgtb, 66, 0F, 64) \
V(pcmpgtw, 66, 0F, 65) \
V(pcmpgtd, 66, 0F, 66) \
V(packuswb, 66, 0F, 67) \
V(punpckhbw, 66, 0F, 68) \
V(punpckhwd, 66, 0F, 69) \
V(punpckhdq, 66, 0F, 6A) \
V(packssdw, 66, 0F, 6B) \
V(punpcklqdq, 66, 0F, 6C) \
V(punpckhqdq, 66, 0F, 6D) \
V(pcmpeqb, 66, 0F, 74) \
V(pcmpeqw, 66, 0F, 75) \
V(pcmpeqd, 66, 0F, 76) \
V(paddq, 66, 0F, D4) \
V(pmullw, 66, 0F, D5) \
V(psubusb, 66, 0F, D8) \
V(psubusw, 66, 0F, D9) \
V(pminub, 66, 0F, DA) \
V(pand, 66, 0F, DB) \
V(paddusb, 66, 0F, DC) \
V(paddusw, 66, 0F, DD) \
V(pmaxub, 66, 0F, DE) \
V(pandn, 66, 0F, DF) \
V(pavgb, 66, 0F, E0) \
V(pavgw, 66, 0F, E3) \
V(pmulhuw, 66, 0F, E4) \
V(pmulhw, 66, 0F, E5) \
V(psubsb, 66, 0F, E8) \
V(psubsw, 66, 0F, E9) \
V(pminsw, 66, 0F, EA) \
V(por, 66, 0F, EB) \
V(paddsb, 66, 0F, EC) \
V(paddsw, 66, 0F, ED) \
V(pmaxsw, 66, 0F, EE) \
V(pxor, 66, 0F, EF) \
V(pmuludq, 66, 0F, F4) \
V(pmaddwd, 66, 0F, F5) \
V(psubb, 66, 0F, F8) \
V(psubw, 66, 0F, F9) \
V(psubd, 66, 0F, FA) \
V(psubq, 66, 0F, FB) \
V(paddb, 66, 0F, FC) \
V(paddw, 66, 0F, FD) \
V(paddd, 66, 0F, FE)
// SSE2 shift instructions with XMM register or m128 operand
#define SSE2_INSTRUCTION_LIST_SHIFT(V) \
V(psrlw, 66, 0F, D1) \
V(psrld, 66, 0F, D2) \
V(psrlq, 66, 0F, D3) \
V(psraw, 66, 0F, E1) \
V(psrad, 66, 0F, E2) \
V(psllw, 66, 0F, F1) \
V(pslld, 66, 0F, F2) \
V(psllq, 66, 0F, F3)
#define SSE2_INSTRUCTION_LIST(V) \
SSE2_INSTRUCTION_LIST_PD(V) \
SSE2_INSTRUCTION_LIST_PI(V) \
SSE2_INSTRUCTION_LIST_SHIFT(V)
// SSE2 instructions whose AVX version has two operands.
#define SSE2_UNOP_INSTRUCTION_LIST(V) \
V(ucomisd, 66, 0F, 2E) \
......
......@@ -2644,6 +2644,8 @@ TEST(AssemblerX64FloatingPoint256bit) {
Assembler masm(AssemblerOptions{}, buffer->CreateView());
CpuFeatureScope fscope(&masm, AVX);
__ vandpd(ymm1, ymm3, ymm5);
__ vminpd(ymm2, ymm3, Operand(r8, r9, times_4, 10000));
__ vsqrtps(ymm0, ymm1);
__ vunpcklps(ymm2, ymm3, ymm14);
__ vsubps(ymm10, ymm11, ymm12);
......@@ -2661,7 +2663,11 @@ TEST(AssemblerX64FloatingPoint256bit) {
code->Print(os);
#endif
byte expected[] = {// VSQRTPS
byte expected[] = {// vandpd ymm1, ymm3, ymm5
0xC5, 0xE5, 0x54, 0xCD,
// vminpd ymm2, ymm3, YMMWORD PTR [r8+r9*4+0x2710]
0xC4, 0x81, 0x65, 0x5D, 0x94, 0x88, 0x10, 0x27, 0x00, 0x00,
// VSQRTPS
0xC5, 0xFC, 0x51, 0xC1,
// VUNPCKLPS
0xC4, 0xC1, 0x64, 0x14, 0xD6,
......@@ -2678,6 +2684,54 @@ TEST(AssemblerX64FloatingPoint256bit) {
CHECK_EQ(0, memcmp(expected, desc.buffer, sizeof(expected)));
}
TEST(AssemblerX64Integer256bit) {
if (!CpuFeatures::IsSupported(AVX2)) return;
CcTest::InitializeVM();
v8::HandleScope scope(CcTest::isolate());
auto buffer = AllocateAssemblerBuffer();
Isolate* isolate = CcTest::i_isolate();
Assembler masm(AssemblerOptions{}, buffer->CreateView());
CpuFeatureScope fscope(&masm, AVX2);
// SSE2_AVX_INSTRUCTION
__ vpunpcklbw(ymm9, ymm2, ymm0);
__ vpacksswb(ymm8, ymm3, ymm1);
__ vpcmpgtw(ymm2, ymm7, ymm9);
__ vpand(ymm2, ymm3, ymm4);
__ vpmaxsw(ymm10, ymm11, Operand(rbx, rcx, times_4, 10000));
__ vpaddb(ymm1, ymm2, ymm3);
__ vpsraw(ymm7, ymm1, xmm4);
__ vpsllq(ymm3, ymm2, xmm1);
CodeDesc desc;
masm.GetCode(isolate, &desc);
#ifdef OBJECT_PRINT
Handle<Code> code =
Factory::CodeBuilder(isolate, desc, CodeKind::FOR_TESTING).Build();
StdoutStream os;
code->Print(os);
#endif
byte expected[] = {// SSE2_AVX_INSTRUCTION
// vpunpcklbw ymm9, ymm2, ymm0
0xC5, 0x6D, 0x60, 0xC8,
// vpacksswb ymm8, ymm3, ymm1
0xC5, 0x65, 0x63, 0xC1,
// vpcmpgtw ymm2, ymm7, ymm9
0xC4, 0xC1, 0x45, 0x65, 0xD1,
// vpand ymm2, ymm3, ymm4
0xC5, 0xE5, 0xDB, 0xD4,
// vpmaxsw ymm10, ymm11, YMMWORD PTR [rbx+rcx*4+0x2710]
0xC5, 0x25, 0xEE, 0x94, 0x8B, 0x10, 0x27, 0x00, 0x00,
// vpaddb ymm1, ymm2, ymm3
0xC5, 0xED, 0xFC, 0xCB,
// vpsraw ymm7, ymm1, xmm4
0xC5, 0xF5, 0xE1, 0xFC,
// vpsllq ymm3, ymm2, xmm1
0xC5, 0xED, 0xF3, 0xD9};
CHECK_EQ(0, memcmp(expected, desc.buffer, sizeof(expected)));
}
TEST(CpuFeatures_ProbeImpl) {
// Support for a newer extension implies support for the older extensions.
CHECK_IMPLIES(CpuFeatures::IsSupported(FMA3), CpuFeatures::IsSupported(AVX));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment