Commit 4a716fea authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd] Add AVX codegen for some x64 instructions

This adds avx for extractps, insertps, and cvtdq2ps. These require
SSE4_1, so modified AvxHelper to take another template arg for sse4
operations, and open the proper cpu scope before calling this arg.

Bug: v8:9561
Change-Id: Iad2be7ebab41b96f7eb74f4e2bd9776002e6a76c
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1874378
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/master@{#64529}
parent 081114b5
...@@ -2802,7 +2802,7 @@ void Assembler::movdqu(XMMRegister dst, Operand src) { ...@@ -2802,7 +2802,7 @@ void Assembler::movdqu(XMMRegister dst, Operand src) {
emit_sse_operand(dst, src); emit_sse_operand(dst, src);
} }
void Assembler::extractps(Register dst, XMMRegister src, byte imm8) { void Assembler::extractps(Register dst, XMMRegister src, int8_t imm8) {
DCHECK(IsEnabled(SSE4_1)); DCHECK(IsEnabled(SSE4_1));
DCHECK(is_uint8(imm8)); DCHECK(is_uint8(imm8));
EnsureSpace ensure_space(this); EnsureSpace ensure_space(this);
......
...@@ -1084,7 +1084,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1084,7 +1084,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
// SSE 4.1 instruction // SSE 4.1 instruction
void insertps(XMMRegister dst, XMMRegister src, byte imm8); void insertps(XMMRegister dst, XMMRegister src, byte imm8);
void insertps(XMMRegister dst, Operand src, byte imm8); void insertps(XMMRegister dst, Operand src, byte imm8);
void extractps(Register dst, XMMRegister src, byte imm8); void extractps(Register dst, XMMRegister src, int8_t imm8);
void pextrb(Register dst, XMMRegister src, int8_t imm8); void pextrb(Register dst, XMMRegister src, int8_t imm8);
void pextrb(Operand dst, XMMRegister src, int8_t imm8); void pextrb(Operand dst, XMMRegister src, int8_t imm8);
void pextrw(Register dst, XMMRegister src, int8_t imm8); void pextrw(Register dst, XMMRegister src, int8_t imm8);
...@@ -1580,6 +1580,20 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1580,6 +1580,20 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
vinstr(0x72, xmm4, dst, src, k66, k0F, kWIG); vinstr(0x72, xmm4, dst, src, k66, k0F, kWIG);
emit(imm8); emit(imm8);
} }
void vinsertps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
byte imm8) {
vinstr(0x21, dst, src1, src2, k66, k0F3A, kWIG);
emit(imm8);
}
void vinsertps(XMMRegister dst, XMMRegister src1, Operand src2, byte imm8) {
vinstr(0x21, dst, src1, src2, k66, k0F3A, kWIG);
emit(imm8);
}
void vextractps(Register dst, XMMRegister src, int8_t imm8) {
XMMRegister idst = XMMRegister::from_code(dst.code());
vinstr(0x17, src, xmm0, idst, k66, k0F3A, kWIG);
emit(imm8);
}
void vpextrb(Register dst, XMMRegister src, uint8_t imm8) { void vpextrb(Register dst, XMMRegister src, uint8_t imm8) {
XMMRegister idst = XMMRegister::from_code(dst.code()); XMMRegister idst = XMMRegister::from_code(dst.code());
vinstr(0x14, src, xmm0, idst, k66, k0F3A, kW0); vinstr(0x14, src, xmm0, idst, k66, k0F3A, kW0);
...@@ -1638,6 +1652,12 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1638,6 +1652,12 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
vinstr(0x70, dst, xmm0, src, k66, k0F, kWIG); vinstr(0x70, dst, xmm0, src, k66, k0F, kWIG);
emit(imm8); emit(imm8);
} }
void vcvtdq2ps(XMMRegister dst, XMMRegister src) {
vinstr(0x5B, dst, xmm0, src, kNone, k0F, kWIG);
}
void vcvtdq2ps(XMMRegister dst, Operand src) {
vinstr(0x5B, dst, xmm0, src, kNone, k0F, kWIG);
}
void vps(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2); void vps(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vps(byte op, XMMRegister dst, XMMRegister src1, Operand src2); void vps(byte op, XMMRegister dst, XMMRegister src1, Operand src2);
......
...@@ -80,6 +80,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -80,6 +80,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
template <typename Dst, typename... Args> template <typename Dst, typename... Args>
struct AvxHelper { struct AvxHelper {
Assembler* assm; Assembler* assm;
base::Optional<CpuFeature> feature = base::nullopt;
// Call a method where the AVX version expects the dst argument to be // Call a method where the AVX version expects the dst argument to be
// duplicated. // duplicated.
template <void (Assembler::*avx)(Dst, Dst, Args...), template <void (Assembler::*avx)(Dst, Dst, Args...),
...@@ -88,6 +89,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -88,6 +89,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
if (CpuFeatures::IsSupported(AVX)) { if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(assm, AVX); CpuFeatureScope scope(assm, AVX);
(assm->*avx)(dst, dst, args...); (assm->*avx)(dst, dst, args...);
} else if (feature.has_value()) {
DCHECK(CpuFeatures::IsSupported(*feature));
CpuFeatureScope scope(assm, *feature);
(assm->*no_avx)(dst, args...);
} else { } else {
(assm->*no_avx)(dst, args...); (assm->*no_avx)(dst, args...);
} }
...@@ -100,6 +105,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -100,6 +105,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
if (CpuFeatures::IsSupported(AVX)) { if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(assm, AVX); CpuFeatureScope scope(assm, AVX);
(assm->*avx)(dst, args...); (assm->*avx)(dst, args...);
} else if (feature.has_value()) {
DCHECK(CpuFeatures::IsSupported(*feature));
CpuFeatureScope scope(assm, *feature);
(assm->*no_avx)(dst, args...);
} else { } else {
(assm->*no_avx)(dst, args...); (assm->*no_avx)(dst, args...);
} }
...@@ -113,6 +122,12 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -113,6 +122,12 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
.template emit<&Assembler::v##name, &Assembler::name>(dst, args...); \ .template emit<&Assembler::v##name, &Assembler::name>(dst, args...); \
} }
#define AVX_OP_SSE4_1(macro_name, name) \
template <typename Dst, typename... Args> \
void macro_name(Dst dst, Args... args) { \
AvxHelper<Dst, Args...>{this, base::Optional<CpuFeature>(SSE4_1)} \
.template emit<&Assembler::v##name, &Assembler::name>(dst, args...); \
}
AVX_OP(Subsd, subsd) AVX_OP(Subsd, subsd)
AVX_OP(Divss, divss) AVX_OP(Divss, divss)
AVX_OP(Divsd, divsd) AVX_OP(Divsd, divsd)
...@@ -167,17 +182,20 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -167,17 +182,20 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Psrad, psrad) AVX_OP(Psrad, psrad)
AVX_OP(Psrld, psrld) AVX_OP(Psrld, psrld)
AVX_OP(Paddd, paddd) AVX_OP(Paddd, paddd)
AVX_OP(Pmulld, pmulld)
AVX_OP(Pminsd, pminsd)
AVX_OP(Pminud, pminud)
AVX_OP(Pmaxsd, pmaxsd)
AVX_OP(Pmaxud, pmaxud)
AVX_OP(Pcmpgtd, pcmpgtd) AVX_OP(Pcmpgtd, pcmpgtd)
AVX_OP(Addpd, addpd) AVX_OP(Addpd, addpd)
AVX_OP(Subpd, subpd) AVX_OP(Subpd, subpd)
AVX_OP(Mulpd, mulpd) AVX_OP(Mulpd, mulpd)
AVX_OP(Divpd, divpd) AVX_OP(Divpd, divpd)
AVX_OP(Shufps, shufps) AVX_OP(Shufps, shufps)
AVX_OP(Cvtdq2ps, cvtdq2ps)
AVX_OP_SSE4_1(Pmulld, pmulld)
AVX_OP_SSE4_1(Pminsd, pminsd)
AVX_OP_SSE4_1(Pminud, pminud)
AVX_OP_SSE4_1(Pmaxsd, pmaxsd)
AVX_OP_SSE4_1(Pmaxud, pmaxud)
AVX_OP_SSE4_1(Extractps, extractps)
AVX_OP_SSE4_1(Insertps, insertps)
#undef AVX_OP #undef AVX_OP
......
...@@ -2443,26 +2443,24 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2443,26 +2443,24 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64F32x4ExtractLane: { case kX64F32x4ExtractLane: {
CpuFeatureScope sse_scope(tasm(), SSE4_1); __ Extractps(kScratchRegister, i.InputSimd128Register(0), i.InputInt8(1));
__ extractps(kScratchRegister, i.InputSimd128Register(0), i.InputInt8(1)); __ Movd(i.OutputDoubleRegister(), kScratchRegister);
__ movd(i.OutputDoubleRegister(), kScratchRegister);
break; break;
} }
case kX64F32x4ReplaceLane: { case kX64F32x4ReplaceLane: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
// The insertps instruction uses imm8[5:4] to indicate the lane // The insertps instruction uses imm8[5:4] to indicate the lane
// that needs to be replaced. // that needs to be replaced.
byte select = i.InputInt8(1) << 4 & 0x30; byte select = i.InputInt8(1) << 4 & 0x30;
if (instr->InputAt(2)->IsFPRegister()) { if (instr->InputAt(2)->IsFPRegister()) {
__ insertps(i.OutputSimd128Register(), i.InputDoubleRegister(2), __ Insertps(i.OutputSimd128Register(), i.InputDoubleRegister(2),
select); select);
} else { } else {
__ insertps(i.OutputSimd128Register(), i.InputOperand(2), select); __ Insertps(i.OutputSimd128Register(), i.InputOperand(2), select);
} }
break; break;
} }
case kX64F32x4SConvertI32x4: { case kX64F32x4SConvertI32x4: {
__ cvtdq2ps(i.OutputSimd128Register(), i.InputSimd128Register(0)); __ Cvtdq2ps(i.OutputSimd128Register(), i.InputSimd128Register(0));
break; break;
} }
case kX64F32x4UConvertI32x4: { case kX64F32x4UConvertI32x4: {
......
...@@ -941,12 +941,23 @@ int DisassemblerX64::AVXInstruction(byte* data) { ...@@ -941,12 +941,23 @@ int DisassemblerX64::AVXInstruction(byte* data) {
current += PrintRightOperand(current); current += PrintRightOperand(current);
AppendToBuffer(",%s,0x%x,", NameOfXMMRegister(regop), *current++); AppendToBuffer(",%s,0x%x,", NameOfXMMRegister(regop), *current++);
break; break;
case 0x17:
AppendToBuffer("vextractps ");
current += PrintRightOperand(current);
AppendToBuffer(",%s,0x%x,", NameOfXMMRegister(regop), *current++);
break;
case 0x20: case 0x20:
AppendToBuffer("vpinsrb %s,%s,", NameOfXMMRegister(regop), AppendToBuffer("vpinsrb %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv)); NameOfXMMRegister(vvvv));
current += PrintRightByteOperand(current); current += PrintRightByteOperand(current);
AppendToBuffer(",0x%x", *current++); AppendToBuffer(",0x%x", *current++);
break; break;
case 0x21:
AppendToBuffer("vinsertps %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));
current += PrintRightByteOperand(current);
AppendToBuffer(",0x%x", *current++);
break;
case 0x22: case 0x22:
AppendToBuffer("vpinsrd %s,%s,", NameOfXMMRegister(regop), AppendToBuffer("vpinsrd %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv)); NameOfXMMRegister(vvvv));
...@@ -1276,6 +1287,10 @@ int DisassemblerX64::AVXInstruction(byte* data) { ...@@ -1276,6 +1287,10 @@ int DisassemblerX64::AVXInstruction(byte* data) {
NameOfXMMRegister(vvvv)); NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current); current += PrintRightXMMOperand(current);
break; break;
case 0x5B:
AppendToBuffer("vcvtdq2ps %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
break;
case 0xC2: { case 0xC2: {
AppendToBuffer("vcmpps %s,%s,", NameOfXMMRegister(regop), AppendToBuffer("vcmpps %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv)); NameOfXMMRegister(vvvv));
......
...@@ -751,6 +751,10 @@ TEST(DisasmX64) { ...@@ -751,6 +751,10 @@ TEST(DisasmX64) {
#undef EMIT_SSE2_AVXINSTR #undef EMIT_SSE2_AVXINSTR
#undef EMIT_SSE34_AVXINSTR #undef EMIT_SSE34_AVXINSTR
__ vinsertps(xmm1, xmm2, xmm3, 1);
__ vinsertps(xmm1, xmm2, Operand(rbx, rcx, times_4, 10000), 1);
__ vextractps(rax, xmm1, 1);
__ vlddqu(xmm1, Operand(rbx, rcx, times_4, 10000)); __ vlddqu(xmm1, Operand(rbx, rcx, times_4, 10000));
__ vpsllw(xmm0, xmm15, 21); __ vpsllw(xmm0, xmm15, 21);
__ vpsrlw(xmm0, xmm15, 21); __ vpsrlw(xmm0, xmm15, 21);
...@@ -771,6 +775,9 @@ TEST(DisasmX64) { ...@@ -771,6 +775,9 @@ TEST(DisasmX64) {
__ vpinsrd(xmm1, xmm2, Operand(rbx, rcx, times_4, 10000), 2); __ vpinsrd(xmm1, xmm2, Operand(rbx, rcx, times_4, 10000), 2);
__ vpshufd(xmm1, xmm2, 85); __ vpshufd(xmm1, xmm2, 85);
__ vshufps(xmm3, xmm2, xmm3, 3); __ vshufps(xmm3, xmm2, xmm3, 3);
__ vcvtdq2ps(xmm5, xmm1);
__ vcvtdq2ps(xmm5, Operand(rdx, 4));
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment