Commit d60809aa authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd] Add AVX for some i64x2 instructions

Also add missing disasm for SSE4_2 instruction.

Bug: v8:9561
Change-Id: Idc8d3c0e59f0e9aff57ebdcc5774bba375828597
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1986386Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#65769}
parent ba14c2f3
......@@ -1004,6 +1004,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
SSSE3_INSTRUCTION_LIST(DECLARE_SSE34_AVX_INSTRUCTION)
SSE4_INSTRUCTION_LIST(DECLARE_SSE34_AVX_INSTRUCTION)
SSE4_2_INSTRUCTION_LIST(DECLARE_SSE34_AVX_INSTRUCTION)
#undef DECLARE_SSE34_AVX_INSTRUCTION
#define DECLARE_SSE4_PMOV_AVX_INSTRUCTION(instruction, prefix, escape1, \
......
......@@ -136,6 +136,12 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AvxHelper<Dst, Args...>{this, base::Optional<CpuFeature>(SSE4_1)} \
.template emit<&Assembler::v##name, &Assembler::name>(dst, args...); \
}
#define AVX_OP_SSE4_2(macro_name, name) \
template <typename Dst, typename... Args> \
void macro_name(Dst dst, Args... args) { \
AvxHelper<Dst, Args...>{this, base::Optional<CpuFeature>(SSE4_2)} \
.template emit<&Assembler::v##name, &Assembler::name>(dst, args...); \
}
AVX_OP(Subsd, subsd)
AVX_OP(Divss, divss)
AVX_OP(Divsd, divsd)
......@@ -193,7 +199,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Psrad, psrad)
AVX_OP(Psrld, psrld)
AVX_OP(Paddd, paddd)
AVX_OP(Paddq, paddq)
AVX_OP(Pcmpgtd, pcmpgtd)
AVX_OP(Pmuludq, pmuludq)
AVX_OP(Addpd, addpd)
AVX_OP(Subpd, subpd)
AVX_OP(Mulpd, mulpd)
......@@ -221,6 +229,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP_SSSE3(Pshufb, pshufb)
AVX_OP_SSSE3(Psignd, psignd)
AVX_OP_SSSE3(Palignr, palignr)
AVX_OP_SSE4_1(Pcmpeqq, pcmpeqq)
AVX_OP_SSE4_1(Pmulld, pmulld)
AVX_OP_SSE4_1(Pminsd, pminsd)
AVX_OP_SSE4_1(Pminud, pminud)
......@@ -237,6 +246,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP_SSE4_1(Pmovzxwd, pmovzxwd)
AVX_OP_SSE4_1(Pmovzxdq, pmovzxdq)
AVX_OP_SSE4_1(Pextrq, pextrq)
AVX_OP_SSE4_2(Pcmpgtq, pcmpgtq)
#undef AVX_OP
......
......@@ -2696,38 +2696,37 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I64x2Add: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ paddq(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Paddq(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I64x2Sub: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ psubq(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Psubq(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I64x2Mul: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_1);
XMMRegister left = i.InputSimd128Register(0);
XMMRegister right = i.InputSimd128Register(1);
XMMRegister tmp1 = i.TempSimd128Register(0);
XMMRegister tmp2 = i.TempSimd128Register(1);
__ movaps(tmp1, left);
__ movaps(tmp2, right);
__ Movaps(tmp1, left);
__ Movaps(tmp2, right);
// Multiply high dword of each qword of left with right.
__ psrlq(tmp1, 32);
__ pmuludq(tmp1, right);
__ Psrlq(tmp1, 32);
__ Pmuludq(tmp1, right);
// Multiply high dword of each qword of right with left.
__ psrlq(tmp2, 32);
__ pmuludq(tmp2, left);
__ Psrlq(tmp2, 32);
__ Pmuludq(tmp2, left);
__ paddq(tmp2, tmp1);
__ psllq(tmp2, 32);
__ Paddq(tmp2, tmp1);
__ Psllq(tmp2, 32);
__ pmuludq(left, right);
__ paddq(left, tmp2); // left == dst
__ Pmuludq(left, right);
__ Paddq(left, tmp2); // left == dst
break;
}
case kX64I64x2MinS: {
......@@ -2792,36 +2791,32 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I64x2Eq: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ pcmpeqq(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Pcmpeqq(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I64x2Ne: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_1);
XMMRegister tmp = i.TempSimd128Register(0);
__ pcmpeqq(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ pcmpeqq(tmp, tmp);
__ pxor(i.OutputSimd128Register(), tmp);
__ Pcmpeqq(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Pcmpeqq(tmp, tmp);
__ Pxor(i.OutputSimd128Register(), tmp);
break;
}
case kX64I64x2GtS: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_2);
__ pcmpgtq(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Pcmpgtq(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I64x2GeS: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_2);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(1);
XMMRegister tmp = i.TempSimd128Register(0);
__ movaps(tmp, src);
__ pcmpgtq(tmp, dst);
__ pcmpeqd(dst, dst);
__ pxor(dst, tmp);
__ Movaps(tmp, src);
__ Pcmpgtq(tmp, dst);
__ Pcmpeqd(dst, dst);
__ Pxor(dst, tmp);
break;
}
case kX64I64x2ShrU: {
......@@ -2880,18 +2875,17 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I64x2GtU: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_2);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(1);
XMMRegister tmp = i.TempSimd128Register(0);
__ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ psllq(kScratchDoubleReg, 63);
__ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ Psllq(kScratchDoubleReg, 63);
__ movaps(tmp, src);
__ pxor(tmp, kScratchDoubleReg);
__ pxor(dst, kScratchDoubleReg);
__ pcmpgtq(dst, tmp);
__ Movaps(tmp, src);
__ Pxor(tmp, kScratchDoubleReg);
__ Pxor(dst, kScratchDoubleReg);
__ Pcmpgtq(dst, tmp);
break;
}
case kX64I64x2GeU: {
......@@ -2901,15 +2895,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister src = i.InputSimd128Register(1);
XMMRegister tmp = i.TempSimd128Register(0);
__ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ psllq(kScratchDoubleReg, 63);
__ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ Psllq(kScratchDoubleReg, 63);
__ movaps(tmp, src);
__ pxor(dst, kScratchDoubleReg);
__ pxor(tmp, kScratchDoubleReg);
__ pcmpgtq(tmp, dst);
__ pcmpeqd(dst, dst);
__ pxor(dst, tmp);
__ Movaps(tmp, src);
__ Pxor(dst, kScratchDoubleReg);
__ Pxor(tmp, kScratchDoubleReg);
__ Pcmpgtq(tmp, dst);
__ Pcmpeqd(dst, dst);
__ Pxor(dst, tmp);
break;
}
case kX64I32x4Splat: {
......
......@@ -916,6 +916,7 @@ int DisassemblerX64::AVXInstruction(byte* data) {
SSSE3_INSTRUCTION_LIST(DECLARE_SSE_AVX_DIS_CASE)
SSE4_INSTRUCTION_LIST(DECLARE_SSE_AVX_DIS_CASE)
SSE4_2_INSTRUCTION_LIST(DECLARE_SSE_AVX_DIS_CASE)
#undef DECLARE_SSE_AVX_DIS_CASE
#define DECLARE_SSE_PMOV_AVX_DIS_CASE(instruction, notUsed1, notUsed2, \
......
......@@ -739,6 +739,7 @@ TEST(DisasmX64) {
SSE2_INSTRUCTION_LIST(EMIT_SSE2_AVXINSTR)
SSSE3_INSTRUCTION_LIST(EMIT_SSE34_AVXINSTR)
SSE4_INSTRUCTION_LIST(EMIT_SSE34_AVXINSTR)
SSE4_2_INSTRUCTION_LIST(EMIT_SSE34_AVXINSTR)
#undef EMIT_SSE2_AVXINSTR
#undef EMIT_SSE34_AVXINSTR
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment