Commit 1effe529 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd] Add AVX codegen

Mostly for f32x4 instructions.

Bug: v8:9561
Change-Id: I3a3dc06305acb9e336c494fc399cf5d21518c0e8
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1950488Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#65382}
parent 2450b3bc
...@@ -1634,6 +1634,16 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1634,6 +1634,16 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
emit(imm8); emit(imm8);
} }
void vpblendw(XMMRegister dst, XMMRegister src1, XMMRegister src2,
uint8_t mask) {
vinstr(0x0E, dst, src1, src2, k66, k0F3A, kWIG);
emit(mask);
}
void vpblendw(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t mask) {
vinstr(0x0E, dst, src1, src2, k66, k0F3A, kWIG);
emit(mask);
}
void vps(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2); void vps(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vps(byte op, XMMRegister dst, XMMRegister src1, Operand src2); void vps(byte op, XMMRegister dst, XMMRegister src1, Operand src2);
void vps(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2, void vps(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2,
......
...@@ -139,6 +139,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -139,6 +139,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Subsd, subsd) AVX_OP(Subsd, subsd)
AVX_OP(Divss, divss) AVX_OP(Divss, divss)
AVX_OP(Divsd, divsd) AVX_OP(Divsd, divsd)
AVX_OP(Orps, orps)
AVX_OP(Xorps, xorps) AVX_OP(Xorps, xorps)
AVX_OP(Xorpd, xorpd) AVX_OP(Xorpd, xorpd)
AVX_OP(Movd, movd) AVX_OP(Movd, movd)
...@@ -193,8 +194,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -193,8 +194,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Addpd, addpd) AVX_OP(Addpd, addpd)
AVX_OP(Subpd, subpd) AVX_OP(Subpd, subpd)
AVX_OP(Mulpd, mulpd) AVX_OP(Mulpd, mulpd)
AVX_OP(Minps, minps)
AVX_OP(Minpd, minpd) AVX_OP(Minpd, minpd)
AVX_OP(Divpd, divpd) AVX_OP(Divpd, divpd)
AVX_OP(Maxps, maxps)
AVX_OP(Maxpd, maxpd) AVX_OP(Maxpd, maxpd)
AVX_OP(Shufps, shufps) AVX_OP(Shufps, shufps)
AVX_OP(Cvtdq2ps, cvtdq2ps) AVX_OP(Cvtdq2ps, cvtdq2ps)
...@@ -208,6 +211,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -208,6 +211,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Pshuflw, pshuflw) AVX_OP(Pshuflw, pshuflw)
AVX_OP(Punpcklqdq, punpcklqdq) AVX_OP(Punpcklqdq, punpcklqdq)
AVX_OP(Pshufd, pshufd) AVX_OP(Pshufd, pshufd)
AVX_OP(Cmpps, cmpps)
AVX_OP(Cmppd, cmppd) AVX_OP(Cmppd, cmppd)
AVX_OP_SSE3(Movddup, movddup) AVX_OP_SSE3(Movddup, movddup)
AVX_OP_SSSE3(Pshufb, pshufb) AVX_OP_SSSE3(Pshufb, pshufb)
...@@ -220,6 +224,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -220,6 +224,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP_SSE4_1(Extractps, extractps) AVX_OP_SSE4_1(Extractps, extractps)
AVX_OP_SSE4_1(Insertps, insertps) AVX_OP_SSE4_1(Insertps, insertps)
AVX_OP_SSE4_1(Pinsrq, pinsrq) AVX_OP_SSE4_1(Pinsrq, pinsrq)
AVX_OP_SSE4_1(Pblendw, pblendw)
#undef AVX_OP #undef AVX_OP
......
...@@ -2463,14 +2463,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2463,14 +2463,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_NE(i.OutputSimd128Register(), kScratchDoubleReg); DCHECK_NE(i.OutputSimd128Register(), kScratchDoubleReg);
CpuFeatureScope sse_scope(tasm(), SSE4_1); CpuFeatureScope sse_scope(tasm(), SSE4_1);
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
__ pxor(kScratchDoubleReg, kScratchDoubleReg); // zeros __ Pxor(kScratchDoubleReg, kScratchDoubleReg); // zeros
__ pblendw(kScratchDoubleReg, dst, 0x55); // get lo 16 bits __ Pblendw(kScratchDoubleReg, dst,
__ psubd(dst, kScratchDoubleReg); // get hi 16 bits static_cast<uint8_t>(0x55)); // get lo 16 bits
__ cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg); // convert lo exactly __ Psubd(dst, kScratchDoubleReg); // get hi 16 bits
__ psrld(dst, 1); // divide by 2 to get in unsigned range __ Cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg); // convert lo exactly
__ cvtdq2ps(dst, dst); // convert hi exactly __ Psrld(dst,
__ addps(dst, dst); // double hi, exactly static_cast<byte>(1)); // divide by 2 to get in unsigned range
__ addps(dst, kScratchDoubleReg); // add hi and lo, may round. __ Cvtdq2ps(dst, dst); // convert hi exactly
__ Addps(dst, dst); // double hi, exactly
__ Addps(dst, kScratchDoubleReg); // add hi and lo, may round.
break; break;
} }
case kX64F32x4Abs: { case kX64F32x4Abs: {
...@@ -2545,16 +2547,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2545,16 +2547,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
// The minps instruction doesn't propagate NaNs and +0's in its first // The minps instruction doesn't propagate NaNs and +0's in its first
// operand. Perform minps in both orders, merge the resuls, and adjust. // operand. Perform minps in both orders, merge the resuls, and adjust.
__ movaps(kScratchDoubleReg, src1); __ Movaps(kScratchDoubleReg, src1);
__ minps(kScratchDoubleReg, dst); __ Minps(kScratchDoubleReg, dst);
__ minps(dst, src1); __ Minps(dst, src1);
// propagate -0's and NaNs, which may be non-canonical. // propagate -0's and NaNs, which may be non-canonical.
__ orps(kScratchDoubleReg, dst); __ Orps(kScratchDoubleReg, dst);
// Canonicalize NaNs by quieting and clearing the payload. // Canonicalize NaNs by quieting and clearing the payload.
__ cmpps(dst, kScratchDoubleReg, 3); __ Cmpps(dst, kScratchDoubleReg, static_cast<int8_t>(3));
__ orps(kScratchDoubleReg, dst); __ Orps(kScratchDoubleReg, dst);
__ psrld(dst, 10); __ Psrld(dst, static_cast<byte>(10));
__ andnps(dst, kScratchDoubleReg); __ Andnps(dst, kScratchDoubleReg);
break; break;
} }
case kX64F32x4Max: { case kX64F32x4Max: {
...@@ -2563,39 +2565,41 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2563,39 +2565,41 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
// The maxps instruction doesn't propagate NaNs and +0's in its first // The maxps instruction doesn't propagate NaNs and +0's in its first
// operand. Perform maxps in both orders, merge the resuls, and adjust. // operand. Perform maxps in both orders, merge the resuls, and adjust.
__ movaps(kScratchDoubleReg, src1); __ Movaps(kScratchDoubleReg, src1);
__ maxps(kScratchDoubleReg, dst); __ Maxps(kScratchDoubleReg, dst);
__ maxps(dst, src1); __ Maxps(dst, src1);
// Find discrepancies. // Find discrepancies.
__ xorps(dst, kScratchDoubleReg); __ Xorps(dst, kScratchDoubleReg);
// Propagate NaNs, which may be non-canonical. // Propagate NaNs, which may be non-canonical.
__ orps(kScratchDoubleReg, dst); __ Orps(kScratchDoubleReg, dst);
// Propagate sign discrepancy and (subtle) quiet NaNs. // Propagate sign discrepancy and (subtle) quiet NaNs.
__ subps(kScratchDoubleReg, dst); __ Subps(kScratchDoubleReg, dst);
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic. // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
__ cmpps(dst, kScratchDoubleReg, 3); __ Cmpps(dst, kScratchDoubleReg, static_cast<int8_t>(3));
__ psrld(dst, 10); __ Psrld(dst, static_cast<byte>(10));
__ andnps(dst, kScratchDoubleReg); __ Andnps(dst, kScratchDoubleReg);
break; break;
} }
case kX64F32x4Eq: { case kX64F32x4Eq: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ cmpps(i.OutputSimd128Register(), i.InputSimd128Register(1), 0x0); __ Cmpps(i.OutputSimd128Register(), i.InputSimd128Register(1),
static_cast<int8_t>(0x0));
break; break;
} }
case kX64F32x4Ne: { case kX64F32x4Ne: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ cmpps(i.OutputSimd128Register(), i.InputSimd128Register(1), 0x4); __ Cmpps(i.OutputSimd128Register(), i.InputSimd128Register(1),
static_cast<int8_t>(0x4));
break; break;
} }
case kX64F32x4Lt: { case kX64F32x4Lt: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ cmpltps(i.OutputSimd128Register(), i.InputSimd128Register(1)); __ Cmpltps(i.OutputSimd128Register(), i.InputSimd128Register(1));
break; break;
} }
case kX64F32x4Le: { case kX64F32x4Le: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ cmpleps(i.OutputSimd128Register(), i.InputSimd128Register(1)); __ Cmpleps(i.OutputSimd128Register(), i.InputSimd128Register(1));
break; break;
} }
case kX64F32x4Qfma: { case kX64F32x4Qfma: {
...@@ -2605,9 +2609,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2605,9 +2609,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(2)); i.InputSimd128Register(2));
} else { } else {
XMMRegister tmp = i.TempSimd128Register(0); XMMRegister tmp = i.TempSimd128Register(0);
__ movaps(tmp, i.InputSimd128Register(2)); __ Movaps(tmp, i.InputSimd128Register(2));
__ mulps(tmp, i.InputSimd128Register(1)); __ Mulps(tmp, i.InputSimd128Register(1));
__ addps(i.OutputSimd128Register(), tmp); __ Addps(i.OutputSimd128Register(), tmp);
} }
break; break;
} }
...@@ -2618,9 +2622,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2618,9 +2622,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(2)); i.InputSimd128Register(2));
} else { } else {
XMMRegister tmp = i.TempSimd128Register(0); XMMRegister tmp = i.TempSimd128Register(0);
__ movaps(tmp, i.InputSimd128Register(2)); __ Movaps(tmp, i.InputSimd128Register(2));
__ mulps(tmp, i.InputSimd128Register(1)); __ Mulps(tmp, i.InputSimd128Register(1));
__ subps(i.OutputSimd128Register(), tmp); __ Subps(i.OutputSimd128Register(), tmp);
} }
break; break;
} }
......
...@@ -930,6 +930,12 @@ int DisassemblerX64::AVXInstruction(byte* data) { ...@@ -930,6 +930,12 @@ int DisassemblerX64::AVXInstruction(byte* data) {
current += PrintRightXMMOperand(current); current += PrintRightXMMOperand(current);
AppendToBuffer(",0x%x", *current++); AppendToBuffer(",0x%x", *current++);
break; break;
case 0x0E:
AppendToBuffer("vpblendw %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
AppendToBuffer(",0x%x", *current++);
break;
case 0x14: case 0x14:
AppendToBuffer("vpextrb "); AppendToBuffer("vpextrb ");
current += PrintRightByteOperand(current); current += PrintRightByteOperand(current);
......
...@@ -770,6 +770,8 @@ TEST(DisasmX64) { ...@@ -770,6 +770,8 @@ TEST(DisasmX64) {
__ vpshuflw(xmm1, xmm2, 85); __ vpshuflw(xmm1, xmm2, 85);
__ vpshuflw(xmm1, Operand(rbx, rcx, times_4, 10000), 85); __ vpshuflw(xmm1, Operand(rbx, rcx, times_4, 10000), 85);
__ vshufps(xmm3, xmm2, xmm3, 3); __ vshufps(xmm3, xmm2, xmm3, 3);
__ vpblendw(xmm1, xmm2, xmm3, 23);
__ vpblendw(xmm1, xmm2, Operand(rbx, rcx, times_4, 10000), 23);
__ vmovddup(xmm1, xmm2); __ vmovddup(xmm1, xmm2);
__ vmovddup(xmm1, Operand(rbx, rcx, times_4, 10000)); __ vmovddup(xmm1, Operand(rbx, rcx, times_4, 10000));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment