Commit 307490b0 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][x64] Add AVX codegen for i32x4 conversions and hadd

Bug: v8:9561
Change-Id: I4a2c6217dea540b81256dcc833412da573f54795
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2069403Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66587}
parent 92a2f0c7
......@@ -1273,6 +1273,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vcvtss2sd(XMMRegister dst, XMMRegister src1, Operand src2) {
vinstr(0x5a, dst, src1, src2, kF3, k0F, kWIG);
}
void vcvttps2dq(XMMRegister dst, XMMRegister src) {
vinstr(0x5b, dst, xmm0, src, kF3, k0F, kWIG);
}
void vcvtlsi2sd(XMMRegister dst, XMMRegister src1, Register src2) {
XMMRegister isrc2 = XMMRegister::from_code(src2.code());
vinstr(0x2a, dst, src1, isrc2, kF2, k0F, kW0);
......
......@@ -176,6 +176,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Sqrtsd, sqrtsd)
AVX_OP(Sqrtps, sqrtps)
AVX_OP(Sqrtpd, sqrtpd)
AVX_OP(Cvttps2dq, cvttps2dq)
AVX_OP(Ucomiss, ucomiss)
AVX_OP(Ucomisd, ucomisd)
AVX_OP(Paddusb, paddusb)
......@@ -219,6 +220,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Cmppd, cmppd)
AVX_OP(Movlhps, movlhps)
AVX_OP_SSE3(Movddup, movddup)
AVX_OP_SSSE3(Phaddd, phaddd)
AVX_OP_SSSE3(Pshufb, pshufb)
AVX_OP_SSSE3(Psignd, psignd)
AVX_OP_SSSE3(Palignr, palignr)
......
......@@ -2912,18 +2912,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister dst = i.OutputSimd128Register();
XMMRegister tmp = i.TempSimd128Register(0);
// NAN->0
__ movaps(tmp, dst);
__ cmpeqps(tmp, tmp);
__ pand(dst, tmp);
__ Movaps(tmp, dst);
__ Cmpeqps(tmp, tmp);
__ Pand(dst, tmp);
// Set top bit if >= 0 (but not -0.0!)
__ pxor(tmp, dst);
__ Pxor(tmp, dst);
// Convert
__ cvttps2dq(dst, dst);
__ Cvttps2dq(dst, dst);
// Set top bit if >=0 is now < 0
__ pand(tmp, dst);
__ psrad(tmp, 31);
__ Pand(tmp, dst);
__ Psrad(tmp, static_cast<byte>(31));
// Set positive overflow lanes to 0x7FFFFFFF
__ pxor(dst, tmp);
__ Pxor(dst, tmp);
break;
}
case kX64I32x4SConvertI16x8Low: {
......@@ -2966,7 +2966,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I32x4AddHoriz: {
CpuFeatureScope sse_scope(tasm(), SSSE3);
__ phaddd(i.OutputSimd128Register(), i.InputSimd128Register(1));
__ Phaddd(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64I32x4Sub: {
......@@ -3018,26 +3018,26 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister tmp = i.TempSimd128Register(0);
XMMRegister tmp2 = i.TempSimd128Register(1);
// NAN->0, negative->0
__ pxor(tmp2, tmp2);
__ maxps(dst, tmp2);
__ Pxor(tmp2, tmp2);
__ Maxps(dst, tmp2);
// scratch: float representation of max_signed
__ pcmpeqd(tmp2, tmp2);
__ psrld(tmp2, 1); // 0x7fffffff
__ cvtdq2ps(tmp2, tmp2); // 0x4f000000
__ Pcmpeqd(tmp2, tmp2);
__ Psrld(tmp2, static_cast<uint8_t>(1)); // 0x7fffffff
__ Cvtdq2ps(tmp2, tmp2); // 0x4f000000
// tmp: convert (src-max_signed).
// Positive overflow lanes -> 0x7FFFFFFF
// Negative lanes -> 0
__ movaps(tmp, dst);
__ subps(tmp, tmp2);
__ cmpleps(tmp2, tmp);
__ cvttps2dq(tmp, tmp);
__ pxor(tmp, tmp2);
__ pxor(tmp2, tmp2);
__ pmaxsd(tmp, tmp2);
__ Movaps(tmp, dst);
__ Subps(tmp, tmp2);
__ Cmpleps(tmp2, tmp);
__ Cvttps2dq(tmp, tmp);
__ Pxor(tmp, tmp2);
__ Pxor(tmp2, tmp2);
__ Pmaxsd(tmp, tmp2);
// convert. Overflow lanes above max_signed will be 0x80000000
__ cvttps2dq(dst, dst);
__ Cvttps2dq(dst, dst);
// Add (src-max_signed) for overflow lanes.
__ paddd(dst, tmp);
__ Paddd(dst, tmp);
break;
}
case kX64I32x4UConvertI16x8Low: {
......
......@@ -1068,6 +1068,10 @@ int DisassemblerX64::AVXInstruction(byte* data) {
NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
break;
case 0x5B:
AppendToBuffer("vcvttps2dq %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
break;
case 0x5C:
AppendToBuffer("vsubss %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));
......
......@@ -635,6 +635,7 @@ TEST(DisasmX64) {
__ vcvtss2sd(xmm4, xmm9, xmm11);
__ vcvtss2sd(xmm4, xmm9, Operand(rbx, rcx, times_1, 10000));
__ vcvttps2dq(xmm4, xmm11);
__ vcvtlsi2sd(xmm5, xmm9, rcx);
__ vcvtlsi2sd(xmm9, xmm3, Operand(rbx, r9, times_4, 10000));
__ vcvtqsi2sd(xmm5, xmm9, r11);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment