Commit deb490dc authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][x64] AVX codegen

For i8x16, most i8x16 should be done now. Drive by cleanup to remove
unnecessary CpuFeatureScope (since they are in the macro assembler now).

Bug: v8:9561
Change-Id: Ic4e9462eec62badfae7a5164562fdb167da76968
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2121169Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66916}
parent 5455998e
...@@ -185,10 +185,13 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -185,10 +185,13 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Pand, pand) AVX_OP(Pand, pand)
AVX_OP(Por, por) AVX_OP(Por, por)
AVX_OP(Pxor, pxor) AVX_OP(Pxor, pxor)
AVX_OP(Psubb, psubb)
AVX_OP(Psubw, psubw) AVX_OP(Psubw, psubw)
AVX_OP(Psubd, psubd) AVX_OP(Psubd, psubd)
AVX_OP(Psubq, psubq) AVX_OP(Psubq, psubq)
AVX_OP(Psubsb, psubsb)
AVX_OP(Psubsw, psubsw) AVX_OP(Psubsw, psubsw)
AVX_OP(Psubusb, psubusb)
AVX_OP(Psubusw, psubusw) AVX_OP(Psubusw, psubusw)
AVX_OP(Pslld, pslld) AVX_OP(Pslld, pslld)
AVX_OP(Pavgb, pavgb) AVX_OP(Pavgb, pavgb)
...@@ -200,9 +203,11 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -200,9 +203,11 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Psrlw, psrlw) AVX_OP(Psrlw, psrlw)
AVX_OP(Psrld, psrld) AVX_OP(Psrld, psrld)
AVX_OP(Psrlq, psrlq) AVX_OP(Psrlq, psrlq)
AVX_OP(Paddb, paddb)
AVX_OP(Paddw, paddw) AVX_OP(Paddw, paddw)
AVX_OP(Paddd, paddd) AVX_OP(Paddd, paddd)
AVX_OP(Paddq, paddq) AVX_OP(Paddq, paddq)
AVX_OP(Paddsb, paddsb)
AVX_OP(Paddsw, paddsw) AVX_OP(Paddsw, paddsw)
AVX_OP(Paddusb, paddusb) AVX_OP(Paddusb, paddusb)
AVX_OP(Paddusw, paddusw) AVX_OP(Paddusw, paddusw)
...@@ -247,6 +252,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -247,6 +252,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP_SSSE3(Phaddd, phaddd) AVX_OP_SSSE3(Phaddd, phaddd)
AVX_OP_SSSE3(Phaddw, phaddw) AVX_OP_SSSE3(Phaddw, phaddw)
AVX_OP_SSSE3(Pshufb, pshufb) AVX_OP_SSSE3(Pshufb, pshufb)
AVX_OP_SSSE3(Psignb, psignb)
AVX_OP_SSSE3(Psignw, psignw) AVX_OP_SSSE3(Psignw, psignw)
AVX_OP_SSSE3(Psignd, psignd) AVX_OP_SSSE3(Psignd, psignd)
AVX_OP_SSSE3(Palignr, palignr) AVX_OP_SSSE3(Palignr, palignr)
......
...@@ -3274,7 +3274,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3274,7 +3274,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64I8x16Splat: { case kX64I8x16Splat: {
CpuFeatureScope sse_scope(tasm(), SSSE3);
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
if (HasRegisterInput(instr, 0)) { if (HasRegisterInput(instr, 0)) {
__ Movd(dst, i.InputRegister(0)); __ Movd(dst, i.InputRegister(0));
...@@ -3286,20 +3285,17 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3286,20 +3285,17 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64I8x16ExtractLaneU: { case kX64I8x16ExtractLaneU: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
Register dst = i.OutputRegister(); Register dst = i.OutputRegister();
__ Pextrb(dst, i.InputSimd128Register(0), i.InputInt8(1)); __ Pextrb(dst, i.InputSimd128Register(0), i.InputInt8(1));
break; break;
} }
case kX64I8x16ExtractLaneS: { case kX64I8x16ExtractLaneS: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
Register dst = i.OutputRegister(); Register dst = i.OutputRegister();
__ Pextrb(dst, i.InputSimd128Register(0), i.InputInt8(1)); __ Pextrb(dst, i.InputSimd128Register(0), i.InputInt8(1));
__ movsxbl(dst, dst); __ movsxbl(dst, dst);
break; break;
} }
case kX64I8x16ReplaceLane: { case kX64I8x16ReplaceLane: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
if (HasRegisterInput(instr, 2)) { if (HasRegisterInput(instr, 2)) {
__ Pinsrb(i.OutputSimd128Register(), i.InputRegister(2), __ Pinsrb(i.OutputSimd128Register(), i.InputRegister(2),
i.InputInt8(1)); i.InputInt8(1));
...@@ -3314,15 +3310,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3314,15 +3310,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64I8x16Neg: { case kX64I8x16Neg: {
CpuFeatureScope sse_scope(tasm(), SSSE3);
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0); XMMRegister src = i.InputSimd128Register(0);
if (dst == src) { if (dst == src) {
__ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg); __ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ psignb(dst, kScratchDoubleReg); __ Psignb(dst, kScratchDoubleReg);
} else { } else {
__ pxor(dst, dst); __ Pxor(dst, dst);
__ psubb(dst, src); __ Psubb(dst, src);
} }
break; break;
} }
...@@ -3346,17 +3341,17 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3346,17 +3341,17 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} else { } else {
Register shift = i.InputRegister(1); Register shift = i.InputRegister(1);
// Mask off the unwanted bits before word-shifting. // Mask off the unwanted bits before word-shifting.
__ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg); __ Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
// Take shift value modulo 8. // Take shift value modulo 8.
__ andq(shift, Immediate(7)); __ andq(shift, Immediate(7));
__ movq(tmp, shift); __ movq(tmp, shift);
__ addq(tmp, Immediate(8)); __ addq(tmp, Immediate(8));
__ movq(tmp_simd, tmp); __ Movq(tmp_simd, tmp);
__ psrlw(kScratchDoubleReg, tmp_simd); __ Psrlw(kScratchDoubleReg, tmp_simd);
__ packuswb(kScratchDoubleReg, kScratchDoubleReg); __ Packuswb(kScratchDoubleReg, kScratchDoubleReg);
__ pand(dst, kScratchDoubleReg); __ Pand(dst, kScratchDoubleReg);
__ movq(tmp_simd, shift); __ Movq(tmp_simd, shift);
__ psllw(dst, tmp_simd); __ Psllw(dst, tmp_simd);
} }
break; break;
} }
...@@ -3364,45 +3359,45 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3364,45 +3359,45 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
if (HasImmediateInput(instr, 1)) { if (HasImmediateInput(instr, 1)) {
__ punpckhbw(kScratchDoubleReg, dst); __ Punpckhbw(kScratchDoubleReg, dst);
__ punpcklbw(dst, dst); __ Punpcklbw(dst, dst);
uint8_t shift = i.InputInt3(1) + 8; uint8_t shift = i.InputInt3(1) + 8;
__ psraw(kScratchDoubleReg, shift); __ Psraw(kScratchDoubleReg, shift);
__ psraw(dst, shift); __ Psraw(dst, shift);
__ packsswb(dst, kScratchDoubleReg); __ Packsswb(dst, kScratchDoubleReg);
} else { } else {
// Temp registers for shift mask andadditional moves to XMM registers. // Temp registers for shift mask andadditional moves to XMM registers.
Register tmp = i.ToRegister(instr->TempAt(0)); Register tmp = i.ToRegister(instr->TempAt(0));
XMMRegister tmp_simd = i.TempSimd128Register(1); XMMRegister tmp_simd = i.TempSimd128Register(1);
// Unpack the bytes into words, do arithmetic shifts, and repack. // Unpack the bytes into words, do arithmetic shifts, and repack.
__ punpckhbw(kScratchDoubleReg, dst); __ Punpckhbw(kScratchDoubleReg, dst);
__ punpcklbw(dst, dst); __ Punpcklbw(dst, dst);
// Prepare shift value // Prepare shift value
__ movq(tmp, i.InputRegister(1)); __ movq(tmp, i.InputRegister(1));
// Take shift value modulo 8. // Take shift value modulo 8.
__ andq(tmp, Immediate(7)); __ andq(tmp, Immediate(7));
__ addq(tmp, Immediate(8)); __ addq(tmp, Immediate(8));
__ movq(tmp_simd, tmp); __ Movq(tmp_simd, tmp);
__ psraw(kScratchDoubleReg, tmp_simd); __ Psraw(kScratchDoubleReg, tmp_simd);
__ psraw(dst, tmp_simd); __ Psraw(dst, tmp_simd);
__ packsswb(dst, kScratchDoubleReg); __ Packsswb(dst, kScratchDoubleReg);
} }
break; break;
} }
case kX64I8x16Add: { case kX64I8x16Add: {
__ paddb(i.OutputSimd128Register(), i.InputSimd128Register(1)); __ Paddb(i.OutputSimd128Register(), i.InputSimd128Register(1));
break; break;
} }
case kX64I8x16AddSaturateS: { case kX64I8x16AddSaturateS: {
__ paddsb(i.OutputSimd128Register(), i.InputSimd128Register(1)); __ Paddsb(i.OutputSimd128Register(), i.InputSimd128Register(1));
break; break;
} }
case kX64I8x16Sub: { case kX64I8x16Sub: {
__ psubb(i.OutputSimd128Register(), i.InputSimd128Register(1)); __ Psubb(i.OutputSimd128Register(), i.InputSimd128Register(1));
break; break;
} }
case kX64I8x16SubSaturateS: { case kX64I8x16SubSaturateS: {
__ psubsb(i.OutputSimd128Register(), i.InputSimd128Register(1)); __ Psubsb(i.OutputSimd128Register(), i.InputSimd128Register(1));
break; break;
} }
case kX64I8x16Mul: { case kX64I8x16Mul: {
...@@ -3415,36 +3410,34 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3415,36 +3410,34 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
// right= BBbb BBbb ... BBbb BBbb // right= BBbb BBbb ... BBbb BBbb
// t = 00AA 00AA ... 00AA 00AA // t = 00AA 00AA ... 00AA 00AA
// s = 00BB 00BB ... 00BB 00BB // s = 00BB 00BB ... 00BB 00BB
__ movaps(tmp, dst); __ Movaps(tmp, dst);
__ movaps(kScratchDoubleReg, right); __ Movaps(kScratchDoubleReg, right);
__ psrlw(tmp, 8); __ Psrlw(tmp, static_cast<byte>(8));
__ psrlw(kScratchDoubleReg, 8); __ Psrlw(kScratchDoubleReg, static_cast<byte>(8));
// dst = left * 256 // dst = left * 256
__ psllw(dst, 8); __ Psllw(dst, static_cast<byte>(8));
// t = I16x8Mul(t, s) // t = I16x8Mul(t, s)
// => __PP __PP ... __PP __PP // => __PP __PP ... __PP __PP
__ pmullw(tmp, kScratchDoubleReg); __ Pmullw(tmp, kScratchDoubleReg);
// dst = I16x8Mul(left * 256, right) // dst = I16x8Mul(left * 256, right)
// => pp__ pp__ ... pp__ pp__ // => pp__ pp__ ... pp__ pp__
__ pmullw(dst, right); __ Pmullw(dst, right);
// t = I16x8Shl(t, 8) // t = I16x8Shl(t, 8)
// => PP00 PP00 ... PP00 PP00 // => PP00 PP00 ... PP00 PP00
__ psllw(tmp, 8); __ Psllw(tmp, static_cast<byte>(8));
// dst = I16x8Shr(dst, 8) // dst = I16x8Shr(dst, 8)
// => 00pp 00pp ... 00pp 00pp // => 00pp 00pp ... 00pp 00pp
__ psrlw(dst, 8); __ Psrlw(dst, static_cast<byte>(8));
// dst = I16x8Or(dst, t) // dst = I16x8Or(dst, t)
// => PPpp PPpp ... PPpp PPpp // => PPpp PPpp ... PPpp PPpp
__ por(dst, tmp); __ Por(dst, tmp);
break; break;
} }
case kX64I8x16MinS: { case kX64I8x16MinS: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ Pminsb(i.OutputSimd128Register(), i.InputSimd128Register(1)); __ Pminsb(i.OutputSimd128Register(), i.InputSimd128Register(1));
break; break;
} }
case kX64I8x16MaxS: { case kX64I8x16MaxS: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ Pmaxsb(i.OutputSimd128Register(), i.InputSimd128Register(1)); __ Pmaxsb(i.OutputSimd128Register(), i.InputSimd128Register(1));
break; break;
} }
...@@ -3464,7 +3457,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3464,7 +3457,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64I8x16GeS: { case kX64I8x16GeS: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(1); XMMRegister src = i.InputSimd128Register(1);
__ Pminsb(dst, src); __ Pminsb(dst, src);
...@@ -3473,7 +3465,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3473,7 +3465,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
case kX64I8x16UConvertI16x8: { case kX64I8x16UConvertI16x8: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ Packuswb(i.OutputSimd128Register(), i.InputSimd128Register(1)); __ Packuswb(i.OutputSimd128Register(), i.InputSimd128Register(1));
break; break;
} }
...@@ -3496,17 +3487,17 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3496,17 +3487,17 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pshufd(tmp_simd, tmp_simd, static_cast<byte>(0)); __ Pshufd(tmp_simd, tmp_simd, static_cast<byte>(0));
__ Pand(dst, tmp_simd); __ Pand(dst, tmp_simd);
} else { } else {
__ punpckhbw(kScratchDoubleReg, dst); __ Punpckhbw(kScratchDoubleReg, dst);
__ punpcklbw(dst, dst); __ Punpcklbw(dst, dst);
// Prepare shift value // Prepare shift value
__ movq(tmp, i.InputRegister(1)); __ movq(tmp, i.InputRegister(1));
// Take shift value modulo 8. // Take shift value modulo 8.
__ andq(tmp, Immediate(7)); __ andq(tmp, Immediate(7));
__ addq(tmp, Immediate(8)); __ addq(tmp, Immediate(8));
__ movq(tmp_simd, tmp); __ Movq(tmp_simd, tmp);
__ psrlw(kScratchDoubleReg, tmp_simd); __ Psrlw(kScratchDoubleReg, tmp_simd);
__ psrlw(dst, tmp_simd); __ Psrlw(dst, tmp_simd);
__ packuswb(dst, kScratchDoubleReg); __ Packuswb(dst, kScratchDoubleReg);
} }
break; break;
} }
...@@ -3515,21 +3506,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3515,21 +3506,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64I8x16SubSaturateU: { case kX64I8x16SubSaturateU: {
__ psubusb(i.OutputSimd128Register(), i.InputSimd128Register(1)); __ Psubusb(i.OutputSimd128Register(), i.InputSimd128Register(1));
break; break;
} }
case kX64I8x16MinU: { case kX64I8x16MinU: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ Pminub(i.OutputSimd128Register(), i.InputSimd128Register(1)); __ Pminub(i.OutputSimd128Register(), i.InputSimd128Register(1));
break; break;
} }
case kX64I8x16MaxU: { case kX64I8x16MaxU: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ Pmaxub(i.OutputSimd128Register(), i.InputSimd128Register(1)); __ Pmaxub(i.OutputSimd128Register(), i.InputSimd128Register(1));
break; break;
} }
case kX64I8x16GtU: { case kX64I8x16GtU: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(1); XMMRegister src = i.InputSimd128Register(1);
XMMRegister tmp = i.TempSimd128Register(0); XMMRegister tmp = i.TempSimd128Register(0);
...@@ -3540,7 +3528,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3540,7 +3528,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64I8x16GeU: { case kX64I8x16GeU: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(1); XMMRegister src = i.InputSimd128Register(1);
__ Pminub(dst, src); __ Pminub(dst, src);
...@@ -3599,7 +3586,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3599,7 +3586,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64S8x16Swizzle: { case kX64S8x16Swizzle: {
CpuFeatureScope sse_scope(tasm(), SSSE3);
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
XMMRegister mask = i.TempSimd128Register(0); XMMRegister mask = i.TempSimd128Register(0);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment