Commit b415fa38 authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[ia32] Merge some SSE/AVX i32x4 and f32x4 ops

We also set these operations to explicitly require Register for the
second operand (rhs) even if AVX is supported. Although AVX instructions
support unaligned operands, there is potentially a performance hit,
especially on older hardware. This matches the x64 instruction selector
as well.

Bug: v8:11217
Change-Id: Iae11ec23cc607842a034250028f7667fb2fcb0d0
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3114601Reviewed-by: 's avatarAdam Klein <adamk@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76474}
parent 5d38a300
......@@ -323,6 +323,7 @@ class V8_EXPORT_PRIVATE TurboAssembler
} \
}
AVX_OP3_WITH_MOVE(Cmpeqps, cmpeqps, XMMRegister, XMMRegister)
AVX_OP3_WITH_MOVE(Cmpeqps, cmpeqps, XMMRegister, Operand)
AVX_OP3_WITH_MOVE(Movlps, movlps, XMMRegister, Operand)
AVX_OP3_WITH_MOVE(Movhps, movhps, XMMRegister, Operand)
AVX_OP3_WITH_MOVE(Pmaddwd, pmaddwd, XMMRegister, Operand)
......
......@@ -180,7 +180,9 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Cmpeqpd, cmpeqpd)
AVX_OP(Cmplepd, cmplepd)
AVX_OP(Cmpleps, cmpleps)
AVX_OP(Cmpltps, cmpltps)
AVX_OP(Cmpltpd, cmpltpd)
AVX_OP(Cmpneqps, cmpneqps)
AVX_OP(Cmpneqpd, cmpneqpd)
AVX_OP(Cmpunordpd, cmpunordpd)
AVX_OP(Cmpunordps, cmpunordps)
......@@ -228,6 +230,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Pavgb, pavgb)
AVX_OP(Pavgw, pavgw)
AVX_OP(Pcmpgtb, pcmpgtb)
AVX_OP(Pcmpgtd, pcmpgtd)
AVX_OP(Pcmpeqd, pcmpeqd)
AVX_OP(Pinsrw, pinsrw)
AVX_OP(Pmaxub, pmaxub)
......@@ -297,12 +300,14 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP_SSE4_1(Pmaxsb, pmaxsb)
AVX_OP_SSE4_1(Pmaxsd, pmaxsd)
AVX_OP_SSE4_1(Pminsb, pminsb)
AVX_OP_SSE4_1(Pminsd, pminsd)
AVX_OP_SSE4_1(Pmovsxbw, pmovsxbw)
AVX_OP_SSE4_1(Pmovsxdq, pmovsxdq)
AVX_OP_SSE4_1(Pmovsxwd, pmovsxwd)
AVX_OP_SSE4_1(Pmovzxbw, pmovzxbw)
AVX_OP_SSE4_1(Pmovzxdq, pmovzxdq)
AVX_OP_SSE4_1(Pmovzxwd, pmovzxwd)
AVX_OP_SSE4_1(Pmulld, pmulld)
AVX_OP_SSE4_1(Ptest, ptest)
AVX_OP_SSE4_1(Roundpd, roundpd)
AVX_OP_SSE4_1(Roundps, roundps)
......
......@@ -2292,48 +2292,24 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vandnps(dst, dst, kScratchDoubleReg);
break;
}
case kSSEF32x4Eq: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ cmpeqps(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXF32x4Eq: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vcmpeqps(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEF32x4Ne: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ cmpneqps(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXF32x4Ne: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vcmpneqps(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEF32x4Lt: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ cmpltps(i.OutputSimd128Register(), i.InputOperand(1));
case kIA32F32x4Eq: {
__ Cmpeqps(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kAVXF32x4Lt: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vcmpltps(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32F32x4Ne: {
__ Cmpneqps(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEF32x4Le: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ cmpleps(i.OutputSimd128Register(), i.InputOperand(1));
case kIA32F32x4Lt: {
__ Cmpltps(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kAVXF32x4Le: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vcmpleps(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
case kIA32F32x4Le: {
__ Cmpleps(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kIA32F32x4Pmin: {
......@@ -2396,117 +2372,63 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
ASSEMBLE_SIMD_SHIFT(Psrad, 5);
break;
}
case kSSEI32x4Add: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ paddd(i.OutputSimd128Register(), i.InputOperand(1));
case kIA32I32x4Add: {
__ Paddd(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kAVXI32x4Add: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpaddd(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
case kIA32I32x4Sub: {
__ Psubd(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEI32x4Sub: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ psubd(i.OutputSimd128Register(), i.InputOperand(1));
case kIA32I32x4Mul: {
__ Pmulld(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kAVXI32x4Sub: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpsubd(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I32x4MinS: {
__ Pminsd(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEI32x4Mul: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ pmulld(i.OutputSimd128Register(), i.InputOperand(1));
case kIA32I32x4MaxS: {
__ Pmaxsd(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kAVXI32x4Mul: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpmulld(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I32x4Eq: {
__ Pcmpeqd(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEI32x4MinS: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ pminsd(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI32x4MinS: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpminsd(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I32x4Ne: {
__ Pcmpeqd(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
__ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
__ Pxor(i.OutputSimd128Register(), i.OutputSimd128Register(),
kScratchDoubleReg);
break;
}
case kSSEI32x4MaxS: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ pmaxsd(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI32x4MaxS: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpmaxsd(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32I32x4GtS: {
__ Pcmpgtd(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEI32x4Eq: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pcmpeqd(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI32x4Eq: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpcmpeqd(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEI32x4Ne: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pcmpeqd(i.OutputSimd128Register(), i.InputOperand(1));
__ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ xorps(i.OutputSimd128Register(), kScratchDoubleReg);
break;
}
case kAVXI32x4Ne: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpcmpeqd(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
__ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
__ vpxor(i.OutputSimd128Register(), i.OutputSimd128Register(),
kScratchDoubleReg);
break;
}
case kSSEI32x4GtS: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pcmpgtd(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXI32x4GtS: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpcmpgtd(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSEI32x4GeS: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_1);
case kIA32I32x4GeS: {
XMMRegister dst = i.OutputSimd128Register();
Operand src = i.InputOperand(1);
__ pminsd(dst, src);
__ pcmpeqd(dst, src);
break;
}
case kAVXI32x4GeS: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister src1 = i.InputSimd128Register(0);
Operand src2 = i.InputOperand(1);
__ vpminsd(kScratchDoubleReg, src1, src2);
__ vpcmpeqd(i.OutputSimd128Register(), kScratchDoubleReg, src2);
XMMRegister src2 = i.InputSimd128Register(1);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpminsd(kScratchDoubleReg, src1, src2);
__ vpcmpeqd(dst, kScratchDoubleReg, src2);
} else {
DCHECK_EQ(dst, src1);
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ pminsd(dst, src2);
__ pcmpeqd(dst, src2);
}
break;
}
case kSSEI32x4UConvertF32x4: {
......
......@@ -165,14 +165,10 @@ namespace compiler {
V(AVXF32x4Min) \
V(SSEF32x4Max) \
V(AVXF32x4Max) \
V(SSEF32x4Eq) \
V(AVXF32x4Eq) \
V(SSEF32x4Ne) \
V(AVXF32x4Ne) \
V(SSEF32x4Lt) \
V(AVXF32x4Lt) \
V(SSEF32x4Le) \
V(AVXF32x4Le) \
V(IA32F32x4Eq) \
V(IA32F32x4Ne) \
V(IA32F32x4Lt) \
V(IA32F32x4Le) \
V(IA32F32x4Pmin) \
V(IA32F32x4Pmax) \
V(IA32F32x4Round) \
......@@ -185,24 +181,15 @@ namespace compiler {
V(IA32I32x4Neg) \
V(IA32I32x4Shl) \
V(IA32I32x4ShrS) \
V(SSEI32x4Add) \
V(AVXI32x4Add) \
V(SSEI32x4Sub) \
V(AVXI32x4Sub) \
V(SSEI32x4Mul) \
V(AVXI32x4Mul) \
V(SSEI32x4MinS) \
V(AVXI32x4MinS) \
V(SSEI32x4MaxS) \
V(AVXI32x4MaxS) \
V(SSEI32x4Eq) \
V(AVXI32x4Eq) \
V(SSEI32x4Ne) \
V(AVXI32x4Ne) \
V(SSEI32x4GtS) \
V(AVXI32x4GtS) \
V(SSEI32x4GeS) \
V(AVXI32x4GeS) \
V(IA32I32x4Add) \
V(IA32I32x4Sub) \
V(IA32I32x4Mul) \
V(IA32I32x4MinS) \
V(IA32I32x4MaxS) \
V(IA32I32x4Eq) \
V(IA32I32x4Ne) \
V(IA32I32x4GtS) \
V(IA32I32x4GeS) \
V(SSEI32x4UConvertF32x4) \
V(AVXI32x4UConvertF32x4) \
V(IA32I32x4UConvertI16x8Low) \
......
......@@ -150,14 +150,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kAVXF32x4Min:
case kSSEF32x4Max:
case kAVXF32x4Max:
case kSSEF32x4Eq:
case kAVXF32x4Eq:
case kSSEF32x4Ne:
case kAVXF32x4Ne:
case kSSEF32x4Lt:
case kAVXF32x4Lt:
case kSSEF32x4Le:
case kAVXF32x4Le:
case kIA32F32x4Eq:
case kIA32F32x4Ne:
case kIA32F32x4Lt:
case kIA32F32x4Le:
case kIA32F32x4Pmin:
case kIA32F32x4Pmax:
case kIA32F32x4Round:
......@@ -170,24 +166,15 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I32x4Neg:
case kIA32I32x4Shl:
case kIA32I32x4ShrS:
case kSSEI32x4Add:
case kAVXI32x4Add:
case kSSEI32x4Sub:
case kAVXI32x4Sub:
case kSSEI32x4Mul:
case kAVXI32x4Mul:
case kSSEI32x4MinS:
case kAVXI32x4MinS:
case kSSEI32x4MaxS:
case kAVXI32x4MaxS:
case kSSEI32x4Eq:
case kAVXI32x4Eq:
case kSSEI32x4Ne:
case kAVXI32x4Ne:
case kSSEI32x4GtS:
case kAVXI32x4GtS:
case kSSEI32x4GeS:
case kAVXI32x4GeS:
case kIA32I32x4Add:
case kIA32I32x4Sub:
case kIA32I32x4Mul:
case kIA32I32x4MinS:
case kIA32I32x4MaxS:
case kIA32I32x4Eq:
case kIA32I32x4Ne:
case kIA32I32x4GtS:
case kIA32I32x4GeS:
case kSSEI32x4UConvertF32x4:
case kAVXI32x4UConvertF32x4:
case kIA32I32x4UConvertI16x8Low:
......
......@@ -361,7 +361,7 @@ void VisitRROSimd(InstructionSelector* selector, Node* node,
InstructionOperand operand0 = g.UseRegister(node->InputAt(0));
if (selector->IsSupported(AVX)) {
selector->Emit(avx_opcode, g.DefineAsRegister(node), operand0,
g.Use(node->InputAt(1)));
g.UseRegister(node->InputAt(1)));
} else {
selector->Emit(sse_opcode, g.DefineSameAsFirst(node), operand0,
g.UseRegister(node->InputAt(1)));
......@@ -2249,19 +2249,6 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
#define SIMD_BINOP_LIST(V) \
V(F32x4Min) \
V(F32x4Max) \
V(F32x4Eq) \
V(F32x4Ne) \
V(F32x4Lt) \
V(F32x4Le) \
V(I32x4Add) \
V(I32x4Sub) \
V(I32x4Mul) \
V(I32x4MinS) \
V(I32x4MaxS) \
V(I32x4Eq) \
V(I32x4Ne) \
V(I32x4GtS) \
V(I32x4GeS) \
V(I32x4MinU) \
V(I32x4MaxU) \
V(I32x4GtU) \
......@@ -2298,10 +2285,23 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
V(F32x4Sub) \
V(F32x4Mul) \
V(F32x4Div) \
V(F32x4Eq) \
V(F32x4Ne) \
V(F32x4Lt) \
V(F32x4Le) \
V(I64x2Add) \
V(I64x2Sub) \
V(I64x2Eq) \
V(I64x2Ne) \
V(I32x4Add) \
V(I32x4Sub) \
V(I32x4Mul) \
V(I32x4MinS) \
V(I32x4MaxS) \
V(I32x4Eq) \
V(I32x4Ne) \
V(I32x4GtS) \
V(I32x4GeS) \
V(I32x4DotI16x8S) \
V(I16x8RoundingAverageU) \
V(I8x16Add) \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment