Commit aefa80ce authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm-simd] Share F32x4 Min Max implementation

We move the implementation in Liftoff (which is the most general and
handles AVX/SSE and also register aliasing) into shared-macro-assembler.
Also consolidate SSE/AVX for ia32.

No functionality change is expected.

Bug: v8:11589
Bug: v8:11217
Change-Id: I64cc71791f04332dd3505055f4672430c2daf5ac
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3131373Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76805}
parent b586e795
...@@ -127,6 +127,65 @@ void SharedTurboAssembler::F64x2ReplaceLane(XMMRegister dst, XMMRegister src, ...@@ -127,6 +127,65 @@ void SharedTurboAssembler::F64x2ReplaceLane(XMMRegister dst, XMMRegister src,
} }
} }
void SharedTurboAssembler::F32x4Min(XMMRegister dst, XMMRegister lhs,
XMMRegister rhs, XMMRegister scratch) {
// The minps instruction doesn't propagate NaNs and +0's in its first
// operand. Perform minps in both orders, merge the results, and adjust.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vminps(scratch, lhs, rhs);
vminps(dst, rhs, lhs);
} else if (dst == lhs || dst == rhs) {
XMMRegister src = dst == lhs ? rhs : lhs;
movaps(scratch, src);
minps(scratch, dst);
minps(dst, src);
} else {
movaps(scratch, lhs);
minps(scratch, rhs);
movaps(dst, rhs);
minps(dst, lhs);
}
// Propagate -0's and NaNs, which may be non-canonical.
Orps(scratch, dst);
// Canonicalize NaNs by quieting and clearing the payload.
Cmpunordps(dst, dst, scratch);
Orps(scratch, dst);
Psrld(dst, dst, byte{10});
Andnps(dst, dst, scratch);
}
void SharedTurboAssembler::F32x4Max(XMMRegister dst, XMMRegister lhs,
XMMRegister rhs, XMMRegister scratch) {
// The maxps instruction doesn't propagate NaNs and +0's in its first
// operand. Perform maxps in both orders, merge the results, and adjust.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vmaxps(scratch, lhs, rhs);
vmaxps(dst, rhs, lhs);
} else if (dst == lhs || dst == rhs) {
XMMRegister src = dst == lhs ? rhs : lhs;
movaps(scratch, src);
maxps(scratch, dst);
maxps(dst, src);
} else {
movaps(scratch, lhs);
maxps(scratch, rhs);
movaps(dst, rhs);
maxps(dst, lhs);
}
// Find discrepancies.
Xorps(dst, scratch);
// Propagate NaNs, which may be non-canonical.
Orps(scratch, dst);
// Propagate sign discrepancy and (subtle) quiet NaNs.
Subps(scratch, scratch, dst);
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
Cmpunordps(dst, dst, scratch);
Psrld(dst, dst, byte{10});
Andnps(dst, dst, scratch);
}
void SharedTurboAssembler::F64x2Min(XMMRegister dst, XMMRegister lhs, void SharedTurboAssembler::F64x2Min(XMMRegister dst, XMMRegister lhs,
XMMRegister rhs, XMMRegister scratch) { XMMRegister rhs, XMMRegister scratch) {
if (CpuFeatures::IsSupported(AVX)) { if (CpuFeatures::IsSupported(AVX)) {
......
...@@ -375,6 +375,10 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -375,6 +375,10 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
XMMRegister scratch); XMMRegister scratch);
void F32x4Splat(XMMRegister dst, DoubleRegister src); void F32x4Splat(XMMRegister dst, DoubleRegister src);
void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane); void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane);
void F32x4Min(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
XMMRegister scratch);
void F32x4Max(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
XMMRegister scratch);
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx); void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
void I8x16Splat(XMMRegister dst, Register src, XMMRegister scratch); void I8x16Splat(XMMRegister dst, Register src, XMMRegister scratch);
void I8x16Splat(XMMRegister dst, Operand src, XMMRegister scratch); void I8x16Splat(XMMRegister dst, Operand src, XMMRegister scratch);
......
...@@ -2157,76 +2157,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2157,76 +2157,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputOperand(1)); i.InputOperand(1));
break; break;
} }
case kSSEF32x4Min: { case kIA32F32x4Min: {
XMMRegister src1 = i.InputSimd128Register(1), __ F32x4Min(i.OutputSimd128Register(), i.InputSimd128Register(0),
dst = i.OutputSimd128Register(); i.InputSimd128Register(1), kScratchDoubleReg);
DCHECK_EQ(dst, i.InputSimd128Register(0));
// The minps instruction doesn't propagate NaNs and +0's in its first
// operand. Perform minps in both orders, merge the resuls, and adjust.
__ movaps(kScratchDoubleReg, src1);
__ minps(kScratchDoubleReg, dst);
__ minps(dst, src1);
// propagate -0's and NaNs, which may be non-canonical.
__ orps(kScratchDoubleReg, dst);
// Canonicalize NaNs by quieting and clearing the payload.
__ cmpps(dst, kScratchDoubleReg, 3);
__ orps(kScratchDoubleReg, dst);
__ psrld(dst, 10);
__ andnps(dst, kScratchDoubleReg);
break;
}
case kAVXF32x4Min: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src0 = i.InputSimd128Register(0);
Operand src1 = i.InputOperand(1);
// See comment above for correction of minps.
__ vmovups(kScratchDoubleReg, src1);
__ vminps(kScratchDoubleReg, kScratchDoubleReg, src0);
__ vminps(dst, src0, src1);
__ vorps(dst, dst, kScratchDoubleReg);
__ vcmpneqps(kScratchDoubleReg, dst, dst);
__ vorps(dst, dst, kScratchDoubleReg);
__ vpsrld(kScratchDoubleReg, kScratchDoubleReg, 10);
__ vandnps(dst, kScratchDoubleReg, dst);
break;
}
case kSSEF32x4Max: {
XMMRegister src1 = i.InputSimd128Register(1),
dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0));
// The maxps instruction doesn't propagate NaNs and +0's in its first
// operand. Perform maxps in both orders, merge the resuls, and adjust.
__ movaps(kScratchDoubleReg, src1);
__ maxps(kScratchDoubleReg, dst);
__ maxps(dst, src1);
// Find discrepancies.
__ xorps(dst, kScratchDoubleReg);
// Propagate NaNs, which may be non-canonical.
__ orps(kScratchDoubleReg, dst);
// Propagate sign discrepancy and (subtle) quiet NaNs.
__ subps(kScratchDoubleReg, dst);
// Canonicalize NaNs by clearing the payload.
__ cmpps(dst, kScratchDoubleReg, 3);
__ psrld(dst, 10);
__ andnps(dst, kScratchDoubleReg);
break; break;
} }
case kAVXF32x4Max: { case kIA32F32x4Max: {
CpuFeatureScope avx_scope(tasm(), AVX); __ F32x4Max(i.OutputSimd128Register(), i.InputSimd128Register(0),
XMMRegister dst = i.OutputSimd128Register(); i.InputSimd128Register(1), kScratchDoubleReg);
XMMRegister src0 = i.InputSimd128Register(0);
Operand src1 = i.InputOperand(1);
// See comment above for correction of maxps.
__ vmovups(kScratchDoubleReg, src1);
__ vmaxps(kScratchDoubleReg, kScratchDoubleReg, src0);
__ vmaxps(dst, src0, src1);
__ vxorps(dst, dst, kScratchDoubleReg);
__ vorps(kScratchDoubleReg, kScratchDoubleReg, dst);
__ vsubps(kScratchDoubleReg, kScratchDoubleReg, dst);
__ vcmpneqps(dst, kScratchDoubleReg, kScratchDoubleReg);
__ vpsrld(dst, dst, 10);
__ vandnps(dst, dst, kScratchDoubleReg);
break; break;
} }
case kIA32F32x4Eq: { case kIA32F32x4Eq: {
......
...@@ -159,10 +159,8 @@ namespace compiler { ...@@ -159,10 +159,8 @@ namespace compiler {
V(IA32F32x4Sub) \ V(IA32F32x4Sub) \
V(IA32F32x4Mul) \ V(IA32F32x4Mul) \
V(IA32F32x4Div) \ V(IA32F32x4Div) \
V(SSEF32x4Min) \ V(IA32F32x4Min) \
V(AVXF32x4Min) \ V(IA32F32x4Max) \
V(SSEF32x4Max) \
V(AVXF32x4Max) \
V(IA32F32x4Eq) \ V(IA32F32x4Eq) \
V(IA32F32x4Ne) \ V(IA32F32x4Ne) \
V(IA32F32x4Lt) \ V(IA32F32x4Lt) \
......
...@@ -144,10 +144,8 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -144,10 +144,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32F32x4Sub: case kIA32F32x4Sub:
case kIA32F32x4Mul: case kIA32F32x4Mul:
case kIA32F32x4Div: case kIA32F32x4Div:
case kSSEF32x4Min: case kIA32F32x4Min:
case kAVXF32x4Min: case kIA32F32x4Max:
case kSSEF32x4Max:
case kAVXF32x4Max:
case kIA32F32x4Eq: case kIA32F32x4Eq:
case kIA32F32x4Ne: case kIA32F32x4Ne:
case kIA32F32x4Lt: case kIA32F32x4Lt:
......
...@@ -2252,8 +2252,6 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) { ...@@ -2252,8 +2252,6 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
V(I8x16) V(I8x16)
#define SIMD_BINOP_LIST(V) \ #define SIMD_BINOP_LIST(V) \
V(F32x4Min) \
V(F32x4Max) \
V(I32x4GtU) \ V(I32x4GtU) \
V(I32x4GeU) \ V(I32x4GeU) \
V(I16x8Ne) \ V(I16x8Ne) \
...@@ -2274,6 +2272,8 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) { ...@@ -2274,6 +2272,8 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
V(F32x4Ne) \ V(F32x4Ne) \
V(F32x4Lt) \ V(F32x4Lt) \
V(F32x4Le) \ V(F32x4Le) \
V(F32x4Min) \
V(F32x4Max) \
V(I64x2Add) \ V(I64x2Add) \
V(I64x2Sub) \ V(I64x2Sub) \
V(I64x2Eq) \ V(I64x2Eq) \
......
...@@ -2821,42 +2821,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2821,42 +2821,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64F32x4Min: { case kX64F32x4Min: {
XMMRegister src1 = i.InputSimd128Register(1), __ F32x4Min(i.OutputSimd128Register(), i.InputSimd128Register(0),
dst = i.OutputSimd128Register(); i.InputSimd128Register(1), kScratchDoubleReg);
DCHECK_EQ(dst, i.InputSimd128Register(0));
// The minps instruction doesn't propagate NaNs and +0's in its first
// operand. Perform minps in both orders, merge the resuls, and adjust.
__ Movaps(kScratchDoubleReg, src1);
__ Minps(kScratchDoubleReg, dst);
__ Minps(dst, src1);
// propagate -0's and NaNs, which may be non-canonical.
__ Orps(kScratchDoubleReg, dst);
// Canonicalize NaNs by quieting and clearing the payload.
__ Cmpunordps(dst, kScratchDoubleReg);
__ Orps(kScratchDoubleReg, dst);
__ Psrld(dst, byte{10});
__ Andnps(dst, kScratchDoubleReg);
break; break;
} }
case kX64F32x4Max: { case kX64F32x4Max: {
XMMRegister src1 = i.InputSimd128Register(1), __ F32x4Max(i.OutputSimd128Register(), i.InputSimd128Register(0),
dst = i.OutputSimd128Register(); i.InputSimd128Register(1), kScratchDoubleReg);
DCHECK_EQ(dst, i.InputSimd128Register(0));
// The maxps instruction doesn't propagate NaNs and +0's in its first
// operand. Perform maxps in both orders, merge the resuls, and adjust.
__ Movaps(kScratchDoubleReg, src1);
__ Maxps(kScratchDoubleReg, dst);
__ Maxps(dst, src1);
// Find discrepancies.
__ Xorps(dst, kScratchDoubleReg);
// Propagate NaNs, which may be non-canonical.
__ Orps(kScratchDoubleReg, dst);
// Propagate sign discrepancy and (subtle) quiet NaNs.
__ Subps(kScratchDoubleReg, dst);
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
__ Cmpunordps(dst, kScratchDoubleReg);
__ Psrld(dst, byte{10});
__ Andnps(dst, kScratchDoubleReg);
break; break;
} }
case kX64F32x4Eq: { case kX64F32x4Eq: {
......
...@@ -4002,61 +4002,12 @@ void LiftoffAssembler::emit_f32x4_div(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -4002,61 +4002,12 @@ void LiftoffAssembler::emit_f32x4_div(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_f32x4_min(LiftoffRegister dst, LiftoffRegister lhs, void LiftoffAssembler::emit_f32x4_min(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) { LiftoffRegister rhs) {
// The minps instruction doesn't propagate NaNs and +0's in its first F32x4Min(dst.fp(), lhs.fp(), rhs.fp(), liftoff::kScratchDoubleReg);
// operand. Perform minps in both orders, merge the results, and adjust.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vminps(liftoff::kScratchDoubleReg, lhs.fp(), rhs.fp());
vminps(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movaps(liftoff::kScratchDoubleReg, src);
minps(liftoff::kScratchDoubleReg, dst.fp());
minps(dst.fp(), src);
} else {
movaps(liftoff::kScratchDoubleReg, lhs.fp());
minps(liftoff::kScratchDoubleReg, rhs.fp());
movaps(dst.fp(), rhs.fp());
minps(dst.fp(), lhs.fp());
}
// propagate -0's and NaNs, which may be non-canonical.
Orps(liftoff::kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by quieting and clearing the payload.
Cmpunordps(dst.fp(), dst.fp(), liftoff::kScratchDoubleReg);
Orps(liftoff::kScratchDoubleReg, dst.fp());
Psrld(dst.fp(), dst.fp(), byte{10});
Andnps(dst.fp(), dst.fp(), liftoff::kScratchDoubleReg);
} }
void LiftoffAssembler::emit_f32x4_max(LiftoffRegister dst, LiftoffRegister lhs, void LiftoffAssembler::emit_f32x4_max(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) { LiftoffRegister rhs) {
// The maxps instruction doesn't propagate NaNs and +0's in its first F32x4Max(dst.fp(), lhs.fp(), rhs.fp(), liftoff::kScratchDoubleReg);
// operand. Perform maxps in both orders, merge the results, and adjust.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vmaxps(liftoff::kScratchDoubleReg, lhs.fp(), rhs.fp());
vmaxps(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movaps(liftoff::kScratchDoubleReg, src);
maxps(liftoff::kScratchDoubleReg, dst.fp());
maxps(dst.fp(), src);
} else {
movaps(liftoff::kScratchDoubleReg, lhs.fp());
maxps(liftoff::kScratchDoubleReg, rhs.fp());
movaps(dst.fp(), rhs.fp());
maxps(dst.fp(), lhs.fp());
}
// Find discrepancies.
Xorps(dst.fp(), liftoff::kScratchDoubleReg);
// Propagate NaNs, which may be non-canonical.
Orps(liftoff::kScratchDoubleReg, dst.fp());
// Propagate sign discrepancy and (subtle) quiet NaNs.
Subps(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
Cmpunordps(dst.fp(), dst.fp(), liftoff::kScratchDoubleReg);
Psrld(dst.fp(), dst.fp(), byte{10});
Andnps(dst.fp(), dst.fp(), liftoff::kScratchDoubleReg);
} }
void LiftoffAssembler::emit_f32x4_pmin(LiftoffRegister dst, LiftoffRegister lhs, void LiftoffAssembler::emit_f32x4_pmin(LiftoffRegister dst, LiftoffRegister lhs,
......
...@@ -3550,61 +3550,12 @@ void LiftoffAssembler::emit_f32x4_div(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -3550,61 +3550,12 @@ void LiftoffAssembler::emit_f32x4_div(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_f32x4_min(LiftoffRegister dst, LiftoffRegister lhs, void LiftoffAssembler::emit_f32x4_min(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) { LiftoffRegister rhs) {
// The minps instruction doesn't propagate NaNs and +0's in its first F32x4Min(dst.fp(), lhs.fp(), rhs.fp(), kScratchDoubleReg);
// operand. Perform minps in both orders, merge the results, and adjust.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vminps(kScratchDoubleReg, lhs.fp(), rhs.fp());
vminps(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movaps(kScratchDoubleReg, src);
minps(kScratchDoubleReg, dst.fp());
minps(dst.fp(), src);
} else {
movaps(kScratchDoubleReg, lhs.fp());
minps(kScratchDoubleReg, rhs.fp());
movaps(dst.fp(), rhs.fp());
minps(dst.fp(), lhs.fp());
}
// propagate -0's and NaNs, which may be non-canonical.
Orps(kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by quieting and clearing the payload.
Cmpunordps(dst.fp(), kScratchDoubleReg);
Orps(kScratchDoubleReg, dst.fp());
Psrld(dst.fp(), byte{10});
Andnps(dst.fp(), kScratchDoubleReg);
} }
void LiftoffAssembler::emit_f32x4_max(LiftoffRegister dst, LiftoffRegister lhs, void LiftoffAssembler::emit_f32x4_max(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) { LiftoffRegister rhs) {
// The maxps instruction doesn't propagate NaNs and +0's in its first F32x4Max(dst.fp(), lhs.fp(), rhs.fp(), kScratchDoubleReg);
// operand. Perform maxps in both orders, merge the results, and adjust.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vmaxps(kScratchDoubleReg, lhs.fp(), rhs.fp());
vmaxps(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movaps(kScratchDoubleReg, src);
maxps(kScratchDoubleReg, dst.fp());
maxps(dst.fp(), src);
} else {
movaps(kScratchDoubleReg, lhs.fp());
maxps(kScratchDoubleReg, rhs.fp());
movaps(dst.fp(), rhs.fp());
maxps(dst.fp(), lhs.fp());
}
// Find discrepancies.
Xorps(dst.fp(), kScratchDoubleReg);
// Propagate NaNs, which may be non-canonical.
Orps(kScratchDoubleReg, dst.fp());
// Propagate sign discrepancy and (subtle) quiet NaNs.
Subps(kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
Cmpunordps(dst.fp(), kScratchDoubleReg);
Psrld(dst.fp(), byte{10});
Andnps(dst.fp(), kScratchDoubleReg);
} }
void LiftoffAssembler::emit_f32x4_pmin(LiftoffRegister dst, LiftoffRegister lhs, void LiftoffAssembler::emit_f32x4_pmin(LiftoffRegister dst, LiftoffRegister lhs,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment