Commit 48592a45 authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm-simd][ia32][x64] Consolidate f64x2.min into shared code

Bug: v8:11589
Change-Id: I572dcc740f9974261521e239cd37c64af3bb0d7d
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2883484Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#74488}
parent 3f5ff1fb
...@@ -60,6 +60,43 @@ void SharedTurboAssembler::F64x2ExtractLane(DoubleRegister dst, XMMRegister src, ...@@ -60,6 +60,43 @@ void SharedTurboAssembler::F64x2ExtractLane(DoubleRegister dst, XMMRegister src,
} }
} }
void SharedTurboAssembler::F64x2Min(XMMRegister dst, XMMRegister lhs,
XMMRegister rhs, XMMRegister scratch) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
// The minpd instruction doesn't propagate NaNs and +0's in its first
// operand. Perform minpd in both orders, merge the resuls, and adjust.
vminpd(scratch, lhs, rhs);
vminpd(dst, rhs, lhs);
// propagate -0's and NaNs, which may be non-canonical.
vorpd(scratch, scratch, dst);
// Canonicalize NaNs by quieting and clearing the payload.
vcmpunordpd(dst, dst, scratch);
vorpd(scratch, scratch, dst);
vpsrlq(dst, dst, byte{13});
vandnpd(dst, dst, scratch);
} else {
// Compare lhs with rhs, and rhs with lhs, and have the results in scratch
// and dst. If dst overlaps with lhs or rhs, we can save a move.
if (dst == lhs || dst == rhs) {
XMMRegister src = dst == lhs ? rhs : lhs;
movaps(scratch, src);
minpd(scratch, dst);
minpd(dst, src);
} else {
movaps(scratch, lhs);
movaps(dst, rhs);
minpd(scratch, rhs);
minpd(dst, lhs);
}
orpd(scratch, dst);
cmpunordpd(dst, scratch);
orpd(scratch, dst);
psrlq(dst, byte{13});
andnpd(dst, scratch);
}
}
void SharedTurboAssembler::F32x4Splat(XMMRegister dst, DoubleRegister src) { void SharedTurboAssembler::F32x4Splat(XMMRegister dst, DoubleRegister src) {
if (CpuFeatures::IsSupported(AVX2)) { if (CpuFeatures::IsSupported(AVX2)) {
CpuFeatureScope avx2_scope(this, AVX2); CpuFeatureScope avx2_scope(this, AVX2);
......
...@@ -277,6 +277,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -277,6 +277,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP_SSE4_1(Roundps, roundps) AVX_OP_SSE4_1(Roundps, roundps)
void F64x2ExtractLane(DoubleRegister dst, XMMRegister src, uint8_t lane); void F64x2ExtractLane(DoubleRegister dst, XMMRegister src, uint8_t lane);
void F64x2Min(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
XMMRegister scratch);
void F32x4Splat(XMMRegister dst, DoubleRegister src); void F32x4Splat(XMMRegister dst, DoubleRegister src);
void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane); void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane);
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx); void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
......
...@@ -1938,21 +1938,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -1938,21 +1938,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kIA32F64x2Min: { case kIA32F64x2Min: {
XMMRegister dst = i.OutputSimd128Register(), __ F64x2Min(i.OutputSimd128Register(), i.InputSimd128Register(0),
src0 = i.InputSimd128Register(0), i.InputSimd128Register(1), kScratchDoubleReg);
src1 = i.InputSimd128Register(1);
// The minpd instruction doesn't propagate NaNs and +0's in its first
// operand. Perform minpd in both orders, merge the resuls, and adjust.
__ Movapd(kScratchDoubleReg, src1);
__ Minpd(kScratchDoubleReg, kScratchDoubleReg, src0);
__ Minpd(dst, src0, src1);
// propagate -0's and NaNs, which may be non-canonical.
__ Orpd(kScratchDoubleReg, dst);
// Canonicalize NaNs by quieting and clearing the payload.
__ Cmpunordpd(dst, dst, kScratchDoubleReg);
__ Orpd(kScratchDoubleReg, dst);
__ Psrlq(dst, byte{13});
__ Andnpd(dst, kScratchDoubleReg);
break; break;
} }
case kIA32F64x2Max: { case kIA32F64x2Max: {
......
...@@ -2415,21 +2415,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2415,21 +2415,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64F64x2Min: { case kX64F64x2Min: {
XMMRegister src1 = i.InputSimd128Register(1), // Avoids a move in no-AVX case if dst = src0.
dst = i.OutputSimd128Register(); DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
DCHECK_EQ(dst, i.InputSimd128Register(0)); __ F64x2Min(i.OutputSimd128Register(), i.InputSimd128Register(0),
// The minpd instruction doesn't propagate NaNs and +0's in its first i.InputSimd128Register(1), kScratchDoubleReg);
// operand. Perform minpd in both orders, merge the resuls, and adjust.
__ Movapd(kScratchDoubleReg, src1);
__ Minpd(kScratchDoubleReg, dst);
__ Minpd(dst, src1);
// propagate -0's and NaNs, which may be non-canonical.
__ Orpd(kScratchDoubleReg, dst);
// Canonicalize NaNs by quieting and clearing the payload.
__ Cmpunordpd(dst, kScratchDoubleReg);
__ Orpd(kScratchDoubleReg, dst);
__ Psrlq(dst, byte{13});
__ Andnpd(dst, kScratchDoubleReg);
break; break;
} }
case kX64F64x2Max: { case kX64F64x2Max: {
......
...@@ -4244,30 +4244,7 @@ void LiftoffAssembler::emit_f64x2_div(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -4244,30 +4244,7 @@ void LiftoffAssembler::emit_f64x2_div(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs, void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) { LiftoffRegister rhs) {
// The minpd instruction doesn't propagate NaNs and +0's in its first F64x2Min(dst.fp(), lhs.fp(), rhs.fp(), liftoff::kScratchDoubleReg);
// operand. Perform minpd in both orders, merge the results, and adjust.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vminpd(liftoff::kScratchDoubleReg, lhs.fp(), rhs.fp());
vminpd(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movaps(liftoff::kScratchDoubleReg, src);
minpd(liftoff::kScratchDoubleReg, dst.fp());
minpd(dst.fp(), src);
} else {
movaps(liftoff::kScratchDoubleReg, lhs.fp());
minpd(liftoff::kScratchDoubleReg, rhs.fp());
movaps(dst.fp(), rhs.fp());
minpd(dst.fp(), lhs.fp());
}
// propagate -0's and NaNs, which may be non-canonical.
Orpd(liftoff::kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by quieting and clearing the payload.
Cmpunordpd(dst.fp(), dst.fp(), liftoff::kScratchDoubleReg);
Orpd(liftoff::kScratchDoubleReg, dst.fp());
Psrlq(dst.fp(), byte{13});
Andnpd(dst.fp(), liftoff::kScratchDoubleReg);
} }
void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs, void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
......
...@@ -3802,30 +3802,7 @@ void LiftoffAssembler::emit_f64x2_div(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -3802,30 +3802,7 @@ void LiftoffAssembler::emit_f64x2_div(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs, void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) { LiftoffRegister rhs) {
// The minpd instruction doesn't propagate NaNs and +0's in its first F64x2Min(dst.fp(), lhs.fp(), rhs.fp(), kScratchDoubleReg);
// operand. Perform minpd in both orders, merge the results, and adjust.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vminpd(kScratchDoubleReg, lhs.fp(), rhs.fp());
vminpd(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movaps(kScratchDoubleReg, src);
minpd(kScratchDoubleReg, dst.fp());
minpd(dst.fp(), src);
} else {
movaps(kScratchDoubleReg, lhs.fp());
minpd(kScratchDoubleReg, rhs.fp());
movaps(dst.fp(), rhs.fp());
minpd(dst.fp(), lhs.fp());
}
// propagate -0's and NaNs, which may be non-canonical.
Orpd(kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by quieting and clearing the payload.
Cmpunordpd(dst.fp(), kScratchDoubleReg);
Orpd(kScratchDoubleReg, dst.fp());
Psrlq(dst.fp(), byte{13});
Andnpd(dst.fp(), kScratchDoubleReg);
} }
void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs, void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment