Commit 9f41a584 authored by Kong, Fanchen's avatar Kong, Fanchen Committed by Commit Bot

[wasm-simd] [liftoff] Implement fp min/max on ia32 and x64

Bug: v8:9909
Change-Id: Ib97bcc7afe516a014cd91128aa3c59f1b8b0b0af
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2151999
Commit-Queue: Fanchen Kong <fanchen.kong@intel.com>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Reviewed-by: 's avatarZhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#67301}
parent ec3cadc4
......@@ -329,6 +329,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP3_XO(Xorpd, xorpd)
AVX_OP3_XO(Sqrtss, sqrtss)
AVX_OP3_XO(Sqrtsd, sqrtsd)
AVX_OP3_XO(Orps, orps)
AVX_OP3_XO(Orpd, orpd)
AVX_OP3_XO(Andnpd, andnpd)
......@@ -351,6 +352,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_PACKED_OP3_WITH_TYPE(macro_name, name, XMMRegister, Operand)
AVX_PACKED_OP3(Addpd, addpd)
AVX_PACKED_OP3(Subps, subps)
AVX_PACKED_OP3(Subpd, subpd)
AVX_PACKED_OP3(Mulpd, mulpd)
AVX_PACKED_OP3(Divpd, divpd)
......@@ -360,6 +362,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_PACKED_OP3(Cmplepd, cmplepd)
AVX_PACKED_OP3(Minpd, minpd)
AVX_PACKED_OP3(Maxpd, maxpd)
AVX_PACKED_OP3(Cmpunordps, cmpunordps)
AVX_PACKED_OP3(Cmpunordpd, cmpunordpd)
AVX_PACKED_OP3(Psllw, psllw)
AVX_PACKED_OP3(Pslld, pslld)
......
......@@ -1704,6 +1704,16 @@ void LiftoffAssembler::emit_f64x2_div(LiftoffRegister dst, LiftoffRegister lhs,
vdiv(dst.high_fp(), lhs.high_fp(), rhs.high_fp());
}
void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "f64x2min");
}
void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "f64x2max");
}
void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
vdup(Neon32, liftoff::GetSimd128Register(dst), src.fp(), 0);
......@@ -1787,6 +1797,16 @@ void LiftoffAssembler::emit_f32x4_div(LiftoffRegister dst, LiftoffRegister lhs,
vdiv(dst_high.high(), lhs_high.high(), rhs_high.high());
}
void LiftoffAssembler::emit_f32x4_min(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "f32x4min");
}
void LiftoffAssembler::emit_f32x4_max(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "f32x4max");
}
void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
LiftoffRegister src) {
Simd128Register dst_simd = liftoff::GetSimd128Register(dst);
......
......@@ -1141,6 +1141,16 @@ void LiftoffAssembler::emit_f64x2_div(LiftoffRegister dst, LiftoffRegister lhs,
Fdiv(dst.fp().V2D(), lhs.fp().V2D(), rhs.fp().V2D());
}
void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "f64x2min");
}
void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "f64x2max");
}
void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
Dup(dst.fp().V4S(), src.fp().S(), 0);
......@@ -1197,6 +1207,16 @@ void LiftoffAssembler::emit_f32x4_div(LiftoffRegister dst, LiftoffRegister lhs,
Fdiv(dst.fp().V4S(), lhs.fp().V4S(), rhs.fp().V4S());
}
void LiftoffAssembler::emit_f32x4_min(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "f32x4min");
}
void LiftoffAssembler::emit_f32x4_max(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "f32x4max");
}
void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
LiftoffRegister src) {
Dup(dst.fp().V2D(), src.gp().X());
......
......@@ -2506,6 +2506,65 @@ void LiftoffAssembler::emit_f32x4_div(LiftoffRegister dst, LiftoffRegister lhs,
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_f32x4_min(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
// The minps instruction doesn't propagate NaNs and +0's in its first
// operand. Perform minps in both orders, merge the results, and adjust.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vminps(liftoff::kScratchDoubleReg, lhs.fp(), rhs.fp());
vminps(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movaps(liftoff::kScratchDoubleReg, src);
minps(liftoff::kScratchDoubleReg, dst.fp());
minps(dst.fp(), src);
} else {
movaps(liftoff::kScratchDoubleReg, lhs.fp());
minps(liftoff::kScratchDoubleReg, rhs.fp());
movaps(dst.fp(), rhs.fp());
minps(dst.fp(), lhs.fp());
}
// propagate -0's and NaNs, which may be non-canonical.
Orps(liftoff::kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by quieting and clearing the payload.
Cmpunordps(dst.fp(), dst.fp(), liftoff::kScratchDoubleReg);
Orps(liftoff::kScratchDoubleReg, dst.fp());
Psrld(dst.fp(), dst.fp(), byte{10});
Andnps(dst.fp(), liftoff::kScratchDoubleReg);
}
void LiftoffAssembler::emit_f32x4_max(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
// The maxps instruction doesn't propagate NaNs and +0's in its first
// operand. Perform maxps in both orders, merge the results, and adjust.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vmaxps(liftoff::kScratchDoubleReg, lhs.fp(), rhs.fp());
vmaxps(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movaps(liftoff::kScratchDoubleReg, src);
maxps(liftoff::kScratchDoubleReg, dst.fp());
maxps(dst.fp(), src);
} else {
movaps(liftoff::kScratchDoubleReg, lhs.fp());
maxps(liftoff::kScratchDoubleReg, rhs.fp());
movaps(dst.fp(), rhs.fp());
maxps(dst.fp(), lhs.fp());
}
// Find discrepancies.
Xorps(dst.fp(), liftoff::kScratchDoubleReg);
// Propagate NaNs, which may be non-canonical.
Orps(liftoff::kScratchDoubleReg, dst.fp());
// Propagate sign discrepancy and (subtle) quiet NaNs.
Subps(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
Cmpunordps(dst.fp(), dst.fp(), liftoff::kScratchDoubleReg);
Psrld(dst.fp(), dst.fp(), byte{10});
Andnps(dst.fp(), liftoff::kScratchDoubleReg);
}
void LiftoffAssembler::emit_f64x2_abs(LiftoffRegister dst,
LiftoffRegister src) {
if (dst.fp() == src.fp()) {
......@@ -2561,6 +2620,65 @@ void LiftoffAssembler::emit_f64x2_div(LiftoffRegister dst, LiftoffRegister lhs,
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
// The minpd instruction doesn't propagate NaNs and +0's in its first
// operand. Perform minpd in both orders, merge the results, and adjust.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vminpd(liftoff::kScratchDoubleReg, lhs.fp(), rhs.fp());
vminpd(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movapd(liftoff::kScratchDoubleReg, src);
minpd(liftoff::kScratchDoubleReg, dst.fp());
minpd(dst.fp(), src);
} else {
movapd(liftoff::kScratchDoubleReg, lhs.fp());
minpd(liftoff::kScratchDoubleReg, rhs.fp());
movapd(dst.fp(), rhs.fp());
minpd(dst.fp(), lhs.fp());
}
// propagate -0's and NaNs, which may be non-canonical.
Orpd(liftoff::kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by quieting and clearing the payload.
Cmpunordpd(dst.fp(), dst.fp(), liftoff::kScratchDoubleReg);
Orpd(liftoff::kScratchDoubleReg, dst.fp());
Psrlq(dst.fp(), 13);
Andnpd(dst.fp(), liftoff::kScratchDoubleReg);
}
void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
// The maxpd instruction doesn't propagate NaNs and +0's in its first
// operand. Perform maxpd in both orders, merge the results, and adjust.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vmaxpd(liftoff::kScratchDoubleReg, lhs.fp(), rhs.fp());
vmaxpd(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movapd(liftoff::kScratchDoubleReg, src);
maxpd(liftoff::kScratchDoubleReg, dst.fp());
maxpd(dst.fp(), src);
} else {
movapd(liftoff::kScratchDoubleReg, lhs.fp());
maxpd(liftoff::kScratchDoubleReg, rhs.fp());
movapd(dst.fp(), rhs.fp());
maxpd(dst.fp(), lhs.fp());
}
// Find discrepancies.
Xorpd(dst.fp(), liftoff::kScratchDoubleReg);
// Propagate NaNs, which may be non-canonical.
Orpd(liftoff::kScratchDoubleReg, dst.fp());
// Propagate sign discrepancy and (subtle) quiet NaNs.
Subpd(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
Cmpunordpd(dst.fp(), dst.fp(), liftoff::kScratchDoubleReg);
Psrlq(dst.fp(), 13);
Andnpd(dst.fp(), liftoff::kScratchDoubleReg);
}
void LiftoffAssembler::emit_i8x16_sconvert_i16x8(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
......
......@@ -843,6 +843,10 @@ class LiftoffAssembler : public TurboAssembler {
LiftoffRegister rhs);
inline void emit_f32x4_div(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_f32x4_min(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_f32x4_max(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_f64x2_abs(LiftoffRegister dst, LiftoffRegister src);
inline void emit_f64x2_neg(LiftoffRegister dst, LiftoffRegister src);
inline void emit_f64x2_sqrt(LiftoffRegister dst, LiftoffRegister src);
......@@ -854,6 +858,10 @@ class LiftoffAssembler : public TurboAssembler {
LiftoffRegister rhs);
inline void emit_f64x2_div(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i8x16_sconvert_i16x8(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs);
......
......@@ -2464,6 +2464,10 @@ class LiftoffCompiler {
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f32x4_mul);
case wasm::kExprF32x4Div:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f32x4_div);
case wasm::kExprF32x4Min:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f32x4_min);
case wasm::kExprF32x4Max:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f32x4_max);
case wasm::kExprF64x2Abs:
return EmitUnOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_abs);
case wasm::kExprF64x2Neg:
......@@ -2478,6 +2482,10 @@ class LiftoffCompiler {
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_mul);
case wasm::kExprF64x2Div:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_div);
case wasm::kExprF64x2Min:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_min);
case wasm::kExprF64x2Max:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_max);
case wasm::kExprI8x16SConvertI16x8:
return EmitBinOp<kS128, kS128>(
&LiftoffAssembler::emit_i8x16_sconvert_i16x8);
......
......@@ -2468,6 +2468,65 @@ void LiftoffAssembler::emit_f32x4_div(LiftoffRegister dst, LiftoffRegister lhs,
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_f32x4_min(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
// The minps instruction doesn't propagate NaNs and +0's in its first
// operand. Perform minps in both orders, merge the results, and adjust.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vminps(kScratchDoubleReg, lhs.fp(), rhs.fp());
vminps(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movaps(kScratchDoubleReg, src);
minps(kScratchDoubleReg, dst.fp());
minps(dst.fp(), src);
} else {
movaps(kScratchDoubleReg, lhs.fp());
minps(kScratchDoubleReg, rhs.fp());
movaps(dst.fp(), rhs.fp());
minps(dst.fp(), lhs.fp());
}
// propagate -0's and NaNs, which may be non-canonical.
Orps(kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by quieting and clearing the payload.
Cmpps(dst.fp(), kScratchDoubleReg, int8_t{3});
Orps(kScratchDoubleReg, dst.fp());
Psrld(dst.fp(), byte{10});
Andnps(dst.fp(), kScratchDoubleReg);
}
void LiftoffAssembler::emit_f32x4_max(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
// The maxps instruction doesn't propagate NaNs and +0's in its first
// operand. Perform maxps in both orders, merge the results, and adjust.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vmaxps(kScratchDoubleReg, lhs.fp(), rhs.fp());
vmaxps(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movaps(kScratchDoubleReg, src);
maxps(kScratchDoubleReg, dst.fp());
maxps(dst.fp(), src);
} else {
movaps(kScratchDoubleReg, lhs.fp());
maxps(kScratchDoubleReg, rhs.fp());
movaps(dst.fp(), rhs.fp());
maxps(dst.fp(), lhs.fp());
}
// Find discrepancies.
Xorps(dst.fp(), kScratchDoubleReg);
// Propagate NaNs, which may be non-canonical.
Orps(kScratchDoubleReg, dst.fp());
// Propagate sign discrepancy and (subtle) quiet NaNs.
Subps(kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
Cmpps(dst.fp(), kScratchDoubleReg, int8_t{3});
Psrld(dst.fp(), byte{10});
Andnps(dst.fp(), kScratchDoubleReg);
}
void LiftoffAssembler::emit_f64x2_abs(LiftoffRegister dst,
LiftoffRegister src) {
if (dst.fp() == src.fp()) {
......@@ -2523,6 +2582,65 @@ void LiftoffAssembler::emit_f64x2_div(LiftoffRegister dst, LiftoffRegister lhs,
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
// The minpd instruction doesn't propagate NaNs and +0's in its first
// operand. Perform minpd in both orders, merge the results, and adjust.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vminpd(kScratchDoubleReg, lhs.fp(), rhs.fp());
vminpd(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movapd(kScratchDoubleReg, src);
minpd(kScratchDoubleReg, dst.fp());
minpd(dst.fp(), src);
} else {
movapd(kScratchDoubleReg, lhs.fp());
minpd(kScratchDoubleReg, rhs.fp());
movapd(dst.fp(), rhs.fp());
minpd(dst.fp(), lhs.fp());
}
// propagate -0's and NaNs, which may be non-canonical.
Orpd(kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by quieting and clearing the payload.
Cmppd(dst.fp(), kScratchDoubleReg, int8_t{3});
Orpd(kScratchDoubleReg, dst.fp());
Psrlq(dst.fp(), 13);
Andnpd(dst.fp(), kScratchDoubleReg);
}
void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
// The maxpd instruction doesn't propagate NaNs and +0's in its first
// operand. Perform maxpd in both orders, merge the results, and adjust.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vmaxpd(kScratchDoubleReg, lhs.fp(), rhs.fp());
vmaxpd(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movapd(kScratchDoubleReg, src);
maxpd(kScratchDoubleReg, dst.fp());
maxpd(dst.fp(), src);
} else {
movapd(kScratchDoubleReg, lhs.fp());
maxpd(kScratchDoubleReg, rhs.fp());
movapd(dst.fp(), rhs.fp());
maxpd(dst.fp(), lhs.fp());
}
// Find discrepancies.
Xorpd(dst.fp(), kScratchDoubleReg);
// Propagate NaNs, which may be non-canonical.
Orpd(kScratchDoubleReg, dst.fp());
// Propagate sign discrepancy and (subtle) quiet NaNs.
Subpd(kScratchDoubleReg, dst.fp());
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
Cmppd(dst.fp(), kScratchDoubleReg, int8_t{3});
Psrlq(dst.fp(), 13);
Andnpd(dst.fp(), kScratchDoubleReg);
}
void LiftoffAssembler::emit_i8x16_sconvert_i16x8(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment