[wasm-simd] [liftoff] Implement fp min/max on ia32 and x64

Bug: v8:9909 Change-Id: Ib97bcc7afe516a014cd91128aa3c59f1b8b0b0af Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2151999 Commit-Queue: Fanchen Kong <fanchen.kong@intel.com> Reviewed-by: Clemens Backes <clemensb@chromium.org> Reviewed-by: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#67301}

[wasm-simd] [liftoff] Implement fp min/max on ia32 and x64
Bug: v8:9909 Change-Id: Ib97bcc7afe516a014cd91128aa3c59f1b8b0b0af Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2151999 Commit-Queue: Fanchen Kong <fanchen.kong@intel.com> Reviewed-by: Clemens Backes <clemensb@chromium.org> Reviewed-by: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#67301}
9f41a584 · Kong, Fanchen · Commit Bot · ec3cadc4 · 9f41a584 · 9f41a584
Commit 9f41a584 authored Apr 15, 2020 by Kong, Fanchen Committed by Commit Bot Apr 22, 2020
7 changed files
--- a/src/codegen/ia32/macro-assembler-ia32.h
+++ b/src/codegen/ia32/macro-assembler-ia32.h
@@ -329,6 +329,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
  AVX_OP3_XO(Xorpd, xorpd)
  AVX_OP3_XO(Sqrtss, sqrtss)
  AVX_OP3_XO(Sqrtsd, sqrtsd)
+  AVX_OP3_XO(Orps, orps)
  AVX_OP3_XO(Orpd, orpd)
  AVX_OP3_XO(Andnpd, andnpd)
@@ -351,6 +352,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
  AVX_PACKED_OP3_WITH_TYPE(macro_name, name, XMMRegister, Operand)
  AVX_PACKED_OP3(Addpd, addpd)
+  AVX_PACKED_OP3(Subps, subps)
  AVX_PACKED_OP3(Subpd, subpd)
  AVX_PACKED_OP3(Mulpd, mulpd)
  AVX_PACKED_OP3(Divpd, divpd)
@@ -360,6 +362,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
  AVX_PACKED_OP3(Cmplepd, cmplepd)
  AVX_PACKED_OP3(Minpd, minpd)
  AVX_PACKED_OP3(Maxpd, maxpd)
+  AVX_PACKED_OP3(Cmpunordps, cmpunordps)
  AVX_PACKED_OP3(Cmpunordpd, cmpunordpd)
  AVX_PACKED_OP3(Psllw, psllw)
  AVX_PACKED_OP3(Pslld, pslld)

--- a/src/wasm/baseline/arm/liftoff-assembler-arm.h
+++ b/src/wasm/baseline/arm/liftoff-assembler-arm.h
@@ -1704,6 +1704,16 @@ void LiftoffAssembler::emit_f64x2_div(LiftoffRegister dst, LiftoffRegister lhs,
  vdiv(dst.high_fp(), lhs.high_fp(), rhs.high_fp());
 }
+void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs,
+                                      LiftoffRegister rhs) {
+  bailout(kSimd, "f64x2min");
+}
+void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
+                                      LiftoffRegister rhs) {
+  bailout(kSimd, "f64x2max");
+}
 void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
                                        LiftoffRegister src) {
  vdup(Neon32, liftoff::GetSimd128Register(dst), src.fp(), 0);
@@ -1787,6 +1797,16 @@ void LiftoffAssembler::emit_f32x4_div(LiftoffRegister dst, LiftoffRegister lhs,
  vdiv(dst_high.high(), lhs_high.high(), rhs_high.high());
 }
+void LiftoffAssembler::emit_f32x4_min(LiftoffRegister dst, LiftoffRegister lhs,
+                                      LiftoffRegister rhs) {
+  bailout(kSimd, "f32x4min");
+}
+void LiftoffAssembler::emit_f32x4_max(LiftoffRegister dst, LiftoffRegister lhs,
+                                      LiftoffRegister rhs) {
+  bailout(kSimd, "f32x4max");
+}
 void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
                                        LiftoffRegister src) {
  Simd128Register dst_simd = liftoff::GetSimd128Register(dst);

--- a/src/wasm/baseline/arm64/liftoff-assembler-arm64.h
+++ b/src/wasm/baseline/arm64/liftoff-assembler-arm64.h
@@ -1141,6 +1141,16 @@ void LiftoffAssembler::emit_f64x2_div(LiftoffRegister dst, LiftoffRegister lhs,
  Fdiv(dst.fp().V2D(), lhs.fp().V2D(), rhs.fp().V2D());
 }
+void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs,
+                                      LiftoffRegister rhs) {
+  bailout(kSimd, "f64x2min");
+}
+void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
+                                      LiftoffRegister rhs) {
+  bailout(kSimd, "f64x2max");
+}
 void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
                                        LiftoffRegister src) {
  Dup(dst.fp().V4S(), src.fp().S(), 0);
@@ -1197,6 +1207,16 @@ void LiftoffAssembler::emit_f32x4_div(LiftoffRegister dst, LiftoffRegister lhs,
  Fdiv(dst.fp().V4S(), lhs.fp().V4S(), rhs.fp().V4S());
 }
+void LiftoffAssembler::emit_f32x4_min(LiftoffRegister dst, LiftoffRegister lhs,
+                                      LiftoffRegister rhs) {
+  bailout(kSimd, "f32x4min");
+}
+void LiftoffAssembler::emit_f32x4_max(LiftoffRegister dst, LiftoffRegister lhs,
+                                      LiftoffRegister rhs) {
+  bailout(kSimd, "f32x4max");
+}
 void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
                                        LiftoffRegister src) {
  Dup(dst.fp().V2D(), src.gp().X());

--- a/src/wasm/baseline/ia32/liftoff-assembler-ia32.h
+++ b/src/wasm/baseline/ia32/liftoff-assembler-ia32.h
@@ -2506,6 +2506,65 @@ void LiftoffAssembler::emit_f32x4_div(LiftoffRegister dst, LiftoffRegister lhs,
      this, dst, lhs, rhs);
 }
+void LiftoffAssembler::emit_f32x4_min(LiftoffRegister dst, LiftoffRegister lhs,
+                                      LiftoffRegister rhs) {
+  // The minps instruction doesn't propagate NaNs and +0's in its first
+  // operand. Perform minps in both orders, merge the results, and adjust.
+  if (CpuFeatures::IsSupported(AVX)) {
+    CpuFeatureScope scope(this, AVX);
+    vminps(liftoff::kScratchDoubleReg, lhs.fp(), rhs.fp());
+    vminps(dst.fp(), rhs.fp(), lhs.fp());
+  } else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
+    XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
+    movaps(liftoff::kScratchDoubleReg, src);
+    minps(liftoff::kScratchDoubleReg, dst.fp());
+    minps(dst.fp(), src);
+  } else {
+    movaps(liftoff::kScratchDoubleReg, lhs.fp());
+    minps(liftoff::kScratchDoubleReg, rhs.fp());
+    movaps(dst.fp(), rhs.fp());
+    minps(dst.fp(), lhs.fp());
+  }
+  // propagate -0's and NaNs, which may be non-canonical.
+  Orps(liftoff::kScratchDoubleReg, dst.fp());
+  // Canonicalize NaNs by quieting and clearing the payload.
+  Cmpunordps(dst.fp(), dst.fp(), liftoff::kScratchDoubleReg);
+  Orps(liftoff::kScratchDoubleReg, dst.fp());
+  Psrld(dst.fp(), dst.fp(), byte{10});
+  Andnps(dst.fp(), liftoff::kScratchDoubleReg);
+}
+void LiftoffAssembler::emit_f32x4_max(LiftoffRegister dst, LiftoffRegister lhs,
+                                      LiftoffRegister rhs) {
+  // The maxps instruction doesn't propagate NaNs and +0's in its first
+  // operand. Perform maxps in both orders, merge the results, and adjust.
+  if (CpuFeatures::IsSupported(AVX)) {
+    CpuFeatureScope scope(this, AVX);
+    vmaxps(liftoff::kScratchDoubleReg, lhs.fp(), rhs.fp());
+    vmaxps(dst.fp(), rhs.fp(), lhs.fp());
+  } else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
+    XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
+    movaps(liftoff::kScratchDoubleReg, src);
+    maxps(liftoff::kScratchDoubleReg, dst.fp());
+    maxps(dst.fp(), src);
+  } else {
+    movaps(liftoff::kScratchDoubleReg, lhs.fp());
+    maxps(liftoff::kScratchDoubleReg, rhs.fp());
+    movaps(dst.fp(), rhs.fp());
+    maxps(dst.fp(), lhs.fp());
+  }
+  // Find discrepancies.
+  Xorps(dst.fp(), liftoff::kScratchDoubleReg);
+  // Propagate NaNs, which may be non-canonical.
+  Orps(liftoff::kScratchDoubleReg, dst.fp());
+  // Propagate sign discrepancy and (subtle) quiet NaNs.
+  Subps(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, dst.fp());
+  // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
+  Cmpunordps(dst.fp(), dst.fp(), liftoff::kScratchDoubleReg);
+  Psrld(dst.fp(), dst.fp(), byte{10});
+  Andnps(dst.fp(), liftoff::kScratchDoubleReg);
+}
 void LiftoffAssembler::emit_f64x2_abs(LiftoffRegister dst,
                                      LiftoffRegister src) {
  if (dst.fp() == src.fp()) {
@@ -2561,6 +2620,65 @@ void LiftoffAssembler::emit_f64x2_div(LiftoffRegister dst, LiftoffRegister lhs,
      this, dst, lhs, rhs);
 }
+void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs,
+                                      LiftoffRegister rhs) {
+  // The minpd instruction doesn't propagate NaNs and +0's in its first
+  // operand. Perform minpd in both orders, merge the results, and adjust.
+  if (CpuFeatures::IsSupported(AVX)) {
+    CpuFeatureScope scope(this, AVX);
+    vminpd(liftoff::kScratchDoubleReg, lhs.fp(), rhs.fp());
+    vminpd(dst.fp(), rhs.fp(), lhs.fp());
+  } else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
+    XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
+    movapd(liftoff::kScratchDoubleReg, src);
+    minpd(liftoff::kScratchDoubleReg, dst.fp());
+    minpd(dst.fp(), src);
+  } else {
+    movapd(liftoff::kScratchDoubleReg, lhs.fp());
+    minpd(liftoff::kScratchDoubleReg, rhs.fp());
+    movapd(dst.fp(), rhs.fp());
+    minpd(dst.fp(), lhs.fp());
+  }
+  // propagate -0's and NaNs, which may be non-canonical.
+  Orpd(liftoff::kScratchDoubleReg, dst.fp());
+  // Canonicalize NaNs by quieting and clearing the payload.
+  Cmpunordpd(dst.fp(), dst.fp(), liftoff::kScratchDoubleReg);
+  Orpd(liftoff::kScratchDoubleReg, dst.fp());
+  Psrlq(dst.fp(), 13);
+  Andnpd(dst.fp(), liftoff::kScratchDoubleReg);
+}
+void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
+                                      LiftoffRegister rhs) {
+  // The maxpd instruction doesn't propagate NaNs and +0's in its first
+  // operand. Perform maxpd in both orders, merge the results, and adjust.
+  if (CpuFeatures::IsSupported(AVX)) {
+    CpuFeatureScope scope(this, AVX);
+    vmaxpd(liftoff::kScratchDoubleReg, lhs.fp(), rhs.fp());
+    vmaxpd(dst.fp(), rhs.fp(), lhs.fp());
+  } else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
+    XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
+    movapd(liftoff::kScratchDoubleReg, src);
+    maxpd(liftoff::kScratchDoubleReg, dst.fp());
+    maxpd(dst.fp(), src);
+  } else {
+    movapd(liftoff::kScratchDoubleReg, lhs.fp());
+    maxpd(liftoff::kScratchDoubleReg, rhs.fp());
+    movapd(dst.fp(), rhs.fp());
+    maxpd(dst.fp(), lhs.fp());
+  }
+  // Find discrepancies.
+  Xorpd(dst.fp(), liftoff::kScratchDoubleReg);
+  // Propagate NaNs, which may be non-canonical.
+  Orpd(liftoff::kScratchDoubleReg, dst.fp());
+  // Propagate sign discrepancy and (subtle) quiet NaNs.
+  Subpd(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, dst.fp());
+  // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
+  Cmpunordpd(dst.fp(), dst.fp(), liftoff::kScratchDoubleReg);
+  Psrlq(dst.fp(), 13);
+  Andnpd(dst.fp(), liftoff::kScratchDoubleReg);
+}
 void LiftoffAssembler::emit_i8x16_sconvert_i16x8(LiftoffRegister dst,
                                                 LiftoffRegister lhs,
                                                 LiftoffRegister rhs) {

--- a/src/wasm/baseline/liftoff-assembler.h
+++ b/src/wasm/baseline/liftoff-assembler.h
@@ -843,6 +843,10 @@ class LiftoffAssembler : public TurboAssembler {
                             LiftoffRegister rhs);
  inline void emit_f32x4_div(LiftoffRegister dst, LiftoffRegister lhs,
                             LiftoffRegister rhs);
+  inline void emit_f32x4_min(LiftoffRegister dst, LiftoffRegister lhs,
+                             LiftoffRegister rhs);
+  inline void emit_f32x4_max(LiftoffRegister dst, LiftoffRegister lhs,
+                             LiftoffRegister rhs);
  inline void emit_f64x2_abs(LiftoffRegister dst, LiftoffRegister src);
  inline void emit_f64x2_neg(LiftoffRegister dst, LiftoffRegister src);
  inline void emit_f64x2_sqrt(LiftoffRegister dst, LiftoffRegister src);
@@ -854,6 +858,10 @@ class LiftoffAssembler : public TurboAssembler {
                             LiftoffRegister rhs);
  inline void emit_f64x2_div(LiftoffRegister dst, LiftoffRegister lhs,
                             LiftoffRegister rhs);
+  inline void emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs,
+                             LiftoffRegister rhs);
+  inline void emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
+                             LiftoffRegister rhs);
  inline void emit_i8x16_sconvert_i16x8(LiftoffRegister dst,
                                        LiftoffRegister lhs,
                                        LiftoffRegister rhs);

--- a/src/wasm/baseline/liftoff-compiler.cc
+++ b/src/wasm/baseline/liftoff-compiler.cc
@@ -2464,6 +2464,10 @@ class LiftoffCompiler {
        return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f32x4_mul);
      case wasm::kExprF32x4Div:
        return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f32x4_div);
+      case wasm::kExprF32x4Min:
+        return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f32x4_min);
+      case wasm::kExprF32x4Max:
+        return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f32x4_max);
      case wasm::kExprF64x2Abs:
        return EmitUnOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_abs);
      case wasm::kExprF64x2Neg:
@@ -2478,6 +2482,10 @@ class LiftoffCompiler {
        return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_mul);
      case wasm::kExprF64x2Div:
        return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_div);
+      case wasm::kExprF64x2Min:
+        return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_min);
+      case wasm::kExprF64x2Max:
+        return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_max);
      case wasm::kExprI8x16SConvertI16x8:
        return EmitBinOp<kS128, kS128>(
            &LiftoffAssembler::emit_i8x16_sconvert_i16x8);

--- a/src/wasm/baseline/x64/liftoff-assembler-x64.h
+++ b/src/wasm/baseline/x64/liftoff-assembler-x64.h
@@ -2468,6 +2468,65 @@ void LiftoffAssembler::emit_f32x4_div(LiftoffRegister dst, LiftoffRegister lhs,
      this, dst, lhs, rhs);
 }
+void LiftoffAssembler::emit_f32x4_min(LiftoffRegister dst, LiftoffRegister lhs,
+                                      LiftoffRegister rhs) {
+  // The minps instruction doesn't propagate NaNs and +0's in its first
+  // operand. Perform minps in both orders, merge the results, and adjust.
+  if (CpuFeatures::IsSupported(AVX)) {
+    CpuFeatureScope scope(this, AVX);
+    vminps(kScratchDoubleReg, lhs.fp(), rhs.fp());
+    vminps(dst.fp(), rhs.fp(), lhs.fp());
+  } else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
+    XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
+    movaps(kScratchDoubleReg, src);
+    minps(kScratchDoubleReg, dst.fp());
+    minps(dst.fp(), src);
+  } else {
+    movaps(kScratchDoubleReg, lhs.fp());
+    minps(kScratchDoubleReg, rhs.fp());
+    movaps(dst.fp(), rhs.fp());
+    minps(dst.fp(), lhs.fp());
+  }
+  // propagate -0's and NaNs, which may be non-canonical.
+  Orps(kScratchDoubleReg, dst.fp());
+  // Canonicalize NaNs by quieting and clearing the payload.
+  Cmpps(dst.fp(), kScratchDoubleReg, int8_t{3});
+  Orps(kScratchDoubleReg, dst.fp());
+  Psrld(dst.fp(), byte{10});
+  Andnps(dst.fp(), kScratchDoubleReg);
+}
+void LiftoffAssembler::emit_f32x4_max(LiftoffRegister dst, LiftoffRegister lhs,
+                                      LiftoffRegister rhs) {
+  // The maxps instruction doesn't propagate NaNs and +0's in its first
+  // operand. Perform maxps in both orders, merge the results, and adjust.
+  if (CpuFeatures::IsSupported(AVX)) {
+    CpuFeatureScope scope(this, AVX);
+    vmaxps(kScratchDoubleReg, lhs.fp(), rhs.fp());
+    vmaxps(dst.fp(), rhs.fp(), lhs.fp());
+  } else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
+    XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
+    movaps(kScratchDoubleReg, src);
+    maxps(kScratchDoubleReg, dst.fp());
+    maxps(dst.fp(), src);
+  } else {
+    movaps(kScratchDoubleReg, lhs.fp());
+    maxps(kScratchDoubleReg, rhs.fp());
+    movaps(dst.fp(), rhs.fp());
+    maxps(dst.fp(), lhs.fp());
+  }
+  // Find discrepancies.
+  Xorps(dst.fp(), kScratchDoubleReg);
+  // Propagate NaNs, which may be non-canonical.
+  Orps(kScratchDoubleReg, dst.fp());
+  // Propagate sign discrepancy and (subtle) quiet NaNs.
+  Subps(kScratchDoubleReg, dst.fp());
+  // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
+  Cmpps(dst.fp(), kScratchDoubleReg, int8_t{3});
+  Psrld(dst.fp(), byte{10});
+  Andnps(dst.fp(), kScratchDoubleReg);
+}
 void LiftoffAssembler::emit_f64x2_abs(LiftoffRegister dst,
                                      LiftoffRegister src) {
  if (dst.fp() == src.fp()) {
@@ -2523,6 +2582,65 @@ void LiftoffAssembler::emit_f64x2_div(LiftoffRegister dst, LiftoffRegister lhs,
      this, dst, lhs, rhs);
 }
+void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs,
+                                      LiftoffRegister rhs) {
+  // The minpd instruction doesn't propagate NaNs and +0's in its first
+  // operand. Perform minpd in both orders, merge the results, and adjust.
+  if (CpuFeatures::IsSupported(AVX)) {
+    CpuFeatureScope scope(this, AVX);
+    vminpd(kScratchDoubleReg, lhs.fp(), rhs.fp());
+    vminpd(dst.fp(), rhs.fp(), lhs.fp());
+  } else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
+    XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
+    movapd(kScratchDoubleReg, src);
+    minpd(kScratchDoubleReg, dst.fp());
+    minpd(dst.fp(), src);
+  } else {
+    movapd(kScratchDoubleReg, lhs.fp());
+    minpd(kScratchDoubleReg, rhs.fp());
+    movapd(dst.fp(), rhs.fp());
+    minpd(dst.fp(), lhs.fp());
+  }
+  // propagate -0's and NaNs, which may be non-canonical.
+  Orpd(kScratchDoubleReg, dst.fp());
+  // Canonicalize NaNs by quieting and clearing the payload.
+  Cmppd(dst.fp(), kScratchDoubleReg, int8_t{3});
+  Orpd(kScratchDoubleReg, dst.fp());
+  Psrlq(dst.fp(), 13);
+  Andnpd(dst.fp(), kScratchDoubleReg);
+}
+void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
+                                      LiftoffRegister rhs) {
+  // The maxpd instruction doesn't propagate NaNs and +0's in its first
+  // operand. Perform maxpd in both orders, merge the results, and adjust.
+  if (CpuFeatures::IsSupported(AVX)) {
+    CpuFeatureScope scope(this, AVX);
+    vmaxpd(kScratchDoubleReg, lhs.fp(), rhs.fp());
+    vmaxpd(dst.fp(), rhs.fp(), lhs.fp());
+  } else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
+    XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
+    movapd(kScratchDoubleReg, src);
+    maxpd(kScratchDoubleReg, dst.fp());
+    maxpd(dst.fp(), src);
+  } else {
+    movapd(kScratchDoubleReg, lhs.fp());
+    maxpd(kScratchDoubleReg, rhs.fp());
+    movapd(dst.fp(), rhs.fp());
+    maxpd(dst.fp(), lhs.fp());
+  }
+  // Find discrepancies.
+  Xorpd(dst.fp(), kScratchDoubleReg);
+  // Propagate NaNs, which may be non-canonical.
+  Orpd(kScratchDoubleReg, dst.fp());
+  // Propagate sign discrepancy and (subtle) quiet NaNs.
+  Subpd(kScratchDoubleReg, dst.fp());
+  // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
+  Cmppd(dst.fp(), kScratchDoubleReg, int8_t{3});
+  Psrlq(dst.fp(), 13);
+  Andnpd(dst.fp(), kScratchDoubleReg);
+}
 void LiftoffAssembler::emit_i8x16_sconvert_i16x8(LiftoffRegister dst,
                                                 LiftoffRegister lhs,
                                                 LiftoffRegister rhs) {