Commit 0f624aae authored by Kong, Fanchen's avatar Kong, Fanchen Committed by Commit Bot

[wasm-simd] [liftoff] Implement int-float/float-int conversion on x64 and ia32

Bug: v8:9909
Change-Id: I6224ce9ae3ac814ee33be71a67f1df02a398e0c5
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2208686
Commit-Queue: Fanchen Kong <fanchen.kong@intel.com>
Reviewed-by: 's avatarZhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#68001}
parent 2d9313e3
......@@ -286,6 +286,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP2_WITH_TYPE(Movd, movd, Register, XMMRegister)
AVX_OP2_WITH_TYPE(Movd, movd, Operand, XMMRegister)
AVX_OP2_WITH_TYPE(Cvtdq2ps, cvtdq2ps, XMMRegister, Operand)
AVX_OP2_WITH_TYPE(Cvtdq2ps, cvtdq2ps, XMMRegister, XMMRegister)
AVX_OP2_WITH_TYPE(Cvttps2dq, cvttps2dq, XMMRegister, XMMRegister)
AVX_OP2_WITH_TYPE(Sqrtps, sqrtps, XMMRegister, XMMRegister)
AVX_OP2_WITH_TYPE(Sqrtpd, sqrtpd, XMMRegister, XMMRegister)
AVX_OP2_WITH_TYPE(Sqrtpd, sqrtpd, XMMRegister, const Operand&)
......@@ -357,6 +359,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_PACKED_OP3_WITH_TYPE(macro_name, name, XMMRegister, XMMRegister) \
AVX_PACKED_OP3_WITH_TYPE(macro_name, name, XMMRegister, Operand)
AVX_PACKED_OP3(Addps, addps)
AVX_PACKED_OP3(Addpd, addpd)
AVX_PACKED_OP3(Subps, subps)
AVX_PACKED_OP3(Subpd, subpd)
......@@ -365,6 +368,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_PACKED_OP3(Cmpeqpd, cmpeqpd)
AVX_PACKED_OP3(Cmpneqpd, cmpneqpd)
AVX_PACKED_OP3(Cmpltpd, cmpltpd)
AVX_PACKED_OP3(Cmpleps, cmpleps)
AVX_PACKED_OP3(Cmplepd, cmplepd)
AVX_PACKED_OP3(Minps, minps)
AVX_PACKED_OP3(Minpd, minpd)
......@@ -380,6 +384,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_PACKED_OP3(Psrlq, psrlq)
AVX_PACKED_OP3(Psraw, psraw)
AVX_PACKED_OP3(Psrad, psrad)
AVX_PACKED_OP3(Paddd, paddd)
AVX_PACKED_OP3(Paddq, paddq)
AVX_PACKED_OP3(Psubq, psubq)
AVX_PACKED_OP3(Pmuludq, pmuludq)
......@@ -444,6 +449,30 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
#undef AVX_OP2_WITH_TYPE_SCOPE
#undef AVX_OP2_XO_SSE4
#define AVX_OP3_WITH_TYPE_SCOPE(macro_name, name, dst_type, src_type, \
sse_scope) \
void macro_name(dst_type dst, src_type src) { \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope scope(this, AVX); \
v##name(dst, dst, src); \
return; \
} \
if (CpuFeatures::IsSupported(sse_scope)) { \
CpuFeatureScope scope(this, sse_scope); \
name(dst, src); \
return; \
} \
UNREACHABLE(); \
}
#define AVX_OP3_XO_SSE4(macro_name, name) \
AVX_OP3_WITH_TYPE_SCOPE(macro_name, name, XMMRegister, XMMRegister, SSE4_1) \
AVX_OP3_WITH_TYPE_SCOPE(macro_name, name, XMMRegister, Operand, SSE4_1)
AVX_OP3_XO_SSE4(Pmaxsd, pmaxsd)
#undef AVX_OP3_XO_SSE4
#undef AVX_OP3_WITH_TYPE_SCOPE
void Pshufb(XMMRegister dst, XMMRegister src) { Pshufb(dst, Operand(src)); }
void Pshufb(XMMRegister dst, Operand src);
void Pblendw(XMMRegister dst, XMMRegister src, uint8_t imm8) {
......
......@@ -3068,6 +3068,26 @@ void LiftoffAssembler::emit_s128_select(LiftoffRegister dst,
liftoff::GetSimd128Register(src2));
}
void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i32x4_sconvert_f32x4");
}
void LiftoffAssembler::emit_i32x4_uconvert_f32x4(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i32x4_uconvert_f32x4");
}
void LiftoffAssembler::emit_f32x4_sconvert_i32x4(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "f32x4_sconvert_i32x4");
}
void LiftoffAssembler::emit_f32x4_uconvert_i32x4(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "f32x4_uconvert_i32x4");
}
void LiftoffAssembler::emit_i8x16_sconvert_i16x8(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
......
......@@ -1968,6 +1968,26 @@ void LiftoffAssembler::emit_s128_select(LiftoffRegister dst,
Bsl(dst.fp().V16B(), src1.fp().V16B(), src2.fp().V16B());
}
void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i32x4_sconvert_f32x4");
}
void LiftoffAssembler::emit_i32x4_uconvert_f32x4(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i32x4_uconvert_f32x4");
}
void LiftoffAssembler::emit_f32x4_sconvert_i32x4(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "f32x4_sconvert_i32x4");
}
void LiftoffAssembler::emit_f32x4_uconvert_i32x4(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "f32x4_uconvert_i32x4");
}
void LiftoffAssembler::emit_i8x16_sconvert_i16x8(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
......
......@@ -3280,6 +3280,97 @@ void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
Andnpd(dst.fp(), liftoff::kScratchDoubleReg);
}
void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst,
LiftoffRegister src) {
// NAN->0
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vcmpeqps(liftoff::kScratchDoubleReg, src.fp(), src.fp());
vpand(dst.fp(), src.fp(), liftoff::kScratchDoubleReg);
} else {
movaps(liftoff::kScratchDoubleReg, src.fp());
cmpeqps(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
if (dst.fp() != src.fp()) movaps(dst.fp(), src.fp());
pand(dst.fp(), liftoff::kScratchDoubleReg);
}
// Set top bit if >= 0 (but not -0.0!).
Pxor(liftoff::kScratchDoubleReg, dst.fp());
// Convert to int.
Cvttps2dq(dst.fp(), dst.fp());
// Set top bit if >=0 is now < 0.
Pand(liftoff::kScratchDoubleReg, dst.fp());
Psrad(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, byte{31});
// Set positive overflow lanes to 0x7FFFFFFF.
Pxor(dst.fp(), liftoff::kScratchDoubleReg);
}
void LiftoffAssembler::emit_i32x4_uconvert_f32x4(LiftoffRegister dst,
LiftoffRegister src) {
static constexpr RegClass tmp_rc = reg_class_for(ValueType::kS128);
DoubleRegister tmp =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, src)).fp();
// NAN->0, negative->0.
Pxor(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vmaxps(dst.fp(), src.fp(), liftoff::kScratchDoubleReg);
} else {
if (dst.fp() != src.fp()) movaps(dst.fp(), src.fp());
maxps(dst.fp(), liftoff::kScratchDoubleReg);
}
// scratch: float representation of max_signed.
Pcmpeqd(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
Psrld(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg,
uint8_t{1}); // 0x7fffffff
Cvtdq2ps(liftoff::kScratchDoubleReg,
liftoff::kScratchDoubleReg); // 0x4f000000
// tmp: convert (src-max_signed).
// Set positive overflow lanes to 0x7FFFFFFF.
// Set negative lanes to 0.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vsubps(tmp, dst.fp(), liftoff::kScratchDoubleReg);
} else {
movaps(tmp, dst.fp());
subps(tmp, liftoff::kScratchDoubleReg);
}
Cmpleps(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, tmp);
Cvttps2dq(tmp, tmp);
Pxor(tmp, liftoff::kScratchDoubleReg);
Pxor(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
Pmaxsd(tmp, liftoff::kScratchDoubleReg);
// Convert to int. Overflow lanes above max_signed will be 0x80000000.
Cvttps2dq(dst.fp(), dst.fp());
// Add (src-max_signed) for overflow lanes.
Paddd(dst.fp(), dst.fp(), tmp);
}
void LiftoffAssembler::emit_f32x4_sconvert_i32x4(LiftoffRegister dst,
LiftoffRegister src) {
Cvtdq2ps(dst.fp(), src.fp());
}
void LiftoffAssembler::emit_f32x4_uconvert_i32x4(LiftoffRegister dst,
LiftoffRegister src) {
Pxor(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg); // Zeros.
Pblendw(liftoff::kScratchDoubleReg, src.fp(),
uint8_t{0x55}); // Get lo 16 bits.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpsubd(dst.fp(), src.fp(), liftoff::kScratchDoubleReg); // Get hi 16 bits.
} else {
if (dst.fp() != src.fp()) movaps(dst.fp(), src.fp());
psubd(dst.fp(), liftoff::kScratchDoubleReg);
}
Cvtdq2ps(liftoff::kScratchDoubleReg,
liftoff::kScratchDoubleReg); // Convert lo exactly.
Psrld(dst.fp(), dst.fp(), byte{1}); // Divide by 2 to get in unsigned range.
Cvtdq2ps(dst.fp(), dst.fp()); // Convert hi, exactly.
Addps(dst.fp(), dst.fp(), dst.fp()); // Double hi, exactly.
Addps(dst.fp(), dst.fp(),
liftoff::kScratchDoubleReg); // Add hi and lo, may round.
}
void LiftoffAssembler::emit_i8x16_sconvert_i16x8(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
......
......@@ -968,6 +968,14 @@ class LiftoffAssembler : public TurboAssembler {
LiftoffRegister rhs);
inline void emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i32x4_sconvert_f32x4(LiftoffRegister dst,
LiftoffRegister src);
inline void emit_i32x4_uconvert_f32x4(LiftoffRegister dst,
LiftoffRegister src);
inline void emit_f32x4_sconvert_i32x4(LiftoffRegister dst,
LiftoffRegister src);
inline void emit_f32x4_uconvert_i32x4(LiftoffRegister dst,
LiftoffRegister src);
inline void emit_i8x16_sconvert_i16x8(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs);
......
......@@ -2723,6 +2723,18 @@ class LiftoffCompiler {
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_min);
case wasm::kExprF64x2Max:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_max);
case wasm::kExprI32x4SConvertF32x4:
return EmitUnOp<kS128, kS128>(
&LiftoffAssembler::emit_i32x4_sconvert_f32x4);
case wasm::kExprI32x4UConvertF32x4:
return EmitUnOp<kS128, kS128>(
&LiftoffAssembler::emit_i32x4_uconvert_f32x4);
case wasm::kExprF32x4SConvertI32x4:
return EmitUnOp<kS128, kS128>(
&LiftoffAssembler::emit_f32x4_sconvert_i32x4);
case wasm::kExprF32x4UConvertI32x4:
return EmitUnOp<kS128, kS128>(
&LiftoffAssembler::emit_f32x4_uconvert_i32x4);
case wasm::kExprI8x16SConvertI16x8:
return EmitBinOp<kS128, kS128>(
&LiftoffAssembler::emit_i8x16_sconvert_i16x8);
......
......@@ -3301,6 +3301,89 @@ void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
Andnpd(dst.fp(), kScratchDoubleReg);
}
void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst,
LiftoffRegister src) {
// NAN->0
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vcmpeqps(kScratchDoubleReg, src.fp(), src.fp());
vpand(dst.fp(), src.fp(), kScratchDoubleReg);
} else {
movaps(kScratchDoubleReg, src.fp());
cmpeqps(kScratchDoubleReg, kScratchDoubleReg);
if (dst.fp() != src.fp()) movaps(dst.fp(), src.fp());
pand(dst.fp(), kScratchDoubleReg);
}
// Set top bit if >= 0 (but not -0.0!).
Pxor(kScratchDoubleReg, dst.fp());
// Convert to int.
Cvttps2dq(dst.fp(), dst.fp());
// Set top bit if >=0 is now < 0.
Pand(kScratchDoubleReg, dst.fp());
Psrad(kScratchDoubleReg, byte{31});
// Set positive overflow lanes to 0x7FFFFFFF.
Pxor(dst.fp(), kScratchDoubleReg);
}
void LiftoffAssembler::emit_i32x4_uconvert_f32x4(LiftoffRegister dst,
LiftoffRegister src) {
// NAN->0, negative->0.
Pxor(kScratchDoubleReg, kScratchDoubleReg);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vmaxps(dst.fp(), src.fp(), kScratchDoubleReg);
} else {
if (dst.fp() != src.fp()) movaps(dst.fp(), src.fp());
maxps(dst.fp(), kScratchDoubleReg);
}
// scratch: float representation of max_signed.
Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
Psrld(kScratchDoubleReg, uint8_t{1}); // 0x7fffffff
Cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg); // 0x4f000000
// scratch2: convert (src-max_signed).
// Set positive overflow lanes to 0x7FFFFFFF.
// Set negative lanes to 0.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vsubps(liftoff::kScratchDoubleReg2, dst.fp(), kScratchDoubleReg);
} else {
movaps(liftoff::kScratchDoubleReg2, dst.fp());
subps(liftoff::kScratchDoubleReg2, kScratchDoubleReg);
}
Cmpleps(kScratchDoubleReg, liftoff::kScratchDoubleReg2);
Cvttps2dq(liftoff::kScratchDoubleReg2, liftoff::kScratchDoubleReg2);
Pxor(liftoff::kScratchDoubleReg2, kScratchDoubleReg);
Pxor(kScratchDoubleReg, kScratchDoubleReg);
Pmaxsd(liftoff::kScratchDoubleReg2, kScratchDoubleReg);
// Convert to int. Overflow lanes above max_signed will be 0x80000000.
Cvttps2dq(dst.fp(), dst.fp());
// Add (src-max_signed) for overflow lanes.
Paddd(dst.fp(), liftoff::kScratchDoubleReg2);
}
void LiftoffAssembler::emit_f32x4_sconvert_i32x4(LiftoffRegister dst,
LiftoffRegister src) {
Cvtdq2ps(dst.fp(), src.fp());
}
void LiftoffAssembler::emit_f32x4_uconvert_i32x4(LiftoffRegister dst,
LiftoffRegister src) {
Pxor(kScratchDoubleReg, kScratchDoubleReg); // Zeros.
Pblendw(kScratchDoubleReg, src.fp(), uint8_t{0x55}); // Get lo 16 bits.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpsubd(dst.fp(), src.fp(), kScratchDoubleReg); // Get hi 16 bits.
} else {
if (dst.fp() != src.fp()) movaps(dst.fp(), src.fp());
psubd(dst.fp(), kScratchDoubleReg);
}
Cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg); // Convert lo exactly.
Psrld(dst.fp(), byte{1}); // Divide by 2 to get in unsigned range.
Cvtdq2ps(dst.fp(), dst.fp()); // Convert hi, exactly.
Addps(dst.fp(), dst.fp()); // Double hi, exactly.
Addps(dst.fp(), kScratchDoubleReg); // Add hi and lo, may round.
}
void LiftoffAssembler::emit_i8x16_sconvert_i16x8(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment