Commit d0aa5c03 authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm-simd] Share I32x4SConvertF32x4 implementation

Move I32x4SConvertF32x4 into shared implementation, and takes care of
both AVX and no-AVX implementation. Instruction selector still requires
dst == src to save a move in codegen.

Bug: v8:11589
Change-Id: Ie982682b3002192ab27700bf73f8c1e66aeba492
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3086732
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#76243}
parent 339dde1c
...@@ -378,6 +378,31 @@ void SharedTurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1, ...@@ -378,6 +378,31 @@ void SharedTurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
} }
} }
void SharedTurboAssembler::I32x4SConvertF32x4(XMMRegister dst, XMMRegister src,
XMMRegister scratch) {
// Convert NAN to 0.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vcmpeqps(scratch, src, src);
vpand(dst, src, scratch);
} else {
movaps(scratch, src);
cmpeqps(scratch, src);
if (dst != src) movaps(dst, src);
andps(dst, scratch);
}
// Set top bit if >= 0 (but not -0.0!).
Pxor(scratch, dst);
// Convert to packed single-precision.
Cvttps2dq(dst, dst);
// Set top bit if >=0 is now < 0.
Pand(scratch, dst);
Psrad(scratch, scratch, byte{31});
// Set positive overflow lanes to 0x7FFFFFFF.
Pxor(dst, scratch);
}
void SharedTurboAssembler::I32x4SConvertI16x8High(XMMRegister dst, void SharedTurboAssembler::I32x4SConvertI16x8High(XMMRegister dst,
XMMRegister src) { XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) { if (CpuFeatures::IsSupported(AVX)) {
......
...@@ -310,6 +310,9 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -310,6 +310,9 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
// Requires that dst == src1 if AVX is not supported. // Requires that dst == src1 if AVX is not supported.
void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch, bool low, bool is_signed); XMMRegister scratch, bool low, bool is_signed);
// Requires dst == src if AVX is not supported.
void I32x4SConvertF32x4(XMMRegister dst, XMMRegister src,
XMMRegister scratch);
void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src); void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src);
void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src, void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src,
XMMRegister scratch); XMMRegister scratch);
......
...@@ -2445,20 +2445,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2445,20 +2445,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kIA32I32x4SConvertF32x4: { case kIA32I32x4SConvertF32x4: {
XMMRegister dst = i.OutputSimd128Register(); __ I32x4SConvertF32x4(i.OutputSimd128Register(),
XMMRegister src = i.InputSimd128Register(0); i.InputSimd128Register(0), kScratchDoubleReg);
// NAN->0
__ Cmpeqps(kScratchDoubleReg, src, src);
__ Pand(dst, src, kScratchDoubleReg);
// Set top bit if >= 0 (but not -0.0!)
__ Pxor(kScratchDoubleReg, dst);
// Convert
__ Cvttps2dq(dst, dst);
// Set top bit if >=0 is now < 0
__ Pand(kScratchDoubleReg, dst);
__ Psrad(kScratchDoubleReg, kScratchDoubleReg, byte{31});
// Set positive overflow lanes to 0x7FFFFFFF
__ Pxor(dst, kScratchDoubleReg);
break; break;
} }
case kIA32I32x4SConvertI16x8Low: { case kIA32I32x4SConvertI16x8Low: {
......
...@@ -3084,21 +3084,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3084,21 +3084,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64I32x4SConvertF32x4: { case kX64I32x4SConvertF32x4: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); __ I32x4SConvertF32x4(i.OutputSimd128Register(),
XMMRegister dst = i.OutputSimd128Register(); i.InputSimd128Register(0), kScratchDoubleReg);
// NAN->0
__ Movaps(kScratchDoubleReg, dst);
__ Cmpeqps(kScratchDoubleReg, kScratchDoubleReg);
__ Pand(dst, kScratchDoubleReg);
// Set top bit if >= 0 (but not -0.0!)
__ Pxor(kScratchDoubleReg, dst);
// Convert
__ Cvttps2dq(dst, dst);
// Set top bit if >=0 is now < 0
__ Pand(kScratchDoubleReg, dst);
__ Psrad(kScratchDoubleReg, byte{31});
// Set positive overflow lanes to 0x7FFFFFFF
__ Pxor(dst, kScratchDoubleReg);
break; break;
} }
case kX64I32x4SConvertI16x8Low: { case kX64I32x4SConvertI16x8Low: {
......
...@@ -3321,7 +3321,8 @@ void InstructionSelector::VisitI64x2Mul(Node* node) { ...@@ -3321,7 +3321,8 @@ void InstructionSelector::VisitI64x2Mul(Node* node) {
void InstructionSelector::VisitI32x4SConvertF32x4(Node* node) { void InstructionSelector::VisitI32x4SConvertF32x4(Node* node) {
X64OperandGenerator g(this); X64OperandGenerator g(this);
Emit(kX64I32x4SConvertF32x4, g.DefineSameAsFirst(node), Emit(kX64I32x4SConvertF32x4,
IsSupported(AVX) ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(0))); g.UseRegister(node->InputAt(0)));
} }
......
...@@ -4300,26 +4300,7 @@ void LiftoffAssembler::emit_f64x2_promote_low_f32x4(LiftoffRegister dst, ...@@ -4300,26 +4300,7 @@ void LiftoffAssembler::emit_f64x2_promote_low_f32x4(LiftoffRegister dst,
void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst, void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst,
LiftoffRegister src) { LiftoffRegister src) {
// NAN->0 I32x4SConvertF32x4(dst.fp(), src.fp(), liftoff::kScratchDoubleReg);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vcmpeqps(liftoff::kScratchDoubleReg, src.fp(), src.fp());
vpand(dst.fp(), src.fp(), liftoff::kScratchDoubleReg);
} else {
movaps(liftoff::kScratchDoubleReg, src.fp());
cmpeqps(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
if (dst.fp() != src.fp()) movaps(dst.fp(), src.fp());
andps(dst.fp(), liftoff::kScratchDoubleReg);
}
// Set top bit if >= 0 (but not -0.0!).
Pxor(liftoff::kScratchDoubleReg, dst.fp());
// Convert to int.
Cvttps2dq(dst.fp(), dst.fp());
// Set top bit if >=0 is now < 0.
Pand(liftoff::kScratchDoubleReg, dst.fp());
Psrad(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, byte{31});
// Set positive overflow lanes to 0x7FFFFFFF.
Pxor(dst.fp(), liftoff::kScratchDoubleReg);
} }
void LiftoffAssembler::emit_i32x4_uconvert_f32x4(LiftoffRegister dst, void LiftoffAssembler::emit_i32x4_uconvert_f32x4(LiftoffRegister dst,
......
...@@ -3852,26 +3852,7 @@ void LiftoffAssembler::emit_f64x2_promote_low_f32x4(LiftoffRegister dst, ...@@ -3852,26 +3852,7 @@ void LiftoffAssembler::emit_f64x2_promote_low_f32x4(LiftoffRegister dst,
void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst, void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst,
LiftoffRegister src) { LiftoffRegister src) {
// NAN->0 I32x4SConvertF32x4(dst.fp(), src.fp(), kScratchDoubleReg);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vcmpeqps(kScratchDoubleReg, src.fp(), src.fp());
vpand(dst.fp(), src.fp(), kScratchDoubleReg);
} else {
movaps(kScratchDoubleReg, src.fp());
cmpeqps(kScratchDoubleReg, kScratchDoubleReg);
if (dst.fp() != src.fp()) movaps(dst.fp(), src.fp());
andps(dst.fp(), kScratchDoubleReg);
}
// Set top bit if >= 0 (but not -0.0!).
Pxor(kScratchDoubleReg, dst.fp());
// Convert to int.
Cvttps2dq(dst.fp(), dst.fp());
// Set top bit if >=0 is now < 0.
Pand(kScratchDoubleReg, dst.fp());
Psrad(kScratchDoubleReg, byte{31});
// Set positive overflow lanes to 0x7FFFFFFF.
Pxor(dst.fp(), kScratchDoubleReg);
} }
void LiftoffAssembler::emit_i32x4_uconvert_f32x4(LiftoffRegister dst, void LiftoffAssembler::emit_i32x4_uconvert_f32x4(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment