Commit 593ab78f authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm-simd] Share i32x4.trunc_sat_f64x2 s,u zero implementation

Bug: v8:11589
Change-Id: I7b55efa76f60eacf31700a544f54042eec963f57
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3115545Reviewed-by: 's avatarAdam Klein <adamk@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76495}
parent c604dcb5
...@@ -1545,6 +1545,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1545,6 +1545,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vmovdqa(XMMRegister dst, Operand src) { void vmovdqa(XMMRegister dst, Operand src) {
vinstr(0x6F, dst, xmm0, src, k66, k0F, kWIG); vinstr(0x6F, dst, xmm0, src, k66, k0F, kWIG);
} }
void vmovdqa(XMMRegister dst, XMMRegister src) {
vinstr(0x6F, dst, xmm0, src, k66, k0F, kWIG);
}
void vmovdqu(XMMRegister dst, Operand src) { void vmovdqu(XMMRegister dst, Operand src) {
vinstr(0x6F, dst, xmm0, src, kF3, k0F, kWIG); vinstr(0x6F, dst, xmm0, src, kF3, k0F, kWIG);
} }
......
...@@ -701,88 +701,6 @@ void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src, ...@@ -701,88 +701,6 @@ void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
} }
} }
void TurboAssembler::I32x4TruncSatF64x2SZero(XMMRegister dst, XMMRegister src,
XMMRegister scratch,
Register tmp) {
ASM_CODE_COMMENT(this);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
XMMRegister original_dst = dst;
// Make sure we don't overwrite src.
if (dst == src) {
DCHECK_NE(scratch, src);
dst = scratch;
}
// dst = 0 if src == NaN, else all ones.
vcmpeqpd(dst, src, src);
// dst = 0 if src == NaN, else INT32_MAX as double.
vandpd(dst, dst,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_int32_max_as_double(), tmp));
// dst = 0 if src == NaN, src is saturated to INT32_MAX as double.
vminpd(dst, src, dst);
// Values > INT32_MAX already saturated, values < INT32_MIN raises an
// exception, which is masked and returns 0x80000000.
vcvttpd2dq(dst, dst);
if (original_dst != dst) {
vmovaps(original_dst, dst);
}
} else {
if (dst != src) {
movaps(dst, src);
}
movaps(scratch, dst);
cmpeqpd(scratch, dst);
andps(scratch,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_int32_max_as_double(), tmp));
minpd(dst, scratch);
cvttpd2dq(dst, dst);
}
}
void TurboAssembler::I32x4TruncSatF64x2UZero(XMMRegister dst, XMMRegister src,
XMMRegister scratch,
Register tmp) {
ASM_CODE_COMMENT(this);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vxorpd(scratch, scratch, scratch);
// Saturate to 0.
vmaxpd(dst, src, scratch);
// Saturate to UINT32_MAX.
vminpd(dst, dst,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_uint32_max_as_double(), tmp));
// Truncate.
vroundpd(dst, dst, kRoundToZero);
// Add to special double where significant bits == uint32.
vaddpd(dst, dst,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_double_2_power_52(), tmp));
// Extract low 32 bits of each double's significand, zero top lanes.
// dst = [dst[0], dst[2], 0, 0]
vshufps(dst, dst, scratch, 0x88);
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst != src) {
movaps(dst, src);
}
xorps(scratch, scratch);
maxpd(dst, scratch);
minpd(dst,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_uint32_max_as_double(), tmp));
roundpd(dst, dst, kRoundToZero);
addpd(dst,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_double_2_power_52(), tmp));
shufps(dst, scratch, 0x88);
}
}
void TurboAssembler::I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src, void TurboAssembler::I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src,
XMMRegister tmp, XMMRegister tmp,
Register scratch) { Register scratch) {
......
...@@ -397,10 +397,6 @@ class V8_EXPORT_PRIVATE TurboAssembler ...@@ -397,10 +397,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
// Defined here to allow usage on both TurboFan and Liftoff. // Defined here to allow usage on both TurboFan and Liftoff.
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1, void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
XMMRegister tmp2, Register scratch); XMMRegister tmp2, Register scratch);
void I32x4TruncSatF64x2SZero(XMMRegister dst, XMMRegister src,
XMMRegister scratch, Register tmp);
void I32x4TruncSatF64x2UZero(XMMRegister dst, XMMRegister src,
XMMRegister scratch, Register tmp);
void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src, void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src,
XMMRegister tmp, Register scratch); XMMRegister tmp, Register scratch);
void I16x8ExtAddPairwiseI8x16U(XMMRegister dst, XMMRegister src, void I16x8ExtAddPairwiseI8x16U(XMMRegister dst, XMMRegister src,
......
...@@ -440,6 +440,81 @@ class V8_EXPORT_PRIVATE SharedTurboAssemblerBase : public SharedTurboAssembler { ...@@ -440,6 +440,81 @@ class V8_EXPORT_PRIVATE SharedTurboAssemblerBase : public SharedTurboAssembler {
ExternalReference::address_of_wasm_double_2_power_52(), scratch)); ExternalReference::address_of_wasm_double_2_power_52(), scratch));
} }
void I32x4TruncSatF64x2SZero(XMMRegister dst, XMMRegister src,
XMMRegister scratch, Register tmp) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
XMMRegister original_dst = dst;
// Make sure we don't overwrite src.
if (dst == src) {
DCHECK_NE(src, scratch);
dst = scratch;
}
// dst = 0 if src == NaN, else all ones.
vcmpeqpd(dst, src, src);
// dst = 0 if src == NaN, else INT32_MAX as double.
vandpd(
dst, dst,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_int32_max_as_double(), tmp));
// dst = 0 if src == NaN, src is saturated to INT32_MAX as double.
vminpd(dst, src, dst);
// Values > INT32_MAX already saturated, values < INT32_MIN raises an
// exception, which is masked and returns 0x80000000.
vcvttpd2dq(original_dst, dst);
} else {
if (dst != src) {
movaps(dst, src);
}
movaps(scratch, dst);
cmpeqpd(scratch, dst);
andps(scratch,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_int32_max_as_double(), tmp));
minpd(dst, scratch);
cvttpd2dq(dst, dst);
}
}
void I32x4TruncSatF64x2UZero(XMMRegister dst, XMMRegister src,
XMMRegister scratch, Register tmp) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vxorpd(scratch, scratch, scratch);
// Saturate to 0.
vmaxpd(dst, src, scratch);
// Saturate to UINT32_MAX.
vminpd(
dst, dst,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_uint32_max_as_double(), tmp));
// Truncate.
vroundpd(dst, dst, kRoundToZero);
// Add to special double where significant bits == uint32.
vaddpd(dst, dst,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_double_2_power_52(), tmp));
// Extract low 32 bits of each double's significand, zero top lanes.
// dst = [dst[0], dst[2], 0, 0]
vshufps(dst, dst, scratch, 0x88);
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst != src) {
movaps(dst, src);
}
xorps(scratch, scratch);
maxpd(dst, scratch);
minpd(dst, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_uint32_max_as_double(),
tmp));
roundpd(dst, dst, kRoundToZero);
addpd(dst,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_double_2_power_52(), tmp));
shufps(dst, scratch, 0x88);
}
}
private: private:
// All implementation-specific methods must be called through this. // All implementation-specific methods must be called through this.
Impl* impl() { return static_cast<Impl*>(this); } Impl* impl() { return static_cast<Impl*>(this); }
......
...@@ -2302,78 +2302,6 @@ void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src, ...@@ -2302,78 +2302,6 @@ void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
} }
} }
void TurboAssembler::I32x4TruncSatF64x2SZero(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
XMMRegister original_dst = dst;
// Make sure we don't overwrite src.
if (dst == src) {
DCHECK_NE(src, kScratchDoubleReg);
dst = kScratchDoubleReg;
}
// dst = 0 if src == NaN, else all ones.
vcmpeqpd(dst, src, src);
// dst = 0 if src == NaN, else INT32_MAX as double.
vandpd(dst, dst,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_int32_max_as_double()));
// dst = 0 if src == NaN, src is saturated to INT32_MAX as double.
vminpd(dst, src, dst);
// Values > INT32_MAX already saturated, values < INT32_MIN raises an
// exception, which is masked and returns 0x80000000.
vcvttpd2dq(dst, dst);
if (original_dst != dst) {
Move(original_dst, dst);
}
} else {
if (dst != src) {
Move(dst, src);
}
Move(kScratchDoubleReg, dst);
cmpeqpd(kScratchDoubleReg, dst);
andps(kScratchDoubleReg,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_int32_max_as_double()));
minpd(dst, kScratchDoubleReg);
cvttpd2dq(dst, dst);
}
}
void TurboAssembler::I32x4TruncSatF64x2UZero(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vxorpd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
// Saturate to 0.
vmaxpd(dst, src, kScratchDoubleReg);
// Saturate to UINT32_MAX.
vminpd(dst, dst,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_uint32_max_as_double()));
// Truncate.
vroundpd(dst, dst, kRoundToZero);
// Add to special double where significant bits == uint32.
vaddpd(dst, dst,
ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_double_2_power_52()));
// Extract low 32 bits of each double's significand, zero top lanes.
// dst = [dst[0], dst[2], 0, 0]
vshufps(dst, dst, kScratchDoubleReg, 0x88);
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst != src) {
Move(dst, src);
}
xorps(kScratchDoubleReg, kScratchDoubleReg);
maxpd(dst, kScratchDoubleReg);
minpd(dst, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_uint32_max_as_double()));
roundpd(dst, dst, kRoundToZero);
addpd(dst, ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_double_2_power_52()));
shufps(dst, kScratchDoubleReg, 0x88);
}
}
void TurboAssembler::I16x8ExtAddPairwiseI8x16S(XMMRegister dst, void TurboAssembler::I16x8ExtAddPairwiseI8x16S(XMMRegister dst,
XMMRegister src) { XMMRegister src) {
// pmaddubsw treats the first operand as unsigned, so the external reference // pmaddubsw treats the first operand as unsigned, so the external reference
......
...@@ -481,9 +481,6 @@ class V8_EXPORT_PRIVATE TurboAssembler ...@@ -481,9 +481,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
// Defined here to allow usage on both TurboFan and Liftoff. // Defined here to allow usage on both TurboFan and Liftoff.
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp); void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp);
void I32x4TruncSatF64x2SZero(XMMRegister dst, XMMRegister src);
void I32x4TruncSatF64x2UZero(XMMRegister dst, XMMRegister src);
void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src); void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src);
void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src); void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src);
......
...@@ -2635,12 +2635,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2635,12 +2635,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
case kX64I32x4TruncSatF64x2SZero: { case kX64I32x4TruncSatF64x2SZero: {
__ I32x4TruncSatF64x2SZero(i.OutputSimd128Register(), __ I32x4TruncSatF64x2SZero(i.OutputSimd128Register(),
i.InputSimd128Register(0)); i.InputSimd128Register(0), kScratchDoubleReg,
kScratchRegister);
break; break;
} }
case kX64I32x4TruncSatF64x2UZero: { case kX64I32x4TruncSatF64x2UZero: {
__ I32x4TruncSatF64x2UZero(i.OutputSimd128Register(), __ I32x4TruncSatF64x2UZero(i.OutputSimd128Register(),
i.InputSimd128Register(0)); i.InputSimd128Register(0), kScratchDoubleReg,
kScratchRegister);
break; break;
} }
case kX64F32x4Splat: { case kX64F32x4Splat: {
......
...@@ -3904,12 +3904,14 @@ void LiftoffAssembler::emit_i32x4_uconvert_i16x8_high(LiftoffRegister dst, ...@@ -3904,12 +3904,14 @@ void LiftoffAssembler::emit_i32x4_uconvert_i16x8_high(LiftoffRegister dst,
void LiftoffAssembler::emit_i32x4_trunc_sat_f64x2_s_zero(LiftoffRegister dst, void LiftoffAssembler::emit_i32x4_trunc_sat_f64x2_s_zero(LiftoffRegister dst,
LiftoffRegister src) { LiftoffRegister src) {
I32x4TruncSatF64x2SZero(dst.fp(), src.fp()); I32x4TruncSatF64x2SZero(dst.fp(), src.fp(), kScratchDoubleReg,
kScratchRegister);
} }
void LiftoffAssembler::emit_i32x4_trunc_sat_f64x2_u_zero(LiftoffRegister dst, void LiftoffAssembler::emit_i32x4_trunc_sat_f64x2_u_zero(LiftoffRegister dst,
LiftoffRegister src) { LiftoffRegister src) {
I32x4TruncSatF64x2UZero(dst.fp(), src.fp()); I32x4TruncSatF64x2UZero(dst.fp(), src.fp(), kScratchDoubleReg,
kScratchRegister);
} }
void LiftoffAssembler::emit_s128_and_not(LiftoffRegister dst, void LiftoffAssembler::emit_s128_and_not(LiftoffRegister dst,
......
...@@ -825,6 +825,7 @@ TEST(DisasmIa320) { ...@@ -825,6 +825,7 @@ TEST(DisasmIa320) {
__ vmovshdup(xmm1, xmm2); __ vmovshdup(xmm1, xmm2);
__ vbroadcastss(xmm1, Operand(ebx, ecx, times_4, 10000)); __ vbroadcastss(xmm1, Operand(ebx, ecx, times_4, 10000));
__ vmovdqa(xmm0, Operand(ebx, ecx, times_4, 10000)); __ vmovdqa(xmm0, Operand(ebx, ecx, times_4, 10000));
__ vmovdqa(xmm0, xmm7);
__ vmovdqu(xmm0, Operand(ebx, ecx, times_4, 10000)); __ vmovdqu(xmm0, Operand(ebx, ecx, times_4, 10000));
__ vmovdqu(Operand(ebx, ecx, times_4, 10000), xmm0); __ vmovdqu(Operand(ebx, ecx, times_4, 10000), xmm0);
__ vmovd(xmm0, edi); __ vmovd(xmm0, edi);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment