Commit 684f3cee authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm-simd] Optimize i32x4.trunc_sat_f32x4_s

Bug: v8:12094
Change-Id: Ibefce881cbfcd4445485197a4a2615bdf0599ada
Fixed: v8:12094
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3123638
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76706}
parent f2042e8c
......@@ -145,6 +145,19 @@ constexpr struct alignas(16) {
} wasm_uint32_max_as_double = {uint64_t{0x41efffffffe00000},
uint64_t{0x41efffffffe00000}};
// This is 2147483648.0, which is 1 more than INT32_MAX.
constexpr struct alignas(16) {
uint32_t a;
uint32_t b;
uint32_t c;
uint32_t d;
} wasm_int32_overflow_as_float = {
uint32_t{0x4f00'0000},
uint32_t{0x4f00'0000},
uint32_t{0x4f00'0000},
uint32_t{0x4f00'0000},
};
// Implementation of ExternalReference
static ExternalReference::Type BuiltinCallTypeForResultSize(int result_size) {
......@@ -619,6 +632,11 @@ ExternalReference ExternalReference::address_of_wasm_uint32_max_as_double() {
reinterpret_cast<Address>(&wasm_uint32_max_as_double));
}
ExternalReference ExternalReference::address_of_wasm_int32_overflow_as_float() {
return ExternalReference(
reinterpret_cast<Address>(&wasm_int32_overflow_as_float));
}
ExternalReference
ExternalReference::address_of_enable_experimental_regexp_engine() {
return ExternalReference(&FLAG_enable_experimental_regexp_engine);
......
......@@ -111,13 +111,6 @@ class StatsCounter;
V(address_of_runtime_stats_flag, "TracingFlags::runtime_stats") \
V(address_of_the_hole_nan, "the_hole_nan") \
V(address_of_uint32_bias, "uint32_bias") \
V(address_of_wasm_i8x16_swizzle_mask, "wasm_i8x16_swizzle_mask") \
V(address_of_wasm_i8x16_popcnt_mask, "wasm_i8x16_popcnt_mask") \
V(address_of_wasm_i8x16_splat_0x01, "wasm_i8x16_splat_0x01") \
V(address_of_wasm_i8x16_splat_0x0f, "wasm_i8x16_splat_0x0f") \
V(address_of_wasm_i8x16_splat_0x33, "wasm_i8x16_splat_0x33") \
V(address_of_wasm_i8x16_splat_0x55, "wasm_i8x16_splat_0x55") \
V(address_of_wasm_i16x8_splat_0x0001, "wasm_16x8_splat_0x0001") \
V(baseline_pc_for_bytecode_offset, "BaselinePCForBytecodeOffset") \
V(baseline_pc_for_next_executed_bytecode, \
"BaselinePCForNextExecutedBytecode") \
......@@ -248,12 +241,20 @@ class StatsCounter;
IF_WASM(V, wasm_memory_copy, "wasm::memory_copy") \
IF_WASM(V, wasm_memory_fill, "wasm::memory_fill") \
IF_WASM(V, wasm_array_copy, "wasm::array_copy") \
V(address_of_wasm_i8x16_swizzle_mask, "wasm_i8x16_swizzle_mask") \
V(address_of_wasm_i8x16_popcnt_mask, "wasm_i8x16_popcnt_mask") \
V(address_of_wasm_i8x16_splat_0x01, "wasm_i8x16_splat_0x01") \
V(address_of_wasm_i8x16_splat_0x0f, "wasm_i8x16_splat_0x0f") \
V(address_of_wasm_i8x16_splat_0x33, "wasm_i8x16_splat_0x33") \
V(address_of_wasm_i8x16_splat_0x55, "wasm_i8x16_splat_0x55") \
V(address_of_wasm_i16x8_splat_0x0001, "wasm_16x8_splat_0x0001") \
V(address_of_wasm_f64x2_convert_low_i32x4_u_int_mask, \
"wasm_f64x2_convert_low_i32x4_u_int_mask") \
V(supports_wasm_simd_128_address, "wasm::supports_wasm_simd_128_address") \
V(address_of_wasm_double_2_power_52, "wasm_double_2_power_52") \
V(address_of_wasm_int32_max_as_double, "wasm_int32_max_as_double") \
V(address_of_wasm_uint32_max_as_double, "wasm_uint32_max_as_double") \
V(address_of_wasm_int32_overflow_as_float, "wasm_int32_overflow_as_float") \
V(write_barrier_marking_from_code_function, "WriteBarrier::MarkingFromCode") \
V(call_enqueue_microtask_function, "MicrotaskQueue::CallEnqueueMicrotask") \
V(call_enter_context_function, "call_enter_context_function") \
......
......@@ -1715,6 +1715,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
}
PACKED_CMP_LIST(AVX_CMP_P)
// vcmpgeps/vcmpgepd only in AVX.
AVX_CMP_P(cmpge, 0xd)
#undef AVX_CMP_P
#undef PACKED_CMP_LIST
......
......@@ -649,31 +649,6 @@ void SharedTurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
}
}
void SharedTurboAssembler::I32x4SConvertF32x4(XMMRegister dst, XMMRegister src,
XMMRegister scratch) {
// Convert NAN to 0.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vcmpeqps(scratch, src, src);
vpand(dst, src, scratch);
} else {
movaps(scratch, src);
cmpeqps(scratch, src);
if (dst != src) movaps(dst, src);
andps(dst, scratch);
}
// Set top bit if >= 0 (but not -0.0!).
Pxor(scratch, dst);
// Convert to packed single-precision.
Cvttps2dq(dst, dst);
// Set top bit if >=0 is now < 0.
Pand(scratch, dst);
Psrad(scratch, scratch, byte{31});
// Set positive overflow lanes to 0x7FFFFFFF.
Pxor(dst, scratch);
}
void SharedTurboAssembler::I32x4SConvertI16x8High(XMMRegister dst,
XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
......
......@@ -389,9 +389,6 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
// Requires that dst == src1 if AVX is not supported.
void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch, bool low, bool is_signed);
// Requires dst == src if AVX is not supported.
void I32x4SConvertF32x4(XMMRegister dst, XMMRegister src,
XMMRegister scratch);
void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src);
void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src,
XMMRegister scratch);
......@@ -465,6 +462,47 @@ class V8_EXPORT_PRIVATE SharedTurboAssemblerBase : public SharedTurboAssembler {
ExternalReference::address_of_wasm_double_2_power_52(), scratch));
}
void I32x4SConvertF32x4(XMMRegister dst, XMMRegister src, XMMRegister tmp,
Register scratch) {
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_int32_overflow_as_float(), scratch);
// This algorithm works by:
// 1. lanes with NaNs are zero-ed
// 2. lanes ge than 2147483648.0f (MAX_INT32+1) set to 0xffff'ffff
// 3. cvttps2dq sets all out of range lanes to 0x8000'0000
// a. correct for underflows (< MIN_INT32)
// b. wrong for overflow, and we know which lanes overflow from 2.
// 4. adjust for 3b by xor-ing 2 and 3
// a. 0x8000'0000 xor 0xffff'ffff = 0x7fff'ffff (MAX_INT32)
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vcmpeqps(tmp, src, src);
vandps(dst, src, tmp);
vcmpgeps(tmp, src, op);
vcvttps2dq(dst, dst);
vpxor(dst, dst, tmp);
} else {
if (src == dst) {
movaps(tmp, src);
cmpeqps(tmp, tmp);
andps(dst, tmp);
movaps(tmp, op);
cmpleps(tmp, dst);
cvttps2dq(dst, dst);
xorps(dst, tmp);
} else {
movaps(tmp, op);
cmpleps(tmp, src);
cvttps2dq(dst, src);
xorps(dst, tmp);
movaps(tmp, src);
cmpeqps(tmp, tmp);
andps(dst, tmp);
}
}
}
void I32x4TruncSatF64x2SZero(XMMRegister dst, XMMRegister src,
XMMRegister scratch, Register tmp) {
if (CpuFeatures::IsSupported(AVX)) {
......
......@@ -1606,6 +1606,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
AVX_CMP_P(vcmpneq, 0x4)
AVX_CMP_P(vcmpnlt, 0x5)
AVX_CMP_P(vcmpnle, 0x6)
AVX_CMP_P(vcmpge, 0xd)
#undef AVX_CMP_P
......
......@@ -2340,7 +2340,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kIA32I32x4SConvertF32x4: {
__ I32x4SConvertF32x4(i.OutputSimd128Register(),
i.InputSimd128Register(0), kScratchDoubleReg);
i.InputSimd128Register(0), kScratchDoubleReg,
i.TempRegister(0));
break;
}
case kIA32I32x4SConvertI16x8Low: {
......
......@@ -2518,7 +2518,12 @@ void InstructionSelector::VisitF32x4UConvertI32x4(Node* node) {
}
void InstructionSelector::VisitI32x4SConvertF32x4(Node* node) {
VisitRRSimd(this, node, kIA32I32x4SConvertF32x4);
IA32OperandGenerator g(this);
InstructionOperand temps[] = {g.TempRegister()};
InstructionOperand dst =
IsSupported(AVX) ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node);
Emit(kIA32I32x4SConvertF32x4, dst, g.UseRegister(node->InputAt(0)),
arraysize(temps), temps);
}
void InstructionSelector::VisitI32x4UConvertF32x4(Node* node) {
......
......@@ -3101,7 +3101,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I32x4SConvertF32x4: {
__ I32x4SConvertF32x4(i.OutputSimd128Register(),
i.InputSimd128Register(0), kScratchDoubleReg);
i.InputSimd128Register(0), kScratchDoubleReg,
kScratchRegister);
break;
}
case kX64I32x4SConvertI16x8Low: {
......
......@@ -4212,7 +4212,8 @@ void LiftoffAssembler::emit_f64x2_promote_low_f32x4(LiftoffRegister dst,
void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst,
LiftoffRegister src) {
I32x4SConvertF32x4(dst.fp(), src.fp(), liftoff::kScratchDoubleReg);
Register tmp = GetUnusedRegister(kGpReg, {}).gp();
I32x4SConvertF32x4(dst.fp(), src.fp(), liftoff::kScratchDoubleReg, tmp);
}
void LiftoffAssembler::emit_i32x4_uconvert_f32x4(LiftoffRegister dst,
......
......@@ -3729,7 +3729,7 @@ void LiftoffAssembler::emit_f64x2_promote_low_f32x4(LiftoffRegister dst,
void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst,
LiftoffRegister src) {
I32x4SConvertF32x4(dst.fp(), src.fp(), kScratchDoubleReg);
I32x4SConvertF32x4(dst.fp(), src.fp(), kScratchDoubleReg, kScratchRegister);
}
void LiftoffAssembler::emit_i32x4_uconvert_f32x4(LiftoffRegister dst,
......
......@@ -740,6 +740,8 @@ TEST(DisasmIa320) {
__ vcmpunordps(xmm5, xmm4, Operand(ebx, ecx, times_4, 10000));
__ vcmpneqps(xmm5, xmm4, xmm1);
__ vcmpneqps(xmm5, xmm4, Operand(ebx, ecx, times_4, 10000));
__ vcmpgeps(xmm5, xmm4, xmm1);
__ vcmpgeps(xmm5, xmm4, Operand(ebx, ecx, times_4, 10000));
__ vandpd(xmm0, xmm1, xmm2);
__ vandpd(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000));
......
......@@ -739,6 +739,8 @@ TEST(DisasmX64) {
__ vcmpnltps(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000));
__ vcmpnleps(xmm5, xmm4, xmm1);
__ vcmpnleps(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000));
__ vcmpgeps(xmm5, xmm4, xmm1);
__ vcmpgeps(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000));
__ vcmppd(xmm5, xmm4, xmm1, 1);
__ vcmppd(xmm5, xmm4, Operand(rbx, rcx, times_4, 10000), 1);
__ vcmpeqpd(xmm5, xmm4, xmm1);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment