Commit 9d1dda7e authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][arm] Improve codegen for any_true

The codegen uses a bunch of vpmax to try and keep set bits around. The
datatype for vpmax does not need to change for each instruction, since
vpmax U32 will persist set bits just as well. This simplifies the
instruction sequences for S1x8 and S1x16 anytrue.

I added a test to check a special case when a f64x2 contains -0.0 (top
bit set). A previous attempt to optimize codegen used floating point
compare, which does not distinguish between 0.0 and -0.0. So -0.0 will
compare equals to 0.0, and incorrect return 0 for anytrue.

Change-Id: I66013796af08a666009e6b2d774ea7ee7bdfe1ad
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2203113
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#67875}
parent a43287f1
...@@ -3028,7 +3028,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3028,7 +3028,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vrev16(Neon8, i.OutputSimd128Register(), i.InputSimd128Register(0)); __ vrev16(Neon8, i.OutputSimd128Register(), i.InputSimd128Register(0));
break; break;
} }
case kArmS1x4AnyTrue: { case kArmS1x4AnyTrue:
case kArmS1x8AnyTrue:
case kArmS1x16AnyTrue: {
const QwNeonRegister& src = i.InputSimd128Register(0); const QwNeonRegister& src = i.InputSimd128Register(0);
UseScratchRegisterScope temps(tasm()); UseScratchRegisterScope temps(tasm());
DwVfpRegister scratch = temps.AcquireD(); DwVfpRegister scratch = temps.AcquireD();
...@@ -3050,18 +3052,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3050,18 +3052,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ mov(i.OutputRegister(), Operand(1), LeaveCC, ne); __ mov(i.OutputRegister(), Operand(1), LeaveCC, ne);
break; break;
} }
case kArmS1x8AnyTrue: {
const QwNeonRegister& src = i.InputSimd128Register(0);
UseScratchRegisterScope temps(tasm());
DwVfpRegister scratch = temps.AcquireD();
__ vpmax(NeonU16, scratch, src.low(), src.high());
__ vpmax(NeonU16, scratch, scratch, scratch);
__ vpmax(NeonU16, scratch, scratch, scratch);
__ ExtractLane(i.OutputRegister(), scratch, NeonS16, 0);
__ cmp(i.OutputRegister(), Operand(0));
__ mov(i.OutputRegister(), Operand(1), LeaveCC, ne);
break;
}
case kArmS1x8AllTrue: { case kArmS1x8AllTrue: {
const QwNeonRegister& src = i.InputSimd128Register(0); const QwNeonRegister& src = i.InputSimd128Register(0);
UseScratchRegisterScope temps(tasm()); UseScratchRegisterScope temps(tasm());
...@@ -3074,22 +3064,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3074,22 +3064,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ mov(i.OutputRegister(), Operand(1), LeaveCC, ne); __ mov(i.OutputRegister(), Operand(1), LeaveCC, ne);
break; break;
} }
case kArmS1x16AnyTrue: {
const QwNeonRegister& src = i.InputSimd128Register(0);
UseScratchRegisterScope temps(tasm());
QwNeonRegister q_scratch = temps.AcquireQ();
DwVfpRegister d_scratch = q_scratch.low();
__ vpmax(NeonU8, d_scratch, src.low(), src.high());
__ vpmax(NeonU8, d_scratch, d_scratch, d_scratch);
// vtst to detect any bits in the bottom 32 bits of d_scratch.
// This saves an instruction vs. the naive sequence of vpmax.
// kDoubleRegZero is not changed, since it is 0.
__ vtst(Neon32, q_scratch, q_scratch, q_scratch);
__ ExtractLane(i.OutputRegister(), d_scratch, NeonS32, 0);
__ cmp(i.OutputRegister(), Operand(0));
__ mov(i.OutputRegister(), Operand(1), LeaveCC, ne);
break;
}
case kArmS1x16AllTrue: { case kArmS1x16AllTrue: {
const QwNeonRegister& src = i.InputSimd128Register(0); const QwNeonRegister& src = i.InputSimd128Register(0);
UseScratchRegisterScope temps(tasm()); UseScratchRegisterScope temps(tasm());
......
...@@ -3503,6 +3503,18 @@ WASM_SIMD_ANYTRUE_TEST(32x4, 4, 0xffffffff, int32_t) ...@@ -3503,6 +3503,18 @@ WASM_SIMD_ANYTRUE_TEST(32x4, 4, 0xffffffff, int32_t)
WASM_SIMD_ANYTRUE_TEST(16x8, 8, 0xffff, int32_t) WASM_SIMD_ANYTRUE_TEST(16x8, 8, 0xffff, int32_t)
WASM_SIMD_ANYTRUE_TEST(8x16, 16, 0xff, int32_t) WASM_SIMD_ANYTRUE_TEST(8x16, 16, 0xff, int32_t)
// Special any true test cases that splats a -0.0 double into a i64x2.
// This is specifically to ensure that our implementation correct handles that
// 0.0 and -0.0 will be different in an anytrue (IEEE753 says they are equals).
WASM_SIMD_TEST_NO_LOWERING(S1x4AnytrueWithNegativeZero) {
WasmRunner<int32_t, int64_t> r(execution_tier, lower_simd);
byte simd = r.AllocateLocal(kWasmS128);
BUILD(r, WASM_SET_LOCAL(simd, WASM_SIMD_I64x2_SPLAT(WASM_GET_LOCAL(0))),
WASM_SIMD_UNOP(kExprS1x4AnyTrue, WASM_GET_LOCAL(simd)));
DCHECK_EQ(1, r.Call(0x8000000000000000));
DCHECK_EQ(0, r.Call(0x0000000000000000));
}
#define WASM_SIMD_ALLTRUE_TEST(format, lanes, max, param_type) \ #define WASM_SIMD_ALLTRUE_TEST(format, lanes, max, param_type) \
WASM_SIMD_TEST(S##format##AllTrue) { \ WASM_SIMD_TEST(S##format##AllTrue) { \
FLAG_SCOPE(wasm_simd_post_mvp); \ FLAG_SCOPE(wasm_simd_post_mvp); \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment