Commit a4ebf1f0 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][liftoff][arm][arm64] Implement mul

i64x2 mul is more tricky, the algorithm is slightly modified:
- for arm64, we can only use 2 temporaries
- for arm, we only have 1 temporary, so we get another register
manually, if we need it (modify lhs/rhs if they are not used)

Bug: v8:9909
Change-Id: I0398e2c95348a8b49ca9773a78ccfb7af73e2eef
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2128606
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66971}
parent 4491da98
......@@ -6,6 +6,7 @@
#define V8_WASM_BASELINE_ARM_LIFTOFF_ASSEMBLER_ARM_H_
#include "src/wasm/baseline/liftoff-assembler.h"
#include "src/wasm/baseline/liftoff-register.h"
namespace v8 {
namespace internal {
......@@ -1589,7 +1590,8 @@ void LiftoffAssembler::emit_f64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_f64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "f64x2mul");
vmul(dst.low_fp(), lhs.low_fp(), rhs.low_fp());
vmul(dst.high_fp(), lhs.high_fp(), rhs.high_fp());
}
void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
......@@ -1629,7 +1631,9 @@ void LiftoffAssembler::emit_f32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_f32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "f32x4mul");
vmul(liftoff::GetSimd128Register(dst.low_fp()),
liftoff::GetSimd128Register(lhs.low_fp()),
liftoff::GetSimd128Register(rhs.low_fp()));
}
void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
......@@ -1676,7 +1680,45 @@ void LiftoffAssembler::emit_i64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "i64x2mul");
UseScratchRegisterScope temps(this);
QwNeonRegister dst_neon = liftoff::GetSimd128Register(dst.low_fp());
QwNeonRegister left = liftoff::GetSimd128Register(lhs.low_fp());
QwNeonRegister right = liftoff::GetSimd128Register(rhs.low_fp());
// These temporary registers will be modified. We can directly modify lhs and
// rhs if they are not uesd, saving on temporaries.
QwNeonRegister tmp1 = left;
QwNeonRegister tmp2 = right;
if (cache_state()->is_used(lhs) && cache_state()->is_used(rhs)) {
tmp1 = temps.AcquireQ();
// We only have 1 scratch Q register, so acquire another ourselves.
LiftoffRegList pinned = LiftoffRegList::ForRegs(dst);
LiftoffRegister unused_pair = GetUnusedRegister(kFpRegPair, pinned);
tmp2 = liftoff::GetSimd128Register(unused_pair.low_fp());
} else if (cache_state()->is_used(lhs)) {
tmp1 = temps.AcquireQ();
} else if (cache_state()->is_used(rhs)) {
tmp2 = temps.AcquireQ();
}
// Algorithm from code-generator-arm.cc, refer to comments there for details.
if (tmp1 != left) {
vmov(tmp1, left);
}
if (tmp2 != right) {
vmov(tmp2, right);
}
vtrn(Neon32, tmp1.low(), tmp1.high());
vtrn(Neon32, tmp2.low(), tmp2.high());
vmull(NeonU32, dst_neon, tmp1.low(), tmp2.high());
vmlal(NeonU32, dst_neon, tmp1.high(), tmp2.low());
vshl(NeonU64, dst_neon, dst_neon, 32);
vmlal(NeonU32, dst_neon, tmp1.low(), tmp2.low());
}
void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst,
......@@ -1716,7 +1758,9 @@ void LiftoffAssembler::emit_i32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "i32x4mul");
vmul(Neon32, liftoff::GetSimd128Register(dst.low_fp()),
liftoff::GetSimd128Register(lhs.low_fp()),
liftoff::GetSimd128Register(rhs.low_fp()));
}
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
......@@ -1746,7 +1790,9 @@ void LiftoffAssembler::emit_i16x8_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i16x8_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "i16x8mul");
vmul(Neon16, liftoff::GetSimd128Register(dst.low_fp()),
liftoff::GetSimd128Register(lhs.low_fp()),
liftoff::GetSimd128Register(rhs.low_fp()));
}
void LiftoffAssembler::emit_i16x8_add_saturate_u(LiftoffRegister dst,
......@@ -1828,7 +1874,9 @@ void LiftoffAssembler::emit_i8x16_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i8x16_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "i8x16mul");
vmul(Neon8, liftoff::GetSimd128Register(dst.low_fp()),
liftoff::GetSimd128Register(lhs.low_fp()),
liftoff::GetSimd128Register(rhs.low_fp()));
}
void LiftoffAssembler::emit_i8x16_add_saturate_u(LiftoffRegister dst,
......
......@@ -1117,7 +1117,7 @@ void LiftoffAssembler::emit_f64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_f64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "f64x2mul");
Fmul(dst.fp().V2D(), lhs.fp().V2D(), rhs.fp().V2D());
}
void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
......@@ -1153,7 +1153,7 @@ void LiftoffAssembler::emit_f32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_f32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "f32x4mul");
Fmul(dst.fp().V4S(), lhs.fp().V4S(), rhs.fp().V4S());
}
void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
......@@ -1189,7 +1189,23 @@ void LiftoffAssembler::emit_i64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "i64x2mul");
UseScratchRegisterScope temps(this);
VRegister tmp1 = temps.AcquireV(kFormat2D);
VRegister tmp2 = temps.AcquireV(kFormat2D);
// Algorithm copied from code-generator-arm64.cc with minor modifications:
// - 2 (max number of scratch registers in Liftoff) temporaries instead of 3
// - 1 more Umull instruction to calculate | cg | ae |,
// - so, we can no longer use Umlal in the last step, and use Add instead.
// Refer to comments there for details.
Xtn(tmp1.V2S(), lhs.fp().V2D());
Xtn(tmp2.V2S(), rhs.fp().V2D());
Umull(tmp1.V2D(), tmp1.V2S(), tmp2.V2S());
Rev64(tmp2.V4S(), rhs.fp().V4S());
Mul(tmp2.V4S(), tmp2.V4S(), lhs.fp().V4S());
Addp(tmp2.V4S(), tmp2.V4S(), tmp2.V4S());
Shll(dst.fp().V2D(), tmp2.V2S(), 32);
Add(dst.fp().V2D(), dst.fp().V2D(), tmp1.V2D());
}
void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst,
......@@ -1225,7 +1241,7 @@ void LiftoffAssembler::emit_i32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "i32x4mul");
Mul(dst.fp().V4S(), lhs.fp().V4S(), rhs.fp().V4S());
}
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
......@@ -1273,7 +1289,7 @@ void LiftoffAssembler::emit_i16x8_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i16x8_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "i16x8mul");
Mul(dst.fp().V8H(), lhs.fp().V8H(), rhs.fp().V8H());
}
void LiftoffAssembler::emit_i16x8_add_saturate_u(LiftoffRegister dst,
......@@ -1327,7 +1343,7 @@ void LiftoffAssembler::emit_i8x16_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i8x16_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "i8x16mul");
Mul(dst.fp().V16B(), lhs.fp().V16B(), rhs.fp().V16B());
}
void LiftoffAssembler::emit_i8x16_add_saturate_u(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment