Commit a3d2f7a4 authored by Zhou, Zhiguo's avatar Zhou, Zhiguo Committed by Commit Bot

[wasm-simd][liftoff] Implement mul on x64 and ia32

Bug: v8:9909
Change-Id: Ie9a3098bcaa894266e850cc7094894d25709afd2
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2113301
Commit-Queue: Zhiguo Zhou <zhiguo.zhou@intel.com>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Reviewed-by: 's avatarZhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66898}
parent 0c423a7a
......@@ -1587,6 +1587,11 @@ void LiftoffAssembler::emit_f64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
vsub(dst.high_fp(), lhs.high_fp(), rhs.high_fp());
}
void LiftoffAssembler::emit_f64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "f64x2mul");
}
void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
vdup(Neon32, liftoff::GetSimd128Register(dst.low_fp()), src.fp(), 0);
......@@ -1622,6 +1627,11 @@ void LiftoffAssembler::emit_f32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
liftoff::GetSimd128Register(rhs.low_fp()));
}
void LiftoffAssembler::emit_f32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "f32x4mul");
}
void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
LiftoffRegister src) {
Simd128Register dst_simd = liftoff::GetSimd128Register(dst.low_fp());
......@@ -1664,6 +1674,11 @@ void LiftoffAssembler::emit_i64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
liftoff::GetSimd128Register(rhs.low_fp()));
}
void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "i64x2mul");
}
void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
vdup(Neon32, liftoff::GetSimd128Register(dst.low_fp()), src.gp());
......@@ -1699,6 +1714,11 @@ void LiftoffAssembler::emit_i32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
liftoff::GetSimd128Register(rhs.low_fp()));
}
void LiftoffAssembler::emit_i32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "i32x4mul");
}
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
LiftoffRegister src) {
vdup(Neon16, liftoff::GetSimd128Register(dst.low_fp()), src.gp());
......@@ -1718,6 +1738,11 @@ void LiftoffAssembler::emit_i16x8_sub(LiftoffRegister dst, LiftoffRegister lhs,
liftoff::GetSimd128Register(rhs.low_fp()));
}
void LiftoffAssembler::emit_i16x8_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "i16x8mul");
}
void LiftoffAssembler::emit_i16x8_extract_lane_u(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
......@@ -1783,6 +1808,11 @@ void LiftoffAssembler::emit_i8x16_sub(LiftoffRegister dst, LiftoffRegister lhs,
liftoff::GetSimd128Register(rhs.low_fp()));
}
void LiftoffAssembler::emit_i8x16_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "i8x16mul");
}
void LiftoffAssembler::StackCheck(Label* ool_code, Register limit_address) {
ldr(limit_address, MemOperand(limit_address));
cmp(sp, limit_address);
......
......@@ -1115,6 +1115,11 @@ void LiftoffAssembler::emit_f64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
Fsub(dst.fp().V2D(), lhs.fp().V2D(), rhs.fp().V2D());
}
void LiftoffAssembler::emit_f64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "f64x2mul");
}
void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
Dup(dst.fp().V4S(), src.fp().S(), 0);
......@@ -1146,6 +1151,11 @@ void LiftoffAssembler::emit_f32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
Fsub(dst.fp().V4S(), lhs.fp().V4S(), rhs.fp().V4S());
}
void LiftoffAssembler::emit_f32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "f32x4mul");
}
void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
LiftoffRegister src) {
Dup(dst.fp().V2D(), src.gp().X());
......@@ -1177,6 +1187,11 @@ void LiftoffAssembler::emit_i64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
Sub(dst.fp().V2D(), lhs.fp().V2D(), rhs.fp().V2D());
}
void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "i64x2mul");
}
void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
Dup(dst.fp().V4S(), src.gp().W());
......@@ -1208,6 +1223,11 @@ void LiftoffAssembler::emit_i32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
Sub(dst.fp().V4S(), lhs.fp().V4S(), rhs.fp().V4S());
}
void LiftoffAssembler::emit_i32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "i32x4mul");
}
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
LiftoffRegister src) {
Dup(dst.fp().V8H(), src.gp().W());
......@@ -1245,6 +1265,11 @@ void LiftoffAssembler::emit_i16x8_sub(LiftoffRegister dst, LiftoffRegister lhs,
Sub(dst.fp().V8H(), lhs.fp().V8H(), rhs.fp().V8H());
}
void LiftoffAssembler::emit_i16x8_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "i16x8mul");
}
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) {
Dup(dst.fp().V16B(), src.gp().W());
......@@ -1282,6 +1307,11 @@ void LiftoffAssembler::emit_i8x16_sub(LiftoffRegister dst, LiftoffRegister lhs,
Sub(dst.fp().V16B(), lhs.fp().V16B(), rhs.fp().V16B());
}
void LiftoffAssembler::emit_i8x16_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "i8x16mul");
}
void LiftoffAssembler::StackCheck(Label* ool_code, Register limit_address) {
Ldr(limit_address, MemOperand(limit_address));
Cmp(sp, limit_address);
......
......@@ -1928,12 +1928,19 @@ void LiftoffAssembler::emit_f64_set_cond(Condition cond, Register dst,
namespace liftoff {
template <void (Assembler::*avx_op)(XMMRegister, XMMRegister, XMMRegister),
void (Assembler::*sse_op)(XMMRegister, XMMRegister)>
void EmitSimdCommutativeBinOp(LiftoffAssembler* assm, LiftoffRegister dst,
LiftoffRegister lhs, LiftoffRegister rhs) {
void EmitSimdCommutativeBinOp(
LiftoffAssembler* assm, LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs, base::Optional<CpuFeature> feature = base::nullopt) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(assm, AVX);
(assm->*avx_op)(dst.fp(), lhs.fp(), rhs.fp());
} else if (dst.fp() == rhs.fp()) {
return;
}
base::Optional<CpuFeatureScope> sse_scope;
if (feature.has_value()) sse_scope.emplace(assm, *feature);
if (dst.fp() == rhs.fp()) {
(assm->*sse_op)(dst.fp(), lhs.fp());
} else {
if (dst.fp() != lhs.fp()) (assm->movaps)(dst.fp(), lhs.fp());
......@@ -2017,6 +2024,12 @@ void LiftoffAssembler::emit_f64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
rhs);
}
void LiftoffAssembler::emit_f64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vmulpd, &Assembler::mulpd>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
......@@ -2068,6 +2081,12 @@ void LiftoffAssembler::emit_f32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
rhs);
}
void LiftoffAssembler::emit_f32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vmulps, &Assembler::mulps>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
LiftoffRegister src) {
Pinsrd(dst.fp(), src.low_gp(), 0);
......@@ -2110,6 +2129,33 @@ void LiftoffAssembler::emit_i64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
rhs);
}
void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
static constexpr RegClass tmp_rc = reg_class_for(ValueType::kS128);
LiftoffRegister tmp1 =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
LiftoffRegister tmp2 =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs, tmp1));
Movaps(tmp1.fp(), lhs.fp());
Movaps(tmp2.fp(), rhs.fp());
// Multiply high dword of each qword of left with right.
Psrlq(tmp1.fp(), 32);
Pmuludq(tmp1.fp(), tmp1.fp(), rhs.fp());
// Multiply high dword of each qword of right with left.
Psrlq(tmp2.fp(), 32);
Pmuludq(tmp2.fp(), tmp2.fp(), lhs.fp());
Paddq(tmp2.fp(), tmp2.fp(), tmp1.fp());
Psllq(tmp2.fp(), tmp2.fp(), 32);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpmuludq(dst.fp(), lhs.fp(), rhs.fp());
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
pmuludq(dst.fp(), rhs.fp());
}
Paddq(dst.fp(), dst.fp(), tmp2.fp());
}
void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
......@@ -2148,6 +2194,12 @@ void LiftoffAssembler::emit_i32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
rhs);
}
void LiftoffAssembler::emit_i32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmulld, &Assembler::pmulld>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
}
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
......@@ -2193,6 +2245,12 @@ void LiftoffAssembler::emit_i16x8_sub(LiftoffRegister dst, LiftoffRegister lhs,
rhs);
}
void LiftoffAssembler::emit_i16x8_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmullw, &Assembler::pmullw>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
......@@ -2239,6 +2297,68 @@ void LiftoffAssembler::emit_i8x16_sub(LiftoffRegister dst, LiftoffRegister lhs,
rhs);
}
void LiftoffAssembler::emit_i8x16_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
static constexpr RegClass tmp_rc = reg_class_for(ValueType::kS128);
LiftoffRegister tmp =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
// I16x8 view of I8x16
// left = AAaa AAaa ... AAaa AAaa
// right= BBbb BBbb ... BBbb BBbb
// t = 00AA 00AA ... 00AA 00AA
// s = 00BB 00BB ... 00BB 00BB
vpsrlw(tmp.fp(), lhs.fp(), 8);
vpsrlw(liftoff::kScratchDoubleReg, rhs.fp(), 8);
// t = I16x8Mul(t0, t1)
//    => __PP __PP ...  __PP  __PP
vpmullw(tmp.fp(), tmp.fp(), liftoff::kScratchDoubleReg);
// s = left * 256
vpsllw(liftoff::kScratchDoubleReg, lhs.fp(), 8);
// dst = I16x8Mul(left * 256, right)
//    => pp__ pp__ ...  pp__  pp__
vpmullw(dst.fp(), liftoff::kScratchDoubleReg, rhs.fp());
// dst = I16x8Shr(dst, 8)
//    => 00pp 00pp ...  00pp  00pp
vpsrlw(dst.fp(), dst.fp(), 8);
// t = I16x8Shl(t, 8)
//    => PP00 PP00 ...  PP00  PP00
vpsllw(tmp.fp(), tmp.fp(), 8);
// dst = I16x8Or(dst, t)
//    => PPpp PPpp ...  PPpp  PPpp
vpor(dst.fp(), dst.fp(), tmp.fp());
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
// I16x8 view of I8x16
// left = AAaa AAaa ... AAaa AAaa
// right= BBbb BBbb ... BBbb BBbb
// t = 00AA 00AA ... 00AA 00AA
// s = 00BB 00BB ... 00BB 00BB
movaps(tmp.fp(), dst.fp());
movaps(liftoff::kScratchDoubleReg, rhs.fp());
psrlw(tmp.fp(), 8);
psrlw(liftoff::kScratchDoubleReg, 8);
// dst = left * 256
psllw(dst.fp(), 8);
// t = I16x8Mul(t, s)
//    => __PP __PP ...  __PP  __PP
pmullw(tmp.fp(), liftoff::kScratchDoubleReg);
// dst = I16x8Mul(left * 256, right)
//    => pp__ pp__ ...  pp__  pp__
pmullw(dst.fp(), rhs.fp());
// t = I16x8Shl(t, 8)
//    => PP00 PP00 ...  PP00  PP00
psllw(tmp.fp(), 8);
// dst = I16x8Shr(dst, 8)
//    => 00pp 00pp ...  00pp  00pp
psrlw(dst.fp(), 8);
// dst = I16x8Or(dst, t)
//    => PPpp PPpp ...  PPpp  PPpp
por(dst.fp(), tmp.fp());
}
}
void LiftoffAssembler::StackCheck(Label* ool_code, Register limit_address) {
cmp(esp, Operand(limit_address, 0));
j(below_equal, ool_code);
......
......@@ -721,6 +721,8 @@ class LiftoffAssembler : public TurboAssembler {
LiftoffRegister rhs);
inline void emit_f64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_f64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_f32x4_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_f32x4_extract_lane(LiftoffRegister dst, LiftoffRegister lhs,
uint8_t imm_lane_idx);
......@@ -731,6 +733,8 @@ class LiftoffAssembler : public TurboAssembler {
LiftoffRegister rhs);
inline void emit_f32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_f32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i64x2_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i64x2_extract_lane(LiftoffRegister dst, LiftoffRegister lhs,
uint8_t imm_lane_idx);
......@@ -741,6 +745,8 @@ class LiftoffAssembler : public TurboAssembler {
LiftoffRegister rhs);
inline void emit_i64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i32x4_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i32x4_extract_lane(LiftoffRegister dst, LiftoffRegister lhs,
uint8_t imm_lane_idx);
......@@ -751,6 +757,8 @@ class LiftoffAssembler : public TurboAssembler {
LiftoffRegister rhs);
inline void emit_i32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i16x8_extract_lane_u(LiftoffRegister dst,
LiftoffRegister lhs,
......@@ -765,6 +773,8 @@ class LiftoffAssembler : public TurboAssembler {
LiftoffRegister rhs);
inline void emit_i16x8_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i8x16_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i8x16_extract_lane_u(LiftoffRegister dst,
LiftoffRegister lhs,
......@@ -779,6 +789,8 @@ class LiftoffAssembler : public TurboAssembler {
LiftoffRegister rhs);
inline void emit_i8x16_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i8x16_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void StackCheck(Label* ool_code, Register limit_address);
......
......@@ -2312,6 +2312,12 @@ class LiftoffCompiler {
__ emit_f64x2_sub(dst, lhs, rhs);
});
break;
case wasm::kExprF64x2Mul:
EmitBinOp<ValueType::kS128, ValueType::kS128>(
[=](LiftoffRegister dst, LiftoffRegister lhs, LiftoffRegister rhs) {
__ emit_f64x2_mul(dst, lhs, rhs);
});
break;
case wasm::kExprF32x4Splat:
EmitUnOp<ValueType::kF32, ValueType::kS128>(
[=](LiftoffRegister dst, LiftoffRegister src) {
......@@ -2330,6 +2336,12 @@ class LiftoffCompiler {
__ emit_f32x4_sub(dst, lhs, rhs);
});
break;
case wasm::kExprF32x4Mul:
EmitBinOp<ValueType::kS128, ValueType::kS128>(
[=](LiftoffRegister dst, LiftoffRegister lhs, LiftoffRegister rhs) {
__ emit_f32x4_mul(dst, lhs, rhs);
});
break;
case wasm::kExprI64x2Splat:
EmitUnOp<ValueType::kI64, ValueType::kS128>(
[=](LiftoffRegister dst, LiftoffRegister src) {
......@@ -2348,6 +2360,12 @@ class LiftoffCompiler {
__ emit_i64x2_sub(dst, lhs, rhs);
});
break;
case wasm::kExprI64x2Mul:
EmitBinOp<ValueType::kS128, ValueType::kS128>(
[=](LiftoffRegister dst, LiftoffRegister lhs, LiftoffRegister rhs) {
__ emit_i64x2_mul(dst, lhs, rhs);
});
break;
case wasm::kExprI32x4Splat:
EmitUnOp<ValueType::kI32, ValueType::kS128>(
[=](LiftoffRegister dst, LiftoffRegister src) {
......@@ -2366,6 +2384,12 @@ class LiftoffCompiler {
__ emit_i32x4_sub(dst, lhs, rhs);
});
break;
case wasm::kExprI32x4Mul:
EmitBinOp<ValueType::kS128, ValueType::kS128>(
[=](LiftoffRegister dst, LiftoffRegister lhs, LiftoffRegister rhs) {
__ emit_i32x4_mul(dst, lhs, rhs);
});
break;
case wasm::kExprI16x8Splat:
EmitUnOp<ValueType::kI32, ValueType::kS128>(
[=](LiftoffRegister dst, LiftoffRegister src) {
......@@ -2384,6 +2408,12 @@ class LiftoffCompiler {
__ emit_i16x8_sub(dst, lhs, rhs);
});
break;
case wasm::kExprI16x8Mul:
EmitBinOp<ValueType::kS128, ValueType::kS128>(
[=](LiftoffRegister dst, LiftoffRegister lhs, LiftoffRegister rhs) {
__ emit_i16x8_mul(dst, lhs, rhs);
});
break;
case wasm::kExprI8x16Splat:
EmitUnOp<ValueType::kI32, ValueType::kS128>(
[=](LiftoffRegister dst, LiftoffRegister src) {
......@@ -2402,6 +2432,12 @@ class LiftoffCompiler {
__ emit_i8x16_sub(dst, lhs, rhs);
});
break;
case wasm::kExprI8x16Mul:
EmitBinOp<ValueType::kS128, ValueType::kS128>(
[=](LiftoffRegister dst, LiftoffRegister lhs, LiftoffRegister rhs) {
__ emit_i8x16_mul(dst, lhs, rhs);
});
break;
default:
unsupported(decoder, kSimd, "simd");
}
......
......@@ -1878,12 +1878,19 @@ void LiftoffAssembler::emit_f64_set_cond(Condition cond, Register dst,
namespace liftoff {
template <void (Assembler::*avx_op)(XMMRegister, XMMRegister, XMMRegister),
void (Assembler::*sse_op)(XMMRegister, XMMRegister)>
void EmitSimdCommutativeBinOp(LiftoffAssembler* assm, LiftoffRegister dst,
LiftoffRegister lhs, LiftoffRegister rhs) {
void EmitSimdCommutativeBinOp(
LiftoffAssembler* assm, LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs, base::Optional<CpuFeature> feature = base::nullopt) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(assm, AVX);
(assm->*avx_op)(dst.fp(), lhs.fp(), rhs.fp());
} else if (dst.fp() == rhs.fp()) {
return;
}
base::Optional<CpuFeatureScope> sse_scope;
if (feature.has_value()) sse_scope.emplace(assm, *feature);
if (dst.fp() == rhs.fp()) {
(assm->*sse_op)(dst.fp(), lhs.fp());
} else {
if (dst.fp() != lhs.fp()) (assm->movaps)(dst.fp(), lhs.fp());
......@@ -1957,6 +1964,12 @@ void LiftoffAssembler::emit_f64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
rhs);
}
void LiftoffAssembler::emit_f64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vmulpd, &Assembler::mulpd>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
if (dst.fp() != src.fp()) {
......@@ -2003,6 +2016,12 @@ void LiftoffAssembler::emit_f32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
rhs);
}
void LiftoffAssembler::emit_f32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vmulps, &Assembler::mulps>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movq(dst.fp(), src.gp());
......@@ -2041,6 +2060,33 @@ void LiftoffAssembler::emit_i64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
rhs);
}
void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
static constexpr RegClass tmp_rc = reg_class_for(ValueType::kS128);
LiftoffRegister tmp1 =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
LiftoffRegister tmp2 =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs, tmp1));
Movaps(tmp1.fp(), lhs.fp());
Movaps(tmp2.fp(), rhs.fp());
// Multiply high dword of each qword of left with right.
Psrlq(tmp1.fp(), 32);
Pmuludq(tmp1.fp(), rhs.fp());
// Multiply high dword of each qword of right with left.
Psrlq(tmp2.fp(), 32);
Pmuludq(tmp2.fp(), lhs.fp());
Paddq(tmp2.fp(), tmp1.fp());
Psllq(tmp2.fp(), 32);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpmuludq(dst.fp(), lhs.fp(), rhs.fp());
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
pmuludq(dst.fp(), rhs.fp());
}
Paddq(dst.fp(), tmp2.fp());
}
void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
......@@ -2079,6 +2125,12 @@ void LiftoffAssembler::emit_i32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
rhs);
}
void LiftoffAssembler::emit_i32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmulld, &Assembler::pmulld>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
}
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
......@@ -2124,6 +2176,12 @@ void LiftoffAssembler::emit_i16x8_sub(LiftoffRegister dst, LiftoffRegister lhs,
rhs);
}
void LiftoffAssembler::emit_i16x8_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmullw, &Assembler::pmullw>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
......@@ -2170,6 +2228,68 @@ void LiftoffAssembler::emit_i8x16_sub(LiftoffRegister dst, LiftoffRegister lhs,
rhs);
}
void LiftoffAssembler::emit_i8x16_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
static constexpr RegClass tmp_rc = reg_class_for(ValueType::kS128);
LiftoffRegister tmp =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
// I16x8 view of I8x16
// left = AAaa AAaa ... AAaa AAaa
// right= BBbb BBbb ... BBbb BBbb
// t = 00AA 00AA ... 00AA 00AA
// s = 00BB 00BB ... 00BB 00BB
vpsrlw(tmp.fp(), lhs.fp(), 8);
vpsrlw(kScratchDoubleReg, rhs.fp(), 8);
// t = I16x8Mul(t0, t1)
//    => __PP __PP ...  __PP  __PP
vpmullw(tmp.fp(), tmp.fp(), kScratchDoubleReg);
// s = left * 256
vpsllw(kScratchDoubleReg, lhs.fp(), 8);
// dst = I16x8Mul(left * 256, right)
//    => pp__ pp__ ...  pp__  pp__
vpmullw(dst.fp(), kScratchDoubleReg, rhs.fp());
// dst = I16x8Shr(dst, 8)
//    => 00pp 00pp ...  00pp  00pp
vpsrlw(dst.fp(), dst.fp(), 8);
// t = I16x8Shl(t, 8)
//    => PP00 PP00 ...  PP00  PP00
vpsllw(tmp.fp(), tmp.fp(), 8);
// dst = I16x8Or(dst, t)
//    => PPpp PPpp ...  PPpp  PPpp
vpor(dst.fp(), dst.fp(), tmp.fp());
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
// I16x8 view of I8x16
// left = AAaa AAaa ... AAaa AAaa
// right= BBbb BBbb ... BBbb BBbb
// t = 00AA 00AA ... 00AA 00AA
// s = 00BB 00BB ... 00BB 00BB
movaps(tmp.fp(), dst.fp());
movaps(kScratchDoubleReg, rhs.fp());
psrlw(tmp.fp(), 8);
psrlw(kScratchDoubleReg, 8);
// dst = left * 256
psllw(dst.fp(), 8);
// t = I16x8Mul(t, s)
//    => __PP __PP ...  __PP  __PP
pmullw(tmp.fp(), kScratchDoubleReg);
// dst = I16x8Mul(left * 256, right)
//    => pp__ pp__ ...  pp__  pp__
pmullw(dst.fp(), rhs.fp());
// t = I16x8Shl(t, 8)
//    => PP00 PP00 ...  PP00  PP00
psllw(tmp.fp(), 8);
// dst = I16x8Shr(dst, 8)
//    => 00pp 00pp ...  00pp  00pp
psrlw(dst.fp(), 8);
// dst = I16x8Or(dst, t)
//    => PPpp PPpp ...  PPpp  PPpp
por(dst.fp(), tmp.fp());
}
}
void LiftoffAssembler::StackCheck(Label* ool_code, Register limit_address) {
cmpq(rsp, Operand(limit_address, 0));
j(below_equal, ool_code);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment