Commit 04b6e0de authored by Zhou, Zhiguo's avatar Zhou, Zhiguo Committed by Commit Bot

[wasm-simd][liftoff] Reorder SIMD opcodes

This reorders the SIMD opcodes together with their implementations
on x64, ia32 according to src/wasm/wasm-opcodes.h.

Bug: v8:9909
Change-Id: Ib2e75927b3b44ebc951005222c8641c256d8872c
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2135074Reviewed-by: 's avatarZhi An Ng <zhin@chromium.org>
Commit-Queue: Zhiguo Zhou <zhiguo.zhou@intel.com>
Cr-Commit-Position: refs/heads/master@{#67051}
parent 1277b5fa
......@@ -1968,218 +1968,225 @@ void EmitSimdSub(LiftoffAssembler* assm, LiftoffRegister dst,
}
} // namespace liftoff
void LiftoffAssembler::emit_f64x2_splat(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movddup(dst.fp(), src.fp());
Movd(dst.fp(), src.gp());
Pxor(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
Pshufb(dst.fp(), liftoff::kScratchDoubleReg);
}
void LiftoffAssembler::emit_f64x2_extract_lane(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vshufpd(dst.fp(), lhs.fp(), lhs.fp(), imm_lane_idx);
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
if (imm_lane_idx != 0) shufpd(dst.fp(), dst.fp(), imm_lane_idx);
}
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
Pshuflw(dst.fp(), dst.fp(), 0);
Pshufd(dst.fp(), dst.fp(), 0);
}
void LiftoffAssembler::emit_f64x2_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
// TODO(fanchenk): Use movlhps and blendpd
void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
Pshufd(dst.fp(), dst.fp(), 0);
}
void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
LiftoffRegister src) {
Pinsrd(dst.fp(), src.low_gp(), 0);
Pinsrd(dst.fp(), src.high_gp(), 1);
Pshufd(dst.fp(), dst.fp(), 0x44);
}
void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
if (imm_lane_idx == 0) {
vinsertps(dst.fp(), src1.fp(), src2.fp(), 0b00000000);
vinsertps(dst.fp(), dst.fp(), src2.fp(), 0b01010000);
} else {
vinsertps(dst.fp(), src1.fp(), src2.fp(), 0b00100000);
vinsertps(dst.fp(), dst.fp(), src2.fp(), 0b01110000);
}
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
if (imm_lane_idx == 0) {
insertps(dst.fp(), src2.fp(), 0b00000000);
insertps(dst.fp(), src2.fp(), 0b01010000);
vshufps(dst.fp(), src.fp(), src.fp(), 0);
} else {
insertps(dst.fp(), src2.fp(), 0b00100000);
insertps(dst.fp(), src2.fp(), 0b01110000);
if (dst.fp() != src.fp()) {
movss(dst.fp(), src.fp());
}
shufps(dst.fp(), src.fp(), 0);
}
}
void LiftoffAssembler::emit_f64x2_add(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_f64x2_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movddup(dst.fp(), src.fp());
}
void LiftoffAssembler::emit_i8x16_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vaddpd, &Assembler::addpd>(
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddb, &Assembler::paddb>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_f64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i8x16_add_saturate_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdSub<&Assembler::vsubpd, &Assembler::subpd>(this, dst, lhs,
rhs);
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddsb, &Assembler::paddsb>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_f64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i8x16_add_saturate_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vmulpd, &Assembler::mulpd>(
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddusb, &Assembler::paddusb>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vshufps(dst.fp(), src.fp(), src.fp(), 0);
} else {
if (dst.fp() != src.fp()) {
movss(dst.fp(), src.fp());
}
shufps(dst.fp(), src.fp(), 0);
}
void LiftoffAssembler::emit_i8x16_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdSub<&Assembler::vpsubb, &Assembler::psubb>(this, dst, lhs,
rhs);
}
void LiftoffAssembler::emit_f32x4_extract_lane(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
void LiftoffAssembler::emit_i8x16_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
static constexpr RegClass tmp_rc = reg_class_for(ValueType::kS128);
LiftoffRegister tmp =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vshufps(dst.fp(), lhs.fp(), lhs.fp(), imm_lane_idx);
// I16x8 view of I8x16
// left = AAaa AAaa ... AAaa AAaa
// right= BBbb BBbb ... BBbb BBbb
// t = 00AA 00AA ... 00AA 00AA
// s = 00BB 00BB ... 00BB 00BB
vpsrlw(tmp.fp(), lhs.fp(), 8);
vpsrlw(liftoff::kScratchDoubleReg, rhs.fp(), 8);
// t = I16x8Mul(t0, t1)
//    => __PP __PP ...  __PP  __PP
vpmullw(tmp.fp(), tmp.fp(), liftoff::kScratchDoubleReg);
// s = left * 256
vpsllw(liftoff::kScratchDoubleReg, lhs.fp(), 8);
// dst = I16x8Mul(left * 256, right)
//    => pp__ pp__ ...  pp__  pp__
vpmullw(dst.fp(), liftoff::kScratchDoubleReg, rhs.fp());
// dst = I16x8Shr(dst, 8)
//    => 00pp 00pp ...  00pp  00pp
vpsrlw(dst.fp(), dst.fp(), 8);
// t = I16x8Shl(t, 8)
//    => PP00 PP00 ...  PP00  PP00
vpsllw(tmp.fp(), tmp.fp(), 8);
// dst = I16x8Or(dst, t)
//    => PPpp PPpp ...  PPpp  PPpp
vpor(dst.fp(), dst.fp(), tmp.fp());
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
if (imm_lane_idx != 0) shufps(dst.fp(), dst.fp(), imm_lane_idx);
// I16x8 view of I8x16
// left = AAaa AAaa ... AAaa AAaa
// right= BBbb BBbb ... BBbb BBbb
// t = 00AA 00AA ... 00AA 00AA
// s = 00BB 00BB ... 00BB 00BB
movaps(tmp.fp(), dst.fp());
movaps(liftoff::kScratchDoubleReg, rhs.fp());
psrlw(tmp.fp(), 8);
psrlw(liftoff::kScratchDoubleReg, 8);
// dst = left * 256
psllw(dst.fp(), 8);
// t = I16x8Mul(t, s)
//    => __PP __PP ...  __PP  __PP
pmullw(tmp.fp(), liftoff::kScratchDoubleReg);
// dst = I16x8Mul(left * 256, right)
//    => pp__ pp__ ...  pp__  pp__
pmullw(dst.fp(), rhs.fp());
// t = I16x8Shl(t, 8)
//    => PP00 PP00 ...  PP00  PP00
psllw(tmp.fp(), 8);
// dst = I16x8Shr(dst, 8)
//    => 00pp 00pp ...  00pp  00pp
psrlw(dst.fp(), 8);
// dst = I16x8Or(dst, t)
//    => PPpp PPpp ...  PPpp  PPpp
por(dst.fp(), tmp.fp());
}
}
void LiftoffAssembler::emit_f32x4_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vinsertps(dst.fp(), src1.fp(), src2.fp(), (imm_lane_idx << 4) & 0x30);
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
insertps(dst.fp(), src2.fp(), (imm_lane_idx << 4) & 0x30);
}
void LiftoffAssembler::emit_i8x16_min_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpminsb, &Assembler::pminsb>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
}
void LiftoffAssembler::emit_f32x4_add(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i8x16_min_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vaddps, &Assembler::addps>(
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpminub, &Assembler::pminub>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_f32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i8x16_max_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdSub<&Assembler::vsubps, &Assembler::subps>(this, dst, lhs,
rhs);
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmaxsb, &Assembler::pmaxsb>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
}
void LiftoffAssembler::emit_f32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i8x16_max_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vmulps, &Assembler::mulps>(
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmaxub, &Assembler::pmaxub>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
LiftoffRegister src) {
Pinsrd(dst.fp(), src.low_gp(), 0);
Pinsrd(dst.fp(), src.high_gp(), 1);
Pshufd(dst.fp(), dst.fp(), 0x44);
void LiftoffAssembler::emit_i16x8_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddw, &Assembler::paddw>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i64x2_extract_lane(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_add_saturate_s(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
Pextrd(dst.low_gp(), lhs.fp(), imm_lane_idx * 2);
Pextrd(dst.high_gp(), lhs.fp(), imm_lane_idx * 2 + 1);
}
void LiftoffAssembler::emit_i64x2_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpinsrd(dst.fp(), src1.fp(), src2.low_gp(), imm_lane_idx * 2);
vpinsrd(dst.fp(), dst.fp(), src2.high_gp(), imm_lane_idx * 2 + 1);
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
pinsrd(dst.fp(), src2.low_gp(), imm_lane_idx * 2);
pinsrd(dst.fp(), src2.high_gp(), imm_lane_idx * 2 + 1);
}
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddsw, &Assembler::paddsw>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i64x2_add(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i16x8_add_saturate_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddq, &Assembler::paddq>(
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddusw, &Assembler::paddusw>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i16x8_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdSub<&Assembler::vpsubq, &Assembler::psubq>(this, dst, lhs,
liftoff::EmitSimdSub<&Assembler::vpsubw, &Assembler::psubw>(this, dst, lhs,
rhs);
}
void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i16x8_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
static constexpr RegClass tmp_rc = reg_class_for(ValueType::kS128);
LiftoffRegister tmp1 =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
LiftoffRegister tmp2 =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs, tmp1));
Movaps(tmp1.fp(), lhs.fp());
Movaps(tmp2.fp(), rhs.fp());
// Multiply high dword of each qword of left with right.
Psrlq(tmp1.fp(), 32);
Pmuludq(tmp1.fp(), tmp1.fp(), rhs.fp());
// Multiply high dword of each qword of right with left.
Psrlq(tmp2.fp(), 32);
Pmuludq(tmp2.fp(), tmp2.fp(), lhs.fp());
Paddq(tmp2.fp(), tmp2.fp(), tmp1.fp());
Psllq(tmp2.fp(), tmp2.fp(), 32);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpmuludq(dst.fp(), lhs.fp(), rhs.fp());
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
pmuludq(dst.fp(), rhs.fp());
}
Paddq(dst.fp(), dst.fp(), tmp2.fp());
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmullw, &Assembler::pmullw>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
Pshufd(dst.fp(), dst.fp(), 0);
void LiftoffAssembler::emit_i16x8_min_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpminsw, &Assembler::pminsw>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i32x4_extract_lane(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_min_u(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
Pextrd(dst.gp(), lhs.fp(), imm_lane_idx);
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpminuw, &Assembler::pminuw>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
}
void LiftoffAssembler::emit_i32x4_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpinsrd(dst.fp(), src1.fp(), src2.gp(), imm_lane_idx);
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
pinsrd(dst.fp(), src2.gp(), imm_lane_idx);
}
void LiftoffAssembler::emit_i16x8_max_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmaxsw, &Assembler::pmaxsw>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i16x8_max_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmaxuw, &Assembler::pmaxuw>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
}
void LiftoffAssembler::emit_i32x4_add(LiftoffRegister dst, LiftoffRegister lhs,
......@@ -2228,117 +2235,142 @@ void LiftoffAssembler::emit_i32x4_max_u(LiftoffRegister dst,
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
}
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
Pshuflw(dst.fp(), dst.fp(), 0);
Pshufd(dst.fp(), dst.fp(), 0);
}
void LiftoffAssembler::emit_i16x8_extract_lane_u(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
Pextrw(dst.gp(), lhs.fp(), imm_lane_idx);
void LiftoffAssembler::emit_i64x2_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddq, &Assembler::paddq>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i16x8_extract_lane_s(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
Pextrw(dst.gp(), lhs.fp(), imm_lane_idx);
movsx_w(dst.gp(), dst.gp());
void LiftoffAssembler::emit_i64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdSub<&Assembler::vpsubq, &Assembler::psubq>(this, dst, lhs,
rhs);
}
void LiftoffAssembler::emit_i16x8_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
static constexpr RegClass tmp_rc = reg_class_for(ValueType::kS128);
LiftoffRegister tmp1 =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
LiftoffRegister tmp2 =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs, tmp1));
Movaps(tmp1.fp(), lhs.fp());
Movaps(tmp2.fp(), rhs.fp());
// Multiply high dword of each qword of left with right.
Psrlq(tmp1.fp(), 32);
Pmuludq(tmp1.fp(), tmp1.fp(), rhs.fp());
// Multiply high dword of each qword of right with left.
Psrlq(tmp2.fp(), 32);
Pmuludq(tmp2.fp(), tmp2.fp(), lhs.fp());
Paddq(tmp2.fp(), tmp2.fp(), tmp1.fp());
Psllq(tmp2.fp(), tmp2.fp(), 32);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpinsrw(dst.fp(), src1.fp(), src2.gp(), imm_lane_idx);
vpmuludq(dst.fp(), lhs.fp(), rhs.fp());
} else {
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
pinsrw(dst.fp(), src2.gp(), imm_lane_idx);
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
pmuludq(dst.fp(), rhs.fp());
}
Paddq(dst.fp(), dst.fp(), tmp2.fp());
}
void LiftoffAssembler::emit_i16x8_add(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_f32x4_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vaddps, &Assembler::addps>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_f32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdSub<&Assembler::vsubps, &Assembler::subps>(this, dst, lhs,
rhs);
}
void LiftoffAssembler::emit_f32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddw, &Assembler::paddw>(
liftoff::EmitSimdCommutativeBinOp<&Assembler::vmulps, &Assembler::mulps>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i16x8_add_saturate_s(LiftoffRegister dst,
LiftoffRegister lhs,
void LiftoffAssembler::emit_f64x2_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddsw, &Assembler::paddsw>(
liftoff::EmitSimdCommutativeBinOp<&Assembler::vaddpd, &Assembler::addpd>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i16x8_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_f64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdSub<&Assembler::vpsubw, &Assembler::psubw>(this, dst, lhs,
liftoff::EmitSimdSub<&Assembler::vsubpd, &Assembler::subpd>(this, dst, lhs,
rhs);
}
void LiftoffAssembler::emit_i16x8_mul(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_f64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmullw, &Assembler::pmullw>(
liftoff::EmitSimdCommutativeBinOp<&Assembler::vmulpd, &Assembler::mulpd>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i16x8_add_saturate_u(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_extract_lane_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddusw, &Assembler::paddusw>(
this, dst, lhs, rhs);
uint8_t imm_lane_idx) {
Pextrb(dst.gp(), lhs.fp(), imm_lane_idx);
movsx_b(dst.gp(), dst.gp());
}
void LiftoffAssembler::emit_i16x8_min_s(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_extract_lane_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpminsw, &Assembler::pminsw>(
this, dst, lhs, rhs);
uint8_t imm_lane_idx) {
Pextrb(dst.gp(), lhs.fp(), imm_lane_idx);
}
void LiftoffAssembler::emit_i16x8_min_u(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_extract_lane_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpminuw, &Assembler::pminuw>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
uint8_t imm_lane_idx) {
Pextrw(dst.gp(), lhs.fp(), imm_lane_idx);
movsx_w(dst.gp(), dst.gp());
}
void LiftoffAssembler::emit_i16x8_max_s(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_extract_lane_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmaxsw, &Assembler::pmaxsw>(
this, dst, lhs, rhs);
uint8_t imm_lane_idx) {
Pextrw(dst.gp(), lhs.fp(), imm_lane_idx);
}
void LiftoffAssembler::emit_i16x8_max_u(LiftoffRegister dst,
void LiftoffAssembler::emit_i32x4_extract_lane(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmaxuw, &Assembler::pmaxuw>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
uint8_t imm_lane_idx) {
Pextrd(dst.gp(), lhs.fp(), imm_lane_idx);
}
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
Pxor(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
Pshufb(dst.fp(), liftoff::kScratchDoubleReg);
void LiftoffAssembler::emit_i64x2_extract_lane(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
Pextrd(dst.low_gp(), lhs.fp(), imm_lane_idx * 2);
Pextrd(dst.high_gp(), lhs.fp(), imm_lane_idx * 2 + 1);
}
void LiftoffAssembler::emit_i8x16_extract_lane_u(LiftoffRegister dst,
void LiftoffAssembler::emit_f32x4_extract_lane(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
Pextrb(dst.gp(), lhs.fp(), imm_lane_idx);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vshufps(dst.fp(), lhs.fp(), lhs.fp(), imm_lane_idx);
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
if (imm_lane_idx != 0) shufps(dst.fp(), dst.fp(), imm_lane_idx);
}
}
void LiftoffAssembler::emit_i8x16_extract_lane_s(LiftoffRegister dst,
void LiftoffAssembler::emit_f64x2_extract_lane(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
Pextrb(dst.gp(), lhs.fp(), imm_lane_idx);
movsx_b(dst.gp(), dst.gp());
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vshufpd(dst.fp(), lhs.fp(), lhs.fp(), imm_lane_idx);
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
if (imm_lane_idx != 0) shufpd(dst.fp(), dst.fp(), imm_lane_idx);
}
}
void LiftoffAssembler::emit_i8x16_replace_lane(LiftoffRegister dst,
......@@ -2355,120 +2387,88 @@ void LiftoffAssembler::emit_i8x16_replace_lane(LiftoffRegister dst,
}
}
void LiftoffAssembler::emit_i8x16_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddb, &Assembler::paddb>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i8x16_add_saturate_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddsb, &Assembler::paddsb>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i8x16_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdSub<&Assembler::vpsubb, &Assembler::psubb>(this, dst, lhs,
rhs);
}
void LiftoffAssembler::emit_i8x16_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
static constexpr RegClass tmp_rc = reg_class_for(ValueType::kS128);
LiftoffRegister tmp =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
void LiftoffAssembler::emit_i16x8_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
// I16x8 view of I8x16
// left = AAaa AAaa ... AAaa AAaa
// right= BBbb BBbb ... BBbb BBbb
// t = 00AA 00AA ... 00AA 00AA
// s = 00BB 00BB ... 00BB 00BB
vpsrlw(tmp.fp(), lhs.fp(), 8);
vpsrlw(liftoff::kScratchDoubleReg, rhs.fp(), 8);
// t = I16x8Mul(t0, t1)
//    => __PP __PP ...  __PP  __PP
vpmullw(tmp.fp(), tmp.fp(), liftoff::kScratchDoubleReg);
// s = left * 256
vpsllw(liftoff::kScratchDoubleReg, lhs.fp(), 8);
// dst = I16x8Mul(left * 256, right)
//    => pp__ pp__ ...  pp__  pp__
vpmullw(dst.fp(), liftoff::kScratchDoubleReg, rhs.fp());
// dst = I16x8Shr(dst, 8)
//    => 00pp 00pp ...  00pp  00pp
vpsrlw(dst.fp(), dst.fp(), 8);
// t = I16x8Shl(t, 8)
//    => PP00 PP00 ...  PP00  PP00
vpsllw(tmp.fp(), tmp.fp(), 8);
// dst = I16x8Or(dst, t)
//    => PPpp PPpp ...  PPpp  PPpp
vpor(dst.fp(), dst.fp(), tmp.fp());
vpinsrw(dst.fp(), src1.fp(), src2.gp(), imm_lane_idx);
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
// I16x8 view of I8x16
// left = AAaa AAaa ... AAaa AAaa
// right= BBbb BBbb ... BBbb BBbb
// t = 00AA 00AA ... 00AA 00AA
// s = 00BB 00BB ... 00BB 00BB
movaps(tmp.fp(), dst.fp());
movaps(liftoff::kScratchDoubleReg, rhs.fp());
psrlw(tmp.fp(), 8);
psrlw(liftoff::kScratchDoubleReg, 8);
// dst = left * 256
psllw(dst.fp(), 8);
// t = I16x8Mul(t, s)
//    => __PP __PP ...  __PP  __PP
pmullw(tmp.fp(), liftoff::kScratchDoubleReg);
// dst = I16x8Mul(left * 256, right)
//    => pp__ pp__ ...  pp__  pp__
pmullw(dst.fp(), rhs.fp());
// t = I16x8Shl(t, 8)
//    => PP00 PP00 ...  PP00  PP00
psllw(tmp.fp(), 8);
// dst = I16x8Shr(dst, 8)
//    => 00pp 00pp ...  00pp  00pp
psrlw(dst.fp(), 8);
// dst = I16x8Or(dst, t)
//    => PPpp PPpp ...  PPpp  PPpp
por(dst.fp(), tmp.fp());
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
pinsrw(dst.fp(), src2.gp(), imm_lane_idx);
}
}
void LiftoffAssembler::emit_i8x16_add_saturate_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddusb, &Assembler::paddusb>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i8x16_min_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpminsb, &Assembler::pminsb>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
void LiftoffAssembler::emit_i32x4_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpinsrd(dst.fp(), src1.fp(), src2.gp(), imm_lane_idx);
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
pinsrd(dst.fp(), src2.gp(), imm_lane_idx);
}
}
void LiftoffAssembler::emit_i8x16_min_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpminub, &Assembler::pminub>(
this, dst, lhs, rhs);
void LiftoffAssembler::emit_i64x2_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpinsrd(dst.fp(), src1.fp(), src2.low_gp(), imm_lane_idx * 2);
vpinsrd(dst.fp(), dst.fp(), src2.high_gp(), imm_lane_idx * 2 + 1);
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
pinsrd(dst.fp(), src2.low_gp(), imm_lane_idx * 2);
pinsrd(dst.fp(), src2.high_gp(), imm_lane_idx * 2 + 1);
}
}
void LiftoffAssembler::emit_i8x16_max_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmaxsb, &Assembler::pmaxsb>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
void LiftoffAssembler::emit_f32x4_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vinsertps(dst.fp(), src1.fp(), src2.fp(), (imm_lane_idx << 4) & 0x30);
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
insertps(dst.fp(), src2.fp(), (imm_lane_idx << 4) & 0x30);
}
}
void LiftoffAssembler::emit_i8x16_max_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmaxub, &Assembler::pmaxub>(
this, dst, lhs, rhs);
void LiftoffAssembler::emit_f64x2_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
// TODO(fanchenk): Use movlhps and blendpd
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
if (imm_lane_idx == 0) {
vinsertps(dst.fp(), src1.fp(), src2.fp(), 0b00000000);
vinsertps(dst.fp(), dst.fp(), src2.fp(), 0b01010000);
} else {
vinsertps(dst.fp(), src1.fp(), src2.fp(), 0b00100000);
vinsertps(dst.fp(), dst.fp(), src2.fp(), 0b01110000);
}
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
if (imm_lane_idx == 0) {
insertps(dst.fp(), src2.fp(), 0b00000000);
insertps(dst.fp(), src2.fp(), 0b01010000);
} else {
insertps(dst.fp(), src2.fp(), 0b00100000);
insertps(dst.fp(), src2.fp(), 0b01110000);
}
}
}
void LiftoffAssembler::StackCheck(Label* ool_code, Register limit_address) {
......
......@@ -711,48 +711,52 @@ class LiftoffAssembler : public TurboAssembler {
inline void emit_f64_set_cond(Condition condition, Register dst,
DoubleRegister lhs, DoubleRegister rhs);
inline void emit_i8x16_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i16x8_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i32x4_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i64x2_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_f32x4_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_f64x2_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_f64x2_extract_lane(LiftoffRegister dst, LiftoffRegister lhs,
uint8_t imm_lane_idx);
inline void emit_f64x2_replace_lane(LiftoffRegister dst, LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx);
inline void emit_f64x2_add(LiftoffRegister dst, LiftoffRegister lhs,
inline void emit_i8x16_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_f64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
inline void emit_i8x16_add_saturate_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_f64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
inline void emit_i8x16_add_saturate_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_f32x4_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_f32x4_extract_lane(LiftoffRegister dst, LiftoffRegister lhs,
uint8_t imm_lane_idx);
inline void emit_f32x4_replace_lane(LiftoffRegister dst, LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx);
inline void emit_f32x4_add(LiftoffRegister dst, LiftoffRegister lhs,
inline void emit_i8x16_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_f32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
inline void emit_i8x16_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_f32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
inline void emit_i8x16_min_s(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i64x2_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i64x2_extract_lane(LiftoffRegister dst, LiftoffRegister lhs,
uint8_t imm_lane_idx);
inline void emit_i64x2_replace_lane(LiftoffRegister dst, LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx);
inline void emit_i64x2_add(LiftoffRegister dst, LiftoffRegister lhs,
inline void emit_i8x16_min_u(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
inline void emit_i8x16_max_s(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
inline void emit_i8x16_max_u(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_add_saturate_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_add_saturate_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_min_s(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_min_u(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_max_s(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_max_u(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i32x4_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i32x4_extract_lane(LiftoffRegister dst, LiftoffRegister lhs,
uint8_t imm_lane_idx);
inline void emit_i32x4_replace_lane(LiftoffRegister dst, LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx);
inline void emit_i32x4_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
......@@ -767,66 +771,62 @@ class LiftoffAssembler : public TurboAssembler {
LiftoffRegister rhs);
inline void emit_i32x4_max_u(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i16x8_extract_lane_u(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx);
inline void emit_i16x8_extract_lane_s(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx);
inline void emit_i16x8_replace_lane(LiftoffRegister dst, LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx);
inline void emit_i16x8_add(LiftoffRegister dst, LiftoffRegister lhs,
inline void emit_i64x2_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_add_saturate_s(LiftoffRegister dst,
LiftoffRegister lhs,
inline void emit_i64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_sub(LiftoffRegister dst, LiftoffRegister lhs,
inline void emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_mul(LiftoffRegister dst, LiftoffRegister lhs,
inline void emit_f32x4_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_add_saturate_u(LiftoffRegister dst,
LiftoffRegister lhs,
inline void emit_f32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_min_s(LiftoffRegister dst, LiftoffRegister lhs,
inline void emit_f32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_min_u(LiftoffRegister dst, LiftoffRegister lhs,
inline void emit_f64x2_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_max_s(LiftoffRegister dst, LiftoffRegister lhs,
inline void emit_f64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_max_u(LiftoffRegister dst, LiftoffRegister lhs,
inline void emit_f64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i8x16_splat(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i8x16_extract_lane_s(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx);
inline void emit_i8x16_extract_lane_u(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx);
inline void emit_i8x16_extract_lane_s(LiftoffRegister dst,
inline void emit_i16x8_extract_lane_s(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx);
inline void emit_i16x8_extract_lane_u(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx);
inline void emit_i32x4_extract_lane(LiftoffRegister dst, LiftoffRegister lhs,
uint8_t imm_lane_idx);
inline void emit_i64x2_extract_lane(LiftoffRegister dst, LiftoffRegister lhs,
uint8_t imm_lane_idx);
inline void emit_f32x4_extract_lane(LiftoffRegister dst, LiftoffRegister lhs,
uint8_t imm_lane_idx);
inline void emit_f64x2_extract_lane(LiftoffRegister dst, LiftoffRegister lhs,
uint8_t imm_lane_idx);
inline void emit_i8x16_replace_lane(LiftoffRegister dst, LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx);
inline void emit_i8x16_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i8x16_add_saturate_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i8x16_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i8x16_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i8x16_add_saturate_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i8x16_min_s(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i8x16_min_u(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i8x16_max_s(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i8x16_max_u(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i16x8_replace_lane(LiftoffRegister dst, LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx);
inline void emit_i32x4_replace_lane(LiftoffRegister dst, LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx);
inline void emit_i64x2_replace_lane(LiftoffRegister dst, LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx);
inline void emit_f32x4_replace_lane(LiftoffRegister dst, LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx);
inline void emit_f64x2_replace_lane(LiftoffRegister dst, LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx);
inline void StackCheck(Label* ool_code, Register limit_address);
......
......@@ -2294,82 +2294,30 @@ class LiftoffCompiler {
return unsupported(decoder, kSimd, "simd");
}
switch (opcode) {
case wasm::kExprF64x2Splat:
return EmitUnOp<kF64, kS128>(&LiftoffAssembler::emit_f64x2_splat);
case wasm::kExprF64x2Add:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_add);
case wasm::kExprF64x2Sub:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_sub);
case wasm::kExprF64x2Mul:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_mul);
case wasm::kExprF32x4Splat:
return EmitUnOp<kF32, kS128>(&LiftoffAssembler::emit_f32x4_splat);
case wasm::kExprF32x4Add:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f32x4_add);
case wasm::kExprF32x4Sub:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f32x4_sub);
case wasm::kExprF32x4Mul:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f32x4_mul);
case wasm::kExprI64x2Splat:
return EmitUnOp<kI64, kS128>(&LiftoffAssembler::emit_i64x2_splat);
case wasm::kExprI64x2Add:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i64x2_add);
case wasm::kExprI64x2Sub:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i64x2_sub);
case wasm::kExprI64x2Mul:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i64x2_mul);
case wasm::kExprI32x4Splat:
return EmitUnOp<kI32, kS128>(&LiftoffAssembler::emit_i32x4_splat);
case wasm::kExprI32x4Add:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i32x4_add);
case wasm::kExprI32x4Sub:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i32x4_sub);
case wasm::kExprI32x4Mul:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i32x4_mul);
case wasm::kExprI32x4MinS:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i32x4_min_s);
case wasm::kExprI32x4MinU:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i32x4_min_u);
case wasm::kExprI32x4MaxS:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i32x4_max_s);
case wasm::kExprI32x4MaxU:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i32x4_max_u);
case wasm::kExprI16x8Splat:
return EmitUnOp<kI32, kS128>(&LiftoffAssembler::emit_i16x8_splat);
case wasm::kExprI16x8Add:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_add);
case wasm::kExprI16x8AddSaturateS:
return EmitBinOp<kS128, kS128>(
&LiftoffAssembler::emit_i16x8_add_saturate_s);
case wasm::kExprI16x8Sub:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_sub);
case wasm::kExprI16x8Mul:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_mul);
case wasm::kExprI16x8AddSaturateU:
return EmitBinOp<kS128, kS128>(
&LiftoffAssembler::emit_i16x8_add_saturate_u);
case wasm::kExprI16x8MinS:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_min_s);
case wasm::kExprI16x8MinU:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_min_u);
case wasm::kExprI16x8MaxS:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_max_s);
case wasm::kExprI16x8MaxU:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_max_u);
case wasm::kExprI8x16Splat:
return EmitUnOp<kI32, kS128>(&LiftoffAssembler::emit_i8x16_splat);
case wasm::kExprI16x8Splat:
return EmitUnOp<kI32, kS128>(&LiftoffAssembler::emit_i16x8_splat);
case wasm::kExprI32x4Splat:
return EmitUnOp<kI32, kS128>(&LiftoffAssembler::emit_i32x4_splat);
case wasm::kExprI64x2Splat:
return EmitUnOp<kI64, kS128>(&LiftoffAssembler::emit_i64x2_splat);
case wasm::kExprF32x4Splat:
return EmitUnOp<kF32, kS128>(&LiftoffAssembler::emit_f32x4_splat);
case wasm::kExprF64x2Splat:
return EmitUnOp<kF64, kS128>(&LiftoffAssembler::emit_f64x2_splat);
case wasm::kExprI8x16Add:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i8x16_add);
case wasm::kExprI8x16AddSaturateS:
return EmitBinOp<kS128, kS128>(
&LiftoffAssembler::emit_i8x16_add_saturate_s);
case wasm::kExprI8x16AddSaturateU:
return EmitBinOp<kS128, kS128>(
&LiftoffAssembler::emit_i8x16_add_saturate_u);
case wasm::kExprI8x16Sub:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i8x16_sub);
case wasm::kExprI8x16Mul:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i8x16_mul);
case wasm::kExprI8x16AddSaturateU:
return EmitBinOp<kS128, kS128>(
&LiftoffAssembler::emit_i8x16_add_saturate_u);
case wasm::kExprI8x16MinS:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i8x16_min_s);
case wasm::kExprI8x16MinU:
......@@ -2378,6 +2326,58 @@ class LiftoffCompiler {
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i8x16_max_s);
case wasm::kExprI8x16MaxU:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i8x16_max_u);
case wasm::kExprI16x8Add:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_add);
case wasm::kExprI16x8AddSaturateS:
return EmitBinOp<kS128, kS128>(
&LiftoffAssembler::emit_i16x8_add_saturate_s);
case wasm::kExprI16x8AddSaturateU:
return EmitBinOp<kS128, kS128>(
&LiftoffAssembler::emit_i16x8_add_saturate_u);
case wasm::kExprI16x8Sub:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_sub);
case wasm::kExprI16x8Mul:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_mul);
case wasm::kExprI16x8MinS:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_min_s);
case wasm::kExprI16x8MinU:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_min_u);
case wasm::kExprI16x8MaxS:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_max_s);
case wasm::kExprI16x8MaxU:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i16x8_max_u);
case wasm::kExprI32x4Add:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i32x4_add);
case wasm::kExprI32x4Sub:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i32x4_sub);
case wasm::kExprI32x4Mul:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i32x4_mul);
case wasm::kExprI32x4MinS:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i32x4_min_s);
case wasm::kExprI32x4MinU:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i32x4_min_u);
case wasm::kExprI32x4MaxS:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i32x4_max_s);
case wasm::kExprI32x4MaxU:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i32x4_max_u);
case wasm::kExprI64x2Add:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i64x2_add);
case wasm::kExprI64x2Sub:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i64x2_sub);
case wasm::kExprI64x2Mul:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_i64x2_mul);
case wasm::kExprF32x4Add:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f32x4_add);
case wasm::kExprF32x4Sub:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f32x4_sub);
case wasm::kExprF32x4Mul:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f32x4_mul);
case wasm::kExprF64x2Add:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_add);
case wasm::kExprF64x2Sub:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_sub);
case wasm::kExprF64x2Mul:
return EmitBinOp<kS128, kS128>(&LiftoffAssembler::emit_f64x2_mul);
default:
unsupported(decoder, kSimd, "simd");
}
......@@ -2438,14 +2438,14 @@ class LiftoffCompiler {
}, \
imm); \
break;
CASE_SIMD_EXTRACT_LANE_OP(F64x2ExtractLane, F64, f64x2_extract_lane)
CASE_SIMD_EXTRACT_LANE_OP(F32x4ExtractLane, F32, f32x4_extract_lane)
CASE_SIMD_EXTRACT_LANE_OP(I64x2ExtractLane, I64, i64x2_extract_lane)
CASE_SIMD_EXTRACT_LANE_OP(I32x4ExtractLane, I32, i32x4_extract_lane)
CASE_SIMD_EXTRACT_LANE_OP(I16x8ExtractLaneU, I32, i16x8_extract_lane_u)
CASE_SIMD_EXTRACT_LANE_OP(I16x8ExtractLaneS, I32, i16x8_extract_lane_s)
CASE_SIMD_EXTRACT_LANE_OP(I8x16ExtractLaneU, I32, i8x16_extract_lane_u)
CASE_SIMD_EXTRACT_LANE_OP(I8x16ExtractLaneS, I32, i8x16_extract_lane_s)
CASE_SIMD_EXTRACT_LANE_OP(I8x16ExtractLaneU, I32, i8x16_extract_lane_u)
CASE_SIMD_EXTRACT_LANE_OP(I16x8ExtractLaneS, I32, i16x8_extract_lane_s)
CASE_SIMD_EXTRACT_LANE_OP(I16x8ExtractLaneU, I32, i16x8_extract_lane_u)
CASE_SIMD_EXTRACT_LANE_OP(I32x4ExtractLane, I32, i32x4_extract_lane)
CASE_SIMD_EXTRACT_LANE_OP(I64x2ExtractLane, I64, i64x2_extract_lane)
CASE_SIMD_EXTRACT_LANE_OP(F32x4ExtractLane, F32, f32x4_extract_lane)
CASE_SIMD_EXTRACT_LANE_OP(F64x2ExtractLane, F64, f64x2_extract_lane)
#undef CASE_SIMD_EXTRACT_LANE_OP
#define CASE_SIMD_REPLACE_LANE_OP(opcode, type, fn) \
case wasm::kExpr##opcode: \
......@@ -2456,12 +2456,12 @@ class LiftoffCompiler {
}, \
imm); \
break;
CASE_SIMD_REPLACE_LANE_OP(F64x2ReplaceLane, F64, f64x2_replace_lane)
CASE_SIMD_REPLACE_LANE_OP(F32x4ReplaceLane, F32, f32x4_replace_lane)
CASE_SIMD_REPLACE_LANE_OP(I64x2ReplaceLane, I64, i64x2_replace_lane)
CASE_SIMD_REPLACE_LANE_OP(I32x4ReplaceLane, I32, i32x4_replace_lane)
CASE_SIMD_REPLACE_LANE_OP(I16x8ReplaceLane, I32, i16x8_replace_lane)
CASE_SIMD_REPLACE_LANE_OP(I8x16ReplaceLane, I32, i8x16_replace_lane)
CASE_SIMD_REPLACE_LANE_OP(I16x8ReplaceLane, I32, i16x8_replace_lane)
CASE_SIMD_REPLACE_LANE_OP(I32x4ReplaceLane, I32, i32x4_replace_lane)
CASE_SIMD_REPLACE_LANE_OP(I64x2ReplaceLane, I64, i64x2_replace_lane)
CASE_SIMD_REPLACE_LANE_OP(F32x4ReplaceLane, F32, f32x4_replace_lane)
CASE_SIMD_REPLACE_LANE_OP(F64x2ReplaceLane, F64, f64x2_replace_lane)
#undef CASE_SIMD_REPLACE_LANE_OP
default:
unsupported(decoder, kSimd, "simd");
......
......@@ -1918,199 +1918,219 @@ void EmitSimdSub(LiftoffAssembler* assm, LiftoffRegister dst,
}
} // namespace liftoff
void LiftoffAssembler::emit_f64x2_splat(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movddup(dst.fp(), src.fp());
Movd(dst.fp(), src.gp());
Pxor(kScratchDoubleReg, kScratchDoubleReg);
Pshufb(dst.fp(), kScratchDoubleReg);
}
void LiftoffAssembler::emit_f64x2_extract_lane(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
Pextrq(kScratchRegister, lhs.fp(), static_cast<int8_t>(imm_lane_idx));
Movq(dst.fp(), kScratchRegister);
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
Pshuflw(dst.fp(), dst.fp(), static_cast<uint8_t>(0));
Pshufd(dst.fp(), dst.fp(), static_cast<uint8_t>(0));
}
void LiftoffAssembler::emit_f64x2_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
if (imm_lane_idx == 0) {
vpblendw(dst.fp(), src1.fp(), src2.fp(), 0b00001111);
} else {
vmovlhps(dst.fp(), src1.fp(), src2.fp());
}
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
if (imm_lane_idx == 0) {
pblendw(dst.fp(), src2.fp(), 0b00001111);
} else {
movlhps(dst.fp(), src2.fp());
}
void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
Pshufd(dst.fp(), dst.fp(), static_cast<uint8_t>(0));
}
void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movq(dst.fp(), src.gp());
Movddup(dst.fp(), dst.fp());
}
void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
if (dst.fp() != src.fp()) {
Movss(dst.fp(), src.fp());
}
Shufps(dst.fp(), src.fp(), static_cast<byte>(0));
}
void LiftoffAssembler::emit_f64x2_add(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_f64x2_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movddup(dst.fp(), src.fp());
}
void LiftoffAssembler::emit_i8x16_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vaddpd, &Assembler::addpd>(
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddb, &Assembler::paddb>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_f64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i8x16_add_saturate_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdSub<&Assembler::vsubpd, &Assembler::subpd>(this, dst, lhs,
rhs);
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddsb, &Assembler::paddsb>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_f64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i8x16_add_saturate_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vmulpd, &Assembler::mulpd>(
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddusb, &Assembler::paddusb>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
if (dst.fp() != src.fp()) {
Movss(dst.fp(), src.fp());
}
Shufps(dst.fp(), src.fp(), static_cast<byte>(0));
void LiftoffAssembler::emit_i8x16_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdSub<&Assembler::vpsubb, &Assembler::psubb>(this, dst, lhs,
rhs);
}
void LiftoffAssembler::emit_f32x4_extract_lane(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
void LiftoffAssembler::emit_i8x16_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
static constexpr RegClass tmp_rc = reg_class_for(ValueType::kS128);
LiftoffRegister tmp =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vshufps(dst.fp(), lhs.fp(), lhs.fp(), imm_lane_idx);
// I16x8 view of I8x16
// left = AAaa AAaa ... AAaa AAaa
// right= BBbb BBbb ... BBbb BBbb
// t = 00AA 00AA ... 00AA 00AA
// s = 00BB 00BB ... 00BB 00BB
vpsrlw(tmp.fp(), lhs.fp(), 8);
vpsrlw(kScratchDoubleReg, rhs.fp(), 8);
// t = I16x8Mul(t0, t1)
//    => __PP __PP ...  __PP  __PP
vpmullw(tmp.fp(), tmp.fp(), kScratchDoubleReg);
// s = left * 256
vpsllw(kScratchDoubleReg, lhs.fp(), 8);
// dst = I16x8Mul(left * 256, right)
//    => pp__ pp__ ...  pp__  pp__
vpmullw(dst.fp(), kScratchDoubleReg, rhs.fp());
// dst = I16x8Shr(dst, 8)
//    => 00pp 00pp ...  00pp  00pp
vpsrlw(dst.fp(), dst.fp(), 8);
// t = I16x8Shl(t, 8)
//    => PP00 PP00 ...  PP00  PP00
vpsllw(tmp.fp(), tmp.fp(), 8);
// dst = I16x8Or(dst, t)
//    => PPpp PPpp ...  PPpp  PPpp
vpor(dst.fp(), dst.fp(), tmp.fp());
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
if (imm_lane_idx != 0) shufps(dst.fp(), dst.fp(), imm_lane_idx);
// I16x8 view of I8x16
// left = AAaa AAaa ... AAaa AAaa
// right= BBbb BBbb ... BBbb BBbb
// t = 00AA 00AA ... 00AA 00AA
// s = 00BB 00BB ... 00BB 00BB
movaps(tmp.fp(), dst.fp());
movaps(kScratchDoubleReg, rhs.fp());
psrlw(tmp.fp(), 8);
psrlw(kScratchDoubleReg, 8);
// dst = left * 256
psllw(dst.fp(), 8);
// t = I16x8Mul(t, s)
//    => __PP __PP ...  __PP  __PP
pmullw(tmp.fp(), kScratchDoubleReg);
// dst = I16x8Mul(left * 256, right)
//    => pp__ pp__ ...  pp__  pp__
pmullw(dst.fp(), rhs.fp());
// t = I16x8Shl(t, 8)
//    => PP00 PP00 ...  PP00  PP00
psllw(tmp.fp(), 8);
// dst = I16x8Shr(dst, 8)
//    => 00pp 00pp ...  00pp  00pp
psrlw(dst.fp(), 8);
// dst = I16x8Or(dst, t)
//    => PPpp PPpp ...  PPpp  PPpp
por(dst.fp(), tmp.fp());
}
}
void LiftoffAssembler::emit_f32x4_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vinsertps(dst.fp(), src1.fp(), src2.fp(), (imm_lane_idx << 4) & 0x30);
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
insertps(dst.fp(), src2.fp(), (imm_lane_idx << 4) & 0x30);
}
void LiftoffAssembler::emit_i8x16_min_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpminsb, &Assembler::pminsb>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
}
void LiftoffAssembler::emit_f32x4_add(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i8x16_min_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vaddps, &Assembler::addps>(
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpminub, &Assembler::pminub>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_f32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i8x16_max_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdSub<&Assembler::vsubps, &Assembler::subps>(this, dst, lhs,
rhs);
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmaxsb, &Assembler::pmaxsb>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
}
void LiftoffAssembler::emit_f32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i8x16_max_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vmulps, &Assembler::mulps>(
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmaxub, &Assembler::pmaxub>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movq(dst.fp(), src.gp());
Movddup(dst.fp(), dst.fp());
void LiftoffAssembler::emit_i16x8_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddw, &Assembler::paddw>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i64x2_extract_lane(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_add_saturate_s(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
Pextrq(dst.gp(), lhs.fp(), static_cast<int8_t>(imm_lane_idx));
}
void LiftoffAssembler::emit_i64x2_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpinsrq(dst.fp(), src1.fp(), src2.gp(), imm_lane_idx);
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
pinsrq(dst.fp(), src2.gp(), imm_lane_idx);
}
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddsw, &Assembler::paddsw>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i64x2_add(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i16x8_add_saturate_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddq, &Assembler::paddq>(
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddusw, &Assembler::paddusw>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i16x8_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdSub<&Assembler::vpsubq, &Assembler::psubq>(this, dst, lhs,
liftoff::EmitSimdSub<&Assembler::vpsubw, &Assembler::psubw>(this, dst, lhs,
rhs);
}
void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_i16x8_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
static constexpr RegClass tmp_rc = reg_class_for(ValueType::kS128);
LiftoffRegister tmp1 =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
LiftoffRegister tmp2 =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs, tmp1));
Movaps(tmp1.fp(), lhs.fp());
Movaps(tmp2.fp(), rhs.fp());
// Multiply high dword of each qword of left with right.
Psrlq(tmp1.fp(), 32);
Pmuludq(tmp1.fp(), rhs.fp());
// Multiply high dword of each qword of right with left.
Psrlq(tmp2.fp(), 32);
Pmuludq(tmp2.fp(), lhs.fp());
Paddq(tmp2.fp(), tmp1.fp());
Psllq(tmp2.fp(), 32);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpmuludq(dst.fp(), lhs.fp(), rhs.fp());
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
pmuludq(dst.fp(), rhs.fp());
}
Paddq(dst.fp(), tmp2.fp());
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmullw, &Assembler::pmullw>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i32x4_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
Pshufd(dst.fp(), dst.fp(), static_cast<uint8_t>(0));
void LiftoffAssembler::emit_i16x8_min_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpminsw, &Assembler::pminsw>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i32x4_extract_lane(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_min_u(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
Pextrd(dst.gp(), lhs.fp(), imm_lane_idx);
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpminuw, &Assembler::pminuw>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
}
void LiftoffAssembler::emit_i32x4_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpinsrd(dst.fp(), src1.fp(), src2.gp(), imm_lane_idx);
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
pinsrd(dst.fp(), src2.gp(), imm_lane_idx);
}
void LiftoffAssembler::emit_i16x8_max_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmaxsw, &Assembler::pmaxsw>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i16x8_max_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmaxuw, &Assembler::pmaxuw>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
}
void LiftoffAssembler::emit_i32x4_add(LiftoffRegister dst, LiftoffRegister lhs,
......@@ -2159,117 +2179,136 @@ void LiftoffAssembler::emit_i32x4_max_u(LiftoffRegister dst,
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
}
void LiftoffAssembler::emit_i16x8_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
Pshuflw(dst.fp(), dst.fp(), static_cast<uint8_t>(0));
Pshufd(dst.fp(), dst.fp(), static_cast<uint8_t>(0));
}
void LiftoffAssembler::emit_i16x8_extract_lane_u(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
Pextrw(dst.gp(), lhs.fp(), imm_lane_idx);
void LiftoffAssembler::emit_i64x2_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddq, &Assembler::paddq>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i16x8_extract_lane_s(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
Pextrw(dst.gp(), lhs.fp(), imm_lane_idx);
movsxwl(dst.gp(), dst.gp());
void LiftoffAssembler::emit_i64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdSub<&Assembler::vpsubq, &Assembler::psubq>(this, dst, lhs,
rhs);
}
void LiftoffAssembler::emit_i16x8_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
void LiftoffAssembler::emit_i64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
static constexpr RegClass tmp_rc = reg_class_for(ValueType::kS128);
LiftoffRegister tmp1 =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
LiftoffRegister tmp2 =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs, tmp1));
Movaps(tmp1.fp(), lhs.fp());
Movaps(tmp2.fp(), rhs.fp());
// Multiply high dword of each qword of left with right.
Psrlq(tmp1.fp(), 32);
Pmuludq(tmp1.fp(), rhs.fp());
// Multiply high dword of each qword of right with left.
Psrlq(tmp2.fp(), 32);
Pmuludq(tmp2.fp(), lhs.fp());
Paddq(tmp2.fp(), tmp1.fp());
Psllq(tmp2.fp(), 32);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpinsrw(dst.fp(), src1.fp(), src2.gp(), imm_lane_idx);
vpmuludq(dst.fp(), lhs.fp(), rhs.fp());
} else {
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
pinsrw(dst.fp(), src2.gp(), imm_lane_idx);
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
pmuludq(dst.fp(), rhs.fp());
}
Paddq(dst.fp(), tmp2.fp());
}
void LiftoffAssembler::emit_i16x8_add(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_f32x4_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddw, &Assembler::paddw>(
liftoff::EmitSimdCommutativeBinOp<&Assembler::vaddps, &Assembler::addps>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i16x8_add_saturate_s(LiftoffRegister dst,
LiftoffRegister lhs,
void LiftoffAssembler::emit_f32x4_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddsw, &Assembler::paddsw>(
liftoff::EmitSimdSub<&Assembler::vsubps, &Assembler::subps>(this, dst, lhs,
rhs);
}
void LiftoffAssembler::emit_f32x4_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vmulps, &Assembler::mulps>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i16x8_sub(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_f64x2_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdSub<&Assembler::vpsubw, &Assembler::psubw>(this, dst, lhs,
liftoff::EmitSimdCommutativeBinOp<&Assembler::vaddpd, &Assembler::addpd>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_f64x2_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdSub<&Assembler::vsubpd, &Assembler::subpd>(this, dst, lhs,
rhs);
}
void LiftoffAssembler::emit_i16x8_mul(LiftoffRegister dst, LiftoffRegister lhs,
void LiftoffAssembler::emit_f64x2_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmullw, &Assembler::pmullw>(
liftoff::EmitSimdCommutativeBinOp<&Assembler::vmulpd, &Assembler::mulpd>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i16x8_add_saturate_u(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_extract_lane_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddusw, &Assembler::paddusw>(
this, dst, lhs, rhs);
uint8_t imm_lane_idx) {
Pextrb(dst.gp(), lhs.fp(), imm_lane_idx);
movsxbl(dst.gp(), dst.gp());
}
void LiftoffAssembler::emit_i16x8_min_s(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_extract_lane_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpminsw, &Assembler::pminsw>(
this, dst, lhs, rhs);
uint8_t imm_lane_idx) {
Pextrb(dst.gp(), lhs.fp(), imm_lane_idx);
}
void LiftoffAssembler::emit_i16x8_min_u(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_extract_lane_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpminuw, &Assembler::pminuw>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
uint8_t imm_lane_idx) {
Pextrw(dst.gp(), lhs.fp(), imm_lane_idx);
movsxwl(dst.gp(), dst.gp());
}
void LiftoffAssembler::emit_i16x8_max_s(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_extract_lane_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmaxsw, &Assembler::pmaxsw>(
this, dst, lhs, rhs);
uint8_t imm_lane_idx) {
Pextrw(dst.gp(), lhs.fp(), imm_lane_idx);
}
void LiftoffAssembler::emit_i16x8_max_u(LiftoffRegister dst,
void LiftoffAssembler::emit_i32x4_extract_lane(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmaxuw, &Assembler::pmaxuw>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
uint8_t imm_lane_idx) {
Pextrd(dst.gp(), lhs.fp(), imm_lane_idx);
}
void LiftoffAssembler::emit_i8x16_splat(LiftoffRegister dst,
LiftoffRegister src) {
Movd(dst.fp(), src.gp());
Pxor(kScratchDoubleReg, kScratchDoubleReg);
Pshufb(dst.fp(), kScratchDoubleReg);
void LiftoffAssembler::emit_i64x2_extract_lane(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
Pextrq(dst.gp(), lhs.fp(), static_cast<int8_t>(imm_lane_idx));
}
void LiftoffAssembler::emit_i8x16_extract_lane_u(LiftoffRegister dst,
void LiftoffAssembler::emit_f32x4_extract_lane(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
Pextrb(dst.gp(), lhs.fp(), imm_lane_idx);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vshufps(dst.fp(), lhs.fp(), lhs.fp(), imm_lane_idx);
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
if (imm_lane_idx != 0) shufps(dst.fp(), dst.fp(), imm_lane_idx);
}
}
void LiftoffAssembler::emit_i8x16_extract_lane_s(LiftoffRegister dst,
void LiftoffAssembler::emit_f64x2_extract_lane(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
Pextrb(dst.gp(), lhs.fp(), imm_lane_idx);
movsxbl(dst.gp(), dst.gp());
Pextrq(kScratchRegister, lhs.fp(), static_cast<int8_t>(imm_lane_idx));
Movq(dst.fp(), kScratchRegister);
}
void LiftoffAssembler::emit_i8x16_replace_lane(LiftoffRegister dst,
......@@ -2286,120 +2325,81 @@ void LiftoffAssembler::emit_i8x16_replace_lane(LiftoffRegister dst,
}
}
void LiftoffAssembler::emit_i8x16_add(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddb, &Assembler::paddb>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i8x16_add_saturate_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddsb, &Assembler::paddsb>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i8x16_sub(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdSub<&Assembler::vpsubb, &Assembler::psubb>(this, dst, lhs,
rhs);
}
void LiftoffAssembler::emit_i8x16_mul(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
static constexpr RegClass tmp_rc = reg_class_for(ValueType::kS128);
LiftoffRegister tmp =
GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(dst, lhs, rhs));
void LiftoffAssembler::emit_i16x8_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
// I16x8 view of I8x16
// left = AAaa AAaa ... AAaa AAaa
// right= BBbb BBbb ... BBbb BBbb
// t = 00AA 00AA ... 00AA 00AA
// s = 00BB 00BB ... 00BB 00BB
vpsrlw(tmp.fp(), lhs.fp(), 8);
vpsrlw(kScratchDoubleReg, rhs.fp(), 8);
// t = I16x8Mul(t0, t1)
//    => __PP __PP ...  __PP  __PP
vpmullw(tmp.fp(), tmp.fp(), kScratchDoubleReg);
// s = left * 256
vpsllw(kScratchDoubleReg, lhs.fp(), 8);
// dst = I16x8Mul(left * 256, right)
//    => pp__ pp__ ...  pp__  pp__
vpmullw(dst.fp(), kScratchDoubleReg, rhs.fp());
// dst = I16x8Shr(dst, 8)
//    => 00pp 00pp ...  00pp  00pp
vpsrlw(dst.fp(), dst.fp(), 8);
// t = I16x8Shl(t, 8)
//    => PP00 PP00 ...  PP00  PP00
vpsllw(tmp.fp(), tmp.fp(), 8);
// dst = I16x8Or(dst, t)
//    => PPpp PPpp ...  PPpp  PPpp
vpor(dst.fp(), dst.fp(), tmp.fp());
vpinsrw(dst.fp(), src1.fp(), src2.gp(), imm_lane_idx);
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
// I16x8 view of I8x16
// left = AAaa AAaa ... AAaa AAaa
// right= BBbb BBbb ... BBbb BBbb
// t = 00AA 00AA ... 00AA 00AA
// s = 00BB 00BB ... 00BB 00BB
movaps(tmp.fp(), dst.fp());
movaps(kScratchDoubleReg, rhs.fp());
psrlw(tmp.fp(), 8);
psrlw(kScratchDoubleReg, 8);
// dst = left * 256
psllw(dst.fp(), 8);
// t = I16x8Mul(t, s)
//    => __PP __PP ...  __PP  __PP
pmullw(tmp.fp(), kScratchDoubleReg);
// dst = I16x8Mul(left * 256, right)
//    => pp__ pp__ ...  pp__  pp__
pmullw(dst.fp(), rhs.fp());
// t = I16x8Shl(t, 8)
//    => PP00 PP00 ...  PP00  PP00
psllw(tmp.fp(), 8);
// dst = I16x8Shr(dst, 8)
//    => 00pp 00pp ...  00pp  00pp
psrlw(dst.fp(), 8);
// dst = I16x8Or(dst, t)
//    => PPpp PPpp ...  PPpp  PPpp
por(dst.fp(), tmp.fp());
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
pinsrw(dst.fp(), src2.gp(), imm_lane_idx);
}
}
void LiftoffAssembler::emit_i8x16_add_saturate_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpaddusb, &Assembler::paddusb>(
this, dst, lhs, rhs);
}
void LiftoffAssembler::emit_i8x16_min_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpminsb, &Assembler::pminsb>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
void LiftoffAssembler::emit_i32x4_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpinsrd(dst.fp(), src1.fp(), src2.gp(), imm_lane_idx);
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
pinsrd(dst.fp(), src2.gp(), imm_lane_idx);
}
}
void LiftoffAssembler::emit_i8x16_min_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpminub, &Assembler::pminub>(
this, dst, lhs, rhs);
void LiftoffAssembler::emit_i64x2_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpinsrq(dst.fp(), src1.fp(), src2.gp(), imm_lane_idx);
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
pinsrq(dst.fp(), src2.gp(), imm_lane_idx);
}
}
void LiftoffAssembler::emit_i8x16_max_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmaxsb, &Assembler::pmaxsb>(
this, dst, lhs, rhs, base::Optional<CpuFeature>(SSE4_1));
void LiftoffAssembler::emit_f32x4_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vinsertps(dst.fp(), src1.fp(), src2.fp(), (imm_lane_idx << 4) & 0x30);
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
insertps(dst.fp(), src2.fp(), (imm_lane_idx << 4) & 0x30);
}
}
void LiftoffAssembler::emit_i8x16_max_u(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
liftoff::EmitSimdCommutativeBinOp<&Assembler::vpmaxub, &Assembler::pmaxub>(
this, dst, lhs, rhs);
void LiftoffAssembler::emit_f64x2_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
if (imm_lane_idx == 0) {
vpblendw(dst.fp(), src1.fp(), src2.fp(), 0b00001111);
} else {
vmovlhps(dst.fp(), src1.fp(), src2.fp());
}
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
if (imm_lane_idx == 0) {
pblendw(dst.fp(), src2.fp(), 0b00001111);
} else {
movlhps(dst.fp(), src2.fp());
}
}
}
void LiftoffAssembler::StackCheck(Label* ool_code, Register limit_address) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment