Commit 5e80730f authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm-simd] Share i16x8.q15mulr_sat_s implementation

Bug: v8:11589
Change-Id: Ie51cfd6cd6315f7f14f0c584f190a478ed565b0e
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3114603Reviewed-by: 's avatarAdam Klein <adamk@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76475}
parent b415fa38
...@@ -631,32 +631,6 @@ void TurboAssembler::Cvttsd2ui(Register dst, Operand src, XMMRegister tmp) { ...@@ -631,32 +631,6 @@ void TurboAssembler::Cvttsd2ui(Register dst, Operand src, XMMRegister tmp) {
add(dst, Immediate(0x80000000)); add(dst, Immediate(0x80000000));
} }
void TurboAssembler::Pmulhrsw(XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpmulhrsw(dst, src1, src2);
} else {
if (dst != src1) {
movaps(dst, src1);
}
CpuFeatureScope sse_scope(this, SSSE3);
pmulhrsw(dst, src2);
}
}
void TurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister scratch) {
ASM_CODE_COMMENT(this);
// k = i16x8.splat(0x8000)
Pcmpeqd(scratch, scratch);
Psllw(scratch, scratch, byte{15});
Pmulhrsw(dst, src1, src2);
Pcmpeqw(scratch, dst);
Pxor(dst, scratch);
}
void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src, void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp1, XMMRegister tmp2,
Register scratch) { Register scratch) {
......
...@@ -392,14 +392,9 @@ class V8_EXPORT_PRIVATE TurboAssembler ...@@ -392,14 +392,9 @@ class V8_EXPORT_PRIVATE TurboAssembler
} }
void Cvttsd2ui(Register dst, Operand src, XMMRegister tmp); void Cvttsd2ui(Register dst, Operand src, XMMRegister tmp);
// Handles SSE and AVX. On SSE, moves src to dst if they are not equal.
void Pmulhrsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);
// These Wasm SIMD ops do not have direct lowerings on IA32. These // These Wasm SIMD ops do not have direct lowerings on IA32. These
// helpers are optimized to produce the fastest and smallest codegen. // helpers are optimized to produce the fastest and smallest codegen.
// Defined here to allow usage on both TurboFan and Liftoff. // Defined here to allow usage on both TurboFan and Liftoff.
void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch);
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1, void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
XMMRegister tmp2, Register scratch); XMMRegister tmp2, Register scratch);
void F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src, Register tmp); void F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src, Register tmp);
......
...@@ -570,6 +570,24 @@ void SharedTurboAssembler::I16x8UConvertI8x16High(XMMRegister dst, ...@@ -570,6 +570,24 @@ void SharedTurboAssembler::I16x8UConvertI8x16High(XMMRegister dst,
} }
} }
void SharedTurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
XMMRegister src2,
XMMRegister scratch) {
ASM_CODE_COMMENT(this);
// k = i16x8.splat(0x8000)
Pcmpeqd(scratch, scratch);
Psllw(scratch, scratch, byte{15});
if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
movaps(dst, src1);
src1 = dst;
}
Pmulhrsw(dst, src1, src2);
Pcmpeqw(scratch, dst);
Pxor(dst, scratch);
}
// 1. Multiply low word into scratch. // 1. Multiply low word into scratch.
// 2. Multiply high word (can be signed or unsigned) into dst. // 2. Multiply high word (can be signed or unsigned) into dst.
// 3. Unpack and interleave scratch and dst into dst. // 3. Unpack and interleave scratch and dst into dst.
......
...@@ -232,6 +232,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -232,6 +232,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Pcmpgtb, pcmpgtb) AVX_OP(Pcmpgtb, pcmpgtb)
AVX_OP(Pcmpgtd, pcmpgtd) AVX_OP(Pcmpgtd, pcmpgtd)
AVX_OP(Pcmpeqd, pcmpeqd) AVX_OP(Pcmpeqd, pcmpeqd)
AVX_OP(Pcmpeqw, pcmpeqw)
AVX_OP(Pinsrw, pinsrw) AVX_OP(Pinsrw, pinsrw)
AVX_OP(Pmaxub, pmaxub) AVX_OP(Pmaxub, pmaxub)
AVX_OP(Pminub, pminub) AVX_OP(Pminub, pminub)
...@@ -288,6 +289,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -288,6 +289,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP_SSSE3(Pabsd, pabsd) AVX_OP_SSSE3(Pabsd, pabsd)
AVX_OP_SSSE3(Pabsw, pabsw) AVX_OP_SSSE3(Pabsw, pabsw)
AVX_OP_SSSE3(Palignr, palignr) AVX_OP_SSSE3(Palignr, palignr)
AVX_OP_SSSE3(Pmulhrsw, pmulhrsw)
AVX_OP_SSSE3(Psignb, psignb) AVX_OP_SSSE3(Psignb, psignb)
AVX_OP_SSSE3(Psignd, psignd) AVX_OP_SSSE3(Psignd, psignd)
AVX_OP_SSSE3(Psignw, psignw) AVX_OP_SSSE3(Psignw, psignw)
...@@ -347,6 +349,9 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -347,6 +349,9 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
void I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src); void I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src);
void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src, void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src,
XMMRegister scratch); XMMRegister scratch);
// Will move src1 to dst if AVX is not supported.
void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch);
// Requires that dst == src1 if AVX is not supported. // Requires that dst == src1 if AVX is not supported.
void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2, void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch, bool low, bool is_signed); XMMRegister scratch, bool low, bool is_signed);
......
...@@ -2243,31 +2243,6 @@ void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src, ...@@ -2243,31 +2243,6 @@ void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src,
} }
} }
void TurboAssembler::Pmulhrsw(XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpmulhrsw(dst, src1, src2);
} else {
if (dst != src1) {
Movdqa(dst, src1);
}
CpuFeatureScope sse_scope(this, SSSE3);
pmulhrsw(dst, src2);
}
}
void TurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
// k = i16x8.splat(0x8000)
Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
Psllw(kScratchDoubleReg, byte{15});
Pmulhrsw(dst, src1, src2);
Pcmpeqw(kScratchDoubleReg, dst);
Pxor(dst, kScratchDoubleReg);
}
void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src, void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
XMMRegister tmp) { XMMRegister tmp) {
DCHECK_NE(dst, tmp); DCHECK_NE(dst, tmp);
......
...@@ -475,13 +475,10 @@ class V8_EXPORT_PRIVATE TurboAssembler ...@@ -475,13 +475,10 @@ class V8_EXPORT_PRIVATE TurboAssembler
// Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE. // Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE.
void Pshufb(XMMRegister dst, XMMRegister src1, XMMRegister src2); void Pshufb(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void Pmulhrsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);
// These Wasm SIMD ops do not have direct lowerings on x64. These // These Wasm SIMD ops do not have direct lowerings on x64. These
// helpers are optimized to produce the fastest and smallest codegen. // helpers are optimized to produce the fastest and smallest codegen.
// Defined here to allow usage on both TurboFan and Liftoff. // Defined here to allow usage on both TurboFan and Liftoff.
void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp); void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp);
void F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src); void F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src);
......
...@@ -3405,7 +3405,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3405,7 +3405,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
case kX64I16x8Q15MulRSatS: { case kX64I16x8Q15MulRSatS: {
__ I16x8Q15MulRSatS(i.OutputSimd128Register(), i.InputSimd128Register(0), __ I16x8Q15MulRSatS(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1)); i.InputSimd128Register(1), kScratchDoubleReg);
break; break;
} }
case kX64I8x16Splat: { case kX64I8x16Splat: {
......
...@@ -3170,7 +3170,7 @@ void LiftoffAssembler::emit_i16x8_extmul_high_i8x16_u(LiftoffRegister dst, ...@@ -3170,7 +3170,7 @@ void LiftoffAssembler::emit_i16x8_extmul_high_i8x16_u(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_q15mulr_sat_s(LiftoffRegister dst, void LiftoffAssembler::emit_i16x8_q15mulr_sat_s(LiftoffRegister dst,
LiftoffRegister src1, LiftoffRegister src1,
LiftoffRegister src2) { LiftoffRegister src2) {
I16x8Q15MulRSatS(dst.fp(), src1.fp(), src2.fp()); I16x8Q15MulRSatS(dst.fp(), src1.fp(), src2.fp(), kScratchDoubleReg);
} }
void LiftoffAssembler::emit_i32x4_neg(LiftoffRegister dst, void LiftoffAssembler::emit_i32x4_neg(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment