Commit 71db74d8 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][x64][ia32] Optimize i16x8.extmul_high_i8x16_{s,u}

palignr has a false dependency, so use punpckhbw. Plus optimize for AVX,
which can have 1 less instruction.

The logic for i16x8.extmul_high_u is slightly complicated (SSE case)
to handle register aliasing. This allows us to have the most flexible
register allocation, and be able to optimize the most

Bug: v8:11468
Change-Id: I221b2d7a79009edb6c4060c136fc35ee7aff08fa
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2757224Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#73415}
parent 5add956c
......@@ -696,19 +696,81 @@ void TurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
}
}
void TurboAssembler::I16x8ExtMul(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister scratch,
bool low, bool is_signed) {
if (low) {
is_signed ? Pmovsxbw(scratch, src1) : Pmovzxbw(scratch, src1);
is_signed ? Pmovsxbw(dst, src2) : Pmovzxbw(dst, src2);
Pmullw(dst, scratch);
void TurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister scratch,
bool is_signed) {
is_signed ? Pmovsxbw(scratch, src1) : Pmovzxbw(scratch, src1);
is_signed ? Pmovsxbw(dst, src2) : Pmovzxbw(dst, src2);
Pmullw(dst, scratch);
}
void TurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister scratch) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpunpckhbw(scratch, src1, src1);
vpsraw(scratch, scratch, 8);
vpunpckhbw(dst, src2, src2);
vpsraw(dst, dst, 8);
vpmullw(dst, dst, scratch);
} else {
Palignr(scratch, src1, uint8_t{8});
is_signed ? Pmovsxbw(scratch, scratch) : Pmovzxbw(scratch, scratch);
Palignr(dst, src2, uint8_t{8});
is_signed ? Pmovsxbw(dst, dst) : Pmovzxbw(dst, dst);
Pmullw(dst, scratch);
if (dst != src1) {
movaps(dst, src1);
}
movaps(scratch, src2);
punpckhbw(dst, dst);
psraw(dst, 8);
punpckhbw(scratch, scratch);
psraw(scratch, 8);
pmullw(dst, scratch);
}
}
void TurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister scratch) {
// The logic here is slightly complicated to handle all the cases of register
// aliasing. This allows flexibility for callers in TurboFan and Liftoff.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
if (src1 == src2) {
vpxor(scratch, scratch, scratch);
vpunpckhbw(dst, src1, scratch);
vpmullw(dst, dst, dst);
} else {
if (dst == src2) {
// We overwrite dst, then use src2, so swap src1 and src2.
std::swap(src1, src2);
}
vpxor(scratch, scratch, scratch);
vpunpckhbw(dst, src1, scratch);
vpunpckhbw(scratch, src2, scratch);
vpmullw(dst, dst, scratch);
}
} else {
if (src1 == src2) {
xorps(scratch, scratch);
if (dst != src1) {
movaps(dst, src1);
}
punpckhbw(dst, scratch);
pmullw(dst, scratch);
} else {
// When dst == src1, nothing special needs to be done.
// When dst == src2, swap src1 and src2, since we overwrite dst.
// When dst is unique, copy src1 to dst first.
if (dst == src2) {
std::swap(src1, src2);
// Now, dst == src1.
} else if (dst != src1) {
// dst != src1 && dst != src2.
movaps(dst, src1);
}
xorps(scratch, scratch);
punpckhbw(dst, scratch);
punpckhbw(scratch, src2);
psrlw(scratch, 8);
pmullw(dst, scratch);
}
}
}
......
......@@ -716,8 +716,12 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
// Requires that dst == src1 if AVX is not supported.
void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch, bool low, bool is_signed);
void I16x8ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch, bool low, bool is_signed);
void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch, bool is_signed);
void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch);
void I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch);
// Requires dst == mask when AVX is not supported.
void S128Select(XMMRegister dst, XMMRegister mask, XMMRegister src1,
XMMRegister src2, XMMRegister scratch);
......
......@@ -2296,20 +2296,81 @@ void TurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
}
}
void TurboAssembler::I16x8ExtMul(XMMRegister dst, XMMRegister src1,
XMMRegister src2, bool low, bool is_signed) {
if (low) {
is_signed ? Pmovsxbw(kScratchDoubleReg, src1)
: Pmovzxbw(kScratchDoubleReg, src1);
is_signed ? Pmovsxbw(dst, src2) : Pmovzxbw(dst, src2);
Pmullw(dst, kScratchDoubleReg);
} else {
Palignr(kScratchDoubleReg, src1, uint8_t{8});
is_signed ? Pmovsxbw(kScratchDoubleReg, kScratchDoubleReg)
: Pmovzxbw(kScratchDoubleReg, kScratchDoubleReg);
Palignr(dst, src2, uint8_t{8});
is_signed ? Pmovsxbw(dst, dst) : Pmovzxbw(dst, dst);
Pmullw(dst, kScratchDoubleReg);
void TurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
XMMRegister src2, bool is_signed) {
is_signed ? Pmovsxbw(kScratchDoubleReg, src1)
: Pmovzxbw(kScratchDoubleReg, src1);
is_signed ? Pmovsxbw(dst, src2) : Pmovzxbw(dst, src2);
Pmullw(dst, kScratchDoubleReg);
}
void TurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpunpckhbw(kScratchDoubleReg, src1, src1);
vpsraw(kScratchDoubleReg, kScratchDoubleReg, 8);
vpunpckhbw(dst, src2, src2);
vpsraw(dst, dst, 8);
vpmullw(dst, dst, kScratchDoubleReg);
} else {
if (dst != src1) {
movaps(dst, src1);
}
movaps(kScratchDoubleReg, src2);
punpckhbw(dst, dst);
psraw(dst, 8);
punpckhbw(kScratchDoubleReg, kScratchDoubleReg);
psraw(kScratchDoubleReg, 8);
pmullw(dst, kScratchDoubleReg);
}
}
void TurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
// The logic here is slightly complicated to handle all the cases of register
// aliasing. This allows flexibility for callers in TurboFan and Liftoff.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
if (src1 == src2) {
vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
vpunpckhbw(dst, src1, kScratchDoubleReg);
vpmullw(dst, dst, dst);
} else {
if (dst == src2) {
// We overwrite dst, then use src2, so swap src1 and src2.
std::swap(src1, src2);
}
vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
vpunpckhbw(dst, src1, kScratchDoubleReg);
vpunpckhbw(kScratchDoubleReg, src2, kScratchDoubleReg);
vpmullw(dst, dst, kScratchDoubleReg);
}
} else {
if (src1 == src2) {
xorps(kScratchDoubleReg, kScratchDoubleReg);
if (dst != src1) {
movaps(dst, src1);
}
punpckhbw(dst, kScratchDoubleReg);
pmullw(dst, kScratchDoubleReg);
} else {
// When dst == src1, nothing special needs to be done.
// When dst == src2, swap src1 and src2, since we overwrite dst.
// When dst is unique, copy src1 to dst first.
if (dst == src2) {
std::swap(src1, src2);
// Now, dst == src1.
} else if (dst != src1) {
// dst != src1 && dst != src2.
movaps(dst, src1);
}
xorps(kScratchDoubleReg, kScratchDoubleReg);
punpckhbw(dst, kScratchDoubleReg);
punpckhbw(kScratchDoubleReg, src2);
psrlw(kScratchDoubleReg, 8);
pmullw(dst, kScratchDoubleReg);
}
}
}
......
......@@ -614,8 +614,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
// Requires that dst == src1 if AVX is not supported.
void I32x4ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
bool low, bool is_signed);
void I16x8ExtMul(XMMRegister dst, XMMRegister src1, XMMRegister src2,
bool low, bool is_signed);
void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2,
bool is_signed);
void I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2);
......
......@@ -2118,27 +2118,25 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kIA32I16x8ExtMulLowI8x16S: {
__ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), kScratchDoubleReg,
/*low=*/true, /*is_signed=*/true);
__ I16x8ExtMulLow(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), kScratchDoubleReg,
/*is_signed=*/true);
break;
}
case kIA32I16x8ExtMulHighI8x16S: {
__ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), kScratchDoubleReg,
/*low=*/false, /*is_signed=*/true);
__ I16x8ExtMulHighS(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), kScratchDoubleReg);
break;
}
case kIA32I16x8ExtMulLowI8x16U: {
__ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), kScratchDoubleReg,
/*low=*/true, /*is_signed=*/false);
__ I16x8ExtMulLow(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), kScratchDoubleReg,
/*is_signed=*/false);
break;
}
case kIA32I16x8ExtMulHighI8x16U: {
__ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), kScratchDoubleReg,
/*low=*/false, /*is_signed=*/false);
__ I16x8ExtMulHighU(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), kScratchDoubleReg);
break;
}
case kIA32I64x2SplatI32Pair: {
......
......@@ -3313,27 +3313,23 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I16x8ExtMulLowI8x16S: {
__ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/true,
/*is_signed=*/true);
__ I16x8ExtMulLow(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*is_signed=*/true);
break;
}
case kX64I16x8ExtMulHighI8x16S: {
__ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/false,
/*is_signed=*/true);
__ I16x8ExtMulHighS(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
break;
}
case kX64I16x8ExtMulLowI8x16U: {
__ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/true,
/*is_signed=*/false);
__ I16x8ExtMulLow(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*is_signed=*/false);
break;
}
case kX64I16x8ExtMulHighI8x16U: {
__ I16x8ExtMul(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), /*low=*/false,
/*is_signed=*/false);
__ I16x8ExtMulHighU(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
break;
}
case kX64I16x8ExtAddPairwiseI8x16S: {
......
......@@ -3640,29 +3640,27 @@ void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_u(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_extmul_low_i8x16_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
I16x8ExtMul(dst.fp(), src1.fp(), src2.fp(), liftoff::kScratchDoubleReg,
/*low=*/true, /*is_signed=*/true);
I16x8ExtMulLow(dst.fp(), src1.fp(), src2.fp(), liftoff::kScratchDoubleReg,
/*is_signed=*/true);
}
void LiftoffAssembler::emit_i16x8_extmul_low_i8x16_u(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
I16x8ExtMul(dst.fp(), src1.fp(), src2.fp(), liftoff::kScratchDoubleReg,
/*low=*/true, /*is_signed=*/false);
I16x8ExtMulLow(dst.fp(), src1.fp(), src2.fp(), liftoff::kScratchDoubleReg,
/*is_signed=*/false);
}
void LiftoffAssembler::emit_i16x8_extmul_high_i8x16_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
I16x8ExtMul(dst.fp(), src1.fp(), src2.fp(), liftoff::kScratchDoubleReg,
/*low=*/false, /*is_signed=*/true);
I16x8ExtMulHighS(dst.fp(), src1.fp(), src2.fp(), liftoff::kScratchDoubleReg);
}
void LiftoffAssembler::emit_i16x8_extmul_high_i8x16_u(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
I16x8ExtMul(dst.fp(), src1.fp(), src2.fp(), liftoff::kScratchDoubleReg,
/*low=*/false, /*is_signed=*/false);
I16x8ExtMulHighU(dst.fp(), src1.fp(), src2.fp(), liftoff::kScratchDoubleReg);
}
void LiftoffAssembler::emit_i16x8_q15mulr_sat_s(LiftoffRegister dst,
......
......@@ -3215,28 +3215,25 @@ void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_u(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_extmul_low_i8x16_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
I16x8ExtMul(dst.fp(), src1.fp(), src2.fp(), /*low=*/true, /*is_signed=*/true);
I16x8ExtMulLow(dst.fp(), src1.fp(), src2.fp(), /*is_signed=*/true);
}
void LiftoffAssembler::emit_i16x8_extmul_low_i8x16_u(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
I16x8ExtMul(dst.fp(), src1.fp(), src2.fp(), /*low=*/true,
/*is_signed=*/false);
I16x8ExtMulLow(dst.fp(), src1.fp(), src2.fp(), /*is_signed=*/false);
}
void LiftoffAssembler::emit_i16x8_extmul_high_i8x16_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
I16x8ExtMul(dst.fp(), src1.fp(), src2.fp(), /*low=*/false,
/*is_signed=*/true);
I16x8ExtMulHighS(dst.fp(), src1.fp(), src2.fp());
}
void LiftoffAssembler::emit_i16x8_extmul_high_i8x16_u(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2) {
I16x8ExtMul(dst.fp(), src1.fp(), src2.fp(), /*low=*/false,
/*is_signed=*/false);
I16x8ExtMulHighU(dst.fp(), src1.fp(), src2.fp());
}
void LiftoffAssembler::emit_i16x8_q15mulr_sat_s(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment