Commit 6fb4893c authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

Reland "[wasm-simd][ia32] Optimize some signed integer widening sequences"

This is a reland of 0ef2eea7

The fixes are adding missing SSE4_1 scopes to ia32. I realize
the x64 codegen is missing the scopes to, so fix them as well.

Original change's description:
> [wasm-simd][ia32] Optimize some signed integer widening sequences
>
> Optimize ia32 code sequences. This is the same sequences as x64, which
> have been optimized based on supported extensions.
>
> Bug: v8:11464
> Change-Id: I10396a928a431cdd2de9b22bb8a395bc0adb4694
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2704897
> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
> Commit-Queue: Zhi An Ng <zhin@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#72926}

Bug: v8:11464
Change-Id: Ib66a63de26bcc3bb3626922b642fe5df6bff8bdb
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2713211Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#72983}
parent b3088d63
......@@ -777,6 +777,97 @@ void TurboAssembler::I64x2UConvertI32x4High(XMMRegister dst, XMMRegister src,
}
}
void TurboAssembler::I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// src = |a|b|c|d|e|f|g|h| (high)
// dst = |e|e|f|f|g|g|h|h|
vpunpckhwd(dst, src, src);
vpsrad(dst, dst, 16);
} else {
CpuFeatureScope sse_scope(this, SSE4_1);
if (dst == src) {
// 2 bytes shorter than pshufd, but has depdency on dst.
movhlps(dst, src);
pmovsxwd(dst, dst);
} else {
// No dependency on dst.
pshufd(dst, src, 0xEE);
pmovsxwd(dst, dst);
}
}
}
void TurboAssembler::I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src,
XMMRegister scratch) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// scratch = |0|0|0|0|0|0|0|0|
// src = |a|b|c|d|e|f|g|h|
// dst = |0|a|0|b|0|c|0|d|
XMMRegister tmp = dst == src ? scratch : dst;
vpxor(tmp, tmp, tmp);
vpunpckhwd(dst, src, tmp);
} else {
if (dst == src) {
// xorps can be executed on more ports than pshufd.
xorps(scratch, scratch);
punpckhwd(dst, scratch);
} else {
CpuFeatureScope sse_scope(this, SSE4_1);
// No dependency on dst.
pshufd(dst, src, 0xEE);
pmovzxwd(dst, dst);
}
}
}
void TurboAssembler::I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// src = |a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p| (high)
// dst = |i|i|j|j|k|k|l|l|m|m|n|n|o|o|p|p|
vpunpckhbw(dst, src, src);
vpsraw(dst, dst, 8);
} else {
CpuFeatureScope sse_scope(this, SSE4_1);
if (dst == src) {
// 2 bytes shorter than pshufd, but has depdency on dst.
movhlps(dst, src);
pmovsxbw(dst, dst);
} else {
CpuFeatureScope sse_scope(this, SSE4_1);
// No dependency on dst.
pshufd(dst, src, 0xEE);
pmovsxbw(dst, dst);
}
}
}
void TurboAssembler::I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src,
XMMRegister scratch) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// tmp = |0|0|0|0|0|0|0|0 | 0|0|0|0|0|0|0|0|
// src = |a|b|c|d|e|f|g|h | i|j|k|l|m|n|o|p|
// dst = |0|a|0|b|0|c|0|d | 0|e|0|f|0|g|0|h|
XMMRegister tmp = dst == src ? scratch : dst;
vpxor(tmp, tmp, tmp);
vpunpckhbw(dst, src, tmp);
} else {
CpuFeatureScope sse_scope(this, SSE4_1);
if (dst == src) {
// xorps can be executed on more ports than pshufd.
xorps(scratch, scratch);
punpckhbw(dst, scratch);
} else {
// No dependency on dst.
pshufd(dst, src, 0xEE);
pmovzxbw(dst, dst);
}
}
}
void TurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
XMMRegister src2, XMMRegister scratch) {
// k = i16x8.splat(0x8000)
......
......@@ -677,6 +677,12 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void I64x2SConvertI32x4High(XMMRegister dst, XMMRegister src);
void I64x2UConvertI32x4High(XMMRegister dst, XMMRegister src,
XMMRegister scratch);
void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src);
void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src,
XMMRegister scratch);
void I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src);
void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src,
XMMRegister scratch);
void I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scratch);
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
......
......@@ -2137,6 +2137,7 @@ void TurboAssembler::I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src) {
vpunpckhwd(dst, src, src);
vpsrad(dst, dst, 16);
} else {
CpuFeatureScope sse_scope(this, SSE4_1);
if (dst == src) {
// 2 bytes shorter than pshufd, but has depdency on dst.
movhlps(dst, src);
......@@ -2159,6 +2160,7 @@ void TurboAssembler::I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src) {
vpxor(scratch, scratch, scratch);
vpunpckhwd(dst, src, scratch);
} else {
CpuFeatureScope sse_scope(this, SSE4_1);
if (dst == src) {
// xorps can be executed on more ports than pshufd.
xorps(kScratchDoubleReg, kScratchDoubleReg);
......@@ -2179,6 +2181,7 @@ void TurboAssembler::I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src) {
vpunpckhbw(dst, src, src);
vpsraw(dst, dst, 8);
} else {
CpuFeatureScope sse_scope(this, SSE4_1);
if (dst == src) {
// 2 bytes shorter than pshufd, but has depdency on dst.
movhlps(dst, src);
......@@ -2206,6 +2209,7 @@ void TurboAssembler::I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src) {
xorps(kScratchDoubleReg, kScratchDoubleReg);
punpckhbw(dst, kScratchDoubleReg);
} else {
CpuFeatureScope sse_scope(this, SSE4_1);
// No dependency on dst.
pshufd(dst, src, 0xEE);
pmovzxbw(dst, dst);
......
......@@ -2666,9 +2666,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kIA32I32x4SConvertI16x8High: {
XMMRegister dst = i.OutputSimd128Register();
__ Palignr(dst, i.InputOperand(0), 8);
__ Pmovsxwd(dst, dst);
__ I32x4SConvertI16x8High(i.OutputSimd128Register(),
i.InputSimd128Register(0));
break;
}
case kIA32I32x4Neg: {
......@@ -2876,9 +2875,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kIA32I32x4UConvertI16x8High: {
XMMRegister dst = i.OutputSimd128Register();
__ Palignr(dst, i.InputOperand(0), 8);
__ Pmovzxwd(dst, dst);
__ I32x4UConvertI16x8High(i.OutputSimd128Register(),
i.InputSimd128Register(0), kScratchDoubleReg);
break;
}
case kIA32I32x4ShrU: {
......@@ -2979,9 +2977,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kIA32I16x8SConvertI8x16High: {
XMMRegister dst = i.OutputSimd128Register();
__ Palignr(dst, i.InputOperand(0), 8);
__ Pmovsxbw(dst, dst);
__ I16x8SConvertI8x16High(i.OutputSimd128Register(),
i.InputSimd128Register(0));
break;
}
case kIA32I16x8Neg: {
......@@ -3163,9 +3160,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kIA32I16x8UConvertI8x16High: {
XMMRegister dst = i.OutputSimd128Register();
__ Palignr(dst, i.InputOperand(0), 8);
__ Pmovzxbw(dst, dst);
__ I16x8UConvertI8x16High(i.OutputSimd128Register(),
i.InputSimd128Register(0), kScratchDoubleReg);
break;
}
case kIA32I16x8ShrU: {
......
......@@ -4551,8 +4551,7 @@ void LiftoffAssembler::emit_i16x8_sconvert_i8x16_low(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_sconvert_i8x16_high(LiftoffRegister dst,
LiftoffRegister src) {
Palignr(dst.fp(), src.fp(), static_cast<uint8_t>(8));
Pmovsxbw(dst.fp(), dst.fp());
I16x8SConvertI8x16High(dst.fp(), src.fp());
}
void LiftoffAssembler::emit_i16x8_uconvert_i8x16_low(LiftoffRegister dst,
......@@ -4562,8 +4561,7 @@ void LiftoffAssembler::emit_i16x8_uconvert_i8x16_low(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_uconvert_i8x16_high(LiftoffRegister dst,
LiftoffRegister src) {
Palignr(dst.fp(), src.fp(), static_cast<uint8_t>(8));
Pmovzxbw(dst.fp(), dst.fp());
I16x8UConvertI8x16High(dst.fp(), src.fp(), liftoff::kScratchDoubleReg);
}
void LiftoffAssembler::emit_i32x4_sconvert_i16x8_low(LiftoffRegister dst,
......@@ -4573,8 +4571,7 @@ void LiftoffAssembler::emit_i32x4_sconvert_i16x8_low(LiftoffRegister dst,
void LiftoffAssembler::emit_i32x4_sconvert_i16x8_high(LiftoffRegister dst,
LiftoffRegister src) {
Palignr(dst.fp(), src.fp(), static_cast<uint8_t>(8));
Pmovsxwd(dst.fp(), dst.fp());
I32x4SConvertI16x8High(dst.fp(), src.fp());
}
void LiftoffAssembler::emit_i32x4_uconvert_i16x8_low(LiftoffRegister dst,
......@@ -4584,8 +4581,7 @@ void LiftoffAssembler::emit_i32x4_uconvert_i16x8_low(LiftoffRegister dst,
void LiftoffAssembler::emit_i32x4_uconvert_i16x8_high(LiftoffRegister dst,
LiftoffRegister src) {
Palignr(dst.fp(), src.fp(), static_cast<uint8_t>(8));
Pmovzxwd(dst.fp(), dst.fp());
I32x4UConvertI16x8High(dst.fp(), src.fp(), liftoff::kScratchDoubleReg);
}
void LiftoffAssembler::emit_i32x4_trunc_sat_f64x2_s_zero(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment