Commit b145152d authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[wasm-simd][x64] Optimize some integer widen_high ops

Optimize:
- i32x4.widen_high_i16x8_s
- i32x4.widen_high_i16x8_u
- i16x8.widen_high_i8x16_s
- i16x8.widen_high_i8x16_u

These optimizations were suggested in http://b/175364869.

The main change is to move away from palignr, which has a dependency on
dst, and also the AVX version is 2 bytes longer than the punpckhqdq.

For the signed and unsigned variants, we have slightly different
optimizations. Unsigned variants can use an punpckh* instruction with a
zero-ed scratch register, that effectively zero-extends. Signed variants
use the movhlps instruction to move high half to low half of dst, then
use packed signed extension instructions.

The common fallback for these instructions is to use pshufd, which does
not have a dependency on dst, but is 1 byte longer than the punpckh*
instructions.

FIXED=b/175364869

Change-Id: If28da2aaa8f6e39a58e63b01cc9a81bbbb294606
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2591853Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71856}
parent 8c480598
......@@ -14,6 +14,7 @@
#include "src/codegen/register-configuration.h"
#include "src/codegen/string-constants.h"
#include "src/codegen/x64/assembler-x64.h"
#include "src/codegen/x64/register-x64.h"
#include "src/common/external-pointer.h"
#include "src/common/globals.h"
#include "src/debug/debug.h"
......@@ -1988,6 +1989,88 @@ void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src,
}
}
void TurboAssembler::I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// Copy top half (64-bit) of src into both halves of dst.
vpunpckhqdq(dst, src, src);
vpmovsxwd(dst, dst);
} else {
if (dst == src) {
// 2 bytes shorter than pshufd, but has depdency on dst.
movhlps(dst, src);
pmovsxwd(dst, dst);
} else {
// No dependency on dst.
pshufd(dst, src, 0xEE);
pmovsxwd(dst, dst);
}
}
}
void TurboAssembler::I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// scratch = |0|0|0|0|0|0|0|0|
// src = |a|b|c|d|e|f|g|h|
// dst = |0|a|0|b|0|c|0|d|
XMMRegister scratch = dst == src ? kScratchDoubleReg : dst;
vpxor(scratch, scratch, scratch);
vpunpckhwd(dst, src, scratch);
} else {
if (dst == src) {
// xorps can be executed on more ports than pshufd.
xorps(kScratchDoubleReg, kScratchDoubleReg);
punpckhwd(dst, kScratchDoubleReg);
} else {
// No dependency on dst.
pshufd(dst, src, 0xEE);
pmovzxwd(dst, dst);
}
}
}
void TurboAssembler::I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// Copy top half (64-bit) of src into both halves of dst.
vpunpckhqdq(dst, src, src);
vpmovsxbw(dst, dst);
} else {
if (dst == src) {
// 2 bytes shorter than pshufd, but has depdency on dst.
movhlps(dst, src);
pmovsxbw(dst, dst);
} else {
// No dependency on dst.
pshufd(dst, src, 0xEE);
pmovsxbw(dst, dst);
}
}
}
void TurboAssembler::I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
// scratch = |0|0|0|0|0|0|0|0 | 0|0|0|0|0|0|0|0|
// src = |a|b|c|d|e|f|g|h | i|j|k|l|m|n|o|p|
// dst = |0|a|0|b|0|c|0|d | 0|e|0|f|0|g|0|h|
XMMRegister scratch = dst == src ? kScratchDoubleReg : dst;
vpxor(scratch, scratch, scratch);
vpunpckhbw(dst, src, scratch);
} else {
if (dst == src) {
// xorps can be executed on more ports than pshufd.
xorps(kScratchDoubleReg, kScratchDoubleReg);
punpckhbw(dst, kScratchDoubleReg);
} else {
// No dependency on dst.
pshufd(dst, src, 0xEE);
pmovzxbw(dst, dst);
}
}
}
void TurboAssembler::Psrld(XMMRegister dst, byte imm8) {
Psrld(dst, dst, imm8);
}
......
......@@ -565,6 +565,13 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
// Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE.
void Pshufb(XMMRegister dst, XMMRegister src1, XMMRegister src2);
// These Wasm SIMD ops do not have direct lowerings on x64. These
// helpers are optimized to produce the fastest and smallest codegen.
void I16x8SConvertI8x16High(XMMRegister dst, XMMRegister src);
void I16x8UConvertI8x16High(XMMRegister dst, XMMRegister src);
void I32x4SConvertI16x8High(XMMRegister dst, XMMRegister src);
void I32x4UConvertI16x8High(XMMRegister dst, XMMRegister src);
void CompareRoot(Register with, RootIndex index);
void CompareRoot(Operand with, RootIndex index);
......
......@@ -2963,9 +2963,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I32x4SConvertI16x8High: {
XMMRegister dst = i.OutputSimd128Register();
__ Palignr(dst, i.InputSimd128Register(0), uint8_t{8});
__ Pmovsxwd(dst, dst);
__ I32x4SConvertI16x8High(i.OutputSimd128Register(),
i.InputSimd128Register(0));
break;
}
case kX64I32x4Neg: {
......@@ -3069,9 +3068,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I32x4UConvertI16x8High: {
XMMRegister dst = i.OutputSimd128Register();
__ Palignr(dst, i.InputSimd128Register(0), uint8_t{8});
__ Pmovzxwd(dst, dst);
__ I32x4UConvertI16x8High(i.OutputSimd128Register(),
i.InputSimd128Register(0));
break;
}
case kX64I32x4ShrU: {
......@@ -3188,9 +3186,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I16x8SConvertI8x16High: {
XMMRegister dst = i.OutputSimd128Register();
__ Palignr(dst, i.InputSimd128Register(0), uint8_t{8});
__ Pmovsxbw(dst, dst);
__ I16x8SConvertI8x16High(i.OutputSimd128Register(),
i.InputSimd128Register(0));
break;
}
case kX64I16x8Neg: {
......@@ -3278,9 +3275,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I16x8UConvertI8x16High: {
XMMRegister dst = i.OutputSimd128Register();
__ Palignr(dst, i.InputSimd128Register(0), uint8_t{8});
__ Pmovzxbw(dst, dst);
__ I16x8UConvertI8x16High(i.OutputSimd128Register(),
i.InputSimd128Register(0));
break;
}
case kX64I16x8ShrU: {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment