Commit 4068b3d2 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][x64] Optimize f32x4 splat and extract lanes

For splats, we can make use of vshufps to avoid a movss. Without
AVX, specific dst to be same as src in the instruction selector.

For extract lane, we can use vshufps to extract a float into a dst xmm,
and leave junk in the higher bits.

On the meshopt_decoder.js benchmark in linked bug, it removes about 7
movss instructions that did nothing. Hardware can do register renaming,
but let's not rely on that :)

R=bbudge@chromium.org

Bug: v8:10116
Change-Id: I4d68c10536a79659de673060d537d58113308477
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2481473
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#70628}
parent d0fb92f1
......@@ -2494,21 +2494,29 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
break;
}
// TODO(gdeepti): Get rid of redundant moves for F32x4Splat/Extract below
case kX64F32x4Splat: {
XMMRegister dst = i.OutputSimd128Register();
if (instr->InputAt(0)->IsFPRegister()) {
__ Movss(dst, i.InputDoubleRegister(0));
XMMRegister src = i.InputDoubleRegister(0);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vshufps(dst, src, src, byte{0x0});
} else {
__ Movss(dst, i.InputOperand(0));
}
DCHECK_EQ(dst, src);
__ Shufps(dst, dst, byte{0x0});
}
break;
}
case kX64F32x4ExtractLane: {
__ Extractps(kScratchRegister, i.InputSimd128Register(0),
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister src = i.InputSimd128Register(0);
// vshufps and leave junk in the 3 high lanes.
__ vshufps(i.OutputDoubleRegister(), src, src, i.InputInt8(1));
} else {
__ extractps(kScratchRegister, i.InputSimd128Register(0),
i.InputUint8(1));
__ Movd(i.OutputDoubleRegister(), kScratchRegister);
__ movd(i.OutputDoubleRegister(), kScratchRegister);
}
break;
}
case kX64F32x4ReplaceLane: {
......
......@@ -2796,14 +2796,6 @@ VISIT_ATOMIC_BINOP(Or)
VISIT_ATOMIC_BINOP(Xor)
#undef VISIT_ATOMIC_BINOP
#define SIMD_TYPES(V) \
V(F64x2) \
V(F32x4) \
V(I64x2) \
V(I32x4) \
V(I16x8) \
V(I8x16)
#define SIMD_BINOP_SSE_AVX_LIST(V) \
V(F32x4Add) \
V(F32x4Sub) \
......@@ -2967,14 +2959,29 @@ void InstructionSelector::VisitS128Zero(Node* node) {
Emit(kX64S128Zero, g.DefineAsRegister(node));
}
#define SIMD_TYPES_FOR_SPLAT(V) \
V(F64x2) \
V(I64x2) \
V(I32x4) \
V(I16x8) \
V(I8x16)
#define VISIT_SIMD_SPLAT(Type) \
void InstructionSelector::Visit##Type##Splat(Node* node) { \
X64OperandGenerator g(this); \
Emit(kX64##Type##Splat, g.DefineAsRegister(node), \
g.Use(node->InputAt(0))); \
}
SIMD_TYPES(VISIT_SIMD_SPLAT)
SIMD_TYPES_FOR_SPLAT(VISIT_SIMD_SPLAT)
#undef VISIT_SIMD_SPLAT
#undef SIMD_TYPES_FOR_SPLAT
void InstructionSelector::VisitF32x4Splat(Node* node) {
X64OperandGenerator g(this);
InstructionOperand dst =
IsSupported(AVX) ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node);
Emit(kX64F32x4Splat, dst, g.UseRegister(node->InputAt(0)));
}
#define SIMD_VISIT_EXTRACT_LANE(Type, Sign) \
void InstructionSelector::Visit##Type##ExtractLane##Sign(Node* node) { \
......@@ -3124,7 +3131,6 @@ SIMD_ANYTRUE_LIST(VISIT_SIMD_ANYTRUE)
SIMD_ALLTRUE_LIST(VISIT_SIMD_ALLTRUE)
#undef VISIT_SIMD_ALLTRUE
#undef SIMD_ALLTRUE_LIST
#undef SIMD_TYPES
void InstructionSelector::VisitS128Select(Node* node) {
X64OperandGenerator g(this);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment