Commit 813ae013 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][x64] Don't force dst to be same as src on AVX

On AVX, many instructions can have 3 operands, unlike SSE which only has
2. So on SSE we use DefineSameAsFirst on the dst. But on AVX, using that
will cause some unnecessary moves.

This patch changes a couple of F32x4 and S128 instructions to remove
this restriction when AVX is supported.

We can't use AvxHelper since it duplicates the dst for the call to the
AVX instruction, which isn't what we want. The alternative is to
redefine Mulps and other functions here, but there are other callsites
that depend on this duplicated-dst behavior, so it's harder to change.
We can migrate this as we move more logic over to non-DefineSameAsFirst
for AVX.

With the meshopt_decoder.js in the linked bug, it removes 8 SIMD movs
(from a function that has 300+ lines of assembly.)

Note that from agner's microarchitecture.pdf, page 127, "Elimination of
move instructions", many times such moves can be eliminated by the
processor. So this change won't speed up perf, but it helps a bit with
binary size, and decoder pressure.

Bug: v8:10116,v8:9561
Change-Id: I125bfd44e728ef08312620bc00f6433f376e69e3
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2465653Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#70462}
parent 102b4b3c
......@@ -565,6 +565,21 @@ void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen,
__ j(not_equal, &binop); \
} while (false)
// Handles both SSE and AVX codegen. For SSE we use DefineSameAsFirst, so the
// dst and first src will be the same. For AVX we don't restrict it that way, so
// we will omit unnecessary moves.
#define ASSEMBLE_SIMD_BINOP(opcode) \
do { \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope avx_scope(tasm(), AVX); \
__ v##opcode(i.OutputSimd128Register(), i.InputSimd128Register(0), \
i.InputSimd128Register(1)); \
} else { \
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); \
__ opcode(i.OutputSimd128Register(), i.InputSimd128Register(1)); \
} \
} while (false)
#define ASSEMBLE_SIMD_INSTR(opcode, dst_operand, index) \
do { \
if (instr->InputAt(index)->IsSimd128Register()) { \
......@@ -2546,8 +2561,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64F32x4Add: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ Addps(i.OutputSimd128Register(), i.InputSimd128Register(1));
ASSEMBLE_SIMD_BINOP(addps);
break;
}
case kX64F32x4AddHoriz: {
......@@ -2556,18 +2570,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64F32x4Sub: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ Subps(i.OutputSimd128Register(), i.InputSimd128Register(1));
ASSEMBLE_SIMD_BINOP(subps);
break;
}
case kX64F32x4Mul: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ Mulps(i.OutputSimd128Register(), i.InputSimd128Register(1));
ASSEMBLE_SIMD_BINOP(mulps);
break;
}
case kX64F32x4Div: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ Divps(i.OutputSimd128Register(), i.InputSimd128Register(1));
ASSEMBLE_SIMD_BINOP(divps);
break;
}
case kX64F32x4Min: {
......@@ -3530,15 +3541,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64S128And: {
__ Pand(i.OutputSimd128Register(), i.InputSimd128Register(1));
ASSEMBLE_SIMD_BINOP(pand);
break;
}
case kX64S128Or: {
__ Por(i.OutputSimd128Register(), i.InputSimd128Register(1));
ASSEMBLE_SIMD_BINOP(por);
break;
}
case kX64S128Xor: {
__ Pxor(i.OutputSimd128Register(), i.InputSimd128Register(1));
ASSEMBLE_SIMD_BINOP(pxor);
break;
}
case kX64S128Not: {
......
......@@ -2774,6 +2774,15 @@ VISIT_ATOMIC_BINOP(Xor)
V(I16x8) \
V(I8x16)
#define SIMD_BINOP_SSE_AVX_LIST(V) \
V(F32x4Add) \
V(F32x4Sub) \
V(F32x4Mul) \
V(F32x4Div) \
V(S128And) \
V(S128Or) \
V(S128Xor)
#define SIMD_BINOP_LIST(V) \
V(F64x2Add) \
V(F64x2Sub) \
......@@ -2785,11 +2794,7 @@ VISIT_ATOMIC_BINOP(Xor)
V(F64x2Ne) \
V(F64x2Lt) \
V(F64x2Le) \
V(F32x4Add) \
V(F32x4AddHoriz) \
V(F32x4Sub) \
V(F32x4Mul) \
V(F32x4Div) \
V(F32x4Min) \
V(F32x4Max) \
V(F32x4Eq) \
......@@ -2845,10 +2850,7 @@ VISIT_ATOMIC_BINOP(Xor)
V(I8x16MinU) \
V(I8x16MaxU) \
V(I8x16GeU) \
V(I8x16RoundingAverageU) \
V(S128And) \
V(S128Or) \
V(S128Xor)
V(I8x16RoundingAverageU)
#define SIMD_BINOP_ONE_TEMP_LIST(V) \
V(I32x4Ne) \
......@@ -3028,6 +3030,21 @@ SIMD_BINOP_LIST(VISIT_SIMD_BINOP)
#undef VISIT_SIMD_BINOP
#undef SIMD_BINOP_LIST
#define VISIT_SIMD_BINOP(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
X64OperandGenerator g(this); \
if (IsSupported(AVX)) { \
Emit(kX64##Opcode, g.DefineAsRegister(node), \
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1))); \
} else { \
Emit(kX64##Opcode, g.DefineSameAsFirst(node), \
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1))); \
} \
}
SIMD_BINOP_SSE_AVX_LIST(VISIT_SIMD_BINOP)
#undef VISIT_SIMD_BINOP
#undef SIMD_BINOP_SSE_AVX_LIST
#define VISIT_SIMD_BINOP_ONE_TEMP(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
X64OperandGenerator g(this); \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment