Commit 723d1af0 authored by Daan de Graaf's avatar Daan de Graaf Committed by V8 LUCI CQ

[wasm-simd][arm64] Fuse signed extadd_pairwise and add SIMD instructions.

The two instructions are fused into a single Sadalp instruction,
improving performance of quantized neural network operator
implementations such as XNNPACK.

This change also includes some formatting changes to the unit
tests that were made automatically by clang-format, which I am
happy to revert if preferred.

Bug: v8:11546
Change-Id: I2afc8940a52186617cffd276c82733ad3020b728
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2878742
Commit-Queue: Daan de Graaf <daagra@google.com>
Reviewed-by: 's avatarZhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#74952}
parent 7c54550e
......@@ -1203,6 +1203,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kArm64Mul32:
__ Mul(i.OutputRegister32(), i.InputRegister32(0), i.InputRegister32(1));
break;
case kArm64Sadalp: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
VectorFormat dst_f = VectorFormatFillQ(LaneSizeField::decode(opcode));
VectorFormat src_f = VectorFormatHalfWidthDoubleLanes(dst_f);
__ Sadalp(i.OutputSimd128Register().Format(dst_f),
i.InputSimd128Register(1).Format(src_f));
break;
}
case kArm64Saddlp: {
VectorFormat dst_f = VectorFormatFillQ(LaneSizeField::decode(opcode));
VectorFormat src_f = VectorFormatHalfWidthDoubleLanes(dst_f);
......@@ -2987,7 +2995,7 @@ void CodeGenerator::AssembleArchSelect(Instruction* instr,
FlagsCondition condition) {
Arm64OperandConverter i(this, instr);
MachineRepresentation rep =
LocationOperand::cast(instr->OutputAt(0))->representation();
LocationOperand::cast(instr->OutputAt(0))->representation();
Condition cc = FlagsConditionToCondition(condition);
// We don't now how many inputs were consumed by the condition, so we have to
// calculate the indices of the last two inputs.
......
......@@ -35,6 +35,7 @@ namespace compiler {
V(Arm64Eor32) \
V(Arm64Eon) \
V(Arm64Eon32) \
V(Arm64Sadalp) \
V(Arm64Saddlp) \
V(Arm64Sub) \
V(Arm64Sub32) \
......
......@@ -36,6 +36,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64Eor32:
case kArm64Eon:
case kArm64Eon32:
case kArm64Sadalp:
case kArm64Saddlp:
case kArm64Sub:
case kArm64Sub32:
......
......@@ -2146,9 +2146,9 @@ void VisitCompare(InstructionSelector* selector, InstructionCode opcode,
FlagsContinuation* cont) {
if (cont->IsSelect()) {
Arm64OperandGenerator g(selector);
InstructionOperand inputs[] = { left, right,
g.UseRegister(cont->true_value()),
g.UseRegister(cont->false_value()) };
InstructionOperand inputs[] = {left, right,
g.UseRegister(cont->true_value()),
g.UseRegister(cont->false_value())};
selector->EmitWithContinuation(opcode, 0, nullptr, 4, inputs, cont);
} else {
selector->EmitWithContinuation(opcode, left, right, cont);
......@@ -3721,7 +3721,7 @@ void InstructionSelector::VisitI64x2Mul(Node* node) {
arraysize(temps), temps);
}
#define VISIT_SIMD_ADD(Type) \
#define VISIT_SIMD_ADD(Type, PairwiseType, LaneSize) \
void InstructionSelector::Visit##Type##Add(Node* node) { \
Arm64OperandGenerator g(this); \
Node* left = node->InputAt(0); \
......@@ -3739,11 +3739,29 @@ void InstructionSelector::VisitI64x2Mul(Node* node) {
g.UseRegister(right->InputAt(1))); \
return; \
} \
/* Select Sadalp(x, y) for Add(x, ExtAddPairwiseS(y)). */ \
if (right->opcode() == \
IrOpcode::k##Type##ExtAddPairwise##PairwiseType##S && \
CanCover(node, right)) { \
Emit(kArm64Sadalp | LaneSizeField::encode(LaneSize), \
g.DefineSameAsFirst(node), g.UseRegister(left), \
g.UseRegister(right->InputAt(0))); \
return; \
} \
/* Select Sadalp(y, x) for Add(ExtAddPairwiseS(x), y). */ \
if (left->opcode() == \
IrOpcode::k##Type##ExtAddPairwise##PairwiseType##S && \
CanCover(node, left)) { \
Emit(kArm64Sadalp | LaneSizeField::encode(LaneSize), \
g.DefineSameAsFirst(node), g.UseRegister(right), \
g.UseRegister(left->InputAt(0))); \
return; \
} \
VisitRRR(this, kArm64##Type##Add, node); \
}
VISIT_SIMD_ADD(I32x4)
VISIT_SIMD_ADD(I16x8)
VISIT_SIMD_ADD(I32x4, I16x8, 32)
VISIT_SIMD_ADD(I16x8, I8x16, 16)
#undef VISIT_SIMD_ADD
#define VISIT_SIMD_SUB(Type) \
......
......@@ -258,8 +258,8 @@ class FlagsContinuation final {
DCHECK_NOT_NULL(result);
}
FlagsContinuation(FlagsCondition condition, Node* result,
Node* true_value, Node* false_value)
FlagsContinuation(FlagsCondition condition, Node* result, Node* true_value,
Node* false_value)
: mode_(kFlags_select),
condition_(condition),
frame_state_or_result_(result),
......@@ -446,6 +446,9 @@ class V8_EXPORT_PRIVATE InstructionSelector final {
// Check if {node} can be covered while generating code for the current
// instruction. A node can be covered if the {user} of the node has the only
// edge and the two are in the same basic block.
// Before fusing two instructions a and b, it is useful to check that
// CanCover(a, b) holds. If this is not the case, code for b must still be
// generated for other users, and fusing is unlikely to improve performance.
bool CanCover(Node* user, Node* node) const;
// CanCover is not transitive. The counter example are Nodes A,B,C such that
// CanCover(A, B) and CanCover(B,C) and B is pure: The the effect level of A
......
......@@ -3312,6 +3312,83 @@ WASM_SIMD_TEST(I16x8ExtractLaneU_I8x16Splat) {
CHECK_EQ(0xfafa, r.Call(0xfa));
}
enum ExtAddSide { LEFT, RIGHT };
template <typename T, typename U>
void RunAddExtAddPairwiseTest(
TestExecutionTier execution_tier, ExtAddSide extAddSide,
WasmOpcode addOpcode,
const std::array<T, kSimd128Size / sizeof(T)> addInput,
WasmOpcode extAddOpcode,
const std::array<U, kSimd128Size / sizeof(U)> extAddInput,
const std::array<T, kSimd128Size / sizeof(T)> expectedOutput) {
WasmRunner<int32_t> r(execution_tier);
T* x = r.builder().AddGlobal<T>(kWasmS128);
for (size_t i = 0; i < addInput.size(); i++) {
WriteLittleEndianValue<T>(&x[i], addInput[i]);
}
U* y = r.builder().AddGlobal<U>(kWasmS128);
for (size_t i = 0; i < extAddInput.size(); i++) {
WriteLittleEndianValue<U>(&y[i], extAddInput[i]);
}
switch (extAddSide) {
case LEFT:
// x = add(extadd_pairwise_s(y), x)
BUILD(r,
WASM_GLOBAL_SET(
0,
WASM_SIMD_BINOP(
addOpcode, WASM_SIMD_UNOP(extAddOpcode, WASM_GLOBAL_GET(1)),
WASM_GLOBAL_GET(0))),
WASM_ONE);
break;
case RIGHT:
// x = add(x, extadd_pairwise_s(y))
BUILD(r,
WASM_GLOBAL_SET(
0, WASM_SIMD_BINOP(
addOpcode, WASM_GLOBAL_GET(0),
WASM_SIMD_UNOP(extAddOpcode, WASM_GLOBAL_GET(1)))),
WASM_ONE);
break;
}
r.Call();
for (size_t i = 0; i < expectedOutput.size(); i++) {
CHECK_EQ(expectedOutput[i], x[i]);
}
}
WASM_SIMD_TEST(AddExtAddPairwiseI32Right) {
RunAddExtAddPairwiseTest<int32_t, int16_t>(
execution_tier, RIGHT, kExprI32x4Add, {1, 2, 3, 4},
kExprI32x4ExtAddPairwiseI16x8S, {1, 2, 3, 4, 5, 6, 7, 8}, {4, 9, 14, 19});
}
WASM_SIMD_TEST(AddExtAddPairwiseI32Left) {
RunAddExtAddPairwiseTest<int32_t, int16_t>(
execution_tier, LEFT, kExprI32x4Add, {1, 2, 3, 4},
kExprI32x4ExtAddPairwiseI16x8S, {1, 2, 3, 4, 5, 6, 7, 8}, {4, 9, 14, 19});
}
WASM_SIMD_TEST(AddExtAddPairwiseI16Right) {
RunAddExtAddPairwiseTest<int16_t, int8_t>(
execution_tier, RIGHT, kExprI16x8Add, {1, 2, 3, 4, 5, 6, 7, 8},
kExprI16x8ExtAddPairwiseI8x16S,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
{4, 9, 14, 19, 24, 29, 34, 39});
}
WASM_SIMD_TEST(AddExtAddPairwiseI16Left) {
RunAddExtAddPairwiseTest<int16_t, int8_t>(
execution_tier, LEFT, kExprI16x8Add, {1, 2, 3, 4, 5, 6, 7, 8},
kExprI16x8ExtAddPairwiseI8x16S,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
{4, 9, 14, 19, 24, 29, 34, 39});
}
#define WASM_EXTRACT_I16x8_TEST(Sign, Type) \
WASM_SIMD_TEST(I16X8ExtractLane##Sign) { \
WasmRunner<int32_t, int32_t> r(execution_tier); \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment