Commit a5d45862 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][arm64] Optimize f32x4 dup + mul into fmul by element

Wasm SIMD does not have an opcode to multiply a vector by a scalar. In
these cases, Wasm code uses mul(x, shuffle(y, imms)), where the shuffle
is a dup of a single lane in y. Pattern match on this to emit a fmul
(element).

We can do similar pattern match on f64x2 too, that will come in a future
patch.

Bug: v8:11257
Change-Id: I61e8c46b56719a1179c8a6032dbf8a4cc03b40a9
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2719083
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarAndreas Haas <ahaas@chromium.org>
Cr-Commit-Position: refs/heads/master@{#73141}
parent f4ec0f4d
......@@ -2175,6 +2175,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
SIMD_BINOP_CASE(kArm64F32x4Min, Fmin, 4S);
SIMD_BINOP_CASE(kArm64F32x4Max, Fmax, 4S);
SIMD_BINOP_CASE(kArm64F32x4Eq, Fcmeq, 4S);
case kArm64F32x4MulElement: {
__ Fmul(i.OutputSimd128Register().V4S(), i.InputSimd128Register(0).V4S(),
i.InputSimd128Register(1).S(), i.InputInt8(2));
break;
}
case kArm64F32x4Ne: {
VRegister dst = i.OutputSimd128Register().V4S();
__ Fcmeq(dst, i.InputSimd128Register(0).V4S(),
......
......@@ -215,6 +215,7 @@ namespace compiler {
V(Arm64F32x4AddHoriz) \
V(Arm64F32x4Sub) \
V(Arm64F32x4Mul) \
V(Arm64F32x4MulElement) \
V(Arm64F32x4Div) \
V(Arm64F32x4Min) \
V(Arm64F32x4Max) \
......
......@@ -180,6 +180,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64F32x4AddHoriz:
case kArm64F32x4Sub:
case kArm64F32x4Mul:
case kArm64F32x4MulElement:
case kArm64F32x4Div:
case kArm64F32x4Min:
case kArm64F32x4Max:
......
......@@ -9,6 +9,7 @@
#include "src/common/globals.h"
#include "src/compiler/backend/instruction-codes.h"
#include "src/compiler/backend/instruction-selector-impl.h"
#include "src/compiler/backend/instruction-selector.h"
#include "src/compiler/machine-operator.h"
#include "src/compiler/node-matchers.h"
#include "src/compiler/node-properties.h"
......@@ -3467,7 +3468,6 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
V(F32x4Add, kArm64F32x4Add) \
V(F32x4AddHoriz, kArm64F32x4AddHoriz) \
V(F32x4Sub, kArm64F32x4Sub) \
V(F32x4Mul, kArm64F32x4Mul) \
V(F32x4Div, kArm64F32x4Div) \
V(F32x4Min, kArm64F32x4Min) \
V(F32x4Max, kArm64F32x4Max) \
......@@ -3614,6 +3614,52 @@ SIMD_BINOP_LIST(SIMD_VISIT_BINOP)
#undef SIMD_VISIT_BINOP
#undef SIMD_BINOP_LIST
using ShuffleMatcher =
ValueMatcher<S128ImmediateParameter, IrOpcode::kI8x16Shuffle>;
using BinopWithShuffleMatcher = BinopMatcher<ShuffleMatcher, ShuffleMatcher>;
void InstructionSelector::VisitF32x4Mul(Node* node) {
// Pattern match:
// f32x4.mul(x, shuffle(x, y, indices)) => f32x4.mul(x, y, laneidx)
// where shuffle(x, y, indices) = dup(x[laneidx]) or dup(y[laneidx])
// f32x4.mul is commutative, so use BinopMatcher.
BinopWithShuffleMatcher m = BinopWithShuffleMatcher(node);
ShuffleMatcher left = m.left();
ShuffleMatcher right = m.right();
Node* input = nullptr;
Node* dup_node = nullptr;
int index = 0;
// TODO(zhin): We can canonicalize first to avoid checking index < 4.
// e.g. shuffle(x, y, [16, 17, 18, 19...]) => shuffle(y, y, [0, 1, 2, 3]...).
// But doing so can mutate the inputs of the shuffle node without updating the
// shuffle immediates themselves. Fix that before we canonicalize here.
// We don't want CanCover here because in many use cases, the shuffle is
// generated early in the function, but the f32x4.mul happens in a loop, which
// won't cover the shuffle since they are different basic blocks.
if (left.HasResolvedValue() && wasm::SimdShuffle::TryMatchSplat<4>(
left.ResolvedValue().data(), &index)) {
dup_node = left.node()->InputAt(index < 4 ? 0 : 1);
input = right.node();
} else if (right.HasResolvedValue() &&
wasm::SimdShuffle::TryMatchSplat<4>(right.ResolvedValue().data(),
&index)) {
dup_node = right.node()->InputAt(index < 4 ? 0 : 1);
input = left.node();
}
if (dup_node == nullptr) {
return VisitRRR(this, kArm64F32x4Mul, node);
}
// Canonicalization would get rid of this too.
index %= 4;
Arm64OperandGenerator g(this);
Emit(kArm64F32x4MulElement, g.DefineAsRegister(node), g.UseRegister(input),
g.UseRegister(dup_node), g.UseImmediate(index));
}
void InstructionSelector::VisitI64x2Mul(Node* node) {
Arm64OperandGenerator g(this);
InstructionOperand temps[] = {g.TempSimd128Register()};
......
......@@ -182,6 +182,7 @@ class S128ImmediateParameter {
explicit S128ImmediateParameter(const uint8_t immediate[16]) {
std::copy(immediate, immediate + 16, immediate_.begin());
}
S128ImmediateParameter() = default;
const std::array<uint8_t, 16>& immediate() const { return immediate_; }
const uint8_t* data() const { return immediate_.data(); }
uint8_t operator[](int x) const { return immediate_[x]; }
......
......@@ -2,6 +2,7 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/common/globals.h"
#include "src/objects/objects-inl.h"
#include "test/unittests/compiler/backend/instruction-selector-unittest.h"
......@@ -2217,6 +2218,118 @@ INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorSIMDDPWithSIMDMulTest,
::testing::ValuesIn(kSIMDMulDPInstructions));
struct SIMDMulDupInst {
const uint8_t shuffle[16];
int32_t lane;
int shuffle_input_index;
};
const SIMDMulDupInst kSIMDF32x4MulDuplInstructions[] = {
{
{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
0,
0,
},
{
{4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7},
1,
0,
},
{
{8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11},
2,
0,
},
{
{12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15},
3,
0,
},
{
{16, 17, 18, 19, 16, 17, 18, 19, 16, 17, 18, 19, 16, 17, 18, 19},
0,
1,
},
{
{20, 21, 22, 23, 20, 21, 22, 23, 20, 21, 22, 23, 20, 21, 22, 23},
1,
1,
},
{
{24, 25, 26, 27, 24, 25, 26, 27, 24, 25, 26, 27, 24, 25, 26, 27},
2,
1,
},
{
{28, 29, 30, 31, 28, 29, 30, 31, 28, 29, 30, 31, 28, 29, 30, 31},
3,
1,
},
};
using InstructionSelectorSimdMulWithDupTest =
InstructionSelectorTestWithParam<SIMDMulDupInst>;
TEST_P(InstructionSelectorSimdMulWithDupTest, MulWithDup) {
const SIMDMulDupInst param = GetParam();
const MachineType type = MachineType::Simd128();
{
StreamBuilder m(this, type, type, type, type);
Node* shuffle = m.AddNode(m.machine()->I8x16Shuffle(param.shuffle),
m.Parameter(0), m.Parameter(1));
m.Return(m.AddNode(m.machine()->F32x4Mul(), m.Parameter(2), shuffle));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kArm64F32x4MulElement, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(param.lane, s.ToInt32(s[0]->InputAt(2)));
EXPECT_EQ(1U, s[0]->OutputCount());
EXPECT_EQ(s.ToVreg(m.Parameter(param.shuffle_input_index)),
s.ToVreg(s[0]->InputAt(1)));
}
// Multiplication operator should be commutative, so test shuffle op as lhs.
{
StreamBuilder m(this, type, type, type, type);
Node* shuffle = m.AddNode(m.machine()->I8x16Shuffle(param.shuffle),
m.Parameter(0), m.Parameter(1));
m.Return(m.AddNode(m.machine()->F32x4Mul(), shuffle, m.Parameter(2)));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kArm64F32x4MulElement, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(param.lane, s.ToInt32(s[0]->InputAt(2)));
EXPECT_EQ(1U, s[0]->OutputCount());
EXPECT_EQ(s.ToVreg(m.Parameter(param.shuffle_input_index)),
s.ToVreg(s[0]->InputAt(1)));
}
}
INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorSimdMulWithDupTest,
::testing::ValuesIn(kSIMDF32x4MulDuplInstructions));
TEST_F(InstructionSelectorTest, SimdMulWithDupNegativeTest) {
const MachineType type = MachineType::Simd128();
// Check that optimization does not match when the shuffle is not a f32x4.dup.
const uint8_t mask[kSimd128Size] = {0};
{
StreamBuilder m(this, type, type, type, type);
Node* shuffle = m.AddNode((m.machine()->I8x16Shuffle(mask)), m.Parameter(0),
m.Parameter(1));
m.Return(m.AddNode(m.machine()->F32x4Mul(), m.Parameter(2), shuffle));
Stream s = m.Build();
ASSERT_EQ(2U, s.size());
// The shuffle is a i8x16.dup of lane 0.
EXPECT_EQ(kArm64S128Dup, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(kArm64F32x4Mul, s[1]->arch_opcode());
EXPECT_EQ(1U, s[0]->OutputCount());
EXPECT_EQ(2U, s[1]->InputCount());
EXPECT_EQ(1U, s[1]->OutputCount());
}
}
TEST_F(InstructionSelectorTest, Int32MulWithImmediate) {
// x * (2^k + 1) -> x + (x << k)
TRACED_FORRANGE(int32_t, k, 1, 30) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment