[wasm-simd][arm64] Optimize f32x4 dup + mul into fmul by element

Wasm SIMD does not have an opcode to multiply a vector by a scalar. In these cases, Wasm code uses mul(x, shuffle(y, imms)), where the shuffle is a dup of a single lane in y. Pattern match on this to emit a fmul (element). We can do similar pattern match on f64x2 too, that will come in a future patch. Bug: v8:11257 Change-Id: I61e8c46b56719a1179c8a6032dbf8a4cc03b40a9 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2719083 Commit-Queue: Zhi An Ng <zhin@chromium.org> Reviewed-by: Andreas Haas <ahaas@chromium.org> Cr-Commit-Position: refs/heads/master@{#73141}

[wasm-simd][arm64] Optimize f32x4 dup + mul into fmul by element
Wasm SIMD does not have an opcode to multiply a vector by a scalar. In these cases, Wasm code uses mul(x, shuffle(y, imms)), where the shuffle is a dup of a single lane in y. Pattern match on this to emit a fmul (element). We can do similar pattern match on f64x2 too, that will come in a future patch. Bug: v8:11257 Change-Id: I61e8c46b56719a1179c8a6032dbf8a4cc03b40a9 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2719083 Commit-Queue: Zhi An Ng <zhin@chromium.org> Reviewed-by: Andreas Haas <ahaas@chromium.org> Cr-Commit-Position: refs/heads/master@{#73141}
a5d45862 · Ng Zhi An · Commit Bot · f4ec0f4d · a5d45862 · a5d45862
Commit a5d45862 authored Mar 02, 2021 by Ng Zhi An Committed by Commit Bot Mar 02, 2021
6 changed files
--- a/src/compiler/backend/arm64/code-generator-arm64.cc
+++ b/src/compiler/backend/arm64/code-generator-arm64.cc
@@ -2175,6 +2175,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      SIMD_BINOP_CASE(kArm64F32x4Min, Fmin, 4S);
      SIMD_BINOP_CASE(kArm64F32x4Max, Fmax, 4S);
      SIMD_BINOP_CASE(kArm64F32x4Eq, Fcmeq, 4S);
+    case kArm64F32x4MulElement: {
+      __ Fmul(i.OutputSimd128Register().V4S(), i.InputSimd128Register(0).V4S(),
+              i.InputSimd128Register(1).S(), i.InputInt8(2));
+      break;
+    }
    case kArm64F32x4Ne: {
      VRegister dst = i.OutputSimd128Register().V4S();
      __ Fcmeq(dst, i.InputSimd128Register(0).V4S(),

--- a/src/compiler/backend/arm64/instruction-codes-arm64.h
+++ b/src/compiler/backend/arm64/instruction-codes-arm64.h
@@ -215,6 +215,7 @@ namespace compiler {
  V(Arm64F32x4AddHoriz)                     \
  V(Arm64F32x4Sub)                          \
  V(Arm64F32x4Mul)                          \
+  V(Arm64F32x4MulElement)                   \
  V(Arm64F32x4Div)                          \
  V(Arm64F32x4Min)                          \
  V(Arm64F32x4Max)                          \

--- a/src/compiler/backend/arm64/instruction-scheduler-arm64.cc
+++ b/src/compiler/backend/arm64/instruction-scheduler-arm64.cc
@@ -180,6 +180,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kArm64F32x4AddHoriz:
    case kArm64F32x4Sub:
    case kArm64F32x4Mul:
+    case kArm64F32x4MulElement:
    case kArm64F32x4Div:
    case kArm64F32x4Min:
    case kArm64F32x4Max:

--- a/src/compiler/backend/arm64/instruction-selector-arm64.cc
+++ b/src/compiler/backend/arm64/instruction-selector-arm64.cc
@@ -9,6 +9,7 @@
 #include "src/common/globals.h"
 #include "src/compiler/backend/instruction-codes.h"
 #include "src/compiler/backend/instruction-selector-impl.h"
+#include "src/compiler/backend/instruction-selector.h"
 #include "src/compiler/machine-operator.h"
 #include "src/compiler/node-matchers.h"
 #include "src/compiler/node-properties.h"
@@ -3467,7 +3468,6 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
  V(F32x4Add, kArm64F32x4Add)                           \
  V(F32x4AddHoriz, kArm64F32x4AddHoriz)                 \
  V(F32x4Sub, kArm64F32x4Sub)                           \
-  V(F32x4Mul, kArm64F32x4Mul)                           \
  V(F32x4Div, kArm64F32x4Div)                           \
  V(F32x4Min, kArm64F32x4Min)                           \
  V(F32x4Max, kArm64F32x4Max)                           \
@@ -3614,6 +3614,52 @@ SIMD_BINOP_LIST(SIMD_VISIT_BINOP)
 #undef SIMD_VISIT_BINOP
 #undef SIMD_BINOP_LIST

+using ShuffleMatcher =
+    ValueMatcher<S128ImmediateParameter, IrOpcode::kI8x16Shuffle>;
+using BinopWithShuffleMatcher = BinopMatcher<ShuffleMatcher, ShuffleMatcher>;
+
+void InstructionSelector::VisitF32x4Mul(Node* node) {
+  // Pattern match:
+  //   f32x4.mul(x, shuffle(x, y, indices)) => f32x4.mul(x, y, laneidx)
+  //   where shuffle(x, y, indices) = dup(x[laneidx]) or dup(y[laneidx])
+  // f32x4.mul is commutative, so use BinopMatcher.
+  BinopWithShuffleMatcher m = BinopWithShuffleMatcher(node);
+  ShuffleMatcher left = m.left();
+  ShuffleMatcher right = m.right();
+
+  Node* input = nullptr;
+  Node* dup_node = nullptr;
+
+  int index = 0;
+  // TODO(zhin): We can canonicalize first to avoid checking index < 4.
+  // e.g. shuffle(x, y, [16, 17, 18, 19...]) => shuffle(y, y, [0, 1, 2, 3]...).
+  // But doing so can mutate the inputs of the shuffle node without updating the
+  // shuffle immediates themselves. Fix that before we canonicalize here.
+  // We don't want CanCover here because in many use cases, the shuffle is
+  // generated early in the function, but the f32x4.mul happens in a loop, which
+  // won't cover the shuffle since they are different basic blocks.
+  if (left.HasResolvedValue() && wasm::SimdShuffle::TryMatchSplat<4>(
+                                     left.ResolvedValue().data(), &index)) {
+    dup_node = left.node()->InputAt(index < 4 ? 0 : 1);
+    input = right.node();
+  } else if (right.HasResolvedValue() &&
+             wasm::SimdShuffle::TryMatchSplat<4>(right.ResolvedValue().data(),
+                                                 &index)) {
+    dup_node = right.node()->InputAt(index < 4 ? 0 : 1);
+    input = left.node();
+  }
+
+  if (dup_node == nullptr) {
+    return VisitRRR(this, kArm64F32x4Mul, node);
+  }
+
+  // Canonicalization would get rid of this too.
+  index %= 4;
+  Arm64OperandGenerator g(this);
+  Emit(kArm64F32x4MulElement, g.DefineAsRegister(node), g.UseRegister(input),
+       g.UseRegister(dup_node), g.UseImmediate(index));
+}
+
 void InstructionSelector::VisitI64x2Mul(Node* node) {
  Arm64OperandGenerator g(this);
  InstructionOperand temps[] = {g.TempSimd128Register()};

--- a/src/compiler/machine-operator.h
+++ b/src/compiler/machine-operator.h
@@ -182,6 +182,7 @@ class S128ImmediateParameter {
  explicit S128ImmediateParameter(const uint8_t immediate[16]) {
    std::copy(immediate, immediate + 16, immediate_.begin());
  }
+  S128ImmediateParameter() = default;
  const std::array<uint8_t, 16>& immediate() const { return immediate_; }
  const uint8_t* data() const { return immediate_.data(); }
  uint8_t operator[](int x) const { return immediate_[x]; }

--- a/test/unittests/compiler/arm64/instruction-selector-arm64-unittest.cc
+++ b/test/unittests/compiler/arm64/instruction-selector-arm64-unittest.cc
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

+#include "src/common/globals.h"
 #include "src/objects/objects-inl.h"
 #include "test/unittests/compiler/backend/instruction-selector-unittest.h"

@@ -2217,6 +2218,118 @@ INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
                         InstructionSelectorSIMDDPWithSIMDMulTest,
                         ::testing::ValuesIn(kSIMDMulDPInstructions));

+struct SIMDMulDupInst {
+  const uint8_t shuffle[16];
+  int32_t lane;
+  int shuffle_input_index;
+};
+
+const SIMDMulDupInst kSIMDF32x4MulDuplInstructions[] = {
+    {
+        {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
+        0,
+        0,
+    },
+    {
+        {4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7},
+        1,
+        0,
+    },
+    {
+        {8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11},
+        2,
+        0,
+    },
+    {
+        {12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15},
+        3,
+        0,
+    },
+    {
+        {16, 17, 18, 19, 16, 17, 18, 19, 16, 17, 18, 19, 16, 17, 18, 19},
+        0,
+        1,
+    },
+    {
+        {20, 21, 22, 23, 20, 21, 22, 23, 20, 21, 22, 23, 20, 21, 22, 23},
+        1,
+        1,
+    },
+    {
+        {24, 25, 26, 27, 24, 25, 26, 27, 24, 25, 26, 27, 24, 25, 26, 27},
+        2,
+        1,
+    },
+    {
+        {28, 29, 30, 31, 28, 29, 30, 31, 28, 29, 30, 31, 28, 29, 30, 31},
+        3,
+        1,
+    },
+};
+
+using InstructionSelectorSimdMulWithDupTest =
+    InstructionSelectorTestWithParam<SIMDMulDupInst>;
+
+TEST_P(InstructionSelectorSimdMulWithDupTest, MulWithDup) {
+  const SIMDMulDupInst param = GetParam();
+  const MachineType type = MachineType::Simd128();
+  {
+    StreamBuilder m(this, type, type, type, type);
+    Node* shuffle = m.AddNode(m.machine()->I8x16Shuffle(param.shuffle),
+                              m.Parameter(0), m.Parameter(1));
+    m.Return(m.AddNode(m.machine()->F32x4Mul(), m.Parameter(2), shuffle));
+    Stream s = m.Build();
+    ASSERT_EQ(1U, s.size());
+    EXPECT_EQ(kArm64F32x4MulElement, s[0]->arch_opcode());
+    EXPECT_EQ(3U, s[0]->InputCount());
+    EXPECT_EQ(param.lane, s.ToInt32(s[0]->InputAt(2)));
+    EXPECT_EQ(1U, s[0]->OutputCount());
+    EXPECT_EQ(s.ToVreg(m.Parameter(param.shuffle_input_index)),
+              s.ToVreg(s[0]->InputAt(1)));
+  }
+
+  // Multiplication operator should be commutative, so test shuffle op as lhs.
+  {
+    StreamBuilder m(this, type, type, type, type);
+    Node* shuffle = m.AddNode(m.machine()->I8x16Shuffle(param.shuffle),
+                              m.Parameter(0), m.Parameter(1));
+    m.Return(m.AddNode(m.machine()->F32x4Mul(), shuffle, m.Parameter(2)));
+    Stream s = m.Build();
+    ASSERT_EQ(1U, s.size());
+    EXPECT_EQ(kArm64F32x4MulElement, s[0]->arch_opcode());
+    EXPECT_EQ(3U, s[0]->InputCount());
+    EXPECT_EQ(param.lane, s.ToInt32(s[0]->InputAt(2)));
+    EXPECT_EQ(1U, s[0]->OutputCount());
+    EXPECT_EQ(s.ToVreg(m.Parameter(param.shuffle_input_index)),
+              s.ToVreg(s[0]->InputAt(1)));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
+                         InstructionSelectorSimdMulWithDupTest,
+                         ::testing::ValuesIn(kSIMDF32x4MulDuplInstructions));
+
+TEST_F(InstructionSelectorTest, SimdMulWithDupNegativeTest) {
+  const MachineType type = MachineType::Simd128();
+  // Check that optimization does not match when the shuffle is not a f32x4.dup.
+  const uint8_t mask[kSimd128Size] = {0};
+  {
+    StreamBuilder m(this, type, type, type, type);
+    Node* shuffle = m.AddNode((m.machine()->I8x16Shuffle(mask)), m.Parameter(0),
+                              m.Parameter(1));
+    m.Return(m.AddNode(m.machine()->F32x4Mul(), m.Parameter(2), shuffle));
+    Stream s = m.Build();
+    ASSERT_EQ(2U, s.size());
+    // The shuffle is a i8x16.dup of lane 0.
+    EXPECT_EQ(kArm64S128Dup, s[0]->arch_opcode());
+    EXPECT_EQ(3U, s[0]->InputCount());
+    EXPECT_EQ(kArm64F32x4Mul, s[1]->arch_opcode());
+    EXPECT_EQ(1U, s[0]->OutputCount());
+    EXPECT_EQ(2U, s[1]->InputCount());
+    EXPECT_EQ(1U, s[1]->OutputCount());
+  }
+}
+
 TEST_F(InstructionSelectorTest, Int32MulWithImmediate) {
  // x * (2^k + 1) -> x + (x << k)
  TRACED_FORRANGE(int32_t, k, 1, 30) {