[arm64][wasm] Implement I64x2 multiply

Implement I64x2 multiply using 32-bit multiplies. This approach uses two fewer cycles (0.88x) on Cortex-A53 and three fewer cycles (0.86x) on Cortex-A72, compared to moving to general purpose registers and doing two 64-bit multiplies. Based on a patch by Zhi An Ng. Bug: v8:8460 Change-Id: I9c8d3bb77f0d751eec2d85823522558b7f173628 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1781696 Commit-Queue: Martyn Capewell <martyn.capewell@arm.com> Reviewed-by: Zhi An Ng <zhin@chromium.org> Reviewed-by: Bill Budge <bbudge@chromium.org> Cr-Commit-Position: refs/heads/master@{#63558}

[arm64][wasm] Implement I64x2 multiply
Implement I64x2 multiply using 32-bit multiplies. This approach uses two fewer cycles (0.88x) on Cortex-A53 and three fewer cycles (0.86x) on Cortex-A72, compared to moving to general purpose registers and doing two 64-bit multiplies. Based on a patch by Zhi An Ng. Bug: v8:8460 Change-Id: I9c8d3bb77f0d751eec2d85823522558b7f173628 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1781696 Commit-Queue: Martyn Capewell <martyn.capewell@arm.com> Reviewed-by: Zhi An Ng <zhin@chromium.org> Reviewed-by: Bill Budge <bbudge@chromium.org> Cr-Commit-Position: refs/heads/master@{#63558}
b7ade853 · Joey Gouly · Commit Bot · a14e2f12 · b7ade853 · b7ade853
Commit b7ade853 authored Sep 03, 2019 by Joey Gouly Committed by Commit Bot Sep 04, 2019
6 changed files
--- a/src/compiler/backend/arm64/code-generator-arm64.cc
+++ b/src/compiler/backend/arm64/code-generator-arm64.cc
@@ -1903,6 +1903,65 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    }
      SIMD_BINOP_CASE(kArm64I64x2Add, Add, 2D);
      SIMD_BINOP_CASE(kArm64I64x2Sub, Sub, 2D);
+    case kArm64I64x2Mul: {
+      UseScratchRegisterScope scope(tasm());
+      VRegister dst = i.OutputSimd128Register();
+      VRegister src1 = i.InputSimd128Register(0);
+      VRegister src2 = i.InputSimd128Register(1);
+      VRegister tmp1 = scope.AcquireSameSizeAs(dst);
+      VRegister tmp2 = scope.AcquireSameSizeAs(dst);
+      VRegister tmp3 = i.ToSimd128Register(instr->TempAt(0));
+      // This 2x64-bit multiplication is performed with several 32-bit
+      // multiplications.
+      // 64-bit numbers x and y, can be represented as:
+      //   x = a + 2^32(b)
+      //   y = c + 2^32(d)
+      // A 64-bit multiplication is:
+      //   x * y = ac + 2^32(ad + bc) + 2^64(bd)
+      // note: `2^64(bd)` can be ignored, the value is too large to fit in
+      // 64-bits.
+      // This sequence implements a 2x64bit multiply, where the registers
+      // `src1` and `src2` are split up into 32-bit components:
+      //   src1 = |d|c|b|a|
+      //   src2 = |h|g|f|e|
+      //
+      //   src1 * src2 = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
+      // Reverse the 32-bit elements in the 64-bit words.
+      //   tmp2 = |g|h|e|f|
+      __ Rev64(tmp2.V4S(), src2.V4S());
+      // Calculate the high half components.
+      //   tmp2 = |dg|ch|be|af|
+      __ Mul(tmp2.V4S(), tmp2.V4S(), src1.V4S());
+      // Extract the low half components of src1.
+      //   tmp1 = |c|a|
+      __ Xtn(tmp1.V2S(), src1.V2D());
+      // Sum the respective high half components.
+      //   tmp2 = |dg+ch|be+af||dg+ch|be+af|
+      __ Addp(tmp2.V4S(), tmp2.V4S(), tmp2.V4S());
+      // Extract the low half components of src2.
+      //   tmp3 = |g|e|
+      __ Xtn(tmp3.V2S(), src2.V2D());
+      // Shift the high half components, into the high half.
+      //   dst = |dg+ch << 32|be+af << 32|
+      __ Shll(dst.V2D(), tmp2.V2S(), 32);
+      // Multiply the low components together, and accumulate with the high
+      // half.
+      //   dst = |dst[1] + cg|dst[0] + ae|
+      __ Umlal(dst.V2D(), tmp3.V2S(), tmp1.V2S());
+      break;
+    }
      SIMD_BINOP_CASE(kArm64I64x2Eq, Cmeq, 2D);
    case kArm64I64x2Ne: {
      VRegister dst = i.OutputSimd128Register().V2D();

--- a/src/compiler/backend/arm64/instruction-codes-arm64.h
+++ b/src/compiler/backend/arm64/instruction-codes-arm64.h
@@ -213,6 +213,7 @@ namespace compiler {
  V(Arm64I64x2ShrS)                         \
  V(Arm64I64x2Add)                          \
  V(Arm64I64x2Sub)                          \
+  V(Arm64I64x2Mul)                          \
  V(Arm64I64x2Eq)                           \
  V(Arm64I64x2Ne)                           \
  V(Arm64I64x2GtS)                          \

--- a/src/compiler/backend/arm64/instruction-scheduler-arm64.cc
+++ b/src/compiler/backend/arm64/instruction-scheduler-arm64.cc
@@ -180,6 +180,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kArm64I64x2ShrS:
    case kArm64I64x2Add:
    case kArm64I64x2Sub:
+    case kArm64I64x2Mul:
    case kArm64I64x2Eq:
    case kArm64I64x2Ne:
    case kArm64I64x2GtS:

--- a/src/compiler/backend/arm64/instruction-selector-arm64.cc
+++ b/src/compiler/backend/arm64/instruction-selector-arm64.cc
@@ -3236,6 +3236,14 @@ SIMD_BINOP_LIST(SIMD_VISIT_BINOP)
 #undef SIMD_VISIT_BINOP
 #undef SIMD_BINOP_LIST
+void InstructionSelector::VisitI64x2Mul(Node* node) {
+  Arm64OperandGenerator g(this);
+  InstructionOperand temps[] = {g.TempSimd128Register()};
+  Emit(kArm64I64x2Mul, g.DefineAsRegister(node),
+       g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
+       arraysize(temps), temps);
+}
 void InstructionSelector::VisitS128Select(Node* node) {
  Arm64OperandGenerator g(this);
  Emit(kArm64S128Select, g.DefineSameAsFirst(node),

--- a/src/compiler/backend/instruction-selector.cc
+++ b/src/compiler/backend/instruction-selector.cc
@@ -2630,6 +2630,7 @@ void InstructionSelector::VisitI64x2Shl(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitI64x2ShrS(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitI64x2Add(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitI64x2Sub(Node* node) { UNIMPLEMENTED(); }
+void InstructionSelector::VisitI64x2Mul(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitI64x2Eq(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitI64x2Ne(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitI64x2ShrU(Node* node) { UNIMPLEMENTED(); }
@@ -2640,7 +2641,6 @@ void InstructionSelector::VisitI64x2GeU(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitS1x2AnyTrue(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitS1x2AllTrue(Node* node) { UNIMPLEMENTED(); }
 #endif  // !V8_TARGET_ARCH_ARM64
-void InstructionSelector::VisitI64x2Mul(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitI64x2MinS(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitI64x2MaxS(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitI64x2MinU(Node* node) { UNIMPLEMENTED(); }

--- a/test/cctest/wasm/test-run-wasm-simd.cc
+++ b/test/cctest/wasm/test-run-wasm-simd.cc
@@ -1249,12 +1249,12 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2Max) {
  RunF64x2BinOpTest(execution_tier, lower_simd, kExprF64x2Max, JSMax);
 }
-#if V8_TARGET_ARCH_X64
 WASM_SIMD_TEST_NO_LOWERING(I64x2Mul) {
  RunI64x2BinOpTest(execution_tier, lower_simd, kExprI64x2Mul,
                    base::MulWithWraparound);
 }
+#if V8_TARGET_ARCH_X64
 WASM_SIMD_TEST_NO_LOWERING(I64x2MinS) {
  RunI64x2BinOpTest(execution_tier, lower_simd, kExprI64x2MinS, Minimum);
 }