PPC: [wasm-simd] Implement horizontal add

Change-Id: I8962c08329c57367ff82d4669880c7efb1db8875 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2229304Reviewed-by: Junliang Yan <jyan@ca.ibm.com> Commit-Queue: Milad Farazmand <miladfar@ca.ibm.com> Cr-Commit-Position: refs/heads/master@{#68158}

PPC: [wasm-simd] Implement horizontal add
Change-Id: I8962c08329c57367ff82d4669880c7efb1db8875 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2229304Reviewed-by: Junliang Yan <jyan@ca.ibm.com> Commit-Queue: Milad Farazmand <miladfar@ca.ibm.com> Cr-Commit-Position: refs/heads/master@{#68158}
7e89ba7f · Milad Farazmand · Commit Bot · 02ee6904 · 7e89ba7f · 7e89ba7f
Commit 7e89ba7f authored Jun 03, 2020 by Milad Farazmand Committed by Commit Bot Jun 03, 2020
7 changed files
--- a/src/codegen/ppc/assembler-ppc.cc
+++ b/src/codegen/ppc/assembler-ppc.cc
@@ -1758,12 +1758,12 @@ void Assembler::fmsub(const DoubleRegister frt, const DoubleRegister fra,
 }

 // Vector instructions
-void Assembler::mfvsrd(const Register ra, const DoubleRegister rs) {
+void Assembler::mfvsrd(const Register ra, const Simd128Register rs) {
  int SX = 1;
  emit(MFVSRD | rs.code() * B21 | ra.code() * B16 | SX);
 }

-void Assembler::mfvsrwz(const Register ra, const DoubleRegister rs) {
+void Assembler::mfvsrwz(const Register ra, const Simd128Register rs) {
  int SX = 1;
  emit(MFVSRWZ | rs.code() * B21 | ra.code() * B16 | SX);
 }

--- a/src/codegen/ppc/assembler-ppc.h
+++ b/src/codegen/ppc/assembler-ppc.h
@@ -974,8 +974,8 @@ class Assembler : public AssemblerBase {
             RCBit rc = LeaveRC);

  // Vector instructions
-  void mfvsrd(const Register ra, const DoubleRegister r);
-  void mfvsrwz(const Register ra, const DoubleRegister r);
+  void mfvsrd(const Register ra, const Simd128Register r);
+  void mfvsrwz(const Register ra, const Simd128Register r);
  void mtvsrd(const Simd128Register rt, const Register ra);

  // Pseudo instructions

--- a/src/codegen/ppc/constants-ppc.h
+++ b/src/codegen/ppc/constants-ppc.h
@@ -2244,7 +2244,15 @@ using Instr = uint32_t;
  /* Vector Multiply Even Unsigned Byte */                 \
  V(vmuleub, VMULEUB, 0x10000208)                          \
  /* Vector Multiply Odd Unsigned Byte */                  \
-  V(vmuloub, VMULOUB, 0x10000008)
+  V(vmuloub, VMULOUB, 0x10000008)                          \
+  /* Vector Sum across Quarter Signed Halfword Saturate */ \
+  V(vsum4shs, VSUM4SHS, 0x10000648)                        \
+  /* Vector Pack Unsigned Word Unsigned Saturate */        \
+  V(vpkuwus, VPKUWUS, 0x100000CE)                          \
+  /* Vector Sum across Half Signed Word Saturate */        \
+  V(vsum2sws, VSUM2SWS, 0x10000688)                        \
+  /* Vector Pack Unsigned Doubleword Unsigned Modulo */    \
+  V(vpkudum, VPKUDUM, 0x1000044E)

 #define PPC_VX_OPCODE_UNUSED_LIST(V)                                      \
  /* Decimal Add Modulo */                                                \
@@ -2401,16 +2409,12 @@ using Instr = uint32_t;
  V(vpkswss, VPKSWSS, 0x100001CE)                                         \
  /* Vector Pack Signed Word Unsigned Saturate */                         \
  V(vpkswus, VPKSWUS, 0x1000014E)                                         \
-  /* Vector Pack Unsigned Doubleword Unsigned Modulo */                   \
-  V(vpkudum, VPKUDUM, 0x1000044E)                                         \
  /* Vector Pack Unsigned Doubleword Unsigned Saturate */                 \
  V(vpkudus, VPKUDUS, 0x100004CE)                                         \
  /* Vector Pack Unsigned Halfword Unsigned Saturate */                   \
  V(vpkuhus, VPKUHUS, 0x1000008E)                                         \
  /* Vector Pack Unsigned Word Unsigned Modulo */                         \
  V(vpkuwum, VPKUWUM, 0x1000004E)                                         \
-  /* Vector Pack Unsigned Word Unsigned Saturate */                       \
-  V(vpkuwus, VPKUWUS, 0x100000CE)                                         \
  /* Vector Polynomial Multiply-Sum Byte */                               \
  V(vpmsumb, VPMSUMB, 0x10000408)                                         \
  /* Vector Polynomial Multiply-Sum Doubleword */                         \
@@ -2499,12 +2503,8 @@ using Instr = uint32_t;
  V(vsubuqm, VSUBUQM, 0x10000500)                                         \
  /* Vector Subtract Unsigned Word Saturate */                            \
  V(vsubuws, VSUBUWS, 0x10000680)                                         \
-  /* Vector Sum across Half Signed Word Saturate */                       \
-  V(vsum2sws, VSUM2SWS, 0x10000688)                                       \
  /* Vector Sum across Quarter Signed Byte Saturate */                    \
  V(vsum4sbs, VSUM4SBS, 0x10000708)                                       \
-  /* Vector Sum across Quarter Signed Halfword Saturate */                \
-  V(vsum4shs, VSUM4SHS, 0x10000648)                                       \
  /* Vector Sum across Quarter Unsigned Byte Saturate */                  \
  V(vsum4bus, VSUM4BUS, 0x10000608)                                       \
  /* Vector Sum across Signed Word Saturate */                            \

--- a/src/compiler/backend/ppc/code-generator-ppc.cc
+++ b/src/compiler/backend/ppc/code-generator-ppc.cc
@@ -2402,6 +2402,26 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
                i.InputSimd128Register(1));
      break;
    }
+    case kPPC_F32x4AddHoriz: {
+      Simd128Register src0 = i.InputSimd128Register(0);
+      Simd128Register src1 = i.InputSimd128Register(1);
+      Simd128Register dst = i.OutputSimd128Register();
+      Simd128Register tempFPReg1 = i.ToSimd128Register(instr->TempAt(0));
+      Simd128Register tempFPReg2 = i.ToSimd128Register(instr->TempAt(1));
+      constexpr int shift_bits = 32;
+      // generate first operand
+      __ vpkudum(dst, src1, src0);
+      // generate second operand
+      __ li(ip, Operand(shift_bits));
+      __ mtvsrd(tempFPReg2, ip);
+      __ vspltb(tempFPReg2, tempFPReg2, Operand(7));
+      __ vsro(tempFPReg1, src0, tempFPReg2);
+      __ vsro(tempFPReg2, src1, tempFPReg2);
+      __ vpkudum(kScratchDoubleReg, tempFPReg2, tempFPReg1);
+      // add the operands
+      __ vaddfp(dst, kScratchDoubleReg, dst);
+      break;
+    }
    case kPPC_F32x4Sub: {
      __ vsubfp(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputSimd128Register(1));
@@ -2445,6 +2465,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
                 i.InputSimd128Register(1));
      break;
    }
+    case kPPC_I32x4AddHoriz: {
+      Simd128Register src0 = i.InputSimd128Register(0);
+      Simd128Register src1 = i.InputSimd128Register(1);
+      Simd128Register dst = i.OutputSimd128Register();
+      __ vxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
+      __ vsum2sws(dst, src0, kScratchDoubleReg);
+      __ vsum2sws(kScratchDoubleReg, src1, kScratchDoubleReg);
+      __ vpkudum(dst, kScratchDoubleReg, dst);
+      break;
+    }
    case kPPC_I32x4Sub: {
      __ vsubuwm(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputSimd128Register(1));
@@ -2460,6 +2490,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
                 i.InputSimd128Register(1));
      break;
    }
+    case kPPC_I16x8AddHoriz: {
+      Simd128Register src0 = i.InputSimd128Register(0);
+      Simd128Register src1 = i.InputSimd128Register(1);
+      Simd128Register dst = i.OutputSimd128Register();
+      __ vxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
+      __ vsum4shs(dst, src0, kScratchDoubleReg);
+      __ vsum4shs(kScratchDoubleReg, src1, kScratchDoubleReg);
+      __ vpkuwus(dst, kScratchDoubleReg, dst);
+      break;
+    }
    case kPPC_I16x8Sub: {
      __ vsubuhm(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputSimd128Register(1));

--- a/src/compiler/backend/ppc/instruction-codes-ppc.h
+++ b/src/compiler/backend/ppc/instruction-codes-ppc.h
@@ -200,6 +200,7 @@ namespace compiler {
  V(PPC_F32x4ExtractLane)            \
  V(PPC_F32x4ReplaceLane)            \
  V(PPC_F32x4Add)                    \
+  V(PPC_F32x4AddHoriz)               \
  V(PPC_F32x4Sub)                    \
  V(PPC_F32x4Mul)                    \
  V(PPC_I64x2Splat)                  \
@@ -212,6 +213,7 @@ namespace compiler {
  V(PPC_I32x4ExtractLane)            \
  V(PPC_I32x4ReplaceLane)            \
  V(PPC_I32x4Add)                    \
+  V(PPC_I32x4AddHoriz)               \
  V(PPC_I32x4Sub)                    \
  V(PPC_I32x4Mul)                    \
  V(PPC_I16x8Splat)                  \
@@ -219,6 +221,7 @@ namespace compiler {
  V(PPC_I16x8ExtractLaneS)           \
  V(PPC_I16x8ReplaceLane)            \
  V(PPC_I16x8Add)                    \
+  V(PPC_I16x8AddHoriz)               \
  V(PPC_I16x8Sub)                    \
  V(PPC_I16x8Mul)                    \
  V(PPC_I8x16Splat)                  \

--- a/src/compiler/backend/ppc/instruction-scheduler-ppc.cc
+++ b/src/compiler/backend/ppc/instruction-scheduler-ppc.cc
@@ -123,6 +123,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kPPC_F32x4ExtractLane:
    case kPPC_F32x4ReplaceLane:
    case kPPC_F32x4Add:
+    case kPPC_F32x4AddHoriz:
    case kPPC_F32x4Sub:
    case kPPC_F32x4Mul:
    case kPPC_I64x2Splat:
@@ -135,6 +136,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kPPC_I32x4ExtractLane:
    case kPPC_I32x4ReplaceLane:
    case kPPC_I32x4Add:
+    case kPPC_I32x4AddHoriz:
    case kPPC_I32x4Sub:
    case kPPC_I32x4Mul:
    case kPPC_I16x8Splat:
@@ -142,6 +144,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kPPC_I16x8ExtractLaneS:
    case kPPC_I16x8ReplaceLane:
    case kPPC_I16x8Add:
+    case kPPC_I16x8AddHoriz:
    case kPPC_I16x8Sub:
    case kPPC_I16x8Mul:
    case kPPC_I8x16Splat:

--- a/src/compiler/backend/ppc/instruction-selector-ppc.cc
+++ b/src/compiler/backend/ppc/instruction-selector-ppc.cc
@@ -2132,15 +2132,18 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
  V(F64x2Sub)              \
  V(F64x2Mul)              \
  V(F32x4Add)              \
+  V(F32x4AddHoriz)         \
  V(F32x4Sub)              \
  V(F32x4Mul)              \
  V(I64x2Add)              \
  V(I64x2Sub)              \
  V(I64x2Mul)              \
  V(I32x4Add)              \
+  V(I32x4AddHoriz)         \
  V(I32x4Sub)              \
  V(I32x4Mul)              \
  V(I16x8Add)              \
+  V(I16x8AddHoriz)         \
  V(I16x8Sub)              \
  V(I16x8Mul)              \
  V(I8x16Add)              \
@@ -2384,10 +2387,6 @@ void InstructionSelector::VisitF32x4RecipSqrtApprox(Node* node) {

 void InstructionSelector::VisitF32x4RecipApprox(Node* node) { UNIMPLEMENTED(); }

-void InstructionSelector::VisitF32x4AddHoriz(Node* node) { UNIMPLEMENTED(); }
-void InstructionSelector::VisitI32x4AddHoriz(Node* node) { UNIMPLEMENTED(); }
-void InstructionSelector::VisitI16x8AddHoriz(Node* node) { UNIMPLEMENTED(); }
-
 void InstructionSelector::VisitF32x4SConvertI32x4(Node* node) {
  UNIMPLEMENTED();
 }