[wasm-simd] Implement remaining I8x16 SIMD ops on x64

- Implementation for I8x16 Shifts, and Mul - Fix convert bug - Enable all tests except for shuffle tests Change-Id: Id1a469d2883c30ea782c51d21dc462d211f94420 Reviewed-on: https://chromium-review.googlesource.com/c/1318609Reviewed-by: Bill Budge <bbudge@chromium.org> Commit-Queue: Deepti Gandluri <gdeepti@chromium.org> Cr-Commit-Position: refs/heads/master@{#57254}

[wasm-simd] Implement remaining I8x16 SIMD ops on x64
- Implementation for I8x16 Shifts, and Mul - Fix convert bug - Enable all tests except for shuffle tests Change-Id: Id1a469d2883c30ea782c51d21dc462d211f94420 Reviewed-on: https://chromium-review.googlesource.com/c/1318609Reviewed-by: Bill Budge <bbudge@chromium.org> Commit-Queue: Deepti Gandluri <gdeepti@chromium.org> Cr-Commit-Position: refs/heads/master@{#57254}
de88bfb2 · Deepti Gandluri · Commit Bot · 691dbd2f · de88bfb2 · de88bfb2
Commit de88bfb2 authored Nov 05, 2018 by Deepti Gandluri Committed by Commit Bot Nov 05, 2018
6 changed files
--- a/src/compiler/instruction-selector.cc
+++ b/src/compiler/instruction-selector.cc
@@ -2481,14 +2481,6 @@ void InstructionSelector::VisitWord64AtomicCompareExchange(Node* node) {
 #if !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_MIPS && \
    !V8_TARGET_ARCH_MIPS64 && !V8_TARGET_ARCH_IA32
-void InstructionSelector::VisitI8x16Shl(Node* node) { UNIMPLEMENTED(); }
-void InstructionSelector::VisitI8x16ShrS(Node* node) { UNIMPLEMENTED(); }
-void InstructionSelector::VisitI8x16ShrU(Node* node) { UNIMPLEMENTED(); }
-void InstructionSelector::VisitI8x16Mul(Node* node) { UNIMPLEMENTED(); }
 void InstructionSelector::VisitS8x16Shuffle(Node* node) { UNIMPLEMENTED(); }
 #endif  // !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_MIPS
        // && !V8_TARGET_ARCH_MIPS64 && !V8_TARGET_ARCH_IA32

--- a/src/compiler/x64/code-generator-x64.cc
+++ b/src/compiler/x64/code-generator-x64.cc
@@ -2683,6 +2683,37 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      }
      break;
    }
+    case kX64I8x16Shl: {
+      XMMRegister dst = i.OutputSimd128Register();
+      DCHECK_EQ(dst, i.InputSimd128Register(0));
+      int8_t shift = i.InputInt8(1) & 0x7;
+      if (shift < 4) {
+        // For small shifts, doubling is faster.
+        for (int i = 0; i < shift; ++i) {
+          __ paddb(dst, dst);
+        }
+      } else {
+        // Mask off the unwanted bits before word-shifting.
+        __ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
+        __ psrlw(kScratchDoubleReg, 8 + shift);
+        __ packuswb(kScratchDoubleReg, kScratchDoubleReg);
+        __ pand(dst, kScratchDoubleReg);
+        __ psllw(dst, shift);
+      }
+      break;
+    }
+    case kX64I8x16ShrS: {
+      XMMRegister dst = i.OutputSimd128Register();
+      XMMRegister src = i.InputSimd128Register(0);
+      int8_t shift = i.InputInt8(1) & 0x7;
+      // Unpack the bytes into words, do arithmetic shifts, and repack.
+      __ punpckhbw(kScratchDoubleReg, src);
+      __ punpcklbw(dst, src);
+      __ psraw(kScratchDoubleReg, 8 + shift);
+      __ psraw(dst, 8 + shift);
+      __ packsswb(dst, kScratchDoubleReg);
+      break;
+    }
    case kX64I8x16Add: {
      __ paddb(i.OutputSimd128Register(), i.InputSimd128Register(1));
      break;
@@ -2699,6 +2730,39 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ psubsb(i.OutputSimd128Register(), i.InputSimd128Register(1));
      break;
    }
+    case kX64I8x16Mul: {
+      XMMRegister dst = i.OutputSimd128Register();
+      DCHECK_EQ(dst, i.InputSimd128Register(0));
+      XMMRegister right = i.InputSimd128Register(1);
+      XMMRegister tmp = i.ToSimd128Register(instr->TempAt(0));
+      // I16x8 view of I8x16
+      // left = AAaa AAaa ... AAaa AAaa
+      // right= BBbb BBbb ... BBbb BBbb
+      // t = 00AA 00AA ... 00AA 00AA
+      // s = 00BB 00BB ... 00BB 00BB
+      __ movaps(tmp, dst);
+      __ movaps(kScratchDoubleReg, right);
+      __ psrlw(tmp, 8);
+      __ psrlw(kScratchDoubleReg, 8);
+      // dst = left * 256
+      __ psllw(dst, 8);
+      // t = I16x8Mul(t, s)
+      //    => __PP __PP ...  __PP  __PP
+      __ pmullw(tmp, kScratchDoubleReg);
+      // dst = I16x8Mul(left * 256, right)
+      //    => pp__ pp__ ...  pp__  pp__
+      __ pmullw(dst, right);
+      // t = I16x8Shl(t, 8)
+      //    => PP00 PP00 ...  PP00  PP00
+      __ psllw(tmp, 8);
+      // dst = I16x8Shr(dst, 8)
+      //    => 00pp 00pp ...  00pp  00pp
+      __ psrlw(dst, 8);
+      // dst = I16x8Or(dst, t)
+      //    => PPpp PPpp ...  PPpp  PPpp
+      __ por(dst, tmp);
+      break;
+    }
    case kX64I8x16MinS: {
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      __ pminsb(i.OutputSimd128Register(), i.InputSimd128Register(1));
@@ -2743,6 +2807,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ packuswb(dst, kScratchDoubleReg);
      break;
    }
+    case kX64I8x16ShrU: {
+      XMMRegister dst = i.OutputSimd128Register();
+      XMMRegister src = i.InputSimd128Register(0);
+      int8_t shift = i.InputInt8(1) & 0x7;
+      // Unpack the bytes into words, do logical shifts, and repack.
+      __ punpckhbw(kScratchDoubleReg, src);
+      __ punpcklbw(dst, src);
+      __ psrlw(kScratchDoubleReg, 8 + shift);
+      __ psrlw(dst, 8 + shift);
+      __ packuswb(dst, kScratchDoubleReg);
+      break;
+    }
    case kX64I8x16AddSaturateU: {
      __ paddusb(i.OutputSimd128Register(), i.InputSimd128Register(1));
      break;

--- a/src/compiler/x64/instruction-codes-x64.h
+++ b/src/compiler/x64/instruction-codes-x64.h
@@ -230,10 +230,13 @@ namespace compiler {
  V(X64I8x16ReplaceLane)                  \
  V(X64I8x16SConvertI16x8)                \
  V(X64I8x16Neg)                          \
+  V(X64I8x16Shl)                          \
+  V(X64I8x16ShrS)                         \
  V(X64I8x16Add)                          \
  V(X64I8x16AddSaturateS)                 \
  V(X64I8x16Sub)                          \
  V(X64I8x16SubSaturateS)                 \
+  V(X64I8x16Mul)                          \
  V(X64I8x16MinS)                         \
  V(X64I8x16MaxS)                         \
  V(X64I8x16Eq)                           \
@@ -243,16 +246,17 @@ namespace compiler {
  V(X64I8x16UConvertI16x8)                \
  V(X64I8x16AddSaturateU)                 \
  V(X64I8x16SubSaturateU)                 \
+  V(X64I8x16ShrU)                         \
  V(X64I8x16MinU)                         \
  V(X64I8x16MaxU)                         \
  V(X64I8x16GtU)                          \
  V(X64I8x16GeU)                          \
+  V(X64S128Zero)                          \
+  V(X64S128Not)                           \
  V(X64S128And)                           \
  V(X64S128Or)                            \
  V(X64S128Xor)                           \
-  V(X64S128Not)                           \
  V(X64S128Select)                        \
-  V(X64S128Zero)                          \
  V(X64S1x4AnyTrue)                       \
  V(X64S1x4AllTrue)                       \
  V(X64S1x8AnyTrue)                       \

--- a/src/compiler/x64/instruction-scheduler-x64.cc
+++ b/src/compiler/x64/instruction-scheduler-x64.cc
@@ -207,10 +207,13 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kX64I8x16ReplaceLane:
    case kX64I8x16SConvertI16x8:
    case kX64I8x16Neg:
+    case kX64I8x16Shl:
+    case kX64I8x16ShrS:
    case kX64I8x16Add:
    case kX64I8x16AddSaturateS:
    case kX64I8x16Sub:
    case kX64I8x16SubSaturateS:
+    case kX64I8x16Mul:
    case kX64I8x16MinS:
    case kX64I8x16MaxS:
    case kX64I8x16Eq:
@@ -220,6 +223,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kX64I8x16UConvertI16x8:
    case kX64I8x16AddSaturateU:
    case kX64I8x16SubSaturateU:
+    case kX64I8x16ShrU:
    case kX64I8x16MinU:
    case kX64I8x16MaxU:
    case kX64I8x16GtU:

--- a/src/compiler/x64/instruction-selector-x64.cc
+++ b/src/compiler/x64/instruction-selector-x64.cc
@@ -2655,7 +2655,10 @@ VISIT_ATOMIC_BINOP(Xor)
  V(I32x4ShrU)                \
  V(I16x8Shl)                 \
  V(I16x8ShrS)                \
-  V(I16x8ShrU)
+  V(I16x8ShrU)                \
+  V(I8x16Shl)                 \
+  V(I8x16ShrS)                \
+  V(I8x16ShrU)
 #define SIMD_ANYTRUE_LIST(V) \
  V(S1x4AnyTrue)             \
@@ -2777,8 +2780,9 @@ void InstructionSelector::VisitI32x4SConvertF32x4(Node* node) {
 void InstructionSelector::VisitI32x4UConvertF32x4(Node* node) {
  X64OperandGenerator g(this);
+  InstructionOperand temps[] = {g.TempSimd128Register()};
  Emit(kX64I32x4UConvertF32x4, g.DefineSameAsFirst(node),
-       g.UseRegister(node->InputAt(0)));
+       g.UseRegister(node->InputAt(0)), arraysize(temps), temps);
 }
 void InstructionSelector::VisitI16x8UConvertI32x4(Node* node) {
@@ -2793,6 +2797,14 @@ void InstructionSelector::VisitI8x16UConvertI16x8(Node* node) {
       g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)));
 }
+void InstructionSelector::VisitI8x16Mul(Node* node) {
+  X64OperandGenerator g(this);
+  InstructionOperand temps[] = {g.TempSimd128Register()};
+  Emit(kX64I8x16Mul, g.DefineSameAsFirst(node),
+       g.UseUniqueRegister(node->InputAt(0)),
+       g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps);
+}
 void InstructionSelector::VisitInt32AbsWithOverflow(Node* node) {
  UNREACHABLE();
 }

--- a/test/cctest/wasm/test-run-wasm-simd.cc
+++ b/test/cctest/wasm/test-run-wasm-simd.cc
@@ -438,8 +438,6 @@ WASM_SIMD_TEST(F32x4ReplaceLane) {
  CHECK_EQ(1, r.Call(3.14159f, -1.5f));
 }
-#if V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || \
-    V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 // Tests both signed and unsigned conversion.
 WASM_SIMD_TEST(F32x4ConvertI32x4) {
  WasmRunner<int32_t, int32_t, float, float> r(execution_tier, lower_simd);
@@ -463,8 +461,6 @@ WASM_SIMD_TEST(F32x4ConvertI32x4) {
                       static_cast<float>(static_cast<uint32_t>(*i))));
  }
 }
-#endif  // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS ||
-        // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 void RunF32x4UnOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
                      WasmOpcode simd_op, FloatUnOp expected_op,
@@ -819,9 +815,6 @@ WASM_SIMD_TEST(I8x16ReplaceLane) {
  CHECK_EQ(1, r.Call(1, 2));
 }
-#if V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || \
-    V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 int32_t ConvertToInt(double val, bool unsigned_integer) {
  if (std::isnan(val)) return 0;
  if (unsigned_integer) {
@@ -900,8 +893,6 @@ WASM_SIMD_TEST(I32x4ConvertI16x8) {
    CHECK_EQ(1, r.Call(*i, unpacked_signed, unpacked_unsigned, 0));
  }
 }
-#endif  // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS ||
-        // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 void RunI32x4UnOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
                      WasmOpcode simd_op, Int32UnOp expected_op) {
@@ -1542,13 +1533,9 @@ WASM_SIMD_TEST(I8x16LeU) {
                        UnsignedLessEqual);
 }
-#if V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || \
-    V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 WASM_SIMD_TEST(I8x16Mul) {
  RunI8x16BinOpTest(execution_tier, lower_simd, kExprI8x16Mul, Mul);
 }
-#endif  // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS ||
-        // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 void RunI8x16ShiftOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
                         WasmOpcode simd_op, Int8ShiftOp expected_op) {
@@ -1566,8 +1553,6 @@ void RunI8x16ShiftOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
  }
 }
-#if V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS || \
-    V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 WASM_SIMD_TEST(I8x16Shl) {
  RunI8x16ShiftOpTest(execution_tier, lower_simd, kExprI8x16Shl,
                      LogicalShiftLeft);
@@ -1582,8 +1567,6 @@ WASM_SIMD_TEST(I8x16ShrU) {
  RunI8x16ShiftOpTest(execution_tier, lower_simd, kExprI8x16ShrU,
                      LogicalShiftRight);
 }
-#endif  // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_MIPS ||
-        // V8_TARGET_ARCH_MIPS64 || V8_TARGET_ARCH_IA32
 // Test Select by making a mask where the 0th and 3rd lanes are true and the
 // rest false, and comparing for non-equality with zero to convert to a boolean