[wasm-simd] Optimize codegen when shift is constant

Define macros to identify constant shift values, we can emit better codegen in these cases. Otherwise we need to mask the shift value before shifting. We also need separate cases for left and right shifts, since right shifts require the shift value to be negated. Bug: v8:10115 Change-Id: I9a032901d03d59cfaa871eefbc58f7f144fd521f Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2041709Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#66325}

[wasm-simd] Optimize codegen when shift is constant
Define macros to identify constant shift values, we can emit better codegen in these cases. Otherwise we need to mask the shift value before shifting. We also need separate cases for left and right shifts, since right shifts require the shift value to be negated. Bug: v8:10115 Change-Id: I9a032901d03d59cfaa871eefbc58f7f144fd521f Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2041709Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#66325}
2975ead6 · Ng Zhi An · Commit Bot · 35afef86 · 2975ead6 · 2975ead6
Commit 2975ead6 authored Feb 13, 2020 by Ng Zhi An Committed by Commit Bot Feb 19, 2020
3 changed files
--- a/src/compiler/backend/arm64/code-generator-arm64.cc
+++ b/src/compiler/backend/arm64/code-generator-arm64.cc
@@ -494,6 +494,45 @@ void EmitMaybePoisonedFPLoad(CodeGenerator* codegen, InstructionCode opcode,
    __ CallCFunction(ExternalReference::ieee754_##name##_function(), 0, 1); \
  } while (0)

+// If shift value is an immediate, we can call asm_imm, taking the shift value
+// modulo 2^width. Otherwise, emit code to perform the modulus operation, and
+// call asm_shl.
+#define ASSEMBLE_SIMD_SHIFT_LEFT(asm_imm, width, format, asm_shl, gp)       \
+  do {                                                                      \
+    if (instr->InputAt(1)->IsImmediate()) {                                 \
+      __ asm_imm(i.OutputSimd128Register().format(),                        \
+                 i.InputSimd128Register(0).format(), i.InputInt##width(1)); \
+    } else {                                                                \
+      VRegister tmp = i.TempSimd128Register(0);                             \
+      Register shift = i.TempRegister(1).gp();                              \
+      constexpr int mask = (1 << width) - 1;                                \
+      __ And(shift, i.InputRegister32(1), mask);                            \
+      __ Dup(tmp.format(), shift);                                          \
+      __ asm_shl(i.OutputSimd128Register().format(),                        \
+                 i.InputSimd128Register(0).format(), tmp.format());         \
+    }                                                                       \
+  } while (0)
+
+// If shift value is an immediate, we can call asm_imm, taking the shift value
+// modulo 2^width. Otherwise, emit code to perform the modulus operation, and
+// call asm_shl, passing in the negative shift value (treated as right shift).
+#define ASSEMBLE_SIMD_SHIFT_RIGHT(asm_imm, width, format, asm_shl, gp)      \
+  do {                                                                      \
+    if (instr->InputAt(1)->IsImmediate()) {                                 \
+      __ asm_imm(i.OutputSimd128Register().format(),                        \
+                 i.InputSimd128Register(0).format(), i.InputInt##width(1)); \
+    } else {                                                                \
+      VRegister tmp = i.TempSimd128Register(0);                             \
+      Register shift = i.TempRegister(1).gp();                              \
+      constexpr int mask = (1 << width) - 1;                                \
+      __ And(shift, i.InputRegister32(1), mask);                            \
+      __ Dup(tmp.format(), shift);                                          \
+      __ Neg(tmp.format(), tmp.format());                                   \
+      __ asm_shl(i.OutputSimd128Register().format(),                        \
+                 i.InputSimd128Register(0).format(), tmp.format());         \
+    }                                                                       \
+  } while (0)
+
 void CodeGenerator::AssembleDeconstructFrame() {
  __ Mov(sp, fp);
  __ Pop<TurboAssembler::kAuthLR>(fp, lr);
@@ -1946,24 +1985,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    }
      SIMD_UNOP_CASE(kArm64I64x2Neg, Neg, 2D);
    case kArm64I64x2Shl: {
-      VRegister tmp = i.TempSimd128Register(0);
-      Register shift = i.TempRegister(1);
-      // Take shift value modulo 64.
-      __ And(shift, i.InputRegister64(1), 63);
-      __ Dup(tmp.V2D(), shift);
-      __ Sshl(i.OutputSimd128Register().V2D(), i.InputSimd128Register(0).V2D(),
-              tmp.V2D());
+      ASSEMBLE_SIMD_SHIFT_LEFT(Shl, 6, V2D, Sshl, X);
      break;
    }
    case kArm64I64x2ShrS: {
-      VRegister tmp = i.TempSimd128Register(0);
-      Register shift = i.TempRegister(1);
-      // Take shift value modulo 64.
-      __ And(shift, i.InputRegister64(1), 63);
-      __ Dup(tmp.V2D(), shift);
-      __ Neg(tmp.V2D(), tmp.V2D());
-      __ Sshl(i.OutputSimd128Register().V2D(), i.InputSimd128Register(0).V2D(),
-              tmp.V2D());
+      ASSEMBLE_SIMD_SHIFT_RIGHT(Sshr, 6, V2D, Sshl, X);
      break;
    }
      SIMD_BINOP_CASE(kArm64I64x2Add, Add, 2D);
@@ -2038,14 +2064,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      SIMD_BINOP_CASE(kArm64I64x2GtS, Cmgt, 2D);
      SIMD_BINOP_CASE(kArm64I64x2GeS, Cmge, 2D);
    case kArm64I64x2ShrU: {
-      VRegister tmp = i.TempSimd128Register(0);
-      Register shift = i.TempRegister(1);
-      // Take shift value modulo 64.
-      __ And(shift, i.InputRegister64(1), 63);
-      __ Dup(tmp.V2D(), shift);
-      __ Neg(tmp.V2D(), tmp.V2D());
-      __ Ushl(i.OutputSimd128Register().V2D(), i.InputSimd128Register(0).V2D(),
-              tmp.V2D());
+      ASSEMBLE_SIMD_SHIFT_RIGHT(Ushr, 6, V2D, Ushl, X);
      break;
    }
      SIMD_BINOP_CASE(kArm64I64x2GtU, Cmhi, 2D);
@@ -2073,24 +2092,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      SIMD_WIDENING_UNOP_CASE(kArm64I32x4SConvertI16x8High, Sxtl2, 4S, 8H);
      SIMD_UNOP_CASE(kArm64I32x4Neg, Neg, 4S);
    case kArm64I32x4Shl: {
-      VRegister tmp = i.TempSimd128Register(0);
-      Register shift = i.TempRegister32(1);
-      // Take shift value modulo 32.
-      __ And(shift, i.InputRegister32(1), 31);
-      __ Dup(tmp.V4S(), shift);
-      __ Sshl(i.OutputSimd128Register().V4S(), i.InputSimd128Register(0).V4S(),
-              tmp.V4S());
+      ASSEMBLE_SIMD_SHIFT_LEFT(Shl, 5, V4S, Sshl, W);
      break;
    }
    case kArm64I32x4ShrS: {
-      VRegister tmp = i.TempSimd128Register(0);
-      Register shift = i.TempRegister32(1);
-      // Take shift value modulo 32.
-      __ And(shift, i.InputRegister32(1), 31);
-      __ Dup(tmp.V4S(), shift);
-      __ Neg(tmp.V4S(), tmp.V4S());
-      __ Sshl(i.OutputSimd128Register().V4S(), i.InputSimd128Register(0).V4S(),
-              tmp.V4S());
+      ASSEMBLE_SIMD_SHIFT_RIGHT(Sshr, 5, V4S, Sshl, W);
      break;
    }
      SIMD_BINOP_CASE(kArm64I32x4Add, Add, 4S);
@@ -2113,14 +2119,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      SIMD_WIDENING_UNOP_CASE(kArm64I32x4UConvertI16x8Low, Uxtl, 4S, 4H);
      SIMD_WIDENING_UNOP_CASE(kArm64I32x4UConvertI16x8High, Uxtl2, 4S, 8H);
    case kArm64I32x4ShrU: {
-      VRegister tmp = i.TempSimd128Register(0);
-      Register shift = i.TempRegister32(1);
-      // Take shift value modulo 32.
-      __ And(shift, i.InputRegister32(1), 31);
-      __ Dup(tmp.V4S(), shift);
-      __ Neg(tmp.V4S(), tmp.V4S());
-      __ Ushl(i.OutputSimd128Register().V4S(), i.InputSimd128Register(0).V4S(),
-              tmp.V4S());
+      ASSEMBLE_SIMD_SHIFT_RIGHT(Ushr, 5, V4S, Ushl, W);
      break;
    }
      SIMD_BINOP_CASE(kArm64I32x4MinU, Umin, 4S);
@@ -2154,24 +2153,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      SIMD_WIDENING_UNOP_CASE(kArm64I16x8SConvertI8x16High, Sxtl2, 8H, 16B);
      SIMD_UNOP_CASE(kArm64I16x8Neg, Neg, 8H);
    case kArm64I16x8Shl: {
-      VRegister tmp = i.TempSimd128Register(0);
-      Register shift = i.TempRegister32(1);
-      // Take shift value modulo 16.
-      __ And(shift, i.InputRegister32(1), 15);
-      __ Dup(tmp.V8H(), shift);
-      __ Sshl(i.OutputSimd128Register().V8H(), i.InputSimd128Register(0).V8H(),
-              tmp.V8H());
+      ASSEMBLE_SIMD_SHIFT_LEFT(Shl, 4, V8H, Sshl, W);
      break;
    }
    case kArm64I16x8ShrS: {
-      VRegister tmp = i.TempSimd128Register(0);
-      Register shift = i.TempRegister32(1);
-      // Take shift value modulo 16.
-      __ And(shift, i.InputRegister32(1), 15);
-      __ Dup(tmp.V8H(), shift);
-      __ Neg(tmp.V8H(), tmp.V8H());
-      __ Sshl(i.OutputSimd128Register().V8H(), i.InputSimd128Register(0).V8H(),
-              tmp.V8H());
+      ASSEMBLE_SIMD_SHIFT_RIGHT(Sshr, 4, V8H, Sshl, W);
      break;
    }
    case kArm64I16x8SConvertI32x4: {
@@ -2216,14 +2202,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      break;
    }
    case kArm64I16x8ShrU: {
-      VRegister tmp = i.TempSimd128Register(0);
-      Register shift = i.TempRegister32(1);
-      // Take shift value modulo 16.
-      __ And(shift, i.InputRegister32(1), 15);
-      __ Dup(tmp.V8H(), shift);
-      __ Neg(tmp.V8H(), tmp.V8H());
-      __ Ushl(i.OutputSimd128Register().V8H(), i.InputSimd128Register(0).V8H(),
-              tmp.V8H());
+      ASSEMBLE_SIMD_SHIFT_RIGHT(Ushr, 4, V8H, Ushl, W);
      break;
    }
    case kArm64I16x8UConvertI32x4: {
@@ -2272,24 +2251,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    }
      SIMD_UNOP_CASE(kArm64I8x16Neg, Neg, 16B);
    case kArm64I8x16Shl: {
-      VRegister tmp = i.TempSimd128Register(0);
-      Register shift = i.TempRegister32(1);
-      // Take shift value modulo 8.
-      __ And(shift, i.InputRegister32(1), 7);
-      __ Dup(tmp.V16B(), shift);
-      __ Sshl(i.OutputSimd128Register().V16B(),
-              i.InputSimd128Register(0).V16B(), tmp.V16B());
+      ASSEMBLE_SIMD_SHIFT_LEFT(Shl, 3, V16B, Sshl, W);
      break;
    }
    case kArm64I8x16ShrS: {
-      VRegister tmp = i.TempSimd128Register(0);
-      Register shift = i.TempRegister32(1);
-      // Take shift value modulo 8.
-      __ And(shift, i.InputRegister32(1), 7);
-      __ Dup(tmp.V16B(), shift);
-      __ Neg(tmp.V16B(), tmp.V16B());
-      __ Sshl(i.OutputSimd128Register().V16B(),
-              i.InputSimd128Register(0).V16B(), tmp.V16B());
+      ASSEMBLE_SIMD_SHIFT_RIGHT(Sshr, 3, V16B, Sshl, W);
      break;
    }
    case kArm64I8x16SConvertI16x8: {
@@ -2324,14 +2290,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      SIMD_BINOP_CASE(kArm64I8x16GtS, Cmgt, 16B);
      SIMD_BINOP_CASE(kArm64I8x16GeS, Cmge, 16B);
    case kArm64I8x16ShrU: {
-      VRegister tmp = i.TempSimd128Register(0);
-      Register shift = i.TempRegister32(1);
-      // Take shift value modulo 8.
-      __ And(shift, i.InputRegister32(1), 7);
-      __ Dup(tmp.V16B(), shift);
-      __ Neg(tmp.V16B(), tmp.V16B());
-      __ Ushl(i.OutputSimd128Register().V16B(),
-              i.InputSimd128Register(0).V16B(), tmp.V16B());
+      ASSEMBLE_SIMD_SHIFT_RIGHT(Ushr, 3, V16B, Ushl, W);
      break;
    }
    case kArm64I8x16UConvertI16x8: {
@@ -2566,6 +2525,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
 #undef SIMD_WIDENING_UNOP_CASE
 #undef SIMD_BINOP_CASE
 #undef SIMD_REDUCE_OP_CASE
+#undef ASSEMBLE_SIMD_SHIFT_LEFT
+#undef ASSEMBLE_SIMD_SHIFT_RIGHT

 // Assemble branches after this instruction.
 void CodeGenerator::AssembleArchBranch(Instruction* instr, BranchInfo* branch) {

--- a/src/compiler/backend/arm64/instruction-selector-arm64.cc
+++ b/src/compiler/backend/arm64/instruction-selector-arm64.cc
@@ -152,12 +152,22 @@ void VisitRRR(InstructionSelector* selector, ArchOpcode opcode, Node* node) {
 }

 void VisitSimdShiftRRR(InstructionSelector* selector, ArchOpcode opcode,
-                       Node* node) {
+                       Node* node, int width) {
  Arm64OperandGenerator g(selector);
-  InstructionOperand temps[] = {g.TempSimd128Register(), g.TempRegister()};
-  selector->Emit(opcode, g.DefineAsRegister(node),
-                 g.UseRegister(node->InputAt(0)),
-                 g.UseRegister(node->InputAt(1)), arraysize(temps), temps);
+  if (g.IsIntegerConstant(node->InputAt(1))) {
+    if (g.GetIntegerConstantValue(node->InputAt(1)) % width == 0) {
+      selector->EmitIdentity(node);
+    } else {
+      selector->Emit(opcode, g.DefineAsRegister(node),
+                     g.UseRegister(node->InputAt(0)),
+                     g.UseImmediate(node->InputAt(1)));
+    }
+  } else {
+    InstructionOperand temps[] = {g.TempSimd128Register(), g.TempRegister()};
+    selector->Emit(opcode, g.DefineAsRegister(node),
+                   g.UseRegister(node->InputAt(0)),
+                   g.UseRegister(node->InputAt(1)), arraysize(temps), temps);
+  }
 }

 void VisitRRI(InstructionSelector* selector, ArchOpcode opcode, Node* node) {
@@ -3174,18 +3184,18 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
  V(S1x16AllTrue, kArm64S1x16AllTrue)

 #define SIMD_SHIFT_OP_LIST(V) \
-  V(I64x2Shl)                 \
-  V(I64x2ShrS)                \
-  V(I64x2ShrU)                \
-  V(I32x4Shl)                 \
-  V(I32x4ShrS)                \
-  V(I32x4ShrU)                \
-  V(I16x8Shl)                 \
-  V(I16x8ShrS)                \
-  V(I16x8ShrU)                \
-  V(I8x16Shl)                 \
-  V(I8x16ShrS)                \
-  V(I8x16ShrU)
+  V(I64x2Shl, 64)             \
+  V(I64x2ShrS, 64)            \
+  V(I64x2ShrU, 64)            \
+  V(I32x4Shl, 32)             \
+  V(I32x4ShrS, 32)            \
+  V(I32x4ShrU, 32)            \
+  V(I16x8Shl, 16)             \
+  V(I16x8ShrS, 16)            \
+  V(I16x8ShrU, 16)            \
+  V(I8x16Shl, 8)              \
+  V(I8x16ShrS, 8)             \
+  V(I8x16ShrU, 8)

 #define SIMD_BINOP_LIST(V)                              \
  V(F64x2Add, kArm64F64x2Add)                           \
@@ -3319,9 +3329,9 @@ SIMD_UNOP_LIST(SIMD_VISIT_UNOP)
 #undef SIMD_VISIT_UNOP
 #undef SIMD_UNOP_LIST

-#define SIMD_VISIT_SHIFT_OP(Name)                     \
-  void InstructionSelector::Visit##Name(Node* node) { \
-    VisitSimdShiftRRR(this, kArm64##Name, node);      \
+#define SIMD_VISIT_SHIFT_OP(Name, width)                \
+  void InstructionSelector::Visit##Name(Node* node) {   \
+    VisitSimdShiftRRR(this, kArm64##Name, node, width); \
  }
 SIMD_SHIFT_OP_LIST(SIMD_VISIT_SHIFT_OP)
 #undef SIMD_VISIT_SHIFT_OP

--- a/src/compiler/backend/instruction-selector.h
+++ b/src/compiler/backend/instruction-selector.h
@@ -351,6 +351,8 @@ class V8_EXPORT_PRIVATE InstructionSelector final {
      size_t input_count, InstructionOperand* inputs, size_t temp_count,
      InstructionOperand* temps, FlagsContinuation* cont);

+  void EmitIdentity(Node* node);
+
  // ===========================================================================
  // ===== Architecture-independent deoptimization exit emission methods. ======
  // ===========================================================================
@@ -655,7 +657,6 @@ class V8_EXPORT_PRIVATE InstructionSelector final {
  void EmitPrepareResults(ZoneVector<compiler::PushParameter>* results,
                          const CallDescriptor* call_descriptor, Node* node);

-  void EmitIdentity(Node* node);
  bool CanProduceSignalingNaN(Node* node);

  // ===========================================================================