[wasm-simd][arm] Remove some usages of TempSimd128Register

We can use UseScratchRegisterScope instead of requiring a TempSimd128Register in the instruction-selector. This reduces register pressure a little bit (when combined with unique register constraints). Drive-by cleanup of some variable names in code-generator, s/tmp2/tmp/ when there is only 1 tmp. Bug: v8:11384 Change-Id: I00a365624cbabeaeeaf78d1d08f0eb284c7e44ac Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2705523Reviewed-by: Bill Budge <bbudge@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#72851}

[wasm-simd][arm] Remove some usages of TempSimd128Register
We can use UseScratchRegisterScope instead of requiring a TempSimd128Register in the instruction-selector. This reduces register pressure a little bit (when combined with unique register constraints). Drive-by cleanup of some variable names in code-generator, s/tmp2/tmp/ when there is only 1 tmp. Bug: v8:11384 Change-Id: I00a365624cbabeaeeaf78d1d08f0eb284c7e44ac Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2705523Reviewed-by: Bill Budge <bbudge@chromium.org> Commit-Queue: Zhi An Ng <zhin@chromium.org> Cr-Commit-Position: refs/heads/master@{#72851}
3b34eeaf · Ng Zhi An · Commit Bot · 0fecb303 · 3b34eeaf · 3b34eeaf
Commit 3b34eeaf authored Feb 18, 2021 by Ng Zhi An Committed by Commit Bot Feb 19, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 39 additions and 38 deletions

code-generator-arm.cc src/compiler/backend/arm/code-generator-arm.cc +36 -30

instruction-selector-arm.cc src/compiler/backend/arm/instruction-selector-arm.cc +3 -8

No files found.
--- a/src/compiler/backend/arm/code-generator-arm.cc
+++ b/src/compiler/backend/arm/code-generator-arm.cc
@@ -515,8 +515,9 @@ void ComputePoisonedAddressForLoad(CodeGenerator* codegen,
    if (instr->InputAt(1)->IsImmediate()) {              \
      __ asm_imm(dt, dst, src, i.InputInt##width(1));    \
    } else {                                             \
-      QwNeonRegister tmp = i.TempSimd128Register(0);     \
-      Register shift = i.TempRegister(1);                \
+      UseScratchRegisterScope temps(tasm());             \
+      Simd128Register tmp = temps.AcquireQ();            \
+      Register shift = temps.Acquire();                  \
      constexpr int mask = (1 << width) - 1;             \
      __ and_(shift, i.InputRegister(1), Operand(mask)); \
      __ vdup(sz, tmp, shift);                           \
@@ -534,8 +535,9 @@ void ComputePoisonedAddressForLoad(CodeGenerator* codegen,
    if (instr->InputAt(1)->IsImmediate()) {               \
      __ asm_imm(dt, dst, src, i.InputInt##width(1));     \
    } else {                                              \
-      QwNeonRegister tmp = i.TempSimd128Register(0);      \
-      Register shift = i.TempRegister(1);                 \
+      UseScratchRegisterScope temps(tasm());              \
+      Simd128Register tmp = temps.AcquireQ();             \
+      Register shift = temps.Acquire();                   \
      constexpr int mask = (1 << width) - 1;              \
      __ and_(shift, i.InputRegister(1), Operand(mask));  \
      __ vdup(sz, tmp, shift);                            \
@@ -2111,11 +2113,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      break;
    }
    case kArmI64x2Mul: {
+      UseScratchRegisterScope temps(tasm());
      QwNeonRegister dst = i.OutputSimd128Register();
      QwNeonRegister left = i.InputSimd128Register(0);
      QwNeonRegister right = i.InputSimd128Register(1);
      QwNeonRegister tmp1 = i.TempSimd128Register(0);
-      QwNeonRegister tmp2 = i.TempSimd128Register(1);
+      QwNeonRegister tmp2 = temps.AcquireQ();

      // This algorithm uses vector operations to perform 64-bit integer
      // multiplication by splitting it into a high and low 32-bit integers.
@@ -2543,19 +2546,20 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    }
    case kArmI32x4BitMask: {
      Register dst = i.OutputRegister();
+      UseScratchRegisterScope temps(tasm());
      Simd128Register src = i.InputSimd128Register(0);
-      Simd128Register tmp2 = i.TempSimd128Register(0);
-      Simd128Register mask = i.TempSimd128Register(1);
+      Simd128Register tmp = temps.AcquireQ();
+      Simd128Register mask = i.TempSimd128Register(0);

-      __ vshr(NeonS32, tmp2, src, 31);
+      __ vshr(NeonS32, tmp, src, 31);
      // Set i-th bit of each lane i. When AND with tmp, the lanes that
      // are signed will have i-th bit set, unsigned will be 0.
      __ vmov(mask.low(), Double(uint64_t{0x0000'0002'0000'0001}));
      __ vmov(mask.high(), Double(uint64_t{0x0000'0008'0000'0004}));
-      __ vand(tmp2, mask, tmp2);
-      __ vpadd(Neon32, tmp2.low(), tmp2.low(), tmp2.high());
-      __ vpadd(Neon32, tmp2.low(), tmp2.low(), kDoubleRegZero);
-      __ VmovLow(dst, tmp2.low());
+      __ vand(tmp, mask, tmp);
+      __ vpadd(Neon32, tmp.low(), tmp.low(), tmp.high());
+      __ vpadd(Neon32, tmp.low(), tmp.low(), kDoubleRegZero);
+      __ VmovLow(dst, tmp.low());
      break;
    }
    case kArmI32x4DotI16x8S: {
@@ -2748,21 +2752,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      break;
    }
    case kArmI16x8BitMask: {
+      UseScratchRegisterScope temps(tasm());
      Register dst = i.OutputRegister();
      Simd128Register src = i.InputSimd128Register(0);
-      Simd128Register tmp2 = i.TempSimd128Register(0);
-      Simd128Register mask = i.TempSimd128Register(1);
+      Simd128Register tmp = temps.AcquireQ();
+      Simd128Register mask = i.TempSimd128Register(0);

-      __ vshr(NeonS16, tmp2, src, 15);
+      __ vshr(NeonS16, tmp, src, 15);
      // Set i-th bit of each lane i. When AND with tmp, the lanes that
      // are signed will have i-th bit set, unsigned will be 0.
      __ vmov(mask.low(), Double(uint64_t{0x0008'0004'0002'0001}));
      __ vmov(mask.high(), Double(uint64_t{0x0080'0040'0020'0010}));
-      __ vand(tmp2, mask, tmp2);
-      __ vpadd(Neon16, tmp2.low(), tmp2.low(), tmp2.high());
-      __ vpadd(Neon16, tmp2.low(), tmp2.low(), tmp2.low());
-      __ vpadd(Neon16, tmp2.low(), tmp2.low(), tmp2.low());
-      __ vmov(NeonU16, dst, tmp2.low(), 0);
+      __ vand(tmp, mask, tmp);
+      __ vpadd(Neon16, tmp.low(), tmp.low(), tmp.high());
+      __ vpadd(Neon16, tmp.low(), tmp.low(), tmp.low());
+      __ vpadd(Neon16, tmp.low(), tmp.low(), tmp.low());
+      __ vmov(NeonU16, dst, tmp.low(), 0);
      break;
    }
    case kArmI16x8Q15MulRSatS: {
@@ -2907,23 +2912,24 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      break;
    }
    case kArmI8x16BitMask: {
+      UseScratchRegisterScope temps(tasm());
      Register dst = i.OutputRegister();
      Simd128Register src = i.InputSimd128Register(0);
-      Simd128Register tmp2 = i.TempSimd128Register(0);
-      Simd128Register mask = i.TempSimd128Register(1);
+      Simd128Register tmp = temps.AcquireQ();
+      Simd128Register mask = i.TempSimd128Register(0);

-      __ vshr(NeonS8, tmp2, src, 7);
+      __ vshr(NeonS8, tmp, src, 7);
      // Set i-th bit of each lane i. When AND with tmp, the lanes that
      // are signed will have i-th bit set, unsigned will be 0.
      __ vmov(mask.low(), Double(uint64_t{0x8040'2010'0804'0201}));
      __ vmov(mask.high(), Double(uint64_t{0x8040'2010'0804'0201}));
-      __ vand(tmp2, mask, tmp2);
-      __ vext(mask, tmp2, tmp2, 8);
-      __ vzip(Neon8, mask, tmp2);
-      __ vpadd(Neon16, tmp2.low(), tmp2.low(), tmp2.high());
-      __ vpadd(Neon16, tmp2.low(), tmp2.low(), tmp2.low());
-      __ vpadd(Neon16, tmp2.low(), tmp2.low(), tmp2.low());
-      __ vmov(NeonU16, dst, tmp2.low(), 0);
+      __ vand(tmp, mask, tmp);
+      __ vext(mask, tmp, tmp, 8);
+      __ vzip(Neon8, mask, tmp);
+      __ vpadd(Neon16, tmp.low(), tmp.low(), tmp.high());
+      __ vpadd(Neon16, tmp.low(), tmp.low(), tmp.low());
+      __ vpadd(Neon16, tmp.low(), tmp.low(), tmp.low());
+      __ vmov(NeonU16, dst, tmp.low(), 0);
      break;
    }
    case kArmSignSelect: {

--- a/src/compiler/backend/arm/instruction-selector-arm.cc
+++ b/src/compiler/backend/arm/instruction-selector-arm.cc
@@ -108,10 +108,7 @@ void VisitSimdShiftRRR(InstructionSelector* selector, ArchOpcode opcode,
                     g.UseImmediate(node->InputAt(1)));
    }
  } else {
-    InstructionOperand temps[] = {g.TempSimd128Register(), g.TempRegister()};
-    selector->Emit(opcode, g.DefineAsRegister(node),
-                   g.UseUniqueRegister(node->InputAt(0)),
-                   g.UseRegister(node->InputAt(1)), arraysize(temps), temps);
+    VisitRRR(selector, opcode, node);
  }
 }

@@ -2816,8 +2813,7 @@ void InstructionSelector::VisitI64x2Neg(Node* node) {

 void InstructionSelector::VisitI64x2Mul(Node* node) {
  ArmOperandGenerator g(this);
-  InstructionOperand temps[] = {g.TempSimd128Register(),
-                                g.TempSimd128Register()};
+  InstructionOperand temps[] = {g.TempSimd128Register()};
  Emit(kArmI64x2Mul, g.DefineAsRegister(node),
       g.UseUniqueRegister(node->InputAt(0)),
       g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps);
@@ -3025,8 +3021,7 @@ namespace {
 template <ArchOpcode opcode>
 void VisitBitMask(InstructionSelector* selector, Node* node) {
  ArmOperandGenerator g(selector);
-  InstructionOperand temps[] = {g.TempSimd128Register(),
-                                g.TempSimd128Register()};
+  InstructionOperand temps[] = {g.TempSimd128Register()};
  selector->Emit(opcode, g.DefineAsRegister(node),
                 g.UseRegister(node->InputAt(0)), arraysize(temps), temps);
 }