[turbofan]: Use "leal" more prevasively on x64

Only use "addl" and "subl" in cases that have been measured to be faster (currently only immediate operations). Review URL: https://codereview.chromium.org/735293004 Cr-Commit-Position: refs/heads/master@{#25580}

[turbofan]: Use "leal" more prevasively on x64
Only use "addl" and "subl" in cases that have been measured to be faster (currently only immediate operations). Review URL: https://codereview.chromium.org/735293004 Cr-Commit-Position: refs/heads/master@{#25580}
91ec654b · danno · Commit bot · 82d0f800 · 91ec654b · 91ec654b
Commit 91ec654b authored Dec 01, 2014 by danno Committed by Commit bot Dec 01, 2014
3 changed files
--- a/src/compiler/x64/code-generator-x64.cc
+++ b/src/compiler/x64/code-generator-x64.cc
@@ -590,10 +590,26 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
        __ movsd(operand, i.InputDoubleRegister(index));
      }
      break;
-    case kX64Lea32:
-      __ leal(i.OutputRegister(), i.MemoryOperand());
+    case kX64Lea32: {
+      AddressingMode mode = AddressingModeField::decode(instr->opcode());
+      // Shorten "leal" to "addl" or "subl" if the register allocation just
+      // happens to work out for operations with immediate operands where the
+      // non-constant input register is the same as output register. The
+      // "addl"/"subl" forms in these cases are faster based on empirical
+      // measurements.
+      if (mode == kMode_MRI && i.InputRegister(0).is(i.OutputRegister())) {
+        int32_t constant_summand = i.InputInt32(1);
+        if (constant_summand > 0) {
+          __ addl(i.OutputRegister(), Immediate(constant_summand));
+        } else if (constant_summand < 0) {
+          __ subl(i.OutputRegister(), Immediate(-constant_summand));
+        }
+      } else {
+        __ leal(i.OutputRegister(), i.MemoryOperand());
+      }
      __ AssertZeroExtended(i.OutputRegister());
      break;
+    }
    case kX64Lea:
      __ leaq(i.OutputRegister(), i.MemoryOperand());
      break;

--- a/src/compiler/x64/instruction-selector-x64.cc
+++ b/src/compiler/x64/instruction-selector-x64.cc
@@ -430,33 +430,6 @@ void InstructionSelector::VisitInt32Add(Node* node) {
  // case that there are only two operands to the add and one of them isn't
  // live, use a plain "addl".
  if (m.matches() && (m.constant() == NULL || g.CanBeImmediate(m.constant()))) {
-    if (m.offset() != NULL) {
-      if (m.constant() == NULL) {
-        if (m.scaled() != NULL && m.scale_exponent() == 0) {
-          if (!IsLive(m.offset())) {
-            Emit(kX64Add32, g.DefineSameAsFirst(node),
-                 g.UseRegister(m.offset()), g.Use(m.scaled()));
-            return;
-          } else if (!IsLive(m.scaled())) {
-            Emit(kX64Add32, g.DefineSameAsFirst(node),
-                 g.UseRegister(m.scaled()), g.Use(m.offset()));
-            return;
-          }
-        }
-      } else {
-        if (m.scale_exponent() == 0) {
-          if (m.scaled() == NULL || m.offset() == NULL) {
-            Node* non_constant = m.scaled() == NULL ? m.offset() : m.scaled();
-            if (!IsLive(non_constant)) {
-              Emit(kX64Add32, g.DefineSameAsFirst(node),
-                   g.UseRegister(non_constant), g.UseImmediate(m.constant()));
-              return;
-            }
-          }
-        }
-      }
-    }
-
    InstructionOperand* inputs[4];
    size_t input_count = 0;
    AddressingMode mode = GenerateMemoryOperandInputs(
@@ -491,15 +464,12 @@ void InstructionSelector::VisitInt32Sub(Node* node) {
    Emit(kX64Neg32, g.DefineSameAsFirst(node), g.UseRegister(m.right().node()));
  } else {
    if (m.right().HasValue() && g.CanBeImmediate(m.right().node())) {
-      if (IsLive(m.left().node())) {
-        // Special handling for subtraction of constants where the non-constant
-        // input is used elsewhere. To eliminate the gap move before the sub to
-        // copy the destination register, use a "leal" instead.
-        Emit(kX64Lea32 | AddressingModeField::encode(kMode_MRI),
-             g.DefineAsRegister(node), g.UseRegister(m.left().node()),
-             g.TempImmediate(-m.right().Value()));
-        return;
-      }
+      // Turn subtractions of constant values into immediate "leal" instructions
+      // by negating the value.
+      Emit(kX64Lea32 | AddressingModeField::encode(kMode_MRI),
+           g.DefineAsRegister(node), g.UseRegister(m.left().node()),
+           g.TempImmediate(-m.right().Value()));
+      return;
    }
    VisitBinop(this, node, kX64Sub32);
  }

--- a/test/unittests/compiler/x64/instruction-selector-x64-unittest.cc
+++ b/test/unittests/compiler/x64/instruction-selector-x64-unittest.cc
@@ -265,14 +265,18 @@ TEST_F(InstructionSelectorTest, Int32AddConstantAsLeaSingle) {
  StreamBuilder m(this, kMachInt32, kMachInt32);
  Node* const p0 = m.Parameter(0);
  Node* const c0 = m.Int32Constant(15);
-  // If there is only a single use of an add's input, use an "addl" not a
-  // "leal", it is faster.
+  // If one of the add's operands is only used once, use an "leal", even though
+  // an "addl" could be used. The "leal" has proven faster--out best guess is
+  // that it gives the register allocation more freedom and it doesn't set
+  // flags, reducing pressure in the CPU's pipeline. If we're lucky with
+  // register allocation, then code generation will select an "addl" later for
+  // the cases that have been measured to be faster.
  Node* const v0 = m.Int32Add(p0, c0);
  m.Return(v0);
  Stream s = m.Build();
  ASSERT_EQ(1U, s.size());
-  EXPECT_EQ(kX64Add32, s[0]->arch_opcode());
-  EXPECT_EQ(kMode_None, s[0]->addressing_mode());
+  EXPECT_EQ(kX64Lea32, s[0]->arch_opcode());
+  EXPECT_EQ(kMode_MRI, s[0]->addressing_mode());
  ASSERT_EQ(2U, s[0]->InputCount());
  EXPECT_EQ(s.ToVreg(p0), s.ToVreg(s[0]->InputAt(0)));
  EXPECT_TRUE(s[0]->InputAt(1)->IsImmediate());
@@ -284,12 +288,13 @@ TEST_F(InstructionSelectorTest, Int32AddConstantAsAdd) {
  Node* const p0 = m.Parameter(0);
  Node* const c0 = m.Int32Constant(1);
  // If there is only a single use of an add's input and the immediate constant
-  // for the add is 1, use inc.
+  // for the add is 1, don't use an inc. It is much slower on modern Intel
+  // architectures.
  m.Return(m.Int32Add(p0, c0));
  Stream s = m.Build();
  ASSERT_EQ(1U, s.size());
-  EXPECT_EQ(kX64Add32, s[0]->arch_opcode());
-  EXPECT_EQ(kMode_None, s[0]->addressing_mode());
+  EXPECT_EQ(kX64Lea32, s[0]->arch_opcode());
+  EXPECT_EQ(kMode_MRI, s[0]->addressing_mode());
  ASSERT_EQ(2U, s[0]->InputCount());
  EXPECT_EQ(s.ToVreg(p0), s.ToVreg(s[0]->InputAt(0)));
  EXPECT_TRUE(s[0]->InputAt(1)->IsImmediate());
@@ -317,12 +322,17 @@ TEST_F(InstructionSelectorTest, Int32AddCommutedConstantAsLeaSingle) {
  StreamBuilder m(this, kMachInt32, kMachInt32);
  Node* const p0 = m.Parameter(0);
  Node* const c0 = m.Int32Constant(15);
-  // If there is only a single use of an add's input, use "addl"
+  // If one of the add's operands is only used once, use an "leal", even though
+  // an "addl" could be used. The "leal" has proven faster--out best guess is
+  // that it gives the register allocation more freedom and it doesn't set
+  // flags, reducing pressure in the CPU's pipeline. If we're lucky with
+  // register allocation, then code generation will select an "addl" later for
+  // the cases that have been measured to be faster.
  m.Return(m.Int32Add(c0, p0));
  Stream s = m.Build();
  ASSERT_EQ(1U, s.size());
-  EXPECT_EQ(kX64Add32, s[0]->arch_opcode());
-  EXPECT_EQ(kMode_None, s[0]->addressing_mode());
+  EXPECT_EQ(kX64Lea32, s[0]->arch_opcode());
+  EXPECT_EQ(kMode_MRI, s[0]->addressing_mode());
  ASSERT_EQ(2U, s[0]->InputCount());
  EXPECT_EQ(s.ToVreg(p0), s.ToVreg(s[0]->InputAt(0)));
  EXPECT_TRUE(s[0]->InputAt(1)->IsImmediate());
@@ -351,12 +361,17 @@ TEST_F(InstructionSelectorTest, Int32AddSimpleAsAdd) {
  StreamBuilder m(this, kMachInt32, kMachInt32, kMachInt32);
  Node* const p0 = m.Parameter(0);
  Node* const p1 = m.Parameter(1);
-  // If one of the add's operands is only used once, use an "addl".
+  // If one of the add's operands is only used once, use an "leal", even though
+  // an "addl" could be used. The "leal" has proven faster--out best guess is
+  // that it gives the register allocation more freedom and it doesn't set
+  // flags, reducing pressure in the CPU's pipeline. If we're lucky with
+  // register allocation, then code generation will select an "addl" later for
+  // the cases that have been measured to be faster.
  m.Return(m.Int32Add(p0, p1));
  Stream s = m.Build();
  ASSERT_EQ(1U, s.size());
-  EXPECT_EQ(kX64Add32, s[0]->arch_opcode());
-  EXPECT_EQ(kMode_None, s[0]->addressing_mode());
+  EXPECT_EQ(kX64Lea32, s[0]->arch_opcode());
+  EXPECT_EQ(kMode_MR1, s[0]->addressing_mode());
  ASSERT_EQ(2U, s[0]->InputCount());
  EXPECT_EQ(s.ToVreg(p0), s.ToVreg(s[0]->InputAt(0)));
  EXPECT_EQ(s.ToVreg(p1), s.ToVreg(s[0]->InputAt(1)));
@@ -715,8 +730,8 @@ TEST_F(InstructionSelectorTest, Int32SubConstantAsSub) {
  m.Return(m.Int32Sub(p0, c0));
  Stream s = m.Build();
  ASSERT_EQ(1U, s.size());
-  EXPECT_EQ(kX64Sub32, s[0]->arch_opcode());
-  EXPECT_EQ(kMode_None, s[0]->addressing_mode());
+  EXPECT_EQ(kX64Lea32, s[0]->arch_opcode());
+  EXPECT_EQ(kMode_MRI, s[0]->addressing_mode());
  ASSERT_EQ(2U, s[0]->InputCount());
  EXPECT_EQ(s.ToVreg(p0), s.ToVreg(s[0]->InputAt(0)));
  EXPECT_TRUE(s[0]->InputAt(1)->IsImmediate());
@@ -759,7 +774,7 @@ TEST_F(InstructionSelectorTest, Int32AddScaled2Other) {
  EXPECT_EQ(s.ToVreg(p1), s.ToVreg(s[0]->InputAt(1)));
  EXPECT_EQ(s.ToVreg(a0), s.ToVreg(s[0]->OutputAt(0)));
  ASSERT_EQ(2U, s[1]->InputCount());
-  EXPECT_EQ(kX64Add32, s[1]->arch_opcode());
+  EXPECT_EQ(kX64Lea32, s[1]->arch_opcode());
  EXPECT_EQ(s.ToVreg(p0), s.ToVreg(s[1]->InputAt(0)));
  EXPECT_EQ(s.ToVreg(a0), s.ToVreg(s[1]->InputAt(1)));
  EXPECT_EQ(s.ToVreg(a1), s.ToVreg(s[1]->OutputAt(0)));