MIPS[64]: Add optimizations to memory load/store helper.

The CL replaces several helper functions for memory load/store using base register and offset with one helper function that contains several optimizations. BUG= Change-Id: I187e7e882131552abd9a0b3a0070d78adefd25b6 Reviewed-on: https://chromium-review.googlesource.com/552119 Commit-Queue: Miran Karić <Miran.Karic@imgtec.com> Reviewed-by: Ivica Bogosavljevic <ivica.bogosavljevic@imgtec.com> Cr-Commit-Position: refs/heads/master@{#46420}

MIPS[64]: Add optimizations to memory load/store helper.
The CL replaces several helper functions for memory load/store using base register and offset with one helper function that contains several optimizations. BUG= Change-Id: I187e7e882131552abd9a0b3a0070d78adefd25b6 Reviewed-on: https://chromium-review.googlesource.com/552119 Commit-Queue: Miran Karić <Miran.Karic@imgtec.com> Reviewed-by: Ivica Bogosavljevic <ivica.bogosavljevic@imgtec.com> Cr-Commit-Position: refs/heads/master@{#46420}
5641559d · Miran.Karic · Commit Bot · a7e5abff · 5641559d · 5641559d
Commit 5641559d authored Jul 05, 2017 by Miran.Karic Committed by Commit Bot Jul 05, 2017
7 changed files
--- a/src/mips/assembler-mips.cc
+++ b/src/mips/assembler-mips.cc
--- a/src/mips/assembler-mips.h
+++ b/src/mips/assembler-mips.h
@@ -1866,10 +1866,18 @@ class Assembler : public AssemblerBase {
  // Load Scaled Address instruction.
  void lsa(Register rd, Register rt, Register rs, uint8_t sa);

-  // Helpers.
-  void LoadRegPlusOffsetToAt(const MemOperand& src);
-  int32_t LoadRegPlusUpperOffsetPartToAt(const MemOperand& src);
-  int32_t LoadUpperOffsetForTwoMemoryAccesses(const MemOperand& src);
+  // Readable constants for base and offset adjustment helper, these indicate if
+  // aside from offset, another value like offset + 4 should fit into int16.
+  enum class OffsetAccessType : bool {
+    SINGLE_ACCESS = false,
+    TWO_ACCESSES = true
+  };
+
+  // Helper function for memory load/store using base register and offset.
+  void AdjustBaseAndOffset(
+      MemOperand& src,
+      OffsetAccessType access_type = OffsetAccessType::SINGLE_ACCESS,
+      int second_access_add_to_offset = 4);

  int32_t buffer_space() const { return reloc_info_writer.pos() - pc_; }


--- a/src/mips/macro-assembler-mips.cc
+++ b/src/mips/macro-assembler-mips.cc
--- a/src/mips64/assembler-mips64.cc
+++ b/src/mips64/assembler-mips64.cc
@@ -2090,31 +2090,130 @@ void Assembler::dlsa(Register rd, Register rt, Register rs, uint8_t sa) {

 // ------------Memory-instructions-------------

-// Helper for base-reg + offset, when offset is larger than int16.
-void Assembler::LoadRegPlusOffsetToAt(const MemOperand& src) {
-  DCHECK(!src.rm().is(at));
-  DCHECK(is_int32(src.offset_));
+void Assembler::AdjustBaseAndOffset(MemOperand& src,
+                                    OffsetAccessType access_type,
+                                    int second_access_add_to_offset) {
+  // This method is used to adjust the base register and offset pair
+  // for a load/store when the offset doesn't fit into int16_t.
+  // It is assumed that 'base + offset' is sufficiently aligned for memory
+  // operands that are machine word in size or smaller. For doubleword-sized
+  // operands it's assumed that 'base' is a multiple of 8, while 'offset'
+  // may be a multiple of 4 (e.g. 4-byte-aligned long and double arguments
+  // and spilled variables on the stack accessed relative to the stack
+  // pointer register).
+  // We preserve the "alignment" of 'offset' by adjusting it by a multiple of 8.
+
+  bool doubleword_aligned = (src.offset() & (kDoubleSize - 1)) == 0;
+  bool two_accesses = static_cast<bool>(access_type) || !doubleword_aligned;
+  DCHECK(second_access_add_to_offset <= 7);  // Must be <= 7.
+
+  // is_int16 must be passed a signed value, hence the static cast below.
+  if (is_int16(src.offset()) &&
+      (!two_accesses || is_int16(static_cast<int32_t>(
+                            src.offset() + second_access_add_to_offset)))) {
+    // Nothing to do: 'offset' (and, if needed, 'offset + 4', or other specified
+    // value) fits into int16_t.
+    return;
+  }

-  if (kArchVariant == kMips64r6) {
-    int32_t hi = (src.offset_ >> kLuiShift) & kImm16Mask;
-    if (src.offset_ & kNegOffset) {
-      if ((hi & kNegOffset) != ((hi + 1) & kNegOffset)) {
-        lui(at, (src.offset_ >> kLuiShift) & kImm16Mask);
-        ori(at, at, src.offset_ & kImm16Mask);  // Load 32-bit offset.
-        daddu(at, at, src.rm());                // Add base register.
-        return;
-      }
+  DCHECK(!src.rm().is(
+      at));  // Must not overwrite the register 'base' while loading 'offset'.

-      hi += 1;
+#ifdef DEBUG
+  // Remember the "(mis)alignment" of 'offset', it will be checked at the end.
+  uint32_t misalignment = src.offset() & (kDoubleSize - 1);
+#endif
+
+  // Do not load the whole 32-bit 'offset' if it can be represented as
+  // a sum of two 16-bit signed offsets. This can save an instruction or two.
+  // To simplify matters, only do this for a symmetric range of offsets from
+  // about -64KB to about +64KB, allowing further addition of 4 when accessing
+  // 64-bit variables with two 32-bit accesses.
+  constexpr int32_t kMinOffsetForSimpleAdjustment =
+      0x7ff8;  // Max int16_t that's a multiple of 8.
+  constexpr int32_t kMaxOffsetForSimpleAdjustment =
+      2 * kMinOffsetForSimpleAdjustment;
+
+  if (0 <= src.offset() && src.offset() <= kMaxOffsetForSimpleAdjustment) {
+    daddiu(at, src.rm(), kMinOffsetForSimpleAdjustment);
+    src.offset_ -= kMinOffsetForSimpleAdjustment;
+  } else if (-kMaxOffsetForSimpleAdjustment <= src.offset() &&
+             src.offset() < 0) {
+    daddiu(at, src.rm(), -kMinOffsetForSimpleAdjustment);
+    src.offset_ += kMinOffsetForSimpleAdjustment;
+  } else if (kArchVariant == kMips64r6) {
+    // On r6 take advantage of the daui instruction, e.g.:
+    //    daui   AT, base, offset_high
+    //   [dahi   AT, 1]                       // When `offset` is close to +2GB.
+    //    lw     reg_lo, offset_low(AT)
+    //   [lw     reg_hi, (offset_low+4)(AT)]  // If misaligned 64-bit load.
+    // or when offset_low+4 overflows int16_t:
+    //    daui   AT, base, offset_high
+    //    daddiu AT, AT, 8
+    //    lw     reg_lo, (offset_low-8)(AT)
+    //    lw     reg_hi, (offset_low-4)(AT)
+    int16_t offset_low = static_cast<uint16_t>(src.offset());
+    int32_t offset_low32 = offset_low;
+    int16_t offset_high = static_cast<uint16_t>(src.offset() >> 16);
+    bool increment_hi16 = offset_low < 0;
+    bool overflow_hi16 = false;
+
+    if (increment_hi16) {
+      offset_high++;
+      overflow_hi16 = (offset_high == -32768);
    }
+    daui(at, src.rm(), offset_high);

-    daui(at, src.rm(), hi);
-    daddiu(at, at, src.offset_ & kImm16Mask);
+    if (overflow_hi16) {
+      dahi(at, 1);
+    }
+
+    if (two_accesses && !is_int16(static_cast<int32_t>(
+                            offset_low32 + second_access_add_to_offset))) {
+      // Avoid overflow in the 16-bit offset of the load/store instruction when
+      // adding 4.
+      daddiu(at, at, kDoubleSize);
+      offset_low32 -= kDoubleSize;
+    }
+
+    src.offset_ = offset_low32;
  } else {
-    lui(at, (src.offset_ >> kLuiShift) & kImm16Mask);
-    ori(at, at, src.offset_ & kImm16Mask);  // Load 32-bit offset.
-    daddu(at, at, src.rm());                // Add base register.
+    // Do not load the whole 32-bit 'offset' if it can be represented as
+    // a sum of three 16-bit signed offsets. This can save an instruction.
+    // To simplify matters, only do this for a symmetric range of offsets from
+    // about -96KB to about +96KB, allowing further addition of 4 when accessing
+    // 64-bit variables with two 32-bit accesses.
+    constexpr int32_t kMinOffsetForMediumAdjustment =
+        2 * kMinOffsetForSimpleAdjustment;
+    constexpr int32_t kMaxOffsetForMediumAdjustment =
+        3 * kMinOffsetForSimpleAdjustment;
+    if (0 <= src.offset() && src.offset() <= kMaxOffsetForMediumAdjustment) {
+      daddiu(at, src.rm(), kMinOffsetForMediumAdjustment / 2);
+      daddiu(at, at, kMinOffsetForMediumAdjustment / 2);
+      src.offset_ -= kMinOffsetForMediumAdjustment;
+    } else if (-kMaxOffsetForMediumAdjustment <= src.offset() &&
+               src.offset() < 0) {
+      daddiu(at, src.rm(), -kMinOffsetForMediumAdjustment / 2);
+      daddiu(at, at, -kMinOffsetForMediumAdjustment / 2);
+      src.offset_ += kMinOffsetForMediumAdjustment;
+    } else {
+      // Now that all shorter options have been exhausted, load the full 32-bit
+      // offset.
+      int32_t loaded_offset = RoundDown(src.offset(), kDoubleSize);
+      lui(at, (loaded_offset >> kLuiShift) & kImm16Mask);
+      ori(at, at, loaded_offset & kImm16Mask);  // Load 32-bit offset.
+      daddu(at, at, src.rm());
+      src.offset_ -= loaded_offset;
+    }
+  }
+  src.rm_ = at;
+
+  DCHECK(is_int16(src.offset()));
+  if (two_accesses) {
+    DCHECK(is_int16(
+        static_cast<int32_t>(src.offset() + second_access_add_to_offset)));
  }
+  DCHECK(misalignment == (src.offset() & (kDoubleSize - 1)));
 }

 void Assembler::lb(Register rd, const MemOperand& rs) {
@@ -3285,14 +3384,17 @@ MSA_BRANCH_LIST(MSA_BRANCH)
  V(st_w, ST_W)           \
  V(st_d, ST_D)

-#define MSA_LD_ST(name, opcode)                                \
-  void Assembler::name(MSARegister wd, const MemOperand& rs) { \
-    if (is_int10(rs.offset())) {                               \
-      GenInstrMsaMI10(opcode, rs.offset(), rs.rm(), wd);       \
-    } else {                                                   \
-      LoadRegPlusOffsetToAt(rs);                               \
-      GenInstrMsaMI10(opcode, 0, at, wd);                      \
-    }                                                          \
+#define MSA_LD_ST(name, opcode)                                  \
+  void Assembler::name(MSARegister wd, const MemOperand& rs) {   \
+    MemOperand source = rs;                                      \
+    AdjustBaseAndOffset(source);                                 \
+    if (is_int10(source.offset())) {                             \
+      GenInstrMsaMI10(opcode, source.offset(), source.rm(), wd); \
+    } else {                                                     \
+      DCHECK(!rs.rm().is(at));                                   \
+      daddiu(at, source.rm(), source.offset());                  \
+      GenInstrMsaMI10(opcode, 0, at, wd);                        \
+    }                                                            \
  }

 MSA_LD_ST_LIST(MSA_LD_ST)

--- a/src/mips64/assembler-mips64.h
+++ b/src/mips64/assembler-mips64.h
@@ -1918,8 +1918,18 @@ class Assembler : public AssemblerBase {
  void lsa(Register rd, Register rt, Register rs, uint8_t sa);
  void dlsa(Register rd, Register rt, Register rs, uint8_t sa);

-  // Helpers.
-  void LoadRegPlusOffsetToAt(const MemOperand& src);
+  // Readable constants for base and offset adjustment helper, these indicate if
+  // aside from offset, another value like offset + 4 should fit into int16.
+  enum class OffsetAccessType : bool {
+    SINGLE_ACCESS = false,
+    TWO_ACCESSES = true
+  };
+
+  // Helper function for memory load/store using base register and offset.
+  void AdjustBaseAndOffset(
+      MemOperand& src,
+      OffsetAccessType access_type = OffsetAccessType::SINGLE_ACCESS,
+      int second_access_add_to_offset = 4);

  inline static void set_target_internal_reference_encoded_at(Address pc,
                                                              Address target);

--- a/src/mips64/macro-assembler-mips64.cc
+++ b/src/mips64/macro-assembler-mips64.cc
--- a/src/mips64/macro-assembler-mips64.h
+++ b/src/mips64/macro-assembler-mips64.h
@@ -1873,10 +1873,6 @@ const Operand& rt = Operand(zero_reg), BranchDelaySlot bd = PROTECT
  MemOperand SafepointRegisterSlot(Register reg);
  MemOperand SafepointRegistersAndDoublesSlot(Register reg);

-  // Helpers.
-  void LoadRegPlusOffsetToAt(const MemOperand& src);
-  int32_t LoadRegPlusUpperOffsetPartToAt(const MemOperand& src);
-
  bool has_frame_;
  bool has_double_zero_reg_set_;
  Isolate* isolate_;