S390: [wasm-simd] Fix the simulator to correctly represent Simd lanes

Vector register lane numbers on IBM machines are reversed compared to x64. For example, doing an I32x4 extract_lane with lane number 0 on x64 will be equal to lane number 3 on IBM machines. Vector registers are only used for compiling Wasm code at the moment. Wasm is also little endian enforced. On s390 native, we manually do a reverse byte whenever values are loaded/stored from memory to a Simd register. On the simulator however, we do not reverse the bytes and data is just copied as is from one memory location to another location which represents a register. To keep the Wasm simulation accurate, we need to make sure accessing a lane is correctly simulated and as such we reverse the lane number on the getters and setters. We need to be careful when getting/setting values on the Low or High side of a simulated register. In the simulation, "Low" is equal to the MSB and "High" is equal to the LSB on memory. As a result, many of the "#ifdef V8_TARGET_BIG_ENDIAN" blocks on Simd opcodes are not needed anymore as we are now simulating native behaviour. Change-Id: Idfa80cdef7382febb4311c75eb6d3e1d110141fa Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2687756 Commit-Queue: Milad Fa <mfarazma@redhat.com> Reviewed-by: Junliang Yan <junyan@redhat.com> Reviewed-by: Joran Siu <joransiu@ca.ibm.com> Reviewed-by: Milad Fa <mfarazma@redhat.com> Cr-Commit-Position: refs/heads/master@{#72642}

S390: [wasm-simd] Fix the simulator to correctly represent Simd lanes
Vector register lane numbers on IBM machines are reversed compared to x64. For example, doing an I32x4 extract_lane with lane number 0 on x64 will be equal to lane number 3 on IBM machines. Vector registers are only used for compiling Wasm code at the moment. Wasm is also little endian enforced. On s390 native, we manually do a reverse byte whenever values are loaded/stored from memory to a Simd register. On the simulator however, we do not reverse the bytes and data is just copied as is from one memory location to another location which represents a register. To keep the Wasm simulation accurate, we need to make sure accessing a lane is correctly simulated and as such we reverse the lane number on the getters and setters. We need to be careful when getting/setting values on the Low or High side of a simulated register. In the simulation, "Low" is equal to the MSB and "High" is equal to the LSB on memory. As a result, many of the "#ifdef V8_TARGET_BIG_ENDIAN" blocks on Simd opcodes are not needed anymore as we are now simulating native behaviour. Change-Id: Idfa80cdef7382febb4311c75eb6d3e1d110141fa Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2687756 Commit-Queue: Milad Fa <mfarazma@redhat.com> Reviewed-by: Junliang Yan <junyan@redhat.com> Reviewed-by: Joran Siu <joransiu@ca.ibm.com> Reviewed-by: Milad Fa <mfarazma@redhat.com> Cr-Commit-Position: refs/heads/master@{#72642}
efed4036 · Milad Fa · Commit Bot · 2367a714 · efed4036 · efed4036
Commit efed4036 authored Feb 10, 2021 by Milad Fa Committed by Commit Bot Feb 10, 2021
4 changed files
--- a/src/compiler/backend/s390/code-generator-s390.cc
+++ b/src/compiler/backend/s390/code-generator-s390.cc
--- a/src/compiler/backend/s390/instruction-selector-s390.cc
+++ b/src/compiler/backend/s390/instruction-selector-s390.cc
@@ -2711,7 +2711,6 @@ void InstructionSelector::VisitI8x16Shuffle(Node* node) {
  S390OperandGenerator g(this);
  Node* input0 = node->InputAt(0);
  Node* input1 = node->InputAt(1);
-#ifdef V8_TARGET_BIG_ENDIAN
  // Remap the shuffle indices to match IBM lane numbering.
  int max_index = 15;
  int total_lane_count = 2 * kSimd128Size;
@@ -2723,7 +2722,6 @@ void InstructionSelector::VisitI8x16Shuffle(Node* node) {
                               : total_lane_count - current_index + max_index);
  }
  shuffle_p = &shuffle_remapped[0];
-#endif
  Emit(kS390_I8x16Shuffle, g.DefineAsRegister(node),
       g.UseUniqueRegister(input0), g.UseUniqueRegister(input1),
       g.UseImmediate(wasm::SimdShuffle::Pack4Lanes(shuffle_p)),

--- a/src/execution/s390/simulator-s390.cc
+++ b/src/execution/s390/simulator-s390.cc
@@ -3473,11 +3473,10 @@ EVALUATE(VPKLS) {
 template <class S, class D>
 void VectorUnpackHigh(Simulator* sim, int dst, int src) {
  constexpr size_t kItemCount = kSimd128Size / sizeof(D);
-  D value = 0;
-  for (size_t i = 0; i < kItemCount; i++) {
-    value = sim->get_simd_register_by_lane<S>(src, i + kItemCount);
-    sim->set_simd_register_by_lane<D>(dst, i, value);
-  }
+  D temps[kItemCount] = {0};
+  // About overwriting if src and dst are the same register.
+  FOR_EACH_LANE(i, D) { temps[i] = sim->get_simd_register_by_lane<S>(src, i); }
+  FOR_EACH_LANE(i, D) { sim->set_simd_register_by_lane<D>(dst, i, temps[i]); }
 }

 #define CASE(i, S, D)                     \
@@ -3623,8 +3622,14 @@ void VectorUnpackLow(Simulator* sim, int dst, int src) {
  constexpr size_t kItemCount = kSimd128Size / sizeof(D);
  D temps[kItemCount] = {0};
  // About overwriting if src and dst are the same register.
-  FOR_EACH_LANE(i, D) { temps[i] = sim->get_simd_register_by_lane<S>(src, i); }
-  FOR_EACH_LANE(i, D) { sim->set_simd_register_by_lane<D>(dst, i, temps[i]); }
+  // Using the "false" argument here to make sure we use the "Low" side of the
+  // Simd register, being simulated by the LSB in memory.
+  FOR_EACH_LANE(i, D) {
+    temps[i] = sim->get_simd_register_by_lane<S>(src, i, false);
+  }
+  FOR_EACH_LANE(i, D) {
+    sim->set_simd_register_by_lane<D>(dst, i, temps[i], false);
+  }
 }

 #define CASE(i, S, D)                    \
@@ -3871,6 +3876,7 @@ EVALUATE(VPERM) {
  DECODE_VRR_E_INSTRUCTION(r1, r2, r3, r4, m6, m5);
  USE(m5);
  USE(m6);
+  int8_t temp[kSimd128Size] = {0};
  for (int i = 0; i < kSimd128Size; i++) {
    int8_t lane_num = get_simd_register_by_lane<int8_t>(r4, i);
    // Get the five least significant bits.
@@ -3880,8 +3886,10 @@ EVALUATE(VPERM) {
      lane_num = lane_num - kSimd128Size;
      reg = r3;
    }
-    int8_t result = get_simd_register_by_lane<int8_t>(reg, lane_num);
-    set_simd_register_by_lane<int8_t>(r1, i, result);
+    temp[i] = get_simd_register_by_lane<int8_t>(reg, lane_num);
+  }
+  for (int i = 0; i < kSimd128Size; i++) {
+    set_simd_register_by_lane<int8_t>(r1, i, temp[i]);
  }
  return length;
 }

--- a/src/execution/s390/simulator-s390.h
+++ b/src/execution/s390/simulator-s390.h
@@ -137,26 +137,21 @@ class Simulator : public SimulatorBase {
  void set_high_register(int reg, uint32_t value);

  double get_double_from_register_pair(int reg);
+
+  // Unlike Integer values, Floating Point values are located on the left most
+  // side of a native 64 bit register. As FP registers are a subset of vector
+  // registers, 64 and 32 bit FP values need to be located on first lane (lane
+  // number 0) of a vector register.
  template <class T>
  T get_fpr(int dreg) {
    DCHECK(dreg >= 0 && dreg < kNumFPRs);
-    if (sizeof(T) == 8) {
-      return get_simd_register_by_lane<T>(dreg, 0);
-    } else {
-      DCHECK_EQ(sizeof(T), 4);
-      return get_simd_register_by_lane<T>(dreg, 1);
-    }
+    return get_simd_register_by_lane<T>(dreg, 0);
  }

  template <class T>
  void set_fpr(int dreg, const T val) {
    DCHECK(dreg >= 0 && dreg < kNumFPRs);
-    if (sizeof(T) == 8) {
-      set_simd_register_by_lane(dreg, 0, val);
-    } else {
-      DCHECK_EQ(sizeof(T), 4);
-      set_simd_register_by_lane(dreg, 1, val);
-    }
+    set_simd_register_by_lane<T>(dreg, 0, val);
  }

  // Special case of set_register and get_register to access the raw PC value.
@@ -412,8 +407,27 @@ class Simulator : public SimulatorBase {
    set_simd_register_by_lane(reg, 0, v);
  }

+  // Vector register lane numbers on IBM machines are reversed compared to
+  // x64. For example, doing an I32x4 extract_lane with lane number 0 on x64
+  // will be equal to lane number 3 on IBM machines. Vector registers are only
+  // used for compiling Wasm code at the moment. Wasm is also little endian
+  // enforced. On s390 native, we manually do a reverse byte whenever values are
+  // loaded/stored from memory to a Simd register. On the simulator however, we
+  // do not reverse the bytes and data is just copied as is from one memory
+  // location to another location which represents a register. To keep the Wasm
+  // simulation accurate, we need to make sure accessing a lane is correctly
+  // simulated and as such we reverse the lane number on the getters and setters
+  // below. We need to be careful when getting/setting values on the Low or High
+  // side of a simulated register. In the simulation, "Low" is equal to the MSB
+  // and "High" is equal to the LSB on memory. "force_ibm_lane_numbering" could
+  // be used to disabled automatic lane number reversal and help with accessing
+  // the Low or High side of a simulated register.
  template <class T>
-  T get_simd_register_by_lane(int reg, int lane) {
+  T get_simd_register_by_lane(int reg, int lane,
+                              bool force_ibm_lane_numbering = true) {
+    if (force_ibm_lane_numbering) {
+      lane = (kSimd128Size / sizeof(T)) - 1 - lane;
+    }
    CHECK_LE(lane, kSimd128Size / sizeof(T));
    CHECK_LT(reg, kNumFPRs);
    CHECK_GE(lane, 0);
@@ -422,7 +436,11 @@ class Simulator : public SimulatorBase {
  }

  template <class T>
-  void set_simd_register_by_lane(int reg, int lane, const T& value) {
+  void set_simd_register_by_lane(int reg, int lane, const T& value,
+                                 bool force_ibm_lane_numbering = true) {
+    if (force_ibm_lane_numbering) {
+      lane = (kSimd128Size / sizeof(T)) - 1 - lane;
+    }
    CHECK_LE(lane, kSimd128Size / sizeof(T));
    CHECK_LT(reg, kNumFPRs);
    CHECK_GE(lane, 0);