s390: [wasm-simd] Add simulator support for simd operations

Change-Id: I0352ef9e4213d6dc0f50a5406d8e167784408452 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2095755 Commit-Queue: Milad Farazmand <miladfar@ca.ibm.com> Reviewed-by: Joran Siu <joransiu@ca.ibm.com> Cr-Commit-Position: refs/heads/master@{#66731}

s390: [wasm-simd] Add simulator support for simd operations
Change-Id: I0352ef9e4213d6dc0f50a5406d8e167784408452 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2095755 Commit-Queue: Milad Farazmand <miladfar@ca.ibm.com> Reviewed-by: Joran Siu <joransiu@ca.ibm.com> Cr-Commit-Position: refs/heads/master@{#66731}
1b2e0ddf · Milad Farazmand · Commit Bot · 371580bb · 1b2e0ddf · 1b2e0ddf
Commit 1b2e0ddf authored Mar 16, 2020 by Milad Farazmand Committed by Commit Bot Mar 16, 2020
5 changed files
--- a/src/codegen/s390/constants-s390.h
+++ b/src/codegen/s390/constants-s390.h
@@ -2333,6 +2333,13 @@ class VRR_E_Instruction : SixByteInstruction {
  DECLARE_FIELD_FOR_SIX_BYTE_INSTR(M5Value, uint32_t, 28, 32)
 };

+class VRR_F_Instruction : SixByteInstruction {
+ public:
+  DECLARE_FIELD_FOR_SIX_BYTE_INSTR(R1Value, int, 8, 12)
+  DECLARE_FIELD_FOR_SIX_BYTE_INSTR(R2Value, int, 12, 16)
+  DECLARE_FIELD_FOR_SIX_BYTE_INSTR(R3Value, int, 16, 20)
+};
+
 class VRX_Instruction : SixByteInstruction {
 public:
  DECLARE_FIELD_FOR_SIX_BYTE_INSTR(R1Value, int, 8, 12)

--- a/src/compiler/backend/s390/code-generator-s390.cc
+++ b/src/compiler/backend/s390/code-generator-s390.cc
@@ -2941,44 +2941,84 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    }
    // vector extract element
    case kS390_F64x2ExtractLane: {
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vrep(i.OutputDoubleRegister(), i.InputSimd128Register(0),
              Operand(1 - i.InputInt8(1)), Condition(3));
+#else
+      __ vrep(i.OutputDoubleRegister(), i.InputSimd128Register(0),
+              Operand(i.InputInt8(1)), Condition(3));
+#endif
      break;
    }
    case kS390_F32x4ExtractLane: {
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vrep(i.OutputDoubleRegister(), i.InputSimd128Register(0),
              Operand(3 - i.InputInt8(1)), Condition(2));
+#else
+      __ vrep(i.OutputDoubleRegister(), i.InputSimd128Register(0),
+              Operand(i.InputInt8(1)), Condition(2));
+#endif
      break;
    }
    case kS390_I64x2ExtractLane: {
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vlgv(i.OutputRegister(), i.InputSimd128Register(0),
              MemOperand(r0, 1 - i.InputInt8(1)), Condition(3));
+#else
+      __ vlgv(i.OutputRegister(), i.InputSimd128Register(0),
+              MemOperand(r0, i.InputInt8(1)), Condition(3));
+#endif
      break;
    }
    case kS390_I32x4ExtractLane: {
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vlgv(i.OutputRegister(), i.InputSimd128Register(0),
              MemOperand(r0, 3 - i.InputInt8(1)), Condition(2));
+#else
+      __ vlgv(i.OutputRegister(), i.InputSimd128Register(0),
+              MemOperand(r0, i.InputInt8(1)), Condition(2));
+#endif
      break;
    }
    case kS390_I16x8ExtractLaneU: {
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vlgv(i.OutputRegister(), i.InputSimd128Register(0),
              MemOperand(r0, 7 - i.InputInt8(1)), Condition(1));
+#else
+      __ vlgv(i.OutputRegister(), i.InputSimd128Register(0),
+              MemOperand(r0, i.InputInt8(1)), Condition(1));
+#endif
      break;
    }
    case kS390_I16x8ExtractLaneS: {
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vlgv(kScratchReg, i.InputSimd128Register(0),
              MemOperand(r0, 7 - i.InputInt8(1)), Condition(1));
+#else
+      __ vlgv(kScratchReg, i.InputSimd128Register(0),
+              MemOperand(r0, i.InputInt8(1)), Condition(1));
+#endif
      __ lghr(i.OutputRegister(), kScratchReg);
      break;
    }
    case kS390_I8x16ExtractLaneU: {
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vlgv(i.OutputRegister(), i.InputSimd128Register(0),
              MemOperand(r0, 15 - i.InputInt8(1)), Condition(0));
+#else
+      __ vlgv(i.OutputRegister(), i.InputSimd128Register(0),
+              MemOperand(r0, i.InputInt8(1)), Condition(0));
+#endif
      break;
    }
    case kS390_I8x16ExtractLaneS: {
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vlgv(kScratchReg, i.InputSimd128Register(0),
              MemOperand(r0, 15 - i.InputInt8(1)), Condition(0));
+#else
+      __ vlgv(kScratchReg, i.InputSimd128Register(0),
+              MemOperand(r0, i.InputInt8(1)), Condition(0));
+#endif
      __ lgbr(i.OutputRegister(), kScratchReg);
      break;
    }
@@ -2989,8 +3029,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ vlr(kScratchDoubleReg, src, Condition(0), Condition(0), Condition(0));
      __ vlgv(kScratchReg, i.InputDoubleRegister(2), MemOperand(r0, 0),
              Condition(3));
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vlvg(kScratchDoubleReg, kScratchReg,
              MemOperand(r0, 1 - i.InputInt8(1)), Condition(3));
+#else
+      __ vlvg(kScratchDoubleReg, kScratchReg, MemOperand(r0, i.InputInt8(1)),
+              Condition(3));
+#endif
      __ vlr(dst, kScratchDoubleReg, Condition(0), Condition(0), Condition(0));
      break;
    }
@@ -2998,10 +3043,17 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      Simd128Register src = i.InputSimd128Register(0);
      Simd128Register dst = i.OutputSimd128Register();
      __ vlr(kScratchDoubleReg, src, Condition(0), Condition(0), Condition(0));
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vlgv(kScratchReg, i.InputDoubleRegister(2), MemOperand(r0, 0),
              Condition(2));
      __ vlvg(kScratchDoubleReg, kScratchReg,
              MemOperand(r0, 3 - i.InputInt8(1)), Condition(2));
+#else
+      __ vlgv(kScratchReg, i.InputDoubleRegister(2), MemOperand(r0, 1),
+              Condition(2));
+      __ vlvg(kScratchDoubleReg, kScratchReg, MemOperand(r0, i.InputInt8(1)),
+              Condition(2));
+#endif
      __ vlr(dst, kScratchDoubleReg, Condition(0), Condition(0), Condition(0));
      break;
    }
@@ -3011,8 +3063,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      if (src != dst) {
        __ vlr(dst, src, Condition(0), Condition(0), Condition(0));
      }
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vlvg(i.OutputSimd128Register(), i.InputRegister(2),
              MemOperand(r0, 1 - i.InputInt8(1)), Condition(3));
+#else
+      __ vlvg(i.OutputSimd128Register(), i.InputRegister(2),
+              MemOperand(r0, i.InputInt8(1)), Condition(3));
+#endif
      break;
    }
    case kS390_I32x4ReplaceLane: {
@@ -3021,8 +3078,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      if (src != dst) {
        __ vlr(dst, src, Condition(0), Condition(0), Condition(0));
      }
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vlvg(i.OutputSimd128Register(), i.InputRegister(2),
              MemOperand(r0, 3 - i.InputInt8(1)), Condition(2));
+#else
+      __ vlvg(i.OutputSimd128Register(), i.InputRegister(2),
+              MemOperand(r0, i.InputInt8(1)), Condition(2));
+#endif
      break;
    }
    case kS390_I16x8ReplaceLane: {
@@ -3031,8 +3093,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      if (src != dst) {
        __ vlr(dst, src, Condition(0), Condition(0), Condition(0));
      }
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vlvg(i.OutputSimd128Register(), i.InputRegister(2),
              MemOperand(r0, 7 - i.InputInt8(1)), Condition(1));
+#else
+      __ vlvg(i.OutputSimd128Register(), i.InputRegister(2),
+              MemOperand(r0, i.InputInt8(1)), Condition(1));
+#endif
      break;
    }
    case kS390_I8x16ReplaceLane: {
@@ -3041,8 +3108,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      if (src != dst) {
        __ vlr(dst, src, Condition(0), Condition(0), Condition(0));
      }
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vlvg(i.OutputSimd128Register(), i.InputRegister(2),
              MemOperand(r0, 15 - i.InputInt8(1)), Condition(0));
+#else
+      __ vlvg(i.OutputSimd128Register(), i.InputRegister(2),
+              MemOperand(r0, i.InputInt8(1)), Condition(0));
+#endif
      break;
    }
    // vector binops
@@ -3104,6 +3176,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
             Condition(2));
      break;
    }
+#define FLOAT_ADD_HORIZ(src0, src1, scratch0, scratch1, add0, add1)         \
+  __ vpk(dst, src0, src1, Condition(0), Condition(0), Condition(3));        \
+  __ vesrl(scratch0, src0, MemOperand(r0, shift_bits), Condition(3));       \
+  __ vesrl(scratch1, src1, MemOperand(r0, shift_bits), Condition(3));       \
+  __ vpk(kScratchDoubleReg, scratch0, scratch1, Condition(0), Condition(0), \
+         Condition(3));                                                     \
+  __ vfa(dst, add0, add1, Condition(0), Condition(0), Condition(2));
    case kS390_F32x4AddHoriz: {
      Simd128Register src0 = i.InputSimd128Register(0);
      Simd128Register src1 = i.InputSimd128Register(1);
@@ -3111,16 +3190,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      DoubleRegister tempFPReg1 = i.ToSimd128Register(instr->TempAt(0));
      DoubleRegister tempFPReg2 = i.ToSimd128Register(instr->TempAt(1));
      constexpr int shift_bits = 32;
-      // generate first operand
-      __ vpk(dst, src1, src0, Condition(0), Condition(0), Condition(3));
-      // generate second operand
-      __ vesrl(tempFPReg1, src0, MemOperand(r0, shift_bits), Condition(3));
-      __ vesrl(tempFPReg2, src1, MemOperand(r0, shift_bits), Condition(3));
-      __ vpk(kScratchDoubleReg, tempFPReg2, tempFPReg1, Condition(0),
-             Condition(0), Condition(3));
-      // add the operands
-      __ vfa(dst, kScratchDoubleReg, dst, Condition(0), Condition(0),
-             Condition(2));
+#ifdef V8_TARGET_BIG_ENDIAN
+      FLOAT_ADD_HORIZ(src1, src0, tempFPReg2, tempFPReg1, kScratchDoubleReg,
+                      dst)
+#else
+      FLOAT_ADD_HORIZ(src0, src1, tempFPReg1, tempFPReg2, dst,
+                      kScratchDoubleReg)
+#endif
+#undef FLOAT_ADD_HORIZ
      break;
    }
    case kS390_F32x4Sub: {
@@ -3212,8 +3289,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
               Condition(2));
      __ vsumg(kScratchDoubleReg, src1, kScratchDoubleReg, Condition(0),
               Condition(0), Condition(2));
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vpk(dst, kScratchDoubleReg, dst, Condition(0), Condition(0),
             Condition(3));
+#else
+      __ vpk(dst, dst, kScratchDoubleReg, Condition(0), Condition(0),
+             Condition(3));
+#endif
      break;
    }
    case kS390_I32x4Sub: {
@@ -3244,8 +3326,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
              Condition(1));
      __ vsum(kScratchDoubleReg, src1, kScratchDoubleReg, Condition(0),
              Condition(0), Condition(1));
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vpk(dst, kScratchDoubleReg, dst, Condition(0), Condition(0),
             Condition(2));
+#else
+      __ vpk(dst, dst, kScratchDoubleReg, Condition(0), Condition(0),
+             Condition(2));
+#endif
      break;
    }
    case kS390_I16x8Sub: {
@@ -3710,7 +3797,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    case kS390_F32x4RecipApprox: {
      __ lgfi(kScratchReg, Operand(1));
      __ ConvertIntToFloat(kScratchDoubleReg, kScratchReg);
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vrep(kScratchDoubleReg, kScratchDoubleReg, Operand(0), Condition(2));
+#else
+      __ vrep(kScratchDoubleReg, kScratchDoubleReg, Operand(1), Condition(2));
+#endif
      __ vfd(i.OutputSimd128Register(), kScratchDoubleReg,
             i.InputSimd128Register(0), Condition(0), Condition(0),
             Condition(2));
@@ -3722,7 +3813,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
              Condition(2));
      __ lgfi(kScratchReg, Operand(1));
      __ ConvertIntToFloat(kScratchDoubleReg, kScratchReg);
+#ifdef V8_TARGET_BIG_ENDIAN
      __ vrep(kScratchDoubleReg, kScratchDoubleReg, Operand(0), Condition(2));
+#else
+      __ vrep(kScratchDoubleReg, kScratchDoubleReg, Operand(1), Condition(2));
+#endif
      __ vfd(i.OutputSimd128Register(), kScratchDoubleReg, tempFPReg1,
             Condition(0), Condition(0), Condition(2));
      break;
@@ -3841,17 +3936,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      break;
    }
    // vector conversions
-#define CONVERT_FLOAT_TO_INT32(convert)                                       \
-  for (int index = 0; index < 4; index++) {                                   \
-    __ vlgv(kScratchReg, kScratchDoubleReg, MemOperand(r0, index),            \
-            Condition(2));                                                    \
-    __ vlvg(kScratchDoubleReg, kScratchReg, MemOperand(r0, 0), Condition(2)); \
-    __ convert(kScratchReg, kScratchDoubleReg, kRoundToZero);                 \
-    __ vlvg(dst, kScratchReg, MemOperand(r0, index), Condition(2));           \
+#define CONVERT_FLOAT_TO_INT32(convert)                             \
+  for (int index = 0; index < 4; index++) {                         \
+    __ vlgv(kScratchReg, kScratchDoubleReg, MemOperand(r0, index),  \
+            Condition(2));                                          \
+    __ MovIntToFloat(tempFPReg1, kScratchReg);                      \
+    __ convert(kScratchReg, tempFPReg1, kRoundToZero);              \
+    __ vlvg(dst, kScratchReg, MemOperand(r0, index), Condition(2)); \
  }
    case kS390_I32x4SConvertF32x4: {
      Simd128Register src = i.InputSimd128Register(0);
      Simd128Register dst = i.OutputSimd128Register();
+      Simd128Register tempFPReg1 = i.ToSimd128Register(instr->TempAt(0));
      // NaN to 0
      __ vlr(kScratchDoubleReg, src, Condition(0), Condition(0), Condition(0));
      __ vfce(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg,
@@ -3864,6 +3960,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    case kS390_I32x4UConvertF32x4: {
      Simd128Register src = i.InputSimd128Register(0);
      Simd128Register dst = i.OutputSimd128Register();
+      Simd128Register tempFPReg1 = i.ToSimd128Register(instr->TempAt(0));
      // NaN to 0, negative to 0
      __ vx(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg,
            Condition(0), Condition(0), Condition(0));
@@ -3873,21 +3970,29 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      break;
    }
 #undef CONVERT_FLOAT_TO_INT32
-#define CONVERT_INT32_TO_FLOAT(convert)                                       \
-  Simd128Register src = i.InputSimd128Register(0);                            \
-  Simd128Register dst = i.OutputSimd128Register();                            \
-  for (int index = 0; index < 4; index++) {                                   \
-    __ vlgv(kScratchReg, src, MemOperand(r0, index), Condition(2));           \
-    __ convert(kScratchDoubleReg, kScratchReg);                               \
-    __ vlgv(kScratchReg, kScratchDoubleReg, MemOperand(r0, 0), Condition(2)); \
-    __ vlvg(dst, kScratchReg, MemOperand(r0, index), Condition(2));           \
+#define CONVERT_INT32_TO_FLOAT(convert, double_index)               \
+  Simd128Register src = i.InputSimd128Register(0);                  \
+  Simd128Register dst = i.OutputSimd128Register();                  \
+  for (int index = 0; index < 4; index++) {                         \
+    __ vlgv(kScratchReg, src, MemOperand(r0, index), Condition(2)); \
+    __ convert(kScratchDoubleReg, kScratchReg);                     \
+    __ MovFloatToInt(kScratchReg, kScratchDoubleReg);               \
+    __ vlvg(dst, kScratchReg, MemOperand(r0, index), Condition(2)); \
  }
    case kS390_F32x4SConvertI32x4: {
-      CONVERT_INT32_TO_FLOAT(ConvertIntToFloat)
+#ifdef V8_TARGET_BIG_ENDIAN
+      CONVERT_INT32_TO_FLOAT(ConvertIntToFloat, 0)
+#else
+      CONVERT_INT32_TO_FLOAT(ConvertIntToFloat, 1)
+#endif
      break;
    }
    case kS390_F32x4UConvertI32x4: {
-      CONVERT_INT32_TO_FLOAT(ConvertUnsignedIntToFloat)
+#ifdef V8_TARGET_BIG_ENDIAN
+      CONVERT_INT32_TO_FLOAT(ConvertUnsignedIntToFloat, 0)
+#else
+      CONVERT_INT32_TO_FLOAT(ConvertUnsignedIntToFloat, 1)
+#endif
      break;
    }
 #undef CONVERT_INT32_TO_FLOAT
@@ -4044,8 +4149,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
                                 i.InputInt32(4), i.InputInt32(5)};
      // create 2 * 8 byte inputs indicating new indices
      for (int i = 0, j = 0; i < 2; i++, j = +2) {
+#ifdef V8_TARGET_BIG_ENDIAN
        __ lgfi(i < 1 ? ip : r0, Operand(k8x16_indices[j + 1]));
        __ aih(i < 1 ? ip : r0, Operand(k8x16_indices[j]));
+#else
+        __ lgfi(i < 1 ? ip : r0, Operand(k8x16_indices[j]));
+        __ aih(i < 1 ? ip : r0, Operand(k8x16_indices[j + 1]));
+#endif
      }
      __ vlvgp(kScratchDoubleReg, ip, r0);
      __ vperm(dst, src0, src1, kScratchDoubleReg, Condition(0), Condition(0));
@@ -4055,6 +4165,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      Simd128Register dst = i.OutputSimd128Register(),
                      src0 = i.InputSimd128Register(0),
                      src1 = i.InputSimd128Register(1);
+#ifdef V8_TARGET_BIG_ENDIAN
      //  input needs to be reversed
      __ vlgv(r0, src0, MemOperand(r0, 0), Condition(3));
      __ vlgv(r1, src0, MemOperand(r0, 1), Condition(3));
@@ -4064,6 +4175,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      // clear scr0
      __ vx(src0, src0, src0, Condition(0), Condition(0), Condition(0));
      __ vperm(dst, kScratchDoubleReg, src0, src1, Condition(0), Condition(0));
+#else
+      __ vx(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg,
+            Condition(0), Condition(0), Condition(0));
+      __ vperm(dst, src0, kScratchDoubleReg, src1, Condition(0), Condition(0));
+#endif
      break;
    }
    default:

--- a/src/compiler/backend/s390/instruction-selector-s390.cc
+++ b/src/compiler/backend/s390/instruction-selector-s390.cc
@@ -2750,11 +2750,12 @@ SIMD_BOOL_LIST(SIMD_VISIT_BOOL)
 #undef SIMD_VISIT_BOOL
 #undef SIMD_BOOL_LIST

-#define SIMD_VISIT_CONVERSION(Opcode)                   \
-  void InstructionSelector::Visit##Opcode(Node* node) { \
-    S390OperandGenerator g(this);                       \
-    Emit(kS390_##Opcode, g.DefineAsRegister(node),      \
-         g.UseRegister(node->InputAt(0)));              \
+#define SIMD_VISIT_CONVERSION(Opcode)                               \
+  void InstructionSelector::Visit##Opcode(Node* node) {             \
+    S390OperandGenerator g(this);                                   \
+    InstructionOperand temps[] = {g.TempSimd128Register()};         \
+    Emit(kS390_##Opcode, g.DefineAsRegister(node),                  \
+         g.UseRegister(node->InputAt(0)), arraysize(temps), temps); \
  }
 SIMD_CONVERSION_LIST(SIMD_VISIT_CONVERSION)
 #undef SIMD_VISIT_CONVERSION
@@ -2782,6 +2783,7 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
  S390OperandGenerator g(this);
  Node* input0 = node->InputAt(0);
  Node* input1 = node->InputAt(1);
+#ifdef V8_TARGET_BIG_ENDIAN
  // input registers are each in reverse order, we will have to remap the
  // shuffle indices
  int max_index = 15;
@@ -2801,12 +2803,21 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
       g.UseImmediate(Pack4Lanes(shuffle_remapped + 8)),
       g.UseImmediate(Pack4Lanes(shuffle_remapped + 4)),
       g.UseImmediate(Pack4Lanes(shuffle_remapped)));
+#else
+  Emit(kS390_S8x16Shuffle, g.DefineAsRegister(node),
+       g.UseUniqueRegister(input0), g.UseUniqueRegister(input1),
+       g.UseImmediate(Pack4Lanes(shuffle)),
+       g.UseImmediate(Pack4Lanes(shuffle + 4)),
+       g.UseImmediate(Pack4Lanes(shuffle + 8)),
+       g.UseImmediate(Pack4Lanes(shuffle + 12)));
+#endif
 }

 void InstructionSelector::VisitS8x16Swizzle(Node* node) {
  S390OperandGenerator g(this);
  Emit(kS390_S8x16Swizzle, g.DefineAsRegister(node),
-       g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)));
+       g.UseUniqueRegister(node->InputAt(0)),
+       g.UseUniqueRegister(node->InputAt(1)));
 }

 void InstructionSelector::VisitS128Zero(Node* node) {

--- a/src/execution/s390/simulator-s390.cc
+++ b/src/execution/s390/simulator-s390.cc
@@ -747,17 +747,21 @@ void Simulator::EvalTableInit() {
 #define S390_SUPPORTED_VECTOR_OPCODE_LIST(V)                                   \
  V(vst, VST, 0xE70E)     /* type = VRX   VECTOR STORE  */                     \
  V(vl, VL, 0xE706)       /* type = VRX   VECTOR LOAD  */                      \
+  V(vlp, VLP, 0xE7DF)     /* type = VRR_A VECTOR LOAD POSITIVE */              \
  V(vlgv, VLGV, 0xE721)   /* type = VRS_C VECTOR LOAD GR FROM VR ELEMENT  */   \
  V(vlvg, VLVG, 0xE722)   /* type = VRS_B VECTOR LOAD VR ELEMENT FROM GR  */   \
+  V(vlvgp, VLVGP, 0xE762) /* type = VRR_F VECTOR LOAD VR FROM GRS DISJOINT */  \
  V(vrep, VREP, 0xE74D)   /* type = VRI_C VECTOR REPLICATE  */                 \
  V(vlrep, VLREP, 0xE705) /* type = VRX   VECTOR LOAD AND REPLICATE  */        \
  V(vrepi, VREPI, 0xE745) /* type = VRI_A VECTOR REPLICATE IMMEDIATE  */       \
  V(vlr, VLR, 0xE756)     /* type = VRR_A VECTOR LOAD  */                      \
  V(vstef, VSTEF, 0xE70B) /* type = VRX   VECTOR STORE ELEMENT (32)  */        \
  V(vlef, VLEF, 0xE703)   /* type = VRX   VECTOR LOAD ELEMENT (32)  */         \
+  V(vavgl, VAVGL, 0xE7F0) /* type = VRR_C VECTOR AVERAGE LOGICAL  */           \
  V(va, VA, 0xE7F3)       /* type = VRR_C VECTOR ADD  */                       \
  V(vs, VS, 0xE7F7)       /* type = VRR_C VECTOR SUBTRACT  */                  \
  V(vml, VML, 0xE7A2)     /* type = VRR_C VECTOR MULTIPLY LOW  */              \
+  V(vnc, VNC, 0xE769)     /* type = VRR_C VECTOR AND WITH COMPLEMENT */        \
  V(vsum, VSUM, 0xE764)   /* type = VRR_C VECTOR SUM ACROSS WORD  */           \
  V(vsumg, VSUMG, 0xE765) /* type = VRR_C VECTOR SUM ACROSS DOUBLEWORD  */     \
  V(vpk, VPK, 0xE794)     /* type = VRR_C VECTOR PACK  */                      \
@@ -777,14 +781,21 @@ void Simulator::EvalTableInit() {
  V(vch, VCH, 0xE7FB)     /* type = VRR_B VECTOR COMPARE HIGH  */              \
  V(vo, VO, 0xE76A)       /* type = VRR_C VECTOR OR  */                        \
  V(vn, VN, 0xE768)       /* type = VRR_C VECTOR AND  */                       \
+  V(vno, VNO, 0xE768B)    /* type = VRR_C VECTOR NOR  */                       \
  V(vlc, VLC, 0xE7DE)     /* type = VRR_A VECTOR LOAD COMPLEMENT  */           \
  V(vsel, VSEL, 0xE78D)   /* type = VRR_E VECTOR SELECT  */                    \
+  V(vperm, VPERM, 0xE78C) /* type = VRR_E VECTOR PERMUTE  */                   \
  V(vtm, VTM, 0xE7D8)     /* type = VRR_A VECTOR TEST UNDER MASK  */           \
  V(vesl, VESL, 0xE730)   /* type = VRS_A VECTOR ELEMENT SHIFT LEFT  */        \
+  V(veslv, VESLV, 0xE770) /* type = VRR_C VECTOR ELEMENT SHIFT LEFT  */        \
  V(vesrl, VESRL,                                                              \
    0xE738) /* type = VRS_A VECTOR ELEMENT SHIFT RIGHT LOGICAL  */             \
+  V(vesrlv, VESRLV,                                                            \
+    0xE778) /* type = VRR_C VECTOR ELEMENT SHIFT RIGHT LOGICAL  */             \
  V(vesra, VESRA,                                                              \
    0xE73A) /* type = VRS_A VECTOR ELEMENT SHIFT RIGHT ARITHMETIC  */          \
+  V(vesrav, VESRAV,                                                            \
+    0xE77A) /* type = VRR_C VECTOR ELEMENT SHIFT RIGHT ARITHMETIC  */          \
  V(vfsq, VFSQ, 0xE7CE)   /* type = VRR_A VECTOR FP SQUARE ROOT  */            \
  V(vfmax, VFMAX, 0xE7EF) /* type = VRR_C VECTOR FP MAXIMUM */                 \
  V(vfmin, VFMIN, 0xE7EE) /* type = VRR_C VECTOR FP MINIMUM */                 \
@@ -796,7 +807,10 @@ void Simulator::EvalTableInit() {
  V(vfs, VFS, 0xE7E2)     /* type = VRR_C VECTOR FP SUBTRACT  */               \
  V(vfa, VFA, 0xE7E3)     /* type = VRR_C VECTOR FP ADD  */                    \
  V(vfd, VFD, 0xE7E5)     /* type = VRR_C VECTOR FP DIVIDE  */                 \
-  V(vfm, VFM, 0xE7E7)     /* type = VRR_C VECTOR FP MULTIPLY  */
+  V(vfm, VFM, 0xE7E7)     /* type = VRR_C VECTOR FP MULTIPLY  */               \
+  V(vfma, VFMA, 0xE78F)   /* type = VRR_E VECTOR FP MULTIPLY AND ADD  */       \
+  V(vfnms, VFNMS,                                                              \
+    0xE79E) /* type = VRR_E VECTOR FP NEGATIVE MULTIPLY AND SUBTRACT   */

 #define CREATE_EVALUATE_TABLE(name, op_name, op_value) \
  EvalTable[op_name] = &Simulator::Evaluate_##op_name;
@@ -2870,6 +2884,12 @@ uintptr_t Simulator::PopAddress() {
  int m5 = AS(VRR_E_Instruction)->M5Value();             \
  int length = 6;

+#define DECODE_VRR_F_INSTRUCTION(r1, r2, r3) \
+  int r1 = AS(VRR_F_Instruction)->R1Value(); \
+  int r2 = AS(VRR_F_Instruction)->R2Value(); \
+  int r3 = AS(VRR_F_Instruction)->R3Value(); \
+  int length = 6;
+
 #define DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3) \
  int r1 = AS(VRX_Instruction)->R1Value();         \
  int x2 = AS(VRX_Instruction)->X2Value();         \
@@ -2927,6 +2947,78 @@ EVALUATE(VL) {
  return length;
 }

+#define VECTOR_LOAD_POSITIVE(r1, r2, type)                              \
+  for (size_t i = 0, j = 0; j < kSimd128Size; i++, j += sizeof(type)) { \
+    set_simd_register_by_lane<type>(                                    \
+        r1, i, abs(get_simd_register_by_lane<type>(r2, i)));            \
+  }
+EVALUATE(VLP) {
+  DCHECK(VL);
+  DECODE_VRR_A_INSTRUCTION(r1, r2, m5, m4, m3);
+  USE(m5);
+  USE(m4);
+  switch (m3) {
+    case 0: {
+      VECTOR_LOAD_POSITIVE(r1, r2, int8_t)
+      break;
+    }
+    case 1: {
+      VECTOR_LOAD_POSITIVE(r1, r2, int16_t)
+      break;
+    }
+    case 2: {
+      VECTOR_LOAD_POSITIVE(r1, r2, int32_t)
+      break;
+    }
+    case 3: {
+      VECTOR_LOAD_POSITIVE(r1, r2, int64_t)
+      break;
+    }
+    default:
+      UNREACHABLE();
+  }
+
+  return length;
+}
+#undef VECTOR_LOAD_POSITIVE
+
+#define VECTOR_AVERAGE_U(r1, r2, r3, type)                                    \
+  for (size_t i = 0, j = 0; j < kSimd128Size; i++, j += sizeof(type)) {       \
+    type src0 = get_simd_register_by_lane<type>(r2, i);                       \
+    type src1 = get_simd_register_by_lane<type>(r3, i);                       \
+    set_simd_register_by_lane<type>(                                          \
+        r1, i, (static_cast<type>(src0) + static_cast<type>(src1) + 1) >> 1); \
+  }
+EVALUATE(VAVGL) {
+  DCHECK(VL);
+  DECODE_VRR_C_INSTRUCTION(r1, r2, r3, m6, m5, m4);
+  USE(m6);
+  USE(m5);
+  switch (m4) {
+    case 0: {
+      VECTOR_AVERAGE_U(r1, r2, r3, uint8_t)
+      break;
+    }
+    case 1: {
+      VECTOR_AVERAGE_U(r1, r2, r3, uint16_t)
+      break;
+    }
+    case 2: {
+      VECTOR_AVERAGE_U(r1, r2, r3, uint32_t)
+      break;
+    }
+    case 3: {
+      VECTOR_AVERAGE_U(r1, r2, r3, uint64_t)
+      break;
+    }
+    default:
+      UNREACHABLE();
+  }
+
+  return length;
+}
+#undef VECTOR_AVERAGE_U
+
 EVALUATE(VLGV) {
  DCHECK_OPCODE(VLGV);
  DECODE_VRS_INSTRUCTION(r1, r3, b2, d2, m4);
@@ -2950,6 +3042,14 @@ EVALUATE(VLVG) {
  return length;
 }

+EVALUATE(VLVGP) {
+  DCHECK_OPCODE(VLVGP);
+  DECODE_VRR_F_INSTRUCTION(r1, r2, r3);
+  set_simd_register_by_lane<int64_t>(r1, 0, get_register(r2));
+  set_simd_register_by_lane<int64_t>(r1, 1, get_register(r3));
+  return length;
+}
+
 EVALUATE(VREP) {
  DCHECK_OPCODE(VREP);
  DECODE_VRI_C_INSTRUCTION(r1, r3, i2, m4);
@@ -3084,6 +3184,20 @@ EVALUATE(VML) {
  return length;
 }

+EVALUATE(VNC) {
+  DCHECK(VNC);
+  DECODE_VRR_C_INSTRUCTION(r1, r2, r3, m6, m5, m4);
+  USE(m6);
+  USE(m5);
+  USE(m4);
+  for (int i = 0; i < 2; i++) {
+    int64_t lane_1 = get_simd_register_by_lane<uint64_t>(r2, i);
+    int64_t lane_2 = get_simd_register_by_lane<uint64_t>(r3, i);
+    set_simd_register_by_lane<uint64_t>(r1, i, lane_1 & ~lane_2);
+  }
+  return length;
+}
+
 template <class S, class D>
 void VectorSum(void* dst, void* src1, void* src2) {
  D value = 0;
@@ -3490,6 +3604,42 @@ EVALUATE(VX) {
  return length;
 }

+#define VECTOR_NOR(r1, r2, r3, type)                                    \
+  for (size_t i = 0, j = 0; j < kSimd128Size; i++, j += sizeof(type)) { \
+    type src0 = get_simd_register_by_lane<type>(r2, i);                 \
+    type src1 = get_simd_register_by_lane<type>(r3, i);                 \
+    set_simd_register_by_lane<type>(r1, i, ~(src0 | src1));             \
+  }
+EVALUATE(VNO) {
+  DCHECK(VL);
+  DECODE_VRR_C_INSTRUCTION(r1, r2, r3, m6, m5, m4);
+  USE(m6);
+  USE(m5);
+  switch (m4) {
+    case 0: {
+      VECTOR_NOR(r1, r2, r3, int8_t)
+      break;
+    }
+    case 1: {
+      VECTOR_NOR(r1, r2, r3, int16_t)
+      break;
+    }
+    case 2: {
+      VECTOR_NOR(r1, r2, r3, int32_t)
+      break;
+    }
+    case 3: {
+      VECTOR_NOR(r1, r2, r3, int64_t)
+      break;
+    }
+    default:
+      UNREACHABLE();
+  }
+
+  return length;
+}
+#undef VECTOR_NOR
+
 template <class T>
 void VectorLoadComplement(void* dst, void* src) {
  int8_t* src_ptr = reinterpret_cast<int8_t*>(src);
@@ -3530,6 +3680,27 @@ EVALUATE(VLC) {
  return length;
 }

+EVALUATE(VPERM) {
+  DCHECK_OPCODE(VPERM);
+  DECODE_VRR_E_INSTRUCTION(r1, r2, r3, r4, m6, m5);
+  USE(m5);
+  USE(m6);
+  for (int i = 0; i < kSimd128Size; i++) {
+    int8_t lane_num = get_simd_register_by_lane<int8_t>(r4, i);
+    int reg = r2;
+    if (lane_num >= kSimd128Size) {
+      lane_num = lane_num - kSimd128Size;
+      reg = r3;
+    }
+    int8_t result = 0;
+    if (lane_num >= 0 && lane_num < kSimd128Size * 2) {
+      result = get_simd_register_by_lane<int8_t>(reg, lane_num);
+    }
+    set_simd_register_by_lane<int8_t>(r1, i, result);
+  }
+  return length;
+}
+
 EVALUATE(VSEL) {
  DCHECK_OPCODE(VSEL);
  DECODE_VRR_E_INSTRUCTION(r1, r2, r3, r4, m6, m5);
@@ -3606,6 +3777,65 @@ EVALUATE(VESRL) {
  return length;
 }

+#define VECTOR_SHIFT_WITH_OPERAND_TYPE(r1, r2, r3, type, op)             \
+  for (size_t i = 0, j = 0; j < kSimd128Size; i++, j += sizeof(type)) {  \
+    type src0 = get_simd_register_by_lane<type>(r2, i);                  \
+    type src1 = get_simd_register_by_lane<type>(r3, i);                  \
+    set_simd_register_by_lane<type>(r1, i,                               \
+                                    src0 op(src1 % (sizeof(type) * 8))); \
+  }
+
+#define VECTOR_SHIFT_WITH_OPERAND(r1, r2, r3, op, sign)             \
+  switch (m4) {                                                     \
+    case 0: {                                                       \
+      VECTOR_SHIFT_WITH_OPERAND_TYPE(r1, r2, r3, sign##int8_t, op)  \
+      break;                                                        \
+    }                                                               \
+    case 1: {                                                       \
+      VECTOR_SHIFT_WITH_OPERAND_TYPE(r1, r2, r3, sign##int16_t, op) \
+      break;                                                        \
+    }                                                               \
+    case 2: {                                                       \
+      VECTOR_SHIFT_WITH_OPERAND_TYPE(r1, r2, r3, sign##int32_t, op) \
+      break;                                                        \
+    }                                                               \
+    case 3: {                                                       \
+      VECTOR_SHIFT_WITH_OPERAND_TYPE(r1, r2, r3, sign##int64_t, op) \
+      break;                                                        \
+    }                                                               \
+    default:                                                        \
+      UNREACHABLE();                                                \
+  }
+
+EVALUATE(VESLV) {
+  DCHECK_OPCODE(VESLV);
+  DECODE_VRR_C_INSTRUCTION(r1, r2, r3, m6, m5, m4);
+  USE(m6);
+  USE(m5);
+  VECTOR_SHIFT_WITH_OPERAND(r1, r2, r3, <<, )
+  return length;
+}
+
+EVALUATE(VESRAV) {
+  DCHECK_OPCODE(VESRAV);
+  DECODE_VRR_C_INSTRUCTION(r1, r2, r3, m6, m5, m4);
+  USE(m6);
+  USE(m5);
+  VECTOR_SHIFT_WITH_OPERAND(r1, r2, r3, >>, )
+  return length;
+}
+
+EVALUATE(VESRLV) {
+  DCHECK_OPCODE(VESRLV);
+  DECODE_VRR_C_INSTRUCTION(r1, r2, r3, m6, m5, m4);
+  USE(m6);
+  USE(m5);
+  VECTOR_SHIFT_WITH_OPERAND(r1, r2, r3, >>, u)
+  return length;
+}
+#undef VECTOR_SHIFT_WITH_OPERAND
+#undef VECTOR_SHIFT_WITH_OPERAND_TYPE
+
 EVALUATE(VTM) {
  DCHECK_OPCODE(VTM);
  DECODE_VRR_A_INSTRUCTION(r1, r2, m5, m4, m3);
@@ -3688,6 +3918,63 @@ EVALUATE(VFD) {
  return length;
 }

+#define VECTOR_FP_MULTIPLY_QFMS_OPERATION(type, op, sign, first_lane_only) \
+  for (size_t i = 0, j = 0; j < kSimd128Size; i++, j += sizeof(type)) {    \
+    type src0 = get_simd_register_by_lane<type>(r2, i);                    \
+    type src1 = get_simd_register_by_lane<type>(r3, i);                    \
+    type src2 = get_simd_register_by_lane<type>(r4, i);                    \
+    type result = sign * (src0 * src1 op src2);                            \
+    if (isinf(src0)) result = src0;                                        \
+    if (isinf(src1)) result = src1;                                        \
+    if (isinf(src2)) result = src2;                                        \
+    set_simd_register_by_lane<type>(r1, i, result);                        \
+    if (first_lane_only) break;                                            \
+  }
+
+#define VECTOR_FP_MULTIPLY_QFMS(op, sign)                          \
+  switch (m6) {                                                    \
+    case 2:                                                        \
+      DCHECK(CpuFeatures::IsSupported(VECTOR_ENHANCE_FACILITY_1)); \
+      if (m5 == 8) {                                               \
+        VECTOR_FP_MULTIPLY_QFMS_OPERATION(float, op, sign, true)   \
+      } else {                                                     \
+        DCHECK_EQ(m5, 0);                                          \
+        VECTOR_FP_MULTIPLY_QFMS_OPERATION(float, op, sign, false)  \
+      }                                                            \
+      break;                                                       \
+    case 3:                                                        \
+      if (m5 == 8) {                                               \
+        VECTOR_FP_MULTIPLY_QFMS_OPERATION(double, op, sign, true)  \
+      } else {                                                     \
+        DCHECK_EQ(m5, 0);                                          \
+        VECTOR_FP_MULTIPLY_QFMS_OPERATION(double, op, sign, false) \
+      }                                                            \
+      break;                                                       \
+    default:                                                       \
+      UNREACHABLE();                                               \
+      break;                                                       \
+  }
+
+EVALUATE(VFMA) {
+  DCHECK_OPCODE(VFMA);
+  DECODE_VRR_E_INSTRUCTION(r1, r2, r3, r4, m6, m5);
+  USE(m5);
+  USE(m6);
+  VECTOR_FP_MULTIPLY_QFMS(+, 1)
+  return length;
+}
+
+EVALUATE(VFNMS) {
+  DCHECK_OPCODE(VFNMS);
+  DECODE_VRR_E_INSTRUCTION(r1, r2, r3, r4, m6, m5);
+  USE(m5);
+  USE(m6);
+  VECTOR_FP_MULTIPLY_QFMS(-, -1)
+  return length;
+}
+#undef VECTOR_FP_MULTIPLY_QFMS
+#undef VECTOR_FP_MULTIPLY_QFMS_OPERATION
+
 template <class T, class Operation>
 void VectorFPMaxMin(void* dst, void* src1, void* src2, Operation op) {
  T* dst_ptr = reinterpret_cast<T*>(dst);
@@ -3707,8 +3994,13 @@ void VectorFPMaxMin(void* dst, void* src1, void* src2, Operation op) {

 #define VECTOR_FP_MAX_MIN_FOR_TYPE(type, op)                           \
  VectorFPMaxMin<type>(&get_simd_register(r1), &get_simd_register(r2), \
-                       &get_simd_register(r3),                         \
-                       [](type a, type b) { return (a op b) ? a : b; });
+                       &get_simd_register(r3), [](type a, type b) {    \
+                         if (signbit(b) op signbit(a))                 \
+                           return a;                                   \
+                         else if (signbit(b) != signbit(a))            \
+                           return b;                                   \
+                         return (a op b) ? a : b;                      \
+                       });

 #define VECTOR_FP_MAX_MIN(op)                                                  \
  switch (m4) {                                                                \
@@ -7090,14 +7382,19 @@ EVALUATE(CFEBRA) {
      break;
    }
    case ROUND_TOWARD_0: {
-      // check for overflow, cast r2_fval to 64bit integer
+      // check for overflow, cast r2_fval to double
      // then check value within the range of INT_MIN and INT_MAX
      // and set condition code accordingly
-      int64_t temp = static_cast<int64_t>(r2_fval);
-      if (temp < INT_MIN || temp > INT_MAX) {
+      double temp = static_cast<double>(r2_fval);
+      if (temp < INT_MIN) {
+        r1_val = kMinInt;
+        condition_reg_ = CC_OF;
+      } else if (temp > INT_MAX) {
+        r1_val = kMaxInt;
        condition_reg_ = CC_OF;
+      } else {
+        r1_val = static_cast<int32_t>(r2_fval);
      }
-      r1_val = static_cast<int32_t>(r2_fval);
      break;
    }
    case ROUND_TOWARD_PLUS_INFINITE: {
@@ -7217,8 +7514,11 @@ EVALUATE(CLFEBR) {
  DECODE_RRE_INSTRUCTION(r1, r2);
  float r2_val = get_float32_from_d_register(r2);
  uint32_t r1_val = static_cast<uint32_t>(r2_val);
-  set_low_register(r1, r1_val);
  SetS390ConvertConditionCode<double>(r2_val, r1_val, UINT32_MAX);
+  double temp = static_cast<double>(r2_val);
+  if (temp < 0) r1_val = 0;
+  if (temp > kMaxUInt32) r1_val = kMaxUInt32;
+  set_low_register(r1, r1_val);
  return length;
 }

@@ -10900,6 +11200,7 @@ EVALUATE(CXZT) {
 #undef DECODE_VRR_B_INSTRUCTION
 #undef DECODE_VRR_C_INSTRUCTION
 #undef DECODE_VRR_E_INSTRUCTION
+#undef DECODE_VRR_F_INSTRUCTION
 #undef DECODE_VRX_INSTRUCTION
 #undef DECODE_VRS_INSTRUCTION
 #undef DECODE_VRI_A_INSTRUCTION

--- a/src/execution/s390/simulator-s390.h
+++ b/src/execution/s390/simulator-s390.h
@@ -503,6 +503,7 @@ class Simulator : public SimulatorBase {
  S390_VRR_A_OPCODE_LIST(EVALUATE_VR_INSTRUCTIONS)
  S390_VRR_C_OPCODE_LIST(EVALUATE_VR_INSTRUCTIONS)
  S390_VRR_E_OPCODE_LIST(EVALUATE_VR_INSTRUCTIONS)
+  S390_VRR_F_OPCODE_LIST(EVALUATE_VR_INSTRUCTIONS)
  S390_VRX_OPCODE_LIST(EVALUATE_VR_INSTRUCTIONS)
  S390_VRS_A_OPCODE_LIST(EVALUATE_VR_INSTRUCTIONS)
  S390_VRS_B_OPCODE_LIST(EVALUATE_VR_INSTRUCTIONS)