Commit 3489bdf8 authored by Milad Fa's avatar Milad Fa Committed by V8 LUCI CQ

S390 [simd]: Implement vector load and splat

This CL takes advantage of the z15 `load reverse and replicate`
instruction to optimize Simd LoadSplat opcodes.

On the simulator we only run `load replicate` as reversing is
not required.

We will need to implement the rest of the `load transform` ops
before enabling this from wasm-compiler on BE machines.

Change-Id: I81ffedf51c3d35dbbc2a6455a2756cad25434127
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3115142Reviewed-by: 's avatarJunliang Yan <junyan@redhat.com>
Commit-Queue: Milad Fa <mfarazma@redhat.com>
Cr-Commit-Position: refs/heads/main@{#76490}
parent 409e02c1
...@@ -1559,8 +1559,10 @@ using SixByteInstr = uint64_t; ...@@ -1559,8 +1559,10 @@ using SixByteInstr = uint64_t;
V(vstef, VSTEF, 0xE70B) /* type = VRX VECTOR STORE ELEMENT (32) */ \ V(vstef, VSTEF, 0xE70B) /* type = VRX VECTOR STORE ELEMENT (32) */ \
V(vst, VST, 0xE70E) /* type = VRX VECTOR STORE */ \ V(vst, VST, 0xE70E) /* type = VRX VECTOR STORE */ \
V(vlbr, VLBR, 0xE606) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENTS */ \ V(vlbr, VLBR, 0xE606) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENTS */ \
V(vstbr, VSTBR, 0xE60E) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENTS \ V(vstbr, VSTBR, \
*/ 0xE60E) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENTS */ \
V(vlbrrep, VLBRREP, \
0xE605) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT AND REPLICATE */
#define S390_RIE_G_OPCODE_LIST(V) \ #define S390_RIE_G_OPCODE_LIST(V) \
V(lochi, LOCHI, \ V(lochi, LOCHI, \
......
...@@ -3924,6 +3924,31 @@ void TurboAssembler::StoreV128LE(Simd128Register src, const MemOperand& mem, ...@@ -3924,6 +3924,31 @@ void TurboAssembler::StoreV128LE(Simd128Register src, const MemOperand& mem,
} }
} }
void TurboAssembler::LoadAndSplat8x16LE(Simd128Register dst,
const MemOperand& mem) {
vlrep(dst, mem, Condition(0));
}
#define LOAD_SPLAT_LIST(V) \
V(64x2, LoadU64LE, 3) \
V(32x4, LoadU32LE, 2) \
V(16x8, LoadU16LE, 1)
#define LOAD_SPLAT(name, scalar_instr, condition) \
void TurboAssembler::LoadAndSplat##name##LE(Simd128Register dst, \
const MemOperand& mem) { \
if (CpuFeatures::IsSupported(VECTOR_ENHANCE_FACILITY_2) && \
is_uint12(mem.offset())) { \
vlbrrep(dst, mem, Condition(condition)); \
return; \
} \
scalar_instr(r1, mem); \
vlvg(dst, r1, MemOperand(r0, 0), Condition(condition)); \
vrep(dst, dst, Operand(0), Condition(condition)); \
}
LOAD_SPLAT_LIST(LOAD_SPLAT)
#undef LOAD_SPLAT
#undef LOAD_SPLAT_LIST
#else #else
void TurboAssembler::LoadU64LE(Register dst, const MemOperand& mem, void TurboAssembler::LoadU64LE(Register dst, const MemOperand& mem,
Register scratch) { Register scratch) {
...@@ -3996,6 +4021,21 @@ void TurboAssembler::StoreV128LE(Simd128Register src, const MemOperand& mem, ...@@ -3996,6 +4021,21 @@ void TurboAssembler::StoreV128LE(Simd128Register src, const MemOperand& mem,
StoreV128(src, mem, scratch1); StoreV128(src, mem, scratch1);
} }
#define LOAD_SPLAT_LIST(V) \
V(64x2, 3) \
V(32x4, 2) \
V(16x8, 1) \
V(8x16, 0)
#define LOAD_SPLAT(name, condition) \
void TurboAssembler::LoadAndSplat##name##LE(Simd128Register dst, \
const MemOperand& mem) { \
vlrep(dst, mem, Condition(condition)); \
}
LOAD_SPLAT_LIST(LOAD_SPLAT)
#undef LOAD_SPLAT
#undef LOAD_SPLAT_LIST
#endif #endif
// Load And Test (Reg <- Reg) // Load And Test (Reg <- Reg)
......
...@@ -392,6 +392,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -392,6 +392,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
Register scratch1); Register scratch1);
void LoadF64LE(DoubleRegister dst, const MemOperand& opnd, Register scratch); void LoadF64LE(DoubleRegister dst, const MemOperand& opnd, Register scratch);
void LoadF32LE(DoubleRegister dst, const MemOperand& opnd, Register scratch); void LoadF32LE(DoubleRegister dst, const MemOperand& opnd, Register scratch);
void LoadAndSplat64x2LE(Simd128Register dst, const MemOperand& mem);
void LoadAndSplat32x4LE(Simd128Register dst, const MemOperand& mem);
void LoadAndSplat16x8LE(Simd128Register dst, const MemOperand& mem);
void LoadAndSplat8x16LE(Simd128Register dst, const MemOperand& mem);
// Load And Test // Load And Test
void LoadAndTest32(Register dst, Register src); void LoadAndTest32(Register dst, Register src);
......
...@@ -2218,6 +2218,28 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2218,6 +2218,28 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vl(i.OutputSimd128Register(), operand, Condition(0)); __ vl(i.OutputSimd128Register(), operand, Condition(0));
break; break;
} }
#define LOAD_SPLAT(type) \
AddressingMode mode = kMode_None; \
MemOperand operand = i.MemoryOperand(&mode); \
Simd128Register dst = i.OutputSimd128Register(); \
__ LoadAndSplat##type##LE(dst, operand);
case kS390_S128Load8Splat: {
LOAD_SPLAT(8x16);
break;
}
case kS390_S128Load16Splat: {
LOAD_SPLAT(16x8);
break;
}
case kS390_S128Load32Splat: {
LOAD_SPLAT(32x4);
break;
}
case kS390_S128Load64Splat: {
LOAD_SPLAT(64x2);
break;
}
#undef LOAD_SPLAT
case kS390_StoreWord8: case kS390_StoreWord8:
ASSEMBLE_STORE_INTEGER(StoreU8); ASSEMBLE_STORE_INTEGER(StoreU8);
break; break;
......
...@@ -372,6 +372,10 @@ namespace compiler { ...@@ -372,6 +372,10 @@ namespace compiler {
V(S390_S128Not) \ V(S390_S128Not) \
V(S390_S128Select) \ V(S390_S128Select) \
V(S390_S128AndNot) \ V(S390_S128AndNot) \
V(S390_S128Load8Splat) \
V(S390_S128Load16Splat) \
V(S390_S128Load32Splat) \
V(S390_S128Load64Splat) \
V(S390_StoreSimd128) \ V(S390_StoreSimd128) \
V(S390_LoadSimd128) \ V(S390_LoadSimd128) \
V(S390_StoreCompressTagged) \ V(S390_StoreCompressTagged) \
......
...@@ -359,6 +359,10 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -359,6 +359,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kS390_LoadDecompressTaggedSigned: case kS390_LoadDecompressTaggedSigned:
case kS390_LoadDecompressTaggedPointer: case kS390_LoadDecompressTaggedPointer:
case kS390_LoadDecompressAnyTagged: case kS390_LoadDecompressAnyTagged:
case kS390_S128Load8Splat:
case kS390_S128Load16Splat:
case kS390_S128Load32Splat:
case kS390_S128Load64Splat:
return kIsLoadOperation; return kIsLoadOperation;
case kS390_StoreWord8: case kS390_StoreWord8:
......
...@@ -2793,8 +2793,25 @@ void InstructionSelector::VisitLoadLane(Node* node) { ...@@ -2793,8 +2793,25 @@ void InstructionSelector::VisitLoadLane(Node* node) {
} }
void InstructionSelector::VisitLoadTransform(Node* node) { void InstructionSelector::VisitLoadTransform(Node* node) {
// We should never reach here, see http://crrev.com/c/2050811 LoadTransformParameters params = LoadTransformParametersOf(node->op());
UNREACHABLE(); ArchOpcode opcode;
switch (params.transformation) {
case LoadTransformation::kS128Load8Splat:
opcode = kS390_S128Load8Splat;
break;
case LoadTransformation::kS128Load16Splat:
opcode = kS390_S128Load16Splat;
break;
case LoadTransformation::kS128Load32Splat:
opcode = kS390_S128Load32Splat;
break;
case LoadTransformation::kS128Load64Splat:
opcode = kS390_S128Load64Splat;
break;
default:
UNREACHABLE();
}
VisitLoad(node, node, opcode);
} }
void InstructionSelector::VisitStoreLane(Node* node) { void InstructionSelector::VisitStoreLane(Node* node) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment