Commit 9cc41406 authored by Milad Fa's avatar Milad Fa Committed by V8 LUCI CQ

S390 [simd]: Implement vector load and zero

This CL takes advantage of the z15 `load byte reverse element`
instruction to optimize Simd Load and Zero opcodes.

On the simulator we only run `load element` as reversing is
not required.

Change-Id: I868bda865249cdc525f804c8ddf4d45df5977a86
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3132965Reviewed-by: 's avatarJunliang Yan <junyan@redhat.com>
Commit-Queue: Milad Fa <mfarazma@redhat.com>
Cr-Commit-Position: refs/heads/main@{#76610}
parent 2e5e2f15
...@@ -3910,6 +3910,7 @@ void TurboAssembler::StoreV128LE(Simd128Register src, const MemOperand& mem, ...@@ -3910,6 +3910,7 @@ void TurboAssembler::StoreV128LE(Simd128Register src, const MemOperand& mem,
} }
} }
// Vector LE Load and Transform instructions.
void TurboAssembler::LoadAndSplat8x16LE(Simd128Register dst, void TurboAssembler::LoadAndSplat8x16LE(Simd128Register dst,
const MemOperand& mem) { const MemOperand& mem) {
vlrep(dst, mem, Condition(0)); vlrep(dst, mem, Condition(0));
...@@ -3960,6 +3961,26 @@ LOAD_EXTEND_LIST(LOAD_EXTEND) ...@@ -3960,6 +3961,26 @@ LOAD_EXTEND_LIST(LOAD_EXTEND)
#undef LOAD_EXTEND #undef LOAD_EXTEND
#undef LOAD_EXTEND #undef LOAD_EXTEND
void TurboAssembler::LoadV32ZeroLE(Simd128Register dst, const MemOperand& mem) {
vx(dst, dst, dst, Condition(0), Condition(0), Condition(0));
if (CpuFeatures::IsSupported(VECTOR_ENHANCE_FACILITY_2)) {
vlebrf(dst, mem, Condition(3));
return;
}
LoadU32LE(r1, mem);
vlvg(dst, r1, MemOperand(r0, 3), Condition(2));
}
void TurboAssembler::LoadV64ZeroLE(Simd128Register dst, const MemOperand& mem) {
vx(dst, dst, dst, Condition(0), Condition(0), Condition(0));
if (CpuFeatures::IsSupported(VECTOR_ENHANCE_FACILITY_2)) {
vlebrg(dst, mem, Condition(1));
return;
}
LoadU64LE(r1, mem);
vlvg(dst, r1, MemOperand(r0, 1), Condition(3));
}
#else #else
void TurboAssembler::LoadU64LE(Register dst, const MemOperand& mem, void TurboAssembler::LoadU64LE(Register dst, const MemOperand& mem,
Register scratch) { Register scratch) {
...@@ -4032,6 +4053,7 @@ void TurboAssembler::StoreV128LE(Simd128Register src, const MemOperand& mem, ...@@ -4032,6 +4053,7 @@ void TurboAssembler::StoreV128LE(Simd128Register src, const MemOperand& mem,
StoreV128(src, mem, scratch1); StoreV128(src, mem, scratch1);
} }
// Vector LE Load and Transform instructions.
#define LOAD_SPLAT_LIST(V) \ #define LOAD_SPLAT_LIST(V) \
V(64x2, 3) \ V(64x2, 3) \
V(32x4, 2) \ V(32x4, 2) \
...@@ -4066,6 +4088,16 @@ LOAD_EXTEND_LIST(LOAD_EXTEND) ...@@ -4066,6 +4088,16 @@ LOAD_EXTEND_LIST(LOAD_EXTEND)
#undef LOAD_EXTEND #undef LOAD_EXTEND
#undef LOAD_EXTEND #undef LOAD_EXTEND
void TurboAssembler::LoadV32ZeroLE(Simd128Register dst, const MemOperand& mem) {
vx(dst, dst, dst, Condition(0), Condition(0), Condition(0));
vlef(dst, mem, Condition(3));
}
void TurboAssembler::LoadV64ZeroLE(Simd128Register dst, const MemOperand& mem) {
vx(dst, dst, dst, Condition(0), Condition(0), Condition(0));
vleg(dst, mem, Condition(1));
}
#endif #endif
// Load And Test (Reg <- Reg) // Load And Test (Reg <- Reg)
......
...@@ -392,6 +392,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -392,6 +392,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
Register scratch1); Register scratch1);
void LoadF64LE(DoubleRegister dst, const MemOperand& opnd, Register scratch); void LoadF64LE(DoubleRegister dst, const MemOperand& opnd, Register scratch);
void LoadF32LE(DoubleRegister dst, const MemOperand& opnd, Register scratch); void LoadF32LE(DoubleRegister dst, const MemOperand& opnd, Register scratch);
// Vector LE Load and Transform instructions.
void LoadAndSplat64x2LE(Simd128Register dst, const MemOperand& mem); void LoadAndSplat64x2LE(Simd128Register dst, const MemOperand& mem);
void LoadAndSplat32x4LE(Simd128Register dst, const MemOperand& mem); void LoadAndSplat32x4LE(Simd128Register dst, const MemOperand& mem);
void LoadAndSplat16x8LE(Simd128Register dst, const MemOperand& mem); void LoadAndSplat16x8LE(Simd128Register dst, const MemOperand& mem);
...@@ -402,6 +403,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -402,6 +403,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void LoadAndExtend16x4SLE(Simd128Register dst, const MemOperand& mem); void LoadAndExtend16x4SLE(Simd128Register dst, const MemOperand& mem);
void LoadAndExtend32x2ULE(Simd128Register dst, const MemOperand& mem); void LoadAndExtend32x2ULE(Simd128Register dst, const MemOperand& mem);
void LoadAndExtend32x2SLE(Simd128Register dst, const MemOperand& mem); void LoadAndExtend32x2SLE(Simd128Register dst, const MemOperand& mem);
void LoadV32ZeroLE(Simd128Register dst, const MemOperand& mem);
void LoadV64ZeroLE(Simd128Register dst, const MemOperand& mem);
// Load And Test // Load And Test
void LoadAndTest32(Register dst, Register src); void LoadAndTest32(Register dst, Register src);
......
...@@ -3439,6 +3439,20 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3439,6 +3439,20 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
#undef LOAD_EXTEND #undef LOAD_EXTEND
#define LOAD_AND_ZERO(type) \
AddressingMode mode = kMode_None; \
MemOperand operand = i.MemoryOperand(&mode); \
Simd128Register dst = i.OutputSimd128Register(); \
__ LoadV##type##ZeroLE(dst, operand);
case kS390_S128Load32Zero: {
LOAD_AND_ZERO(32);
break;
}
case kS390_S128Load64Zero: {
LOAD_AND_ZERO(64);
break;
}
#undef LOAD_AND_ZERO
case kS390_StoreCompressTagged: { case kS390_StoreCompressTagged: {
CHECK(!instr->HasOutput()); CHECK(!instr->HasOutput());
size_t index = 0; size_t index = 0;
......
...@@ -382,6 +382,8 @@ namespace compiler { ...@@ -382,6 +382,8 @@ namespace compiler {
V(S390_S128Load16x4U) \ V(S390_S128Load16x4U) \
V(S390_S128Load32x2S) \ V(S390_S128Load32x2S) \
V(S390_S128Load32x2U) \ V(S390_S128Load32x2U) \
V(S390_S128Load32Zero) \
V(S390_S128Load64Zero) \
V(S390_StoreSimd128) \ V(S390_StoreSimd128) \
V(S390_LoadSimd128) \ V(S390_LoadSimd128) \
V(S390_StoreCompressTagged) \ V(S390_StoreCompressTagged) \
......
...@@ -369,6 +369,8 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -369,6 +369,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kS390_S128Load16x4U: case kS390_S128Load16x4U:
case kS390_S128Load32x2S: case kS390_S128Load32x2S:
case kS390_S128Load32x2U: case kS390_S128Load32x2U:
case kS390_S128Load32Zero:
case kS390_S128Load64Zero:
return kIsLoadOperation; return kIsLoadOperation;
case kS390_StoreWord8: case kS390_StoreWord8:
......
...@@ -2825,6 +2825,12 @@ void InstructionSelector::VisitLoadTransform(Node* node) { ...@@ -2825,6 +2825,12 @@ void InstructionSelector::VisitLoadTransform(Node* node) {
case LoadTransformation::kS128Load32x2U: case LoadTransformation::kS128Load32x2U:
opcode = kS390_S128Load32x2U; opcode = kS390_S128Load32x2U;
break; break;
case LoadTransformation::kS128Load32Zero:
opcode = kS390_S128Load32Zero;
break;
case LoadTransformation::kS128Load64Zero:
opcode = kS390_S128Load64Zero;
break;
default: default:
UNREACHABLE(); UNREACHABLE();
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment