Commit ff340d80 authored by Milad Fa's avatar Milad Fa Committed by V8 LUCI CQ

S390 [simd]: Implement vector load and extend

This CL takes advantage of the z15 `load byte reverse element`
instruction to optimize Simd LoadExtend opcodes.

On the simulator we only run `load element` as reversing is
not required.

Change-Id: Ia34ac86f93e987656596b3116771a30f64009416
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3119048Reviewed-by: 's avatarJunliang Yan <junyan@redhat.com>
Commit-Queue: Milad Fa <mfarazma@redhat.com>
Cr-Commit-Position: refs/heads/main@{#76517}
parent ba25a52e
......@@ -1562,7 +1562,13 @@ using SixByteInstr = uint64_t;
V(vstbr, VSTBR, \
0xE60E) /* type = VRX VECTOR STORE BYTE REVERSED ELEMENTS */ \
V(vlbrrep, VLBRREP, \
0xE605) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT AND REPLICATE */
0xE605) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT AND REPLICATE */ \
V(vlebrh, VLEBRH, \
0xE601) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT (16) */ \
V(vlebrf, VLEBRF, \
0xE603) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT (32) */ \
V(vlebrg, VLEBRG, \
0xE602) /* type = VRX VECTOR LOAD BYTE REVERSED ELEMENT (64) */
#define S390_RIE_G_OPCODE_LIST(V) \
V(lochi, LOCHI, \
......
......@@ -3949,6 +3949,31 @@ LOAD_SPLAT_LIST(LOAD_SPLAT)
#undef LOAD_SPLAT
#undef LOAD_SPLAT_LIST
#define LOAD_EXTEND_LIST(V) \
V(32x2U, vuplh, 2) \
V(32x2S, vuph, 2) \
V(16x4U, vuplh, 1) \
V(16x4S, vuph, 1) \
V(8x8U, vuplh, 0) \
V(8x8S, vuph, 0)
#define LOAD_EXTEND(name, unpack_instr, condition) \
void TurboAssembler::LoadAndExtend##name##LE(Simd128Register dst, \
const MemOperand& mem) { \
if (CpuFeatures::IsSupported(VECTOR_ENHANCE_FACILITY_2) && \
is_uint12(mem.offset())) { \
vlebrg(kScratchDoubleReg, mem, Condition(0)); \
} else { \
LoadU64LE(r1, mem); \
vlvg(kScratchDoubleReg, r1, MemOperand(r0, 0), Condition(3)); \
} \
unpack_instr(dst, kScratchDoubleReg, Condition(0), Condition(0), \
Condition(condition)); \
}
LOAD_EXTEND_LIST(LOAD_EXTEND)
#undef LOAD_EXTEND
#undef LOAD_EXTEND
#else
void TurboAssembler::LoadU64LE(Register dst, const MemOperand& mem,
Register scratch) {
......@@ -4036,6 +4061,25 @@ LOAD_SPLAT_LIST(LOAD_SPLAT)
#undef LOAD_SPLAT
#undef LOAD_SPLAT_LIST
#define LOAD_EXTEND_LIST(V) \
V(32x2U, vuplh, 2) \
V(32x2S, vuph, 2) \
V(16x4U, vuplh, 1) \
V(16x4S, vuph, 1) \
V(8x8U, vuplh, 0) \
V(8x8S, vuph, 0)
#define LOAD_EXTEND(name, unpack_instr, condition) \
void TurboAssembler::LoadAndExtend##name##LE(Simd128Register dst, \
const MemOperand& mem) { \
vleg(kScratchDoubleReg, mem, Condition(0)); \
unpack_instr(dst, kScratchDoubleReg, Condition(0), Condition(0), \
Condition(condition)); \
}
LOAD_EXTEND_LIST(LOAD_EXTEND)
#undef LOAD_EXTEND
#undef LOAD_EXTEND
#endif
// Load And Test (Reg <- Reg)
......
......@@ -396,6 +396,12 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void LoadAndSplat32x4LE(Simd128Register dst, const MemOperand& mem);
void LoadAndSplat16x8LE(Simd128Register dst, const MemOperand& mem);
void LoadAndSplat8x16LE(Simd128Register dst, const MemOperand& mem);
void LoadAndExtend8x8ULE(Simd128Register dst, const MemOperand& mem);
void LoadAndExtend8x8SLE(Simd128Register dst, const MemOperand& mem);
void LoadAndExtend16x4ULE(Simd128Register dst, const MemOperand& mem);
void LoadAndExtend16x4SLE(Simd128Register dst, const MemOperand& mem);
void LoadAndExtend32x2ULE(Simd128Register dst, const MemOperand& mem);
void LoadAndExtend32x2SLE(Simd128Register dst, const MemOperand& mem);
// Load And Test
void LoadAndTest32(Register dst, Register src);
......
......@@ -2218,28 +2218,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vl(i.OutputSimd128Register(), operand, Condition(0));
break;
}
#define LOAD_SPLAT(type) \
AddressingMode mode = kMode_None; \
MemOperand operand = i.MemoryOperand(&mode); \
Simd128Register dst = i.OutputSimd128Register(); \
__ LoadAndSplat##type##LE(dst, operand);
case kS390_S128Load8Splat: {
LOAD_SPLAT(8x16);
break;
}
case kS390_S128Load16Splat: {
LOAD_SPLAT(16x8);
break;
}
case kS390_S128Load32Splat: {
LOAD_SPLAT(32x4);
break;
}
case kS390_S128Load64Splat: {
LOAD_SPLAT(64x2);
break;
}
#undef LOAD_SPLAT
case kS390_StoreWord8:
ASSEMBLE_STORE_INTEGER(StoreU8);
break;
......@@ -3409,6 +3387,58 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vpkls(dst, dst, kScratchDoubleReg, Condition(0), Condition(3));
break;
}
#define LOAD_SPLAT(type) \
AddressingMode mode = kMode_None; \
MemOperand operand = i.MemoryOperand(&mode); \
Simd128Register dst = i.OutputSimd128Register(); \
__ LoadAndSplat##type##LE(dst, operand);
case kS390_S128Load64Splat: {
LOAD_SPLAT(64x2);
break;
}
case kS390_S128Load32Splat: {
LOAD_SPLAT(32x4);
break;
}
case kS390_S128Load16Splat: {
LOAD_SPLAT(16x8);
break;
}
case kS390_S128Load8Splat: {
LOAD_SPLAT(8x16);
break;
}
#undef LOAD_SPLAT
#define LOAD_EXTEND(type) \
AddressingMode mode = kMode_None; \
MemOperand operand = i.MemoryOperand(&mode); \
Simd128Register dst = i.OutputSimd128Register(); \
__ LoadAndExtend##type##LE(dst, operand);
case kS390_S128Load32x2U: {
LOAD_EXTEND(32x2U);
break;
}
case kS390_S128Load32x2S: {
LOAD_EXTEND(32x2S);
break;
}
case kS390_S128Load16x4U: {
LOAD_EXTEND(16x4U);
break;
}
case kS390_S128Load16x4S: {
LOAD_EXTEND(16x4S);
break;
}
case kS390_S128Load8x8U: {
LOAD_EXTEND(8x8U);
break;
}
case kS390_S128Load8x8S: {
LOAD_EXTEND(8x8S);
break;
}
#undef LOAD_EXTEND
case kS390_StoreCompressTagged: {
CHECK(!instr->HasOutput());
size_t index = 0;
......
......@@ -376,6 +376,12 @@ namespace compiler {
V(S390_S128Load16Splat) \
V(S390_S128Load32Splat) \
V(S390_S128Load64Splat) \
V(S390_S128Load8x8S) \
V(S390_S128Load8x8U) \
V(S390_S128Load16x4S) \
V(S390_S128Load16x4U) \
V(S390_S128Load32x2S) \
V(S390_S128Load32x2U) \
V(S390_StoreSimd128) \
V(S390_LoadSimd128) \
V(S390_StoreCompressTagged) \
......
......@@ -363,6 +363,12 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kS390_S128Load16Splat:
case kS390_S128Load32Splat:
case kS390_S128Load64Splat:
case kS390_S128Load8x8S:
case kS390_S128Load8x8U:
case kS390_S128Load16x4S:
case kS390_S128Load16x4U:
case kS390_S128Load32x2S:
case kS390_S128Load32x2U:
return kIsLoadOperation;
case kS390_StoreWord8:
......
......@@ -2808,6 +2808,24 @@ void InstructionSelector::VisitLoadTransform(Node* node) {
case LoadTransformation::kS128Load64Splat:
opcode = kS390_S128Load64Splat;
break;
case LoadTransformation::kS128Load8x8S:
opcode = kS390_S128Load8x8S;
break;
case LoadTransformation::kS128Load8x8U:
opcode = kS390_S128Load8x8U;
break;
case LoadTransformation::kS128Load16x4S:
opcode = kS390_S128Load16x4S;
break;
case LoadTransformation::kS128Load16x4U:
opcode = kS390_S128Load16x4U;
break;
case LoadTransformation::kS128Load32x2S:
opcode = kS390_S128Load32x2S;
break;
case LoadTransformation::kS128Load32x2U:
opcode = kS390_S128Load32x2U;
break;
default:
UNREACHABLE();
}
......
......@@ -760,6 +760,7 @@ void Simulator::EvalTableInit() {
V(vlr, VLR, 0xE756) /* type = VRR_A VECTOR LOAD */ \
V(vstef, VSTEF, 0xE70B) /* type = VRX VECTOR STORE ELEMENT (32) */ \
V(vlef, VLEF, 0xE703) /* type = VRX VECTOR LOAD ELEMENT (32) */ \
V(vleg, VLEG, 0xE702) /* type = VRX VECTOR LOAD ELEMENT (64) */ \
V(vavgl, VAVGL, 0xE7F0) /* type = VRR_C VECTOR AVERAGE LOGICAL */ \
V(va, VA, 0xE7F3) /* type = VRR_C VECTOR ADD */ \
V(vs, VS, 0xE7F7) /* type = VRR_C VECTOR SUBTRACT */ \
......@@ -3205,6 +3206,15 @@ EVALUATE(VLEF) {
return length;
}
EVALUATE(VLEG) {
DCHECK_OPCODE(VLEG);
DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3);
intptr_t addr = GET_ADDRESS(x2, b2, d2);
uint64_t value = ReadDW(addr);
set_simd_register_by_lane<uint64_t>(r1, m3, value);
return length;
}
// TODO(john): unify most fp binary operations
template <class T, class Operation>
inline static void VectorBinaryOp(Simulator* sim, int dst, int src1, int src2,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment