Commit cd5830d8 authored by Milad Fa's avatar Milad Fa Committed by V8 LUCI CQ

S390 [simd]: Implement vector load lane

This CL takes advantage of the z15 `load byte reverse element`
instruction to optimize Simd LoadLane opcodes.

On the simulator we only run `load element` as reversing is
not required.

Change-Id: I038535f7e038bed7972844806644f50519d4919c
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3138212Reviewed-by: 's avatarJunliang Yan <junyan@redhat.com>
Commit-Queue: Milad Fa <mfarazma@redhat.com>
Cr-Commit-Position: refs/heads/main@{#76648}
parent 0508f277
......@@ -3981,6 +3981,30 @@ void TurboAssembler::LoadV64ZeroLE(Simd128Register dst, const MemOperand& mem) {
vlvg(dst, r1, MemOperand(r0, 1), Condition(3));
}
void TurboAssembler::LoadLane8LE(Simd128Register dst, const MemOperand& mem,
int index) {
vleb(dst, mem, Condition(index));
}
#define LOAD_LANE_LIST(V) \
V(64, vlebrg, LoadU64LE, 3) \
V(32, vlebrf, LoadU32LE, 2) \
V(16, vlebrh, LoadU16LE, 1)
#define LOAD_LANE(name, vector_instr, scalar_instr, condition) \
void TurboAssembler::LoadLane##name##LE(Simd128Register dst, \
const MemOperand& mem, int lane) { \
if (CpuFeatures::IsSupported(VECTOR_ENHANCE_FACILITY_2) && \
is_uint12(mem.offset())) { \
vector_instr(dst, mem, Condition(lane)); \
return; \
} \
scalar_instr(r1, mem); \
vlvg(dst, r1, MemOperand(r0, lane), Condition(condition)); \
}
LOAD_LANE_LIST(LOAD_LANE)
#undef LOAD_LANE
#undef LOAD_LANE_LIST
#else
void TurboAssembler::LoadU64LE(Register dst, const MemOperand& mem,
Register scratch) {
......@@ -4088,6 +4112,22 @@ LOAD_EXTEND_LIST(LOAD_EXTEND)
#undef LOAD_EXTEND
#undef LOAD_EXTEND
#define LOAD_LANE_LIST(V) \
V(64, vleg) \
V(32, vlef) \
V(16, vleh) \
V(8, vleb)
#define LOAD_LANE(name, vector_instr) \
void TurboAssembler::LoadLane##name##LE(Simd128Register dst, \
const MemOperand& mem, int lane) { \
DCHECK(is_uint12(mem.offset())); \
vector_instr(dst, mem, Condition(lane)); \
}
LOAD_LANE_LIST(LOAD_LANE)
#undef LOAD_LANE
#undef LOAD_LANE_LIST
void TurboAssembler::LoadV32ZeroLE(Simd128Register dst, const MemOperand& mem) {
vx(dst, dst, dst, Condition(0), Condition(0), Condition(0));
vlef(dst, mem, Condition(3));
......
......@@ -405,6 +405,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void LoadAndExtend32x2SLE(Simd128Register dst, const MemOperand& mem);
void LoadV32ZeroLE(Simd128Register dst, const MemOperand& mem);
void LoadV64ZeroLE(Simd128Register dst, const MemOperand& mem);
void LoadLane8LE(Simd128Register dst, const MemOperand& mem, int lane);
void LoadLane16LE(Simd128Register dst, const MemOperand& mem, int lane);
void LoadLane32LE(Simd128Register dst, const MemOperand& mem, int lane);
void LoadLane64LE(Simd128Register dst, const MemOperand& mem, int lane);
// Load And Test
void LoadAndTest32(Register dst, Register src);
......
......@@ -3453,6 +3453,31 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
#undef LOAD_AND_ZERO
#undef LOAD_EXTEND
#define LOAD_LANE(type, lane) \
AddressingMode mode = kMode_None; \
size_t index = 2; \
MemOperand operand = i.MemoryOperand(&mode, &index); \
Simd128Register dst = i.OutputSimd128Register(); \
DCHECK_EQ(dst, i.InputSimd128Register(0)); \
__ LoadLane##type##LE(dst, operand, lane);
case kS390_S128Load8Lane: {
LOAD_LANE(8, 15 - i.InputUint8(1));
break;
}
case kS390_S128Load16Lane: {
LOAD_LANE(16, 7 - i.InputUint8(1));
break;
}
case kS390_S128Load32Lane: {
LOAD_LANE(32, 3 - i.InputUint8(1));
break;
}
case kS390_S128Load64Lane: {
LOAD_LANE(64, 1 - i.InputUint8(1));
break;
}
#undef LOAD_LANE
case kS390_StoreCompressTagged: {
CHECK(!instr->HasOutput());
size_t index = 0;
......
......@@ -384,6 +384,10 @@ namespace compiler {
V(S390_S128Load32x2U) \
V(S390_S128Load32Zero) \
V(S390_S128Load64Zero) \
V(S390_S128Load8Lane) \
V(S390_S128Load16Lane) \
V(S390_S128Load32Lane) \
V(S390_S128Load64Lane) \
V(S390_StoreSimd128) \
V(S390_LoadSimd128) \
V(S390_StoreCompressTagged) \
......
......@@ -371,6 +371,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kS390_S128Load32x2U:
case kS390_S128Load32Zero:
case kS390_S128Load64Zero:
case kS390_S128Load8Lane:
case kS390_S128Load16Lane:
case kS390_S128Load32Lane:
case kS390_S128Load64Lane:
return kIsLoadOperation;
case kS390_StoreWord8:
......
......@@ -2787,8 +2787,32 @@ void InstructionSelector::EmitPrepareResults(
}
void InstructionSelector::VisitLoadLane(Node* node) {
// We should never reach here, see http://crrev.com/c/2577820
UNREACHABLE();
LoadLaneParameters params = LoadLaneParametersOf(node->op());
InstructionCode opcode;
if (params.rep == MachineType::Int8()) {
opcode = kS390_S128Load8Lane;
} else if (params.rep == MachineType::Int16()) {
opcode = kS390_S128Load16Lane;
} else if (params.rep == MachineType::Int32()) {
opcode = kS390_S128Load32Lane;
} else if (params.rep == MachineType::Int64()) {
opcode = kS390_S128Load64Lane;
} else {
UNREACHABLE();
}
S390OperandGenerator g(this);
InstructionOperand outputs[] = {g.DefineSameAsFirst(node)};
InstructionOperand inputs[5];
size_t input_count = 0;
inputs[input_count++] = g.UseRegister(node->InputAt(2));
inputs[input_count++] = g.UseImmediate(params.laneidx);
AddressingMode mode =
g.GetEffectiveAddressMemoryOperand(node, inputs, &input_count);
opcode |= AddressingModeField::encode(mode);
Emit(opcode, 1, outputs, input_count, inputs);
}
void InstructionSelector::VisitLoadTransform(Node* node) {
......
......@@ -758,6 +758,8 @@ void Simulator::EvalTableInit() {
V(vrepi, VREPI, 0xE745) /* type = VRI_A VECTOR REPLICATE IMMEDIATE */ \
V(vlr, VLR, 0xE756) /* type = VRR_A VECTOR LOAD */ \
V(vstef, VSTEF, 0xE70B) /* type = VRX VECTOR STORE ELEMENT (32) */ \
V(vleb, VLEB, 0xE701) /* type = VRX VECTOR LOAD ELEMENT (8) */ \
V(vleh, VLEH, 0xE701) /* type = VRX VECTOR LOAD ELEMENT (16) */ \
V(vlef, VLEF, 0xE703) /* type = VRX VECTOR LOAD ELEMENT (32) */ \
V(vleg, VLEG, 0xE702) /* type = VRX VECTOR LOAD ELEMENT (64) */ \
V(vavgl, VAVGL, 0xE7F0) /* type = VRR_C VECTOR AVERAGE LOGICAL */ \
......@@ -3193,6 +3195,24 @@ EVALUATE(VSTEF) {
return length;
}
EVALUATE(VLEB) {
DCHECK_OPCODE(VLEB);
DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3);
intptr_t addr = GET_ADDRESS(x2, b2, d2);
int8_t value = ReadB(addr);
set_simd_register_by_lane<int8_t>(r1, m3, value);
return length;
}
EVALUATE(VLEH) {
DCHECK_OPCODE(VLEH);
DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3);
intptr_t addr = GET_ADDRESS(x2, b2, d2);
int16_t value = ReadH(addr);
set_simd_register_by_lane<int16_t>(r1, m3, value);
return length;
}
EVALUATE(VLEF) {
DCHECK_OPCODE(VLEF);
DECODE_VRX_INSTRUCTION(r1, x2, b2, d2, m3);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment