Commit 660e87d2 authored by Milad Fa's avatar Milad Fa Committed by Commit Bot

S390 [simd]: optimize vector multiply extend on codegen

Implantation now includes using a combination of
multiplly even and odd flowed by a vector merge low or high.

vector merge instructions are also added to the simulator.

Change-Id: I144c5d07e5e6bd978788a70aacabd61463f93289
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2815562
Commit-Queue: Milad Fa <mfarazma@redhat.com>
Reviewed-by: 's avatarJunliang Yan <junyan@redhat.com>
Cr-Commit-Position: refs/heads/master@{#73868}
parent a6a27731
......@@ -3672,80 +3672,65 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
Condition(0), Condition(0), Condition(2));
break;
}
#define ASSEMBLE_SIMD_I64X2_EXT_MUL(UNPACK_INSTR) \
__ UNPACK_INSTR(kScratchDoubleReg, i.InputSimd128Register(0), Condition(0), \
Condition(0), Condition(2)); \
__ UNPACK_INSTR(i.OutputSimd128Register(), i.InputSimd128Register(1), \
Condition(0), Condition(0), Condition(2)); \
Register scratch_0 = r0; \
Register scratch_1 = r1; \
for (int lane = 0; lane < 2; lane++) { \
__ vlgv(scratch_0, kScratchDoubleReg, MemOperand(r0, lane), Condition(3)); \
__ vlgv(scratch_1, i.OutputSimd128Register(), MemOperand(r0, lane), \
Condition(3)); \
__ MulS64(scratch_0, scratch_1); \
scratch_0 = r1; \
scratch_1 = ip; \
} \
__ vlvgp(i.OutputSimd128Register(), r0, r1);
#define EXT_MUL(mul_even, mul_odd, merge, mode) \
Simd128Register dst = i.OutputSimd128Register(), \
src0 = i.InputSimd128Register(0), \
src1 = i.InputSimd128Register(1); \
__ mul_even(dst, src0, src1, Condition(0), Condition(0), Condition(mode)); \
__ mul_odd(kScratchDoubleReg, src0, src1, Condition(0), Condition(0), \
Condition(mode)); \
__ merge(dst, dst, kScratchDoubleReg, Condition(0), Condition(0), \
Condition(mode + 1));
case kS390_I64x2ExtMulLowI32x4S: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(vupl)
EXT_MUL(vme, vmo, vmrl, 2)
break;
}
case kS390_I64x2ExtMulHighI32x4S: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(vuph)
EXT_MUL(vme, vmo, vmrh, 2)
break;
}
case kS390_I64x2ExtMulLowI32x4U: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(vupll)
EXT_MUL(vmle, vmlo, vmrl, 2)
break;
}
case kS390_I64x2ExtMulHighI32x4U: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(vuplh)
break;
}
#undef ASSEMBLE_SIMD_I64X2_EXT_MUL
#define ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(UNPACK_INSTR, MODE) \
__ UNPACK_INSTR(kScratchDoubleReg, i.InputSimd128Register(0), Condition(0), \
Condition(0), Condition(MODE)); \
__ UNPACK_INSTR(i.OutputSimd128Register(), i.InputSimd128Register(1), \
Condition(0), Condition(0), Condition(MODE)); \
__ vml(i.OutputSimd128Register(), kScratchDoubleReg, \
i.OutputSimd128Register(), Condition(0), Condition(0), \
Condition(MODE + 1));
EXT_MUL(vmle, vmlo, vmrh, 2)
break;
}
case kS390_I32x4ExtMulLowI16x8S: {
ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(vupl, 1)
EXT_MUL(vme, vmo, vmrl, 1)
break;
}
case kS390_I32x4ExtMulHighI16x8S: {
ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(vuph, 1)
EXT_MUL(vme, vmo, vmrh, 1)
break;
}
case kS390_I32x4ExtMulLowI16x8U: {
ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(vupll, 1)
EXT_MUL(vmle, vmlo, vmrl, 1)
break;
}
case kS390_I32x4ExtMulHighI16x8U: {
ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(vuplh, 1)
EXT_MUL(vmle, vmlo, vmrh, 1)
break;
}
case kS390_I16x8ExtMulLowI8x16S: {
ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(vupl, 0)
EXT_MUL(vme, vmo, vmrl, 0)
break;
}
case kS390_I16x8ExtMulHighI8x16S: {
ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(vuph, 0)
EXT_MUL(vme, vmo, vmrh, 0)
break;
}
case kS390_I16x8ExtMulLowI8x16U: {
ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(vupll, 0)
EXT_MUL(vmle, vmlo, vmrl, 0)
break;
}
case kS390_I16x8ExtMulHighI8x16U: {
ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(vuplh, 0)
EXT_MUL(vmle, vmlo, vmrh, 0)
break;
}
#undef ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL
#undef EXT_MUL
#define EXT_ADD_PAIRWISE(lane_size, mul_even, mul_odd) \
Simd128Register src = i.InputSimd128Register(0); \
Simd128Register dst = i.OutputSimd128Register(); \
......
......@@ -772,6 +772,8 @@ void Simulator::EvalTableInit() {
V(vsum, VSUM, 0xE764) /* type = VRR_C VECTOR SUM ACROSS WORD */ \
V(vsumg, VSUMG, 0xE765) /* type = VRR_C VECTOR SUM ACROSS DOUBLEWORD */ \
V(vpk, VPK, 0xE794) /* type = VRR_C VECTOR PACK */ \
V(vmrl, VMRL, 0xE760) /* type = VRR_C VECTOR MERGE LOW */ \
V(vmrh, VMRH, 0xE761) /* type = VRR_C VECTOR MERGE HIGH */ \
V(vpks, VPKS, 0xE797) /* type = VRR_B VECTOR PACK SATURATE */ \
V(vpkls, VPKLS, 0xE795) /* type = VRR_B VECTOR PACK LOGICAL SATURATE */ \
V(vupll, VUPLL, 0xE7D4) /* type = VRR_A VECTOR UNPACK LOGICAL LOW */ \
......@@ -3397,6 +3399,53 @@ EVALUATE(VSUMG) {
}
#undef CASE
#define VECTOR_MERGE(type, is_low_side) \
constexpr size_t index_limit = (kSimd128Size / sizeof(type)) / 2; \
for (size_t i = 0, source_index = is_low_side ? i + index_limit : i; \
i < index_limit; i++, source_index++) { \
set_simd_register_by_lane<type>( \
r1, 2 * i, get_simd_register_by_lane<type>(r2, source_index)); \
set_simd_register_by_lane<type>( \
r1, (2 * i) + 1, get_simd_register_by_lane<type>(r3, source_index)); \
}
#define CASE(i, type, is_low_side) \
case i: { \
VECTOR_MERGE(type, is_low_side) \
} break;
EVALUATE(VMRL) {
DCHECK_OPCODE(VMRL);
DECODE_VRR_C_INSTRUCTION(r1, r2, r3, m6, m5, m4);
USE(m6);
USE(m5);
switch (m4) {
CASE(0, int8_t, true);
CASE(1, int16_t, true);
CASE(2, int32_t, true);
CASE(3, int64_t, true);
default:
UNREACHABLE();
}
return length;
}
EVALUATE(VMRH) {
DCHECK_OPCODE(VMRH);
DECODE_VRR_C_INSTRUCTION(r1, r2, r3, m6, m5, m4);
USE(m6);
USE(m5);
switch (m4) {
CASE(0, int8_t, false);
CASE(1, int16_t, false);
CASE(2, int32_t, false);
CASE(3, int64_t, false);
default:
UNREACHABLE();
}
return length;
}
#undef CASE
#undef VECTOR_MERGE
template <class S, class D>
void VectorPack(Simulator* sim, int dst, int src1, int src2, bool saturate,
const D& max = 0, const D& min = 0) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment