Commit 660e87d2 authored by Milad Fa's avatar Milad Fa Committed by Commit Bot

S390 [simd]: optimize vector multiply extend on codegen

Implantation now includes using a combination of
multiplly even and odd flowed by a vector merge low or high.

vector merge instructions are also added to the simulator.

Change-Id: I144c5d07e5e6bd978788a70aacabd61463f93289
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2815562
Commit-Queue: Milad Fa <mfarazma@redhat.com>
Reviewed-by: 's avatarJunliang Yan <junyan@redhat.com>
Cr-Commit-Position: refs/heads/master@{#73868}
parent a6a27731
...@@ -3672,80 +3672,65 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3672,80 +3672,65 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
Condition(0), Condition(0), Condition(2)); Condition(0), Condition(0), Condition(2));
break; break;
} }
#define ASSEMBLE_SIMD_I64X2_EXT_MUL(UNPACK_INSTR) \ #define EXT_MUL(mul_even, mul_odd, merge, mode) \
__ UNPACK_INSTR(kScratchDoubleReg, i.InputSimd128Register(0), Condition(0), \ Simd128Register dst = i.OutputSimd128Register(), \
Condition(0), Condition(2)); \ src0 = i.InputSimd128Register(0), \
__ UNPACK_INSTR(i.OutputSimd128Register(), i.InputSimd128Register(1), \ src1 = i.InputSimd128Register(1); \
Condition(0), Condition(0), Condition(2)); \ __ mul_even(dst, src0, src1, Condition(0), Condition(0), Condition(mode)); \
Register scratch_0 = r0; \ __ mul_odd(kScratchDoubleReg, src0, src1, Condition(0), Condition(0), \
Register scratch_1 = r1; \ Condition(mode)); \
for (int lane = 0; lane < 2; lane++) { \ __ merge(dst, dst, kScratchDoubleReg, Condition(0), Condition(0), \
__ vlgv(scratch_0, kScratchDoubleReg, MemOperand(r0, lane), Condition(3)); \ Condition(mode + 1));
__ vlgv(scratch_1, i.OutputSimd128Register(), MemOperand(r0, lane), \
Condition(3)); \
__ MulS64(scratch_0, scratch_1); \
scratch_0 = r1; \
scratch_1 = ip; \
} \
__ vlvgp(i.OutputSimd128Register(), r0, r1);
case kS390_I64x2ExtMulLowI32x4S: { case kS390_I64x2ExtMulLowI32x4S: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(vupl) EXT_MUL(vme, vmo, vmrl, 2)
break; break;
} }
case kS390_I64x2ExtMulHighI32x4S: { case kS390_I64x2ExtMulHighI32x4S: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(vuph) EXT_MUL(vme, vmo, vmrh, 2)
break; break;
} }
case kS390_I64x2ExtMulLowI32x4U: { case kS390_I64x2ExtMulLowI32x4U: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(vupll) EXT_MUL(vmle, vmlo, vmrl, 2)
break; break;
} }
case kS390_I64x2ExtMulHighI32x4U: { case kS390_I64x2ExtMulHighI32x4U: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(vuplh) EXT_MUL(vmle, vmlo, vmrh, 2)
break; break;
} }
#undef ASSEMBLE_SIMD_I64X2_EXT_MUL
#define ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(UNPACK_INSTR, MODE) \
__ UNPACK_INSTR(kScratchDoubleReg, i.InputSimd128Register(0), Condition(0), \
Condition(0), Condition(MODE)); \
__ UNPACK_INSTR(i.OutputSimd128Register(), i.InputSimd128Register(1), \
Condition(0), Condition(0), Condition(MODE)); \
__ vml(i.OutputSimd128Register(), kScratchDoubleReg, \
i.OutputSimd128Register(), Condition(0), Condition(0), \
Condition(MODE + 1));
case kS390_I32x4ExtMulLowI16x8S: { case kS390_I32x4ExtMulLowI16x8S: {
ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(vupl, 1) EXT_MUL(vme, vmo, vmrl, 1)
break; break;
} }
case kS390_I32x4ExtMulHighI16x8S: { case kS390_I32x4ExtMulHighI16x8S: {
ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(vuph, 1) EXT_MUL(vme, vmo, vmrh, 1)
break; break;
} }
case kS390_I32x4ExtMulLowI16x8U: { case kS390_I32x4ExtMulLowI16x8U: {
ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(vupll, 1) EXT_MUL(vmle, vmlo, vmrl, 1)
break; break;
} }
case kS390_I32x4ExtMulHighI16x8U: { case kS390_I32x4ExtMulHighI16x8U: {
ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(vuplh, 1) EXT_MUL(vmle, vmlo, vmrh, 1)
break; break;
} }
case kS390_I16x8ExtMulLowI8x16S: { case kS390_I16x8ExtMulLowI8x16S: {
ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(vupl, 0) EXT_MUL(vme, vmo, vmrl, 0)
break; break;
} }
case kS390_I16x8ExtMulHighI8x16S: { case kS390_I16x8ExtMulHighI8x16S: {
ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(vuph, 0) EXT_MUL(vme, vmo, vmrh, 0)
break; break;
} }
case kS390_I16x8ExtMulLowI8x16U: { case kS390_I16x8ExtMulLowI8x16U: {
ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(vupll, 0) EXT_MUL(vmle, vmlo, vmrl, 0)
break; break;
} }
case kS390_I16x8ExtMulHighI8x16U: { case kS390_I16x8ExtMulHighI8x16U: {
ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL(vuplh, 0) EXT_MUL(vmle, vmlo, vmrh, 0)
break; break;
} }
#undef ASSEMBLE_SIMD_I32X4_I16X8_EXT_MUL #undef EXT_MUL
#define EXT_ADD_PAIRWISE(lane_size, mul_even, mul_odd) \ #define EXT_ADD_PAIRWISE(lane_size, mul_even, mul_odd) \
Simd128Register src = i.InputSimd128Register(0); \ Simd128Register src = i.InputSimd128Register(0); \
Simd128Register dst = i.OutputSimd128Register(); \ Simd128Register dst = i.OutputSimd128Register(); \
......
...@@ -772,6 +772,8 @@ void Simulator::EvalTableInit() { ...@@ -772,6 +772,8 @@ void Simulator::EvalTableInit() {
V(vsum, VSUM, 0xE764) /* type = VRR_C VECTOR SUM ACROSS WORD */ \ V(vsum, VSUM, 0xE764) /* type = VRR_C VECTOR SUM ACROSS WORD */ \
V(vsumg, VSUMG, 0xE765) /* type = VRR_C VECTOR SUM ACROSS DOUBLEWORD */ \ V(vsumg, VSUMG, 0xE765) /* type = VRR_C VECTOR SUM ACROSS DOUBLEWORD */ \
V(vpk, VPK, 0xE794) /* type = VRR_C VECTOR PACK */ \ V(vpk, VPK, 0xE794) /* type = VRR_C VECTOR PACK */ \
V(vmrl, VMRL, 0xE760) /* type = VRR_C VECTOR MERGE LOW */ \
V(vmrh, VMRH, 0xE761) /* type = VRR_C VECTOR MERGE HIGH */ \
V(vpks, VPKS, 0xE797) /* type = VRR_B VECTOR PACK SATURATE */ \ V(vpks, VPKS, 0xE797) /* type = VRR_B VECTOR PACK SATURATE */ \
V(vpkls, VPKLS, 0xE795) /* type = VRR_B VECTOR PACK LOGICAL SATURATE */ \ V(vpkls, VPKLS, 0xE795) /* type = VRR_B VECTOR PACK LOGICAL SATURATE */ \
V(vupll, VUPLL, 0xE7D4) /* type = VRR_A VECTOR UNPACK LOGICAL LOW */ \ V(vupll, VUPLL, 0xE7D4) /* type = VRR_A VECTOR UNPACK LOGICAL LOW */ \
...@@ -3397,6 +3399,53 @@ EVALUATE(VSUMG) { ...@@ -3397,6 +3399,53 @@ EVALUATE(VSUMG) {
} }
#undef CASE #undef CASE
#define VECTOR_MERGE(type, is_low_side) \
constexpr size_t index_limit = (kSimd128Size / sizeof(type)) / 2; \
for (size_t i = 0, source_index = is_low_side ? i + index_limit : i; \
i < index_limit; i++, source_index++) { \
set_simd_register_by_lane<type>( \
r1, 2 * i, get_simd_register_by_lane<type>(r2, source_index)); \
set_simd_register_by_lane<type>( \
r1, (2 * i) + 1, get_simd_register_by_lane<type>(r3, source_index)); \
}
#define CASE(i, type, is_low_side) \
case i: { \
VECTOR_MERGE(type, is_low_side) \
} break;
EVALUATE(VMRL) {
DCHECK_OPCODE(VMRL);
DECODE_VRR_C_INSTRUCTION(r1, r2, r3, m6, m5, m4);
USE(m6);
USE(m5);
switch (m4) {
CASE(0, int8_t, true);
CASE(1, int16_t, true);
CASE(2, int32_t, true);
CASE(3, int64_t, true);
default:
UNREACHABLE();
}
return length;
}
EVALUATE(VMRH) {
DCHECK_OPCODE(VMRH);
DECODE_VRR_C_INSTRUCTION(r1, r2, r3, m6, m5, m4);
USE(m6);
USE(m5);
switch (m4) {
CASE(0, int8_t, false);
CASE(1, int16_t, false);
CASE(2, int32_t, false);
CASE(3, int64_t, false);
default:
UNREACHABLE();
}
return length;
}
#undef CASE
#undef VECTOR_MERGE
template <class S, class D> template <class S, class D>
void VectorPack(Simulator* sim, int dst, int src1, int src2, bool saturate, void VectorPack(Simulator* sim, int dst, int src1, int src2, bool saturate,
const D& max = 0, const D& min = 0) { const D& max = 0, const D& min = 0) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment