Commit b67dd429 authored by Milad Fa's avatar Milad Fa Committed by V8 LUCI CQ

PPC [simd]: optimize I64x2Mul on Power10

Cl also optimizes the usage on Power9 by using
mtvsrdd.

Change-Id: Ibd6b227111adc0c262c621be6ce4068d3de2e659
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3060493Reviewed-by: 's avatarJunliang Yan <junyan@redhat.com>
Reviewed-by: 's avatarMilad Fa <mfarazma@redhat.com>
Commit-Queue: Milad Fa <mfarazma@redhat.com>
Cr-Commit-Position: refs/heads/master@{#76008}
parent d63ca69c
......@@ -2313,6 +2313,8 @@ using Instr = uint32_t;
V(vmulosw, VMULOSW, 0x10000188) \
/* Vector Multiply Odd Unsigned Word */ \
V(vmulouw, VMULOUW, 0x10000088) \
/* Vector Multiply Low Doubleword */ \
V(vmulld, VMULLD, 0x100001C9) \
/* Vector Sum across Quarter Signed Halfword Saturate */ \
V(vsum4shs, VSUM4SHS, 0x10000648) \
/* Vector Pack Unsigned Word Unsigned Saturate */ \
......
......@@ -2372,26 +2372,29 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
constexpr int lane_width_in_bytes = 8;
Simd128Register src0 = i.InputSimd128Register(0);
Simd128Register src1 = i.InputSimd128Register(1);
Simd128Register tempFPReg1 = i.ToSimd128Register(instr->TempAt(0));
Simd128Register tempFPReg0 = i.ToSimd128Register(instr->TempAt(0));
Register tempReg1 = i.ToRegister(instr->TempAt(2));
Register scratch_0 = ip;
Register scratch_1 = r0;
Simd128Register dst = i.OutputSimd128Register();
for (int i = 0; i < 2; i++) {
if (i > 0) {
__ vextractd(kScratchSimd128Reg, src0,
Operand(1 * lane_width_in_bytes));
__ vextractd(tempFPReg1, src1, Operand(1 * lane_width_in_bytes));
src0 = kScratchSimd128Reg;
src1 = tempFPReg1;
}
__ mfvsrd(r0, src0);
__ mfvsrd(ip, src1);
__ mulld(r0, r0, ip);
if (i <= 0) {
__ mtvsrd(dst, r0);
} else {
__ mtvsrd(kScratchSimd128Reg, r0);
__ vinsertd(dst, kScratchSimd128Reg,
Operand(1 * lane_width_in_bytes));
if (CpuFeatures::IsSupported(PPC_10_PLUS)) {
__ vmulld(dst, src0, src1);
} else {
for (int i = 0; i < 2; i++) {
if (i > 0) {
__ vextractd(kScratchSimd128Reg, src0,
Operand(1 * lane_width_in_bytes));
__ vextractd(tempFPReg0, src1, Operand(1 * lane_width_in_bytes));
src0 = kScratchSimd128Reg;
src1 = tempFPReg0;
}
__ mfvsrd(scratch_0, src0);
__ mfvsrd(scratch_1, src1);
__ mulld(scratch_0, scratch_0, scratch_1);
scratch_0 = r0;
scratch_1 = tempReg1;
}
__ mtvsrdd(dst, ip, r0);
}
break;
}
......
......@@ -2395,14 +2395,14 @@ SIMD_VISIT_EXTRACT_LANE(I8x16, S)
SIMD_TYPES(SIMD_VISIT_REPLACE_LANE)
#undef SIMD_VISIT_REPLACE_LANE
#define SIMD_VISIT_BINOP(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
PPCOperandGenerator g(this); \
InstructionOperand temps[] = {g.TempSimd128Register(), \
g.TempSimd128Register()}; \
Emit(kPPC_##Opcode, g.DefineAsRegister(node), \
g.UseUniqueRegister(node->InputAt(0)), \
g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps); \
#define SIMD_VISIT_BINOP(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
PPCOperandGenerator g(this); \
InstructionOperand temps[] = {g.TempSimd128Register(), \
g.TempSimd128Register(), g.TempRegister()}; \
Emit(kPPC_##Opcode, g.DefineAsRegister(node), \
g.UseUniqueRegister(node->InputAt(0)), \
g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps); \
}
SIMD_BINOP_LIST(SIMD_VISIT_BINOP)
#undef SIMD_VISIT_BINOP
......
......@@ -4252,6 +4252,10 @@ void Simulator::ExecuteGeneric(Instruction* instr) {
VECTOR_ARITHMETIC_OP(int64_t, -)
break;
}
case VMULLD: {
VECTOR_ARITHMETIC_OP(int64_t, *)
break;
}
case VADDUWM: {
VECTOR_ARITHMETIC_OP(int32_t, +)
break;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment