Commit 7961ab1b authored by Milad Fa's avatar Milad Fa Committed by Commit Bot

PPC [simd]: optimize vector load/store on power 9

Using the added lxvx and stxvx instructions, we can load and
store vector register values in a single instruction.
MRR encoding does not have a 16 byte alignment requirement.

Change-Id: I9c1d80fd867a0e79d3390e4a05e08cdf2a2e4835
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2845734Reviewed-by: 's avatarJunliang Yan <junyan@redhat.com>
Commit-Queue: Milad Fa <mfarazma@redhat.com>
Cr-Commit-Position: refs/heads/master@{#74130}
parent e866b7ab
......@@ -1821,6 +1821,12 @@ void Assembler::lxvd(const Simd128Register rt, const MemOperand& src) {
TX);
}
void Assembler::lxvx(const Simd128Register rt, const MemOperand& src) {
int TX = 1;
emit(LXVX | rt.code() * B21 | src.ra().code() * B16 | src.rb().code() * B11 |
TX);
}
void Assembler::lxsdx(const Simd128Register rt, const MemOperand& src) {
int TX = 1;
emit(LXSDX | rt.code() * B21 | src.ra().code() * B16 | src.rb().code() * B11 |
......@@ -1875,6 +1881,12 @@ void Assembler::stxvd(const Simd128Register rt, const MemOperand& dst) {
SX);
}
void Assembler::stxvx(const Simd128Register rt, const MemOperand& dst) {
int SX = 1;
emit(STXVX | rt.code() * B21 | dst.ra().code() * B16 | dst.rb().code() * B11 |
SX);
}
void Assembler::xxspltib(const Simd128Register rt, const Operand& imm) {
int TX = 1;
CHECK(is_uint8(imm.immediate()));
......
......@@ -1022,6 +1022,7 @@ class Assembler : public AssemblerBase {
void mtvsrd(const Simd128Register rt, const Register ra);
void mtvsrdd(const Simd128Register rt, const Register ra, const Register rb);
void lxvd(const Simd128Register rt, const MemOperand& src);
void lxvx(const Simd128Register rt, const MemOperand& src);
void lxsdx(const Simd128Register rt, const MemOperand& src);
void lxsibzx(const Simd128Register rt, const MemOperand& src);
void lxsihzx(const Simd128Register rt, const MemOperand& src);
......@@ -1031,6 +1032,7 @@ class Assembler : public AssemblerBase {
void stxsihx(const Simd128Register rs, const MemOperand& src);
void stxsiwx(const Simd128Register rs, const MemOperand& src);
void stxvd(const Simd128Register rt, const MemOperand& src);
void stxvx(const Simd128Register rt, const MemOperand& src);
void xxspltib(const Simd128Register rt, const Operand& imm);
void xxbrq(const Simd128Register rt, const Simd128Register rb);
......
......@@ -1983,6 +1983,8 @@ using Instr = uint32_t;
V(lxsspx, LXSSPX, 0x7C000418) \
/* Load VSR Vector Doubleword*2 Indexed */ \
V(lxvd, LXVD, 0x7C000698) \
/* Load VSX Vector Indexed */ \
V(lxvx, LXVX, 0x7C000218) \
/* Load VSR Vector Doubleword & Splat Indexed */ \
V(lxvdsx, LXVDSX, 0x7C000298) \
/* Load VSR Vector Word*4 Indexed */ \
......@@ -2011,6 +2013,8 @@ using Instr = uint32_t;
V(stxsspx, STXSSPX, 0x7C000518) \
/* Store VSR Vector Doubleword*2 Indexed */ \
V(stxvd, STXVD, 0x7C000798) \
/* Store VSX Vector Indexed */ \
V(stxvx, STXVX, 0x7C000318) \
/* Store VSR Vector Word*4 Indexed */ \
V(stxvw, STXVW, 0x7C000718)
......
......@@ -443,7 +443,7 @@ void TurboAssembler::MultiPushV128(RegList dregs, Register location) {
Simd128Register dreg = Simd128Register::from_code(i);
stack_offset -= kSimd128Size;
li(ip, Operand(stack_offset));
StoreSimd128(dreg, MemOperand(location, ip), r0, kScratchSimd128Reg);
StoreSimd128(dreg, MemOperand(location, ip));
}
}
}
......@@ -468,7 +468,7 @@ void TurboAssembler::MultiPopV128(RegList dregs, Register location) {
if ((dregs & (1 << i)) != 0) {
Simd128Register dreg = Simd128Register::from_code(i);
li(ip, Operand(stack_offset));
LoadSimd128(dreg, MemOperand(location, ip), r0, kScratchSimd128Reg);
LoadSimd128(dreg, MemOperand(location, ip));
stack_offset += kSimd128Size;
}
}
......@@ -2993,22 +2993,8 @@ void TurboAssembler::LoadSingleU(DoubleRegister dst, const MemOperand& mem,
}
}
void TurboAssembler::LoadSimd128(Simd128Register dst, const MemOperand& mem,
Register ScratchReg,
Simd128Register ScratchDoubleReg) {
// lvx needs the stack to be 16 byte aligned.
// We first use lxvd/stxvd to copy the content on an aligned address. lxvd
// itself reverses the lanes so it cannot be used as is.
lxvd(ScratchDoubleReg, mem);
mr(ScratchReg, sp);
ClearRightImm(
sp, sp,
Operand(base::bits::WhichPowerOfTwo(16))); // equivalent to &= -16
addi(sp, sp, Operand(-16));
stxvd(ScratchDoubleReg, MemOperand(r0, sp));
// Load it with correct lane ordering.
lvx(dst, MemOperand(r0, sp));
mr(sp, ScratchReg);
void TurboAssembler::LoadSimd128(Simd128Register dst, const MemOperand& mem) {
lxvx(dst, mem);
}
void TurboAssembler::StoreDouble(DoubleRegister src, const MemOperand& mem,
......@@ -3063,21 +3049,8 @@ void TurboAssembler::StoreSingleU(DoubleRegister src, const MemOperand& mem,
}
}
void TurboAssembler::StoreSimd128(Simd128Register src, const MemOperand& mem,
Register ScratchReg,
Simd128Register ScratchDoubleReg) {
// stvx needs the stack to be 16 byte aligned.
// We use lxvd/stxvd to store the content on an aligned address. stxvd
// itself reverses the lanes so it cannot be used as is.
mr(ScratchReg, sp);
ClearRightImm(
sp, sp,
Operand(base::bits::WhichPowerOfTwo(16))); // equivalent to &= -16
addi(sp, sp, Operand(-16));
stvx(src, MemOperand(r0, sp));
lxvd(ScratchDoubleReg, MemOperand(r0, sp));
mr(sp, ScratchReg);
stxvd(ScratchDoubleReg, mem);
void TurboAssembler::StoreSimd128(Simd128Register src, const MemOperand& mem) {
stxvx(src, mem);
}
Register GetRegisterThatIsNotOneOf(Register reg1, Register reg2, Register reg3,
......@@ -3219,13 +3192,13 @@ void TurboAssembler::SwapSimd128(Simd128Register src, MemOperand dst,
DCHECK(src != scratch);
// push v0, to be used as scratch
addi(sp, sp, Operand(-kSimd128Size));
StoreSimd128(v0, MemOperand(r0, sp), r0, scratch);
StoreSimd128(v0, MemOperand(r0, sp));
mov(ip, Operand(dst.offset()));
LoadSimd128(v0, MemOperand(dst.ra(), ip), r0, scratch);
StoreSimd128(src, MemOperand(dst.ra(), ip), r0, scratch);
LoadSimd128(v0, MemOperand(dst.ra(), ip));
StoreSimd128(src, MemOperand(dst.ra(), ip));
vor(src, v0, v0);
// restore v0
LoadSimd128(v0, MemOperand(r0, sp), ip, scratch);
LoadSimd128(v0, MemOperand(r0, sp));
addi(sp, sp, Operand(kSimd128Size));
}
......@@ -3233,23 +3206,23 @@ void TurboAssembler::SwapSimd128(MemOperand src, MemOperand dst,
Simd128Register scratch) {
// push v0 and v1, to be used as scratch
addi(sp, sp, Operand(2 * -kSimd128Size));
StoreSimd128(v0, MemOperand(r0, sp), ip, scratch);
StoreSimd128(v0, MemOperand(r0, sp));
li(ip, Operand(kSimd128Size));
StoreSimd128(v1, MemOperand(ip, sp), r0, scratch);
StoreSimd128(v1, MemOperand(ip, sp));
mov(ip, Operand(src.offset()));
LoadSimd128(v0, MemOperand(src.ra(), ip), r0, scratch);
LoadSimd128(v0, MemOperand(src.ra(), ip));
mov(ip, Operand(dst.offset()));
LoadSimd128(v1, MemOperand(dst.ra(), ip), r0, scratch);
LoadSimd128(v1, MemOperand(dst.ra(), ip));
StoreSimd128(v0, MemOperand(dst.ra(), ip), r0, scratch);
StoreSimd128(v0, MemOperand(dst.ra(), ip));
mov(ip, Operand(src.offset()));
StoreSimd128(v1, MemOperand(src.ra(), ip), r0, scratch);
StoreSimd128(v1, MemOperand(src.ra(), ip));
// restore v0 and v1
LoadSimd128(v0, MemOperand(r0, sp), ip, scratch);
LoadSimd128(v0, MemOperand(r0, sp));
li(ip, Operand(kSimd128Size));
LoadSimd128(v1, MemOperand(ip, sp), r0, scratch);
LoadSimd128(v1, MemOperand(ip, sp));
addi(sp, sp, Operand(2 * kSimd128Size));
}
......
......@@ -161,8 +161,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void LoadFloat32(DoubleRegister dst, const MemOperand& mem,
Register scratch = no_reg);
void LoadDoubleLiteral(DoubleRegister result, Double value, Register scratch);
void LoadSimd128(Simd128Register dst, const MemOperand& mem,
Register ScratchReg, Simd128Register ScratchDoubleReg);
void LoadSimd128(Simd128Register dst, const MemOperand& mem);
// load a literal signed int value <value> to GPR <dst>
void LoadIntLiteral(Register dst, int value);
......@@ -185,8 +184,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
Register scratch = no_reg);
void StoreSingleU(DoubleRegister src, const MemOperand& mem,
Register scratch = no_reg);
void StoreSimd128(Simd128Register src, const MemOperand& mem,
Register ScratchReg, Simd128Register ScratchDoubleReg);
void StoreSimd128(Simd128Register src, const MemOperand& mem);
void Cmpi(Register src1, const Operand& src2, Register scratch,
CRegister cr = cr7);
......
......@@ -1222,8 +1222,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} else {
DCHECK_EQ(MachineRepresentation::kSimd128, op->representation());
__ mov(ip, Operand(offset));
__ LoadSimd128(i.OutputSimd128Register(), MemOperand(fp, ip), r0,
kScratchSimd128Reg);
__ LoadSimd128(i.OutputSimd128Register(), MemOperand(fp, ip));
}
} else {
__ LoadP(i.OutputRegister(), MemOperand(fp, offset), r0);
......@@ -1748,8 +1747,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
case MachineRepresentation::kSimd128:
__ addi(sp, sp, Operand(-kSimd128Size));
__ StoreSimd128(i.InputSimd128Register(1), MemOperand(r0, sp), r0,
kScratchSimd128Reg);
__ StoreSimd128(i.InputSimd128Register(1), MemOperand(r0, sp));
break;
default:
__ StorePU(i.InputRegister(1), MemOperand(sp, -kSystemPointerSize),
......@@ -1791,8 +1789,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} else {
DCHECK_EQ(MachineRepresentation::kSimd128, op->representation());
__ mov(ip, Operand(slot * kSystemPointerSize));
__ StoreSimd128(i.InputSimd128Register(0), MemOperand(ip, sp), r0,
kScratchSimd128Reg);
__ StoreSimd128(i.InputSimd128Register(0), MemOperand(ip, sp));
}
} else {
__ StoreP(i.InputRegister(0), MemOperand(sp, slot * kSystemPointerSize),
......@@ -2057,9 +2054,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
AddressingMode mode = kMode_None;
MemOperand operand = i.MemoryOperand(&mode);
bool is_atomic = i.InputInt32(2);
// lvx only supports MRR.
DCHECK_EQ(mode, kMode_MRR);
__ LoadSimd128(result, operand, r0, kScratchSimd128Reg);
__ LoadSimd128(result, operand);
if (is_atomic) __ lwsync();
DCHECK_EQ(LeaveRC, i.OutputRCBit());
break;
......@@ -2095,9 +2091,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
Simd128Register value = i.InputSimd128Register(index);
bool is_atomic = i.InputInt32(3);
if (is_atomic) __ lwsync();
// stvx only supports MRR.
DCHECK_EQ(mode, kMode_MRR);
__ StoreSimd128(value, operand, r0, kScratchSimd128Reg);
__ StoreSimd128(value, operand);
if (is_atomic) __ sync();
DCHECK_EQ(LeaveRC, i.OutputRCBit());
break;
......@@ -4443,8 +4438,7 @@ void CodeGenerator::AssembleMove(InstructionOperand* source,
DCHECK(destination->IsSimd128StackSlot());
MemOperand dst = g.ToMemOperand(destination);
__ mov(ip, Operand(dst.offset()));
__ StoreSimd128(g.ToSimd128Register(source), MemOperand(dst.ra(), ip),
r0, kScratchSimd128Reg);
__ StoreSimd128(g.ToSimd128Register(source), MemOperand(dst.ra(), ip));
}
} else {
DoubleRegister src = g.ToDoubleRegister(source);
......@@ -4475,7 +4469,7 @@ void CodeGenerator::AssembleMove(InstructionOperand* source,
MemOperand src = g.ToMemOperand(source);
__ mov(ip, Operand(src.offset()));
__ LoadSimd128(g.ToSimd128Register(destination),
MemOperand(src.ra(), ip), r0, kScratchSimd128Reg);
MemOperand(src.ra(), ip));
}
} else {
LocationOperand* op = LocationOperand::cast(source);
......@@ -4490,15 +4484,15 @@ void CodeGenerator::AssembleMove(InstructionOperand* source,
DCHECK_EQ(MachineRepresentation::kSimd128, op->representation());
// push v0, to be used as scratch
__ addi(sp, sp, Operand(-kSimd128Size));
__ StoreSimd128(v0, MemOperand(r0, sp), r0, kScratchSimd128Reg);
__ StoreSimd128(v0, MemOperand(r0, sp));
MemOperand src = g.ToMemOperand(source);
MemOperand dst = g.ToMemOperand(destination);
__ mov(ip, Operand(src.offset()));
__ LoadSimd128(v0, MemOperand(src.ra(), ip), r0, kScratchSimd128Reg);
__ LoadSimd128(v0, MemOperand(src.ra(), ip));
__ mov(ip, Operand(dst.offset()));
__ StoreSimd128(v0, MemOperand(dst.ra(), ip), r0, kScratchSimd128Reg);
__ StoreSimd128(v0, MemOperand(dst.ra(), ip));
// restore v0
__ LoadSimd128(v0, MemOperand(r0, sp), ip, kScratchSimd128Reg);
__ LoadSimd128(v0, MemOperand(r0, sp));
__ addi(sp, sp, Operand(kSimd128Size));
}
}
......
......@@ -638,6 +638,10 @@ void Decoder::DecodeExt2(Instruction* instr) {
Format(instr, "lxvd 'Xt, 'ra, 'rb");
return;
}
case LXVX: {
Format(instr, "lxvx 'Xt, 'ra, 'rb");
return;
}
case LXSDX: {
Format(instr, "lxsdx 'Xt, 'ra, 'rb");
return;
......@@ -658,6 +662,10 @@ void Decoder::DecodeExt2(Instruction* instr) {
Format(instr, "stxvd 'Xs, 'ra, 'rb");
return;
}
case STXVX: {
Format(instr, "stxvx 'Xs, 'ra, 'rb");
return;
}
case STXSDX: {
Format(instr, "stxsdx 'Xs, 'ra, 'rb");
return;
......
......@@ -3926,6 +3926,14 @@ void Simulator::ExecuteGeneric(Instruction* instr) {
xt, 1, ReadDW(ra_val + rb_val + kSystemPointerSize));
break;
}
case LXVX: {
DECODE_VX_INSTRUCTION(vrt, ra, rb, T)
GET_ADDRESS(ra, rb, ra_val, rb_val)
intptr_t addr = ra_val + rb_val;
simdr_t* ptr = reinterpret_cast<simdr_t*>(addr);
set_simd_register(vrt, *ptr);
break;
}
case STXVD: {
DECODE_VX_INSTRUCTION(xs, ra, rb, S)
GET_ADDRESS(ra, rb, ra_val, rb_val)
......@@ -3934,6 +3942,15 @@ void Simulator::ExecuteGeneric(Instruction* instr) {
get_simd_register_by_lane<int64_t>(xs, 1));
break;
}
case STXVX: {
DECODE_VX_INSTRUCTION(vrs, ra, rb, S)
GET_ADDRESS(ra, rb, ra_val, rb_val)
intptr_t addr = ra_val + rb_val;
__int128 vrs_val =
*(reinterpret_cast<__int128*>(get_simd_register(vrs).int8));
WriteQW(addr, vrs_val);
break;
}
case LXSIBZX: {
DECODE_VX_INSTRUCTION(xt, ra, rb, T)
GET_ADDRESS(ra, rb, ra_val, rb_val)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment