Commit db97c402 authored by jyan's avatar jyan Committed by Commit bot

S390: Optimize For Mul in TurboFan codegen

R=joransiu@ca.ibm.com, bjaideep@ca.ibm.com, michael_dawson@ca.ibm.com, mbrandy@us.ibm.com
BUG=

Review-Url: https://codereview.chromium.org/2265073003
Cr-Commit-Position: refs/heads/master@{#38801}
parent da5d713d
......@@ -27,6 +27,16 @@ class S390OperandConverter final : public InstructionOperandConverter {
size_t OutputCount() { return instr_->OutputCount(); }
bool Is64BitOperand(int index) {
return LocationOperand::cast(instr_->InputAt(index))->representation() ==
MachineRepresentation::kWord64;
}
bool Is32BitOperand(int index) {
return LocationOperand::cast(instr_->InputAt(index))->representation() ==
MachineRepresentation::kWord32;
}
bool CompareLogical() const {
switch (instr_->flags_condition()) {
case kUnsignedLessThan:
......@@ -104,12 +114,25 @@ class S390OperandConverter final : public InstructionOperandConverter {
FrameOffset offset = frame_access_state()->GetFrameOffset(slot);
return MemOperand(offset.from_stack_pointer() ? sp : fp, offset.offset());
}
MemOperand InputStackSlot(size_t index) {
InstructionOperand* op = instr_->InputAt(index);
return SlotToMemOperand(AllocatedOperand::cast(op)->index());
}
};
static inline bool HasRegisterInput(Instruction* instr, int index) {
return instr->InputAt(index)->IsRegister();
}
static inline bool HasImmediateInput(Instruction* instr, size_t index) {
return instr->InputAt(index)->IsImmediate();
}
static inline bool HasStackSlotInput(Instruction* instr, size_t index) {
return instr->InputAt(index)->IsStackSlot();
}
namespace {
class OutOfLineLoadNAN32 final : public OutOfLineCode {
......@@ -287,9 +310,11 @@ Condition FlagsConditionToCondition(FlagsCondition condition, ArchOpcode op) {
if (HasRegisterInput(instr, 1)) { \
__ asm_instr(i.OutputRegister(), i.InputRegister(0), \
i.InputRegister(1)); \
} else { \
} else if (HasImmediateInput(instr, 1)) { \
__ asm_instr(i.OutputRegister(), i.InputRegister(0), \
i.InputImmediate(1)); \
} else { \
UNIMPLEMENTED(); \
} \
} while (0)
......@@ -1223,14 +1248,54 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
break;
case kS390_Mul32:
#if V8_TARGET_ARCH_S390X
case kS390_Mul64:
if (HasRegisterInput(instr, 1)) {
__ Mul32(i.InputRegister(0), i.InputRegister(1));
} else if (HasImmediateInput(instr, 1)) {
__ Mul32(i.InputRegister(0), i.InputImmediate(1));
} else if (HasStackSlotInput(instr, 1)) {
#ifdef V8_TARGET_ARCH_S390X
// Avoid endian-issue here:
// stg r1, 0(fp)
// ...
// msy r2, 0(fp) <-- This will read the upper 32 bits
__ lg(kScratchReg, i.InputStackSlot(1));
__ Mul32(i.InputRegister(0), kScratchReg);
#else
__ Mul32(i.InputRegister(0), i.InputStackSlot(1));
#endif
__ Mul(i.OutputRegister(), i.InputRegister(0), i.InputRegister(1));
} else {
UNIMPLEMENTED();
}
break;
case kS390_Mul64:
if (HasRegisterInput(instr, 1)) {
__ Mul64(i.InputRegister(0), i.InputRegister(1));
} else if (HasImmediateInput(instr, 1)) {
__ Mul64(i.InputRegister(0), i.InputImmediate(1));
} else if (HasStackSlotInput(instr, 1)) {
__ Mul64(i.InputRegister(0), i.InputStackSlot(1));
} else {
UNIMPLEMENTED();
}
break;
case kS390_MulHigh32:
__ LoadRR(r1, i.InputRegister(0));
__ mr_z(r0, i.InputRegister(1));
if (HasRegisterInput(instr, 1)) {
__ mr_z(r0, i.InputRegister(1));
} else if (HasStackSlotInput(instr, 1)) {
#ifdef V8_TARGET_ARCH_S390X
// Avoid endian-issue here:
// stg r1, 0(fp)
// ...
// mfy r2, 0(fp) <-- This will read the upper 32 bits
__ lg(kScratchReg, i.InputStackSlot(1));
__ mr_z(r0, kScratchReg);
#else
__ mfy(r0, i.InputStackSlot(1));
#endif
} else {
UNIMPLEMENTED();
}
__ LoadW(i.OutputRegister(), r0);
break;
case kS390_Mul32WithHigh32:
......@@ -1241,7 +1306,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
case kS390_MulHighU32:
__ LoadRR(r1, i.InputRegister(0));
__ mlr(r0, i.InputRegister(1));
if (HasRegisterInput(instr, 1)) {
__ mlr(r0, i.InputRegister(1));
} else if (HasStackSlotInput(instr, 1)) {
#ifdef V8_TARGET_ARCH_S390X
// Avoid endian-issue here:
// stg r1, 0(fp)
// ...
// mfy r2, 0(fp) <-- This will read the upper 32 bits
__ lg(kScratchReg, i.InputStackSlot(1));
__ mlr(r0, kScratchReg);
#else
__ ml(r0, i.InputStackSlot(1));
#endif
} else {
UNIMPLEMENTED();
}
__ LoadlW(i.OutputRegister(), r0);
break;
case kS390_MulFloat:
......
......@@ -35,6 +35,16 @@ class S390OperandGenerator final : public OperandGenerator {
return UseRegister(node);
}
int64_t GetImmediate(Node* node) {
if (node->opcode() == IrOpcode::kInt32Constant)
return OpParameter<int32_t>(node);
else if (node->opcode() == IrOpcode::kInt64Constant)
return OpParameter<int64_t>(node);
else
UNIMPLEMENTED();
return 0L;
}
bool CanBeImmediate(Node* node, ImmediateMode mode) {
int64_t value;
if (node->opcode() == IrOpcode::kInt32Constant)
......@@ -132,6 +142,18 @@ class S390OperandGenerator final : public OperandGenerator {
return kMode_MRR;
}
}
bool CanBeBetterLeftOperand(Node* node) const {
return !selector()->IsLive(node);
}
MachineRepresentation GetRepresentation(Node* node) {
return sequence()->GetRepresentation(selector()->GetVirtualRegister(node));
}
bool Is64BitOperand(Node* node) {
return MachineRepresentation::kWord64 == GetRepresentation(node);
}
};
namespace {
......@@ -182,13 +204,36 @@ void VisitBinop(InstructionSelector* selector, Node* node,
FlagsContinuation* cont) {
S390OperandGenerator g(selector);
Matcher m(node);
Node* left = m.left().node();
Node* right = m.right().node();
InstructionOperand inputs[4];
size_t input_count = 0;
InstructionOperand outputs[2];
size_t output_count = 0;
inputs[input_count++] = g.UseRegister(m.left().node());
inputs[input_count++] = g.UseOperand(m.right().node(), operand_mode);
// TODO(turbofan): match complex addressing modes.
if (left == right) {
// If both inputs refer to the same operand, enforce allocating a register
// for both of them to ensure that we don't end up generating code like
// this:
//
// mov rax, [rbp-0x10]
// add rax, [rbp-0x10]
// jo label
InstructionOperand const input = g.UseRegister(left);
inputs[input_count++] = input;
inputs[input_count++] = input;
} else if (g.CanBeImmediate(right, operand_mode)) {
inputs[input_count++] = g.UseRegister(left);
inputs[input_count++] = g.UseImmediate(right);
} else {
if (node->op()->HasProperty(Operator::kCommutative) &&
g.CanBeBetterLeftOperand(right)) {
std::swap(left, right);
}
inputs[input_count++] = g.UseRegister(left);
inputs[input_count++] = g.UseRegister(right);
}
if (cont->IsBranch()) {
inputs[input_count++] = g.Label(cont->true_block());
......@@ -1002,28 +1047,89 @@ void EmitInt32MulWithOverflow(InstructionSelector* selector, Node* node,
VisitCompare(selector, kS390_Cmp32, high32_operand, temp_operand, cont);
}
void VisitMul(InstructionSelector* selector, Node* node, ArchOpcode opcode) {
S390OperandGenerator g(selector);
Int32BinopMatcher m(node);
Node* left = m.left().node();
Node* right = m.right().node();
if (g.CanBeImmediate(right, kInt32Imm)) {
selector->Emit(opcode, g.DefineSameAsFirst(node), g.UseRegister(left),
g.UseImmediate(right));
} else {
if (g.CanBeBetterLeftOperand(right)) {
std::swap(left, right);
}
selector->Emit(opcode, g.DefineSameAsFirst(node), g.UseRegister(left),
g.Use(right));
}
}
} // namespace
void InstructionSelector::VisitInt32MulWithOverflow(Node* node) {
if (Node* ovf = NodeProperties::FindProjection(node, 1)) {
FlagsContinuation cont = FlagsContinuation::ForSet(kNotEqual, ovf);
return EmitInt32MulWithOverflow(this, node, &cont);
}
VisitMul(this, node, kS390_Mul32);
// FlagsContinuation cont;
// EmitInt32MulWithOverflow(this, node, &cont);
}
void InstructionSelector::VisitInt32Mul(Node* node) {
VisitRRR(this, kS390_Mul32, node);
S390OperandGenerator g(this);
Int32BinopMatcher m(node);
Node* left = m.left().node();
Node* right = m.right().node();
if (g.CanBeImmediate(right, kInt32Imm) &&
base::bits::IsPowerOfTwo32(g.GetImmediate(right))) {
int power = 31 - base::bits::CountLeadingZeros32(g.GetImmediate(right));
Emit(kS390_ShiftLeft32, g.DefineSameAsFirst(node), g.UseRegister(left),
g.UseImmediate(power));
return;
}
VisitMul(this, node, kS390_Mul32);
}
#if V8_TARGET_ARCH_S390X
void InstructionSelector::VisitInt64Mul(Node* node) {
VisitRRR(this, kS390_Mul64, node);
S390OperandGenerator g(this);
Int64BinopMatcher m(node);
Node* left = m.left().node();
Node* right = m.right().node();
if (g.CanBeImmediate(right, kInt32Imm) &&
base::bits::IsPowerOfTwo64(g.GetImmediate(right))) {
int power = 31 - base::bits::CountLeadingZeros64(g.GetImmediate(right));
Emit(kS390_ShiftLeft64, g.DefineSameAsFirst(node), g.UseRegister(left),
g.UseImmediate(power));
return;
}
VisitMul(this, node, kS390_Mul64);
}
#endif
void InstructionSelector::VisitInt32MulHigh(Node* node) {
S390OperandGenerator g(this);
Emit(kS390_MulHigh32, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)));
Int32BinopMatcher m(node);
Node* left = m.left().node();
Node* right = m.right().node();
if (g.CanBeBetterLeftOperand(right)) {
std::swap(left, right);
}
Emit(kS390_MulHigh32, g.DefineAsRegister(node), g.UseRegister(left),
g.Use(right));
}
void InstructionSelector::VisitUint32MulHigh(Node* node) {
S390OperandGenerator g(this);
Emit(kS390_MulHighU32, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)));
Int32BinopMatcher m(node);
Node* left = m.left().node();
Node* right = m.right().node();
if (g.CanBeBetterLeftOperand(right)) {
std::swap(left, right);
}
Emit(kS390_MulHighU32, g.DefineAsRegister(node), g.UseRegister(left),
g.Use(right));
}
void InstructionSelector::VisitInt32Div(Node* node) {
......@@ -1721,15 +1827,6 @@ void InstructionSelector::VisitUint64LessThanOrEqual(Node* node) {
}
#endif
void InstructionSelector::VisitInt32MulWithOverflow(Node* node) {
if (Node* ovf = NodeProperties::FindProjection(node, 1)) {
FlagsContinuation cont = FlagsContinuation::ForSet(kNotEqual, ovf);
return EmitInt32MulWithOverflow(this, node, &cont);
}
FlagsContinuation cont;
EmitInt32MulWithOverflow(this, node, &cont);
}
void InstructionSelector::VisitFloat32Equal(Node* node) {
FlagsContinuation cont = FlagsContinuation::ForSet(kEqual, node);
VisitFloat32Compare(this, node, &cont);
......
......@@ -2098,9 +2098,15 @@ void Assembler::slgrk(Register r1, Register r2, Register r3) {
// ----------------------------
// Multiply Register-Storage (64<32)
void Assembler::m(Register r1, const MemOperand& opnd) {
DCHECK(r1.code() % 2 == 0);
rx_form(M, r1, opnd.rx(), opnd.rb(), opnd.offset());
}
void Assembler::mfy(Register r1, const MemOperand& opnd) {
DCHECK(r1.code() % 2 == 0);
rxy_form(MFY, r1, opnd.rx(), opnd.rb(), opnd.offset());
}
// Multiply Register (64<32)
void Assembler::mr_z(Register r1, Register r2) {
DCHECK(r1.code() % 2 == 0);
......
......@@ -1055,6 +1055,7 @@ class Assembler : public AssemblerBase {
// 32-bit Multiply Instructions
void m(Register r1, const MemOperand& opnd);
void mfy(Register r1, const MemOperand& opnd);
void mr_z(Register r1, Register r2);
void ml(Register r1, const MemOperand& opnd);
void mlr(Register r1, Register r2);
......
......@@ -907,6 +907,9 @@ bool Decoder::DecodeFourByte(Instruction* instr) {
case LDGR:
Format(instr, "ldgr\t'f5,'r6");
break;
case MS:
Format(instr, "ms\t'r1,'d1('r2d,'r3)");
break;
case STE:
Format(instr, "ste\t'f1,'d1('r2d,'r3)");
break;
......@@ -1358,6 +1361,12 @@ bool Decoder::DecodeSixByte(Instruction* instr) {
case LEY:
Format(instr, "ley\t'f1,'d2('r2d,'r3)");
break;
case MSG:
Format(instr, "msg\t'r1,'d2('r2d,'r3)");
break;
case MSY:
Format(instr, "msy\t'r1,'d2('r2d,'r3)");
break;
case STEY:
Format(instr, "stey\t'f1,'d2('r2d,'r3)");
break;
......
......@@ -3697,6 +3697,36 @@ void MacroAssembler::mov(Register dst, const Operand& src) {
#endif
}
void MacroAssembler::Mul32(Register dst, const MemOperand& src1) {
if (is_uint12(src1.offset())) {
ms(dst, src1);
} else if (is_int20(src1.offset())) {
msy(dst, src1);
} else {
UNIMPLEMENTED();
}
}
void MacroAssembler::Mul32(Register dst, Register src1) { msr(dst, src1); }
void MacroAssembler::Mul32(Register dst, const Operand& src1) {
msfi(dst, src1);
}
void MacroAssembler::Mul64(Register dst, const MemOperand& src1) {
if (is_int20(src1.offset())) {
msg(dst, src1);
} else {
UNIMPLEMENTED();
}
}
void MacroAssembler::Mul64(Register dst, Register src1) { msgr(dst, src1); }
void MacroAssembler::Mul64(Register dst, const Operand& src1) {
msgfi(dst, src1);
}
void MacroAssembler::Mul(Register dst, Register src1, Register src2) {
if (dst.is(src2)) {
MulP(dst, src1);
......
......@@ -301,6 +301,12 @@ class MacroAssembler : public Assembler {
void MulP(Register dst, Register src);
void MulP(Register dst, const MemOperand& opnd);
void Mul(Register dst, Register src1, Register src2);
void Mul32(Register dst, const MemOperand& src1);
void Mul32(Register dst, Register src1);
void Mul32(Register dst, const Operand& src1);
void Mul64(Register dst, const MemOperand& src1);
void Mul64(Register dst, Register src1);
void Mul64(Register dst, const Operand& src1);
// Divide
void DivP(Register dividend, Register divider);
......
......@@ -6584,7 +6584,6 @@ EVALUATE(MR) {
int32_t low_bits = product & 0x00000000FFFFFFFF;
set_low_register(r1, high_bits);
set_low_register(r1 + 1, low_bits);
set_low_register(r1, r1_val);
return length;
}
......@@ -6940,9 +6939,22 @@ EVALUATE(S) {
}
EVALUATE(M) {
UNIMPLEMENTED();
USE(instr);
return 0;
DCHECK_OPCODE(M);
DECODE_RX_A_INSTRUCTION(x2, b2, r1, d2_val);
int64_t b2_val = (b2 == 0) ? 0 : get_register(b2);
int64_t x2_val = (x2 == 0) ? 0 : get_register(x2);
intptr_t addr = b2_val + x2_val + d2_val;
DCHECK(r1 % 2 == 0);
int32_t mem_val = ReadW(addr, instr);
int32_t r1_val = get_low_register<int32_t>(r1 + 1);
int64_t product =
static_cast<int64_t>(r1_val) * static_cast<int64_t>(mem_val);
int32_t high_bits = product >> 32;
r1_val = high_bits;
int32_t low_bits = product & 0x00000000FFFFFFFF;
set_low_register(r1, high_bits);
set_low_register(r1 + 1, low_bits);
return length;
}
EVALUATE(D) {
......@@ -11156,9 +11168,21 @@ EVALUATE(SY) {
}
EVALUATE(MFY) {
UNIMPLEMENTED();
USE(instr);
return 0;
DCHECK_OPCODE(MFY);
DECODE_RXY_A_INSTRUCTION(r1, x2, b2, d2);
int64_t x2_val = (x2 == 0) ? 0 : get_register(x2);
int64_t b2_val = (b2 == 0) ? 0 : get_register(b2);
DCHECK(r1 % 2 == 0);
int32_t mem_val = ReadW(b2_val + x2_val + d2, instr);
int32_t r1_val = get_low_register<int32_t>(r1 + 1);
int64_t product =
static_cast<int64_t>(r1_val) * static_cast<int64_t>(mem_val);
int32_t high_bits = product >> 32;
r1_val = high_bits;
int32_t low_bits = product & 0x00000000FFFFFFFF;
set_low_register(r1, high_bits);
set_low_register(r1 + 1, low_bits);
return length;
}
EVALUATE(ALY) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment