Commit 37b461a9 authored by Miran.Karic's avatar Miran.Karic Committed by Commit Bot

MIPS64: Add optimizations to li and Dsubu macro.

Here we optimize Dsubu by instead of loading imm and subtracting, we
load -imm and perform addition when loading -imm takes less instructions
than loading imm. Similarily li is optimized by loading -imm and
performing addition or loading ~imm and inverting bits using nor when
one of these loads takes two instructions less than loading imm, saving
at least one instruction. Tests are adjusted to cover these
optimizations.

BUG=
TEST=cctest/test-assembler-mips/li_macro
     cctest/test-assembler-mips/Dsubu

Review-Url: https://codereview.chromium.org/2909913002
Cr-Commit-Position: refs/heads/master@{#46001}
parent 79fe6e3e
...@@ -658,37 +658,40 @@ void MacroAssembler::Subu(Register rd, Register rs, const Operand& rt) { ...@@ -658,37 +658,40 @@ void MacroAssembler::Subu(Register rd, Register rs, const Operand& rt) {
addiu(rd, rs, addiu(rd, rs,
static_cast<int32_t>( static_cast<int32_t>(
-rt.imm64_)); // No subiu instr, use addiu(x, y, -imm). -rt.imm64_)); // No subiu instr, use addiu(x, y, -imm).
} else if (-rt.imm64_ >> 16 == 0 && !MustUseReg(rt.rmode_)) { } else {
// Use load -imm and addu when loading -imm generates one instruction.
DCHECK(!rs.is(at)); DCHECK(!rs.is(at));
if (-rt.imm64_ >> 16 == 0 && !MustUseReg(rt.rmode_)) {
// Use load -imm and addu when loading -imm generates one instruction.
li(at, -rt.imm64_); li(at, -rt.imm64_);
addu(rd, rs, at); addu(rd, rs, at);
} else { } else {
// li handles the relocation. // li handles the relocation.
DCHECK(!rs.is(at));
li(at, rt); li(at, rt);
subu(rd, rs, at); subu(rd, rs, at);
} }
} }
}
} }
void MacroAssembler::Dsubu(Register rd, Register rs, const Operand& rt) { void MacroAssembler::Dsubu(Register rd, Register rs, const Operand& rt) {
if (rt.is_reg()) { if (rt.is_reg()) {
dsubu(rd, rs, rt.rm()); dsubu(rd, rs, rt.rm());
} else { } else if (is_int16(-rt.imm64_) && !MustUseReg(rt.rmode_)) {
if (is_int16(-rt.imm64_) && !MustUseReg(rt.rmode_)) {
daddiu(rd, rs, daddiu(rd, rs,
static_cast<int32_t>( static_cast<int32_t>(
-rt.imm64_)); // No dsubiu instr, use daddiu(x, y, -imm). -rt.imm64_)); // No dsubiu instr, use daddiu(x, y, -imm).
} else if (-rt.imm64_ >> 16 == 0 && !MustUseReg(rt.rmode_)) { } else {
// Use load -imm and daddu when loading -imm generates one instruction.
DCHECK(!rs.is(at)); DCHECK(!rs.is(at));
li(at, -rt.imm64_); int li_count = InstrCountForLi64Bit(rt.imm64_);
daddu(rd, rs, at); int li_neg_count = InstrCountForLi64Bit(-rt.imm64_);
if (li_neg_count < li_count && !MustUseReg(rt.rmode_)) {
// Use load -imm and daddu when loading -imm generates one instruction.
DCHECK(rt.imm64_ != std::numeric_limits<int32_t>::min());
li(at, Operand(-rt.imm64_));
Daddu(rd, rs, at);
} else { } else {
// li handles the relocation. // li handles the relocation.
DCHECK(!rs.is(at));
li(at, rt); li(at, rt);
dsubu(rd, rs, at); dsubu(rd, rs, at);
} }
...@@ -1710,6 +1713,15 @@ void MacroAssembler::li(Register dst, Handle<Object> value, LiFlags mode) { ...@@ -1710,6 +1713,15 @@ void MacroAssembler::li(Register dst, Handle<Object> value, LiFlags mode) {
li(dst, Operand(value), mode); li(dst, Operand(value), mode);
} }
static inline int InstrCountForLiLower32Bit(int64_t value) {
if (!is_int16(static_cast<int32_t>(value)) && (value & kUpper16MaskOf64) &&
(value & kImm16Mask)) {
return 2;
} else {
return 1;
}
}
void MacroAssembler::LiLower32BitHelper(Register rd, Operand j) { void MacroAssembler::LiLower32BitHelper(Register rd, Operand j) {
if (is_int16(static_cast<int32_t>(j.imm64_))) { if (is_int16(static_cast<int32_t>(j.imm64_))) {
daddiu(rd, zero_reg, (j.imm64_ & kImm16Mask)); daddiu(rd, zero_reg, (j.imm64_ & kImm16Mask));
...@@ -1734,16 +1746,108 @@ static inline int InstrCountForLoadReplicatedConst32(int64_t value) { ...@@ -1734,16 +1746,108 @@ static inline int InstrCountForLoadReplicatedConst32(int64_t value) {
return INT_MAX; return INT_MAX;
} }
void MacroAssembler::li(Register rd, Operand j, LiFlags mode) { int MacroAssembler::InstrCountForLi64Bit(int64_t value) {
if (is_int32(value)) {
return InstrCountForLiLower32Bit(value);
} else {
int bit31 = value >> 31 & 0x1;
if ((value & kUpper16MaskOf64) == 0 && is_int16(value >> 32) &&
kArchVariant == kMips64r6) {
return 2;
} else if ((value & (kHigher16MaskOf64 | kUpper16MaskOf64)) == 0 &&
kArchVariant == kMips64r6) {
return 2;
} else if ((value & kImm16Mask) == 0 && is_int16((value >> 32) + bit31) &&
kArchVariant == kMips64r6) {
return 2;
} else if ((value & kImm16Mask) == 0 &&
((value >> 31) & 0x1ffff) == ((0x20000 - bit31) & 0x1ffff) &&
kArchVariant == kMips64r6) {
return 2;
} else if (is_int16(static_cast<int32_t>(value)) &&
is_int16((value >> 32) + bit31) && kArchVariant == kMips64r6) {
return 2;
} else if (is_int16(static_cast<int32_t>(value)) &&
((value >> 31) & 0x1ffff) == ((0x20000 - bit31) & 0x1ffff) &&
kArchVariant == kMips64r6) {
return 2;
} else if (base::bits::IsPowerOfTwo64(value + 1)) {
return 2;
} else {
int shift_cnt = base::bits::CountTrailingZeros64(value);
int rep32_count = InstrCountForLoadReplicatedConst32(value);
int64_t tmp = value >> shift_cnt;
if (is_uint16(tmp)) {
return 2;
} else if (is_int16(tmp)) {
return 2;
} else if (rep32_count < 3) {
return 2;
} else if (is_int32(tmp)) {
return 3;
} else {
shift_cnt = 16 + base::bits::CountTrailingZeros64(value >> 16);
tmp = value >> shift_cnt;
if (is_uint16(tmp)) {
return 3;
} else if (is_int16(tmp)) {
return 3;
} else if (rep32_count < 4) {
return 3;
} else if (kArchVariant == kMips64r6) {
int64_t imm = value;
int count = InstrCountForLiLower32Bit(imm);
imm = (imm >> 32) + bit31;
if (imm & kImm16Mask) {
count++;
}
imm = (imm >> 16) + (imm >> 15 & 0x1);
if (imm & kImm16Mask) {
count++;
}
return count;
} else {
if (is_int48(value)) {
int64_t k = value >> 16;
int count = InstrCountForLiLower32Bit(k) + 1;
if (value & kImm16Mask) {
count++;
}
return count;
} else {
int64_t k = value >> 32;
int count = InstrCountForLiLower32Bit(k);
if ((value >> 16) & kImm16Mask) {
count += 3;
if (value & kImm16Mask) {
count++;
}
} else {
count++;
if (value & kImm16Mask) {
count++;
}
}
return count;
}
}
}
}
}
UNREACHABLE();
return INT_MAX;
}
void MacroAssembler::li_optimized(Register rd, Operand j, LiFlags mode) {
DCHECK(!j.is_reg()); DCHECK(!j.is_reg());
DCHECK(!MustUseReg(j.rmode_));
DCHECK(mode == OPTIMIZE_SIZE);
BlockTrampolinePoolScope block_trampoline_pool(this); BlockTrampolinePoolScope block_trampoline_pool(this);
if (!MustUseReg(j.rmode_) && mode == OPTIMIZE_SIZE) {
// Normal load of an immediate value which does not need Relocation Info. // Normal load of an immediate value which does not need Relocation Info.
if (is_int32(j.imm64_)) { if (is_int32(j.imm64_)) {
LiLower32BitHelper(rd, j); LiLower32BitHelper(rd, j);
} else { } else {
int bit31 = j.imm64_ >> 31 & 0x1; int bit31 = j.imm64_ >> 31 & 0x1;
int rep32_count = InstrCountForLoadReplicatedConst32(j.imm64_);
if ((j.imm64_ & kUpper16MaskOf64) == 0 && is_int16(j.imm64_ >> 32) && if ((j.imm64_ & kUpper16MaskOf64) == 0 && is_int16(j.imm64_ >> 32) &&
kArchVariant == kMips64r6) { kArchVariant == kMips64r6) {
// 64-bit value which consists of an unsigned 16-bit value in its // 64-bit value which consists of an unsigned 16-bit value in its
...@@ -1766,8 +1870,7 @@ void MacroAssembler::li(Register rd, Operand j, LiFlags mode) { ...@@ -1766,8 +1870,7 @@ void MacroAssembler::li(Register rd, Operand j, LiFlags mode) {
lui(rd, j.imm64_ >> kLuiShift & kImm16Mask); lui(rd, j.imm64_ >> kLuiShift & kImm16Mask);
dahi(rd, ((j.imm64_ >> 32) + bit31) & kImm16Mask); dahi(rd, ((j.imm64_ >> 32) + bit31) & kImm16Mask);
} else if ((j.imm64_ & kImm16Mask) == 0 && } else if ((j.imm64_ & kImm16Mask) == 0 &&
((j.imm64_ >> 31) & 0x1ffff) == ((j.imm64_ >> 31) & 0x1ffff) == ((0x20000 - bit31) & 0x1ffff) &&
((0x20000 - bit31) & 0x1ffff) &&
kArchVariant == kMips64r6) { kArchVariant == kMips64r6) {
// 16 LSBs all set to zero. // 16 LSBs all set to zero.
// 48 MSBs hold a signed value which can't be represented by signed // 48 MSBs hold a signed value which can't be represented by signed
...@@ -1782,8 +1885,7 @@ void MacroAssembler::li(Register rd, Operand j, LiFlags mode) { ...@@ -1782,8 +1885,7 @@ void MacroAssembler::li(Register rd, Operand j, LiFlags mode) {
daddiu(rd, zero_reg, j.imm64_ & kImm16Mask); daddiu(rd, zero_reg, j.imm64_ & kImm16Mask);
dahi(rd, ((j.imm64_ >> 32) + bit31) & kImm16Mask); dahi(rd, ((j.imm64_ >> 32) + bit31) & kImm16Mask);
} else if (is_int16(static_cast<int32_t>(j.imm64_)) && } else if (is_int16(static_cast<int32_t>(j.imm64_)) &&
((j.imm64_ >> 31) & 0x1ffff) == ((j.imm64_ >> 31) & 0x1ffff) == ((0x20000 - bit31) & 0x1ffff) &&
((0x20000 - bit31) & 0x1ffff) &&
kArchVariant == kMips64r6) { kArchVariant == kMips64r6) {
// 48 LSBs contain an unsigned 16-bit number. // 48 LSBs contain an unsigned 16-bit number.
// 16 MSBs contain a signed 16-bit number. // 16 MSBs contain a signed 16-bit number.
...@@ -1801,6 +1903,7 @@ void MacroAssembler::li(Register rd, Operand j, LiFlags mode) { ...@@ -1801,6 +1903,7 @@ void MacroAssembler::li(Register rd, Operand j, LiFlags mode) {
} }
} else { } else {
int shift_cnt = base::bits::CountTrailingZeros64(j.imm64_); int shift_cnt = base::bits::CountTrailingZeros64(j.imm64_);
int rep32_count = InstrCountForLoadReplicatedConst32(j.imm64_);
int64_t tmp = j.imm64_ >> shift_cnt; int64_t tmp = j.imm64_ >> shift_cnt;
if (is_uint16(tmp)) { if (is_uint16(tmp)) {
// Value can be computed by loading a 16-bit unsigned value, and // Value can be computed by loading a 16-bit unsigned value, and
...@@ -1908,6 +2011,28 @@ void MacroAssembler::li(Register rd, Operand j, LiFlags mode) { ...@@ -1908,6 +2011,28 @@ void MacroAssembler::li(Register rd, Operand j, LiFlags mode) {
} }
} }
} }
}
void MacroAssembler::li(Register rd, Operand j, LiFlags mode) {
DCHECK(!j.is_reg());
BlockTrampolinePoolScope block_trampoline_pool(this);
if (!MustUseReg(j.rmode_) && mode == OPTIMIZE_SIZE) {
int li_count = InstrCountForLi64Bit(j.imm64_);
int li_neg_count = InstrCountForLi64Bit(-j.imm64_);
int li_not_count = InstrCountForLi64Bit(~j.imm64_);
// Loading -MIN_INT64 could cause problems, but loading MIN_INT64 takes only
// two instructions so no need to check for this.
if (li_neg_count <= li_not_count && li_neg_count < li_count - 1) {
DCHECK(j.imm64_ != std::numeric_limits<int64_t>::min());
li_optimized(rd, Operand(-j.imm64_), mode);
Dsubu(rd, zero_reg, rd);
} else if (li_neg_count > li_not_count && li_not_count < li_count - 1) {
DCHECK(j.imm64_ != std::numeric_limits<int64_t>::min());
li_optimized(rd, Operand(~j.imm64_), mode);
nor(rd, rd, rd);
} else {
li_optimized(rd, j, mode);
}
} else if (MustUseReg(j.rmode_)) { } else if (MustUseReg(j.rmode_)) {
RecordRelocInfo(j.rmode_, j.imm64_); RecordRelocInfo(j.rmode_, j.imm64_);
lui(rd, (j.imm64_ >> 32) & kImm16Mask); lui(rd, (j.imm64_ >> 32) & kImm16Mask);
......
...@@ -740,6 +740,8 @@ class MacroAssembler: public Assembler { ...@@ -740,6 +740,8 @@ class MacroAssembler: public Assembler {
// Load int32 in the rd register. // Load int32 in the rd register.
void li(Register rd, Operand j, LiFlags mode = OPTIMIZE_SIZE); void li(Register rd, Operand j, LiFlags mode = OPTIMIZE_SIZE);
void li_optimized(Register rd, Operand j, LiFlags mode = OPTIMIZE_SIZE);
static int InstrCountForLi64Bit(int64_t value);
inline void LiLower32BitHelper(Register rd, Operand j); inline void LiLower32BitHelper(Register rd, Operand j);
inline void li(Register rd, int64_t j, LiFlags mode = OPTIMIZE_SIZE) { inline void li(Register rd, int64_t j, LiFlags mode = OPTIMIZE_SIZE) {
li(rd, Operand(j), mode); li(rd, Operand(j), mode);
......
...@@ -5071,6 +5071,9 @@ uint64_t run_li_macro(uint64_t imm, LiFlags mode, int32_t num_instr = 0) { ...@@ -5071,6 +5071,9 @@ uint64_t run_li_macro(uint64_t imm, LiFlags mode, int32_t num_instr = 0) {
assm.GetCode(isolate, &desc); assm.GetCode(isolate, &desc);
Handle<Code> code = isolate->factory()->NewCode( Handle<Code> code = isolate->factory()->NewCode(
desc, Code::ComputeFlags(Code::STUB), Handle<Code>()); desc, Code::ComputeFlags(Code::STUB), Handle<Code>());
#ifdef OBJECT_PRINT
code->Print(std::cout);
#endif
F2 f = FUNCTION_CAST<F2>(code->entry()); F2 f = FUNCTION_CAST<F2>(code->entry());
uint64_t res = reinterpret_cast<uint64_t>( uint64_t res = reinterpret_cast<uint64_t>(
...@@ -5123,8 +5126,10 @@ TEST(li_macro) { ...@@ -5123,8 +5126,10 @@ TEST(li_macro) {
{0x00000001fffffffe, 4, 2}, // max_uint32 << 1 {0x00000001fffffffe, 4, 2}, // max_uint32 << 1
// r2 - lui + ori + dsll + ori // r2 - lui + ori + dsll + ori
// r6 - daddiu + dahi // r6 - daddiu + dahi
{0x0000fffffffffffe, 5, 2}, // max_uint48 - 1 {0x0000fffffffffffe, 4, 2}, // max_uint48 - 1
// r2 - ori + dsll + ori + dsll + ori // r2 - daddiu + dsll32 + ori + dsubu
// Loading imm directly would require ori + dsll + ori + dsll + ori.
// Optimized by loading -imm and using dsubu to get imm.
// r6 - daddiu + dati // r6 - daddiu + dati
{0xffffffff00000000, 2, 2}, // max_uint32 << 32 {0xffffffff00000000, 2, 2}, // max_uint32 << 32
// r2 - daddiu + dsll32 // r2 - daddiu + dsll32
...@@ -5151,6 +5156,9 @@ TEST(li_macro) { ...@@ -5151,6 +5156,9 @@ TEST(li_macro) {
{0xffff8000ffff0000, 3, 2}, {0xffff8000ffff0000, 3, 2},
// r2 - lui + ori + dsll // r2 - lui + ori + dsll
// r6 - lui + dahi // r6 - lui + dahi
{0x0000ffffffff0000, 4, 2},
// r2 - ori + dsll + ori + dsll
// r6 - lui + dati
{0x1234ffff80000000, 3, 2}, {0x1234ffff80000000, 3, 2},
// r2 - lui + ori + dsll // r2 - lui + ori + dsll
// r6 - lui + dati // r6 - lui + dati
...@@ -5160,8 +5168,10 @@ TEST(li_macro) { ...@@ -5160,8 +5168,10 @@ TEST(li_macro) {
{0xffff8000ffff8000, 2, 2}, {0xffff8000ffff8000, 2, 2},
// r2 - daddiu + dinsu // r2 - daddiu + dinsu
// r6 - daddiu + dahi // r6 - daddiu + dahi
{0xffff0000ffff8000, 5, 3}, {0xffff0000ffff8000, 4, 3},
// r2 - lui + dsll + ori + dsll + ori // r2 - ori + dsll32 + ori + dsubu
// Loading imm directly would require lui + dsll + ori + dsll + ori.
// Optimized by loading -imm and using dsubu to get imm.
// r6 - daddiu + dahi + dati // r6 - daddiu + dahi + dati
{0x8000000080000000, 2, 2}, {0x8000000080000000, 2, 2},
// lui + dinsu // lui + dinsu
...@@ -5180,11 +5190,15 @@ TEST(li_macro) { ...@@ -5180,11 +5190,15 @@ TEST(li_macro) {
{0x1ffffabcd, 4, 2}, {0x1ffffabcd, 4, 2},
// r2 - lui + ori + dsll + ori // r2 - lui + ori + dsll + ori
// r6 - daddiu + dahi // r6 - daddiu + dahi
{0xffffffffabcd, 5, 2}, {0xffffffffabcd, 4, 2},
// r2 - ori + dsll + ori + dsll + ori // r2 - daddiu + dsll32 + ori + dsubu
// Loading imm directly would require ori + dsll + ori + dsll + ori.
// Optimized by loading -imm and using dsubu to get imm.
// r6 - daddiu + dati // r6 - daddiu + dati
{0x1ffffffffabcd, 6, 2}, {0x1ffffffffabcd, 4, 2},
// r2 - lui + ori + dsll + ori + dsll + ori // r2 - daddiu + dsll32 + ori + dsubu
// Loading imm directly would require lui + ori + dsll + ori + dsll + ori.
// Optimized by loading -imm and using dsubu to get imm.
// r6 - daddiu + dati // r6 - daddiu + dati
{0xffff7fff80010000, 5, 2}, {0xffff7fff80010000, 5, 2},
// r2 - lui + ori + dsll + ori + dsll // r2 - lui + ori + dsll + ori + dsll
...@@ -5209,6 +5223,12 @@ TEST(li_macro) { ...@@ -5209,6 +5223,12 @@ TEST(li_macro) {
// r2 - lui + ori + dsll + ori + dsll + ori instruction sequence, // r2 - lui + ori + dsll + ori + dsll + ori instruction sequence,
// r6 - lui + ori + dahi + dati. // r6 - lui + ori + dahi + dati.
// Load using full instruction sequence. // Load using full instruction sequence.
{0xffff0000ffffffff, 3, 3},
// r2 - ori + dsll32 + nor
// Loading imm directly would require lui + dsll + ori + dsll + ori.
// Optimized by loading ~imm and using nor to get imm. Loading -imm would
// require one instruction more.
// r6 - daddiu + dahi + dati
}; };
size_t nr_test_cases = sizeof(tc) / sizeof(TestCase_li); size_t nr_test_cases = sizeof(tc) / sizeof(TestCase_li);
...@@ -6199,6 +6219,9 @@ uint64_t run_Subu(uint64_t imm, int32_t num_instr) { ...@@ -6199,6 +6219,9 @@ uint64_t run_Subu(uint64_t imm, int32_t num_instr) {
assm.GetCode(isolate, &desc); assm.GetCode(isolate, &desc);
Handle<Code> code = isolate->factory()->NewCode( Handle<Code> code = isolate->factory()->NewCode(
desc, Code::ComputeFlags(Code::STUB), Handle<Code>()); desc, Code::ComputeFlags(Code::STUB), Handle<Code>());
#ifdef OBJECT_PRINT
code->Print(std::cout);
#endif
F2 f = FUNCTION_CAST<F2>(code->entry()); F2 f = FUNCTION_CAST<F2>(code->entry());
uint64_t res = reinterpret_cast<uint64_t>( uint64_t res = reinterpret_cast<uint64_t>(
...@@ -6278,6 +6301,9 @@ uint64_t run_Dsubu(uint64_t imm, int32_t num_instr) { ...@@ -6278,6 +6301,9 @@ uint64_t run_Dsubu(uint64_t imm, int32_t num_instr) {
assm.GetCode(isolate, &desc); assm.GetCode(isolate, &desc);
Handle<Code> code = isolate->factory()->NewCode( Handle<Code> code = isolate->factory()->NewCode(
desc, Code::ComputeFlags(Code::STUB), Handle<Code>()); desc, Code::ComputeFlags(Code::STUB), Handle<Code>());
#ifdef OBJECT_PRINT
code->Print(std::cout);
#endif
F2 f = FUNCTION_CAST<F2>(code->entry()); F2 f = FUNCTION_CAST<F2>(code->entry());
uint64_t res = reinterpret_cast<uint64_t>( uint64_t res = reinterpret_cast<uint64_t>(
...@@ -6341,6 +6367,12 @@ TEST(Dsubu) { ...@@ -6341,6 +6367,12 @@ TEST(Dsubu) {
// r6 - ori + dati + dsubu. // r6 - ori + dati + dsubu.
// The result of 0 - min_int64 eqauls max_int64 + 1, which wraps around to // The result of 0 - min_int64 eqauls max_int64 + 1, which wraps around to
// min_int64 again. // min_int64 again.
{0xffff0000ffffffff, 0x0000ffff00000001, 4},
// The test case above generates:
// r2 - ori + dsrl32 + ori + daddu instruction sequence,
// r6 - daddiu + dahi + dati + dsubu.
// For r2 loading imm would take more instructions than loading -imm so we
// can load -imm and add with daddu.
}; };
size_t nr_test_cases = sizeof(tc) / sizeof(TestCaseDsubu); size_t nr_test_cases = sizeof(tc) / sizeof(TestCaseDsubu);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment