Commit 785fa6b4 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[liftoff] Change FillStackSlotsWithZero to use bytes

Bug: v8:9909
Change-Id: I997ae6f19c580f08eb9ff8ee039e0dd647091616
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1947350
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#65320}
parent 0db45cb1
...@@ -1940,6 +1940,12 @@ void Assembler::emit_repmovs(int size) { ...@@ -1940,6 +1940,12 @@ void Assembler::emit_repmovs(int size) {
emit(0xA5); emit(0xA5);
} }
void Assembler::repstosl() {
EnsureSpace ensure_space(this);
emit(0xF3);
emit(0xAB);
}
void Assembler::repstosq() { void Assembler::repstosq() {
EnsureSpace ensure_space(this); EnsureSpace ensure_space(this);
emit(0xF3); emit(0xF3);
......
...@@ -544,6 +544,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -544,6 +544,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void repmovsl() { emit_repmovs(kInt32Size); } void repmovsl() { emit_repmovs(kInt32Size); }
void repmovsq() { emit_repmovs(kInt64Size); } void repmovsq() { emit_repmovs(kInt64Size); }
// Repeated store of doublewords (fill (E)CX bytes at ES:[(E)DI] with EAX).
void repstosl();
// Repeated store of quadwords (fill RCX quadwords at [RDI] with RAX). // Repeated store of quadwords (fill RCX quadwords at [RDI] with RAX).
void repstosq(); void repstosq();
......
...@@ -640,36 +640,29 @@ void LiftoffAssembler::FillI64Half(Register reg, uint32_t offset, ...@@ -640,36 +640,29 @@ void LiftoffAssembler::FillI64Half(Register reg, uint32_t offset,
ldr(reg, liftoff::GetHalfStackSlot(offset, half)); ldr(reg, liftoff::GetHalfStackSlot(offset, half));
} }
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) { void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
DCHECK_LT(0, count); DCHECK_LT(0, size);
uint32_t last_stack_slot = index + count - 1; DCHECK_EQ(0, size % 4);
RecordUsedSpillOffset(GetStackOffsetFromIndex(last_stack_slot)); RecordUsedSpillOffset(start + size);
// We need a zero reg. Always use r0 for that, and push it before to restore // We need a zero reg. Always use r0 for that, and push it before to restore
// its value afterwards. // its value afterwards.
push(r0); push(r0);
mov(r0, Operand(0)); mov(r0, Operand(0));
if (count <= 5) { if (size <= 36) {
// Special straight-line code for up to five slots. Generates two // Special straight-line code for up to 9 words. Generates one
// instructions per slot. // instruction per word.
for (uint32_t offset = 0; offset < count; ++offset) { for (uint32_t offset = 4; offset <= size; offset += 4) {
str(r0, liftoff::GetHalfStackSlot(GetStackOffsetFromIndex(index + offset), str(r0, liftoff::GetHalfStackSlot(start + offset, kLowWord));
kLowWord));
str(r0, liftoff::GetHalfStackSlot(GetStackOffsetFromIndex(index + offset),
kHighWord));
} }
} else { } else {
// General case for bigger counts (9 instructions). // General case for bigger counts (9 instructions).
// Use r1 for start address (inclusive), r2 for end address (exclusive). // Use r1 for start address (inclusive), r2 for end address (exclusive).
push(r1); push(r1);
push(r2); push(r2);
sub(r1, fp, sub(r1, fp, Operand(liftoff::GetStackSlotOffset(start + size)));
Operand(liftoff::GetStackSlotOffset( sub(r2, fp, Operand(liftoff::GetStackSlotOffset(start)));
GetStackOffsetFromIndex(last_stack_slot))));
sub(r2, fp,
Operand(liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(index)) -
kStackSlotSize));
Label loop; Label loop;
bind(&loop); bind(&loop);
......
...@@ -398,39 +398,52 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t offset, RegPairHalf) { ...@@ -398,39 +398,52 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t offset, RegPairHalf) {
UNREACHABLE(); UNREACHABLE();
} }
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) { void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
DCHECK_LT(0, count); DCHECK_LT(0, size);
uint32_t last_stack_slot = index + count - 1; DCHECK_EQ(0, size % 4);
RecordUsedSpillOffset(GetStackOffsetFromIndex(last_stack_slot)); RecordUsedSpillOffset(start + size);
int max_stp_offset = int max_stp_offset = -liftoff::GetStackSlotOffset(start + size);
-liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(index + count - 1)); if (size <= 12 * kStackSlotSize &&
if (count <= 12 && IsImmLSPair(max_stp_offset, kXRegSizeLog2)) { IsImmLSPair(max_stp_offset, kXRegSizeLog2)) {
// Special straight-line code for up to 12 slots. Generates one // Special straight-line code for up to 12 slots. Generates one
// instruction per two slots (<= 6 instructions total). // instruction per two slots (<= 7 instructions total).
for (; count > 1; count -= 2) { STATIC_ASSERT(kStackSlotSize == kSystemPointerSize);
STATIC_ASSERT(kStackSlotSize == kSystemPointerSize); uint32_t remainder = size;
stp(xzr, xzr, for (; remainder >= 2 * kStackSlotSize; remainder -= 2 * kStackSlotSize) {
liftoff::GetStackSlot(GetStackOffsetFromIndex(index + count - 1))); stp(xzr, xzr, liftoff::GetStackSlot(start + remainder));
} }
DCHECK(count == 0 || count == 1);
if (count) { DCHECK_GE(12, remainder);
str(xzr, liftoff::GetStackSlot(GetStackOffsetFromIndex(index))); switch (remainder) {
case 12:
str(xzr, liftoff::GetStackSlot(start + remainder));
strh(xzr, liftoff::GetStackSlot(start + remainder - 8));
break;
case 8:
str(xzr, liftoff::GetStackSlot(start + remainder));
break;
case 4:
strh(xzr, liftoff::GetStackSlot(start + remainder));
break;
case 0:
break;
default:
UNREACHABLE();
} }
} else { } else {
// General case for bigger counts (5-8 instructions). // General case for bigger counts (5-8 instructions).
UseScratchRegisterScope temps(this); UseScratchRegisterScope temps(this);
Register address_reg = temps.AcquireX(); Register address_reg = temps.AcquireX();
// This {Sub} might use another temp register if the offset is too large. // This {Sub} might use another temp register if the offset is too large.
Sub(address_reg, fp, Sub(address_reg, fp, liftoff::GetStackSlotOffset(start + size));
liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(last_stack_slot)));
Register count_reg = temps.AcquireX(); Register count_reg = temps.AcquireX();
Mov(count_reg, count); Mov(count_reg, size / 4);
Label loop; Label loop;
bind(&loop); bind(&loop);
sub(count_reg, count_reg, 1); sub(count_reg, count_reg, 1);
str(xzr, MemOperand(address_reg, kSystemPointerSize, PostIndex)); strh(xzr, MemOperand(address_reg, kSystemPointerSize, PostIndex));
cbnz(count_reg, &loop); cbnz(count_reg, &loop);
} }
} }
......
...@@ -503,21 +503,16 @@ void LiftoffAssembler::FillI64Half(Register reg, uint32_t offset, ...@@ -503,21 +503,16 @@ void LiftoffAssembler::FillI64Half(Register reg, uint32_t offset,
mov(reg, liftoff::GetHalfStackSlot(offset, half)); mov(reg, liftoff::GetHalfStackSlot(offset, half));
} }
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) { void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
DCHECK_LT(0, count); DCHECK_LT(0, size);
uint32_t last_stack_slot = index + count - 1; DCHECK_EQ(0, size % 4);
RecordUsedSpillOffset(GetStackOffsetFromIndex(last_stack_slot)); RecordUsedSpillOffset(start + size);
if (count <= 2) { if (size <= 12) {
// Special straight-line code for up to two slots (6-9 bytes per word: // Special straight-line code for up to three words (6-9 bytes per word:
// C7 <1-4 bytes operand> <4 bytes imm>, makes 12-18 bytes per slot). // C7 <1-4 bytes operand> <4 bytes imm>, makes 18-27 bytes total).
for (uint32_t offset = 0; offset < count; ++offset) { for (uint32_t offset = 4; offset <= size; offset += 4) {
mov(liftoff::GetHalfStackSlot(GetStackOffsetFromIndex(index + offset), mov(liftoff::GetHalfStackSlot(start + offset, kLowWord), Immediate(0));
kLowWord),
Immediate(0));
mov(liftoff::GetHalfStackSlot(GetStackOffsetFromIndex(index + offset),
kHighWord),
Immediate(0));
} }
} else { } else {
// General case for bigger counts. // General case for bigger counts.
...@@ -527,10 +522,10 @@ void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) { ...@@ -527,10 +522,10 @@ void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
push(eax); push(eax);
push(ecx); push(ecx);
push(edi); push(edi);
lea(edi, liftoff::GetStackSlot(GetStackOffsetFromIndex(last_stack_slot))); lea(edi, liftoff::GetStackSlot(start + size));
xor_(eax, eax); xor_(eax, eax);
// Number of words is number of slots times two. // Size is in bytes, convert to doublewords (4-bytes).
mov(ecx, Immediate(count * 2)); mov(ecx, Immediate(size / 4));
rep_stos(); rep_stos();
pop(edi); pop(edi);
pop(ecx); pop(ecx);
......
...@@ -286,12 +286,14 @@ class LiftoffAssembler : public TurboAssembler { ...@@ -286,12 +286,14 @@ class LiftoffAssembler : public TurboAssembler {
LiftoffRegister PopToRegister(LiftoffRegList pinned = {}); LiftoffRegister PopToRegister(LiftoffRegList pinned = {});
uint32_t NextSpillOffset(ValueType type) { uint32_t NextSpillOffset(ValueType type) {
return TopSpillOffset() + SlotSizeForType(type);
}
uint32_t TopSpillOffset() {
if (cache_state_.stack_state.empty()) { if (cache_state_.stack_state.empty()) {
return SlotSizeForType(type); return 0;
} }
VarState last = cache_state_.stack_state.back(); return cache_state_.stack_state.back().offset();
uint32_t offset = last.offset() + SlotSizeForType(type);
return offset;
} }
void PushRegister(ValueType type, LiftoffRegister reg) { void PushRegister(ValueType type, LiftoffRegister reg) {
......
...@@ -478,16 +478,18 @@ class LiftoffCompiler { ...@@ -478,16 +478,18 @@ class LiftoffCompiler {
for (uint32_t param_idx = 0; param_idx < num_params; ++param_idx) { for (uint32_t param_idx = 0; param_idx < num_params; ++param_idx) {
input_idx += ProcessParameter(__ local_type(param_idx), input_idx); input_idx += ProcessParameter(__ local_type(param_idx), input_idx);
} }
uint32_t params_size = __ TopSpillOffset();
DCHECK_EQ(input_idx, descriptor_->InputCount()); DCHECK_EQ(input_idx, descriptor_->InputCount());
// Initialize locals beyond parameters. // Initialize locals beyond parameters.
if (SpillLocalsInitially(decoder, num_params)) { if (SpillLocalsInitially(decoder, num_params)) {
__ FillStackSlotsWithZero(num_params, __ num_locals() - num_params);
for (uint32_t param_idx = num_params; param_idx < __ num_locals(); for (uint32_t param_idx = num_params; param_idx < __ num_locals();
++param_idx) { ++param_idx) {
ValueType type = decoder->GetLocalType(param_idx); ValueType type = decoder->GetLocalType(param_idx);
__ PushStack(type); __ PushStack(type);
} }
uint32_t spill_size = __ TopSpillOffset();
__ FillStackSlotsWithZero(params_size, spill_size);
} else { } else {
for (uint32_t param_idx = num_params; param_idx < __ num_locals(); for (uint32_t param_idx = num_params; param_idx < __ num_locals();
++param_idx) { ++param_idx) {
......
...@@ -616,28 +616,23 @@ void LiftoffAssembler::FillI64Half(Register reg, uint32_t offset, ...@@ -616,28 +616,23 @@ void LiftoffAssembler::FillI64Half(Register reg, uint32_t offset,
lw(reg, liftoff::GetHalfStackSlot(offset, half)); lw(reg, liftoff::GetHalfStackSlot(offset, half));
} }
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) { void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
DCHECK_LT(0, count); DCHECK_LT(0, size);
uint32_t last_stack_slot = index + count - 1; DCHECK_EQ(0, size % 4);
RecordUsedSpillOffset(GetStackOffsetFromIndex(last_stack_slot)); RecordUsedSpillOffset(start + size);
if (count <= 12) { if (size <= 48) {
// Special straight-line code for up to 12 slots. Generates one // Special straight-line code for up to 12 words. Generates one
// instruction per slot (<=12 instructions total). // instruction per word (<=12 instructions total).
for (uint32_t offset = 0; offset < count; ++offset) { for (uint32_t offset = 4; offset <= size; offset += 4) {
Sw(zero_reg, Sw(zero_reg, liftoff::GetStackSlot(start + offset));
liftoff::GetStackSlot(GetStackOffsetFromIndex(index + offset)));
} }
} else { } else {
// General case for bigger counts (12 instructions). // General case for bigger counts (12 instructions).
// Use a0 for start address (inclusive), a1 for end address (exclusive). // Use a0 for start address (inclusive), a1 for end address (exclusive).
Push(a1, a0); Push(a1, a0);
Addu(a0, fp, Addu(a0, fp, Operand(-liftoff::GetStackSlotOffset(start + size)));
Operand(-liftoff::GetStackSlotOffset( Addu(a1, fp, Operand(-liftoff::GetStackSlotOffset(start)));
GetStackOffsetFromIndex(last_stack_slot))));
Addu(a1, fp,
Operand(-liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(index)) +
kStackSlotSize));
Label loop; Label loop;
bind(&loop); bind(&loop);
......
...@@ -524,28 +524,27 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t offset, RegPairHalf) { ...@@ -524,28 +524,27 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t offset, RegPairHalf) {
UNREACHABLE(); UNREACHABLE();
} }
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) { void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
DCHECK_LT(0, count); DCHECK_LT(0, size);
uint32_t last_stack_slot = index + count - 1; RecordUsedSpillOffset(start + size);
RecordUsedSpillOffset(GetStackOffsetFromIndex(last_stack_slot));
if (count <= 12) { if (size <= 12 * kStackSlotSize) {
// Special straight-line code for up to 12 slots. Generates one // Special straight-line code for up to 12 slots. Generates one
// instruction per slot (<= 12 instructions total). // instruction per slot (<= 12 instructions total).
for (uint32_t offset = 0; offset < count; ++offset) { uint32_t remainder = size;
Sd(zero_reg, for (; remainder >= kStackSlotSize; remainder -= kStackSlotSize) {
liftoff::GetStackSlot(GetStackOffsetFromIndex(index + offset))); Sd(zero_reg, liftoff::GetStackSlot(start + remainder));
}
DCHECK(remainder == 4 || remainder == 0);
if (remainder) {
Sw(zero_reg, liftoff::GetStackSlot(start + remainder));
} }
} else { } else {
// General case for bigger counts (12 instructions). // General case for bigger counts (12 instructions).
// Use a0 for start address (inclusive), a1 for end address (exclusive). // Use a0 for start address (inclusive), a1 for end address (exclusive).
Push(a1, a0); Push(a1, a0);
Daddu(a0, fp, Daddu(a0, fp, Operand(-liftoff::GetStackSlotOffset(start + end)));
Operand(-liftoff::GetStackSlotOffset( Daddu(a1, fp, Operand(-liftoff::GetStackSlotOffset(start)));
GetStackOffsetFromIndex(last_stack_slot))));
Daddu(a1, fp,
Operand(-liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(index)) +
kStackSlotSize));
Label loop; Label loop;
bind(&loop); bind(&loop);
......
...@@ -145,36 +145,34 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t offset, RegPairHalf) { ...@@ -145,36 +145,34 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t offset, RegPairHalf) {
bailout(kUnsupportedArchitecture, "FillI64Half"); bailout(kUnsupportedArchitecture, "FillI64Half");
} }
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) { void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
DCHECK_LT(0, count); DCHECK_LT(0, size);
uint32_t last_stack_slot = index + count - 1; RecordUsedSpillOffset(start + size);
RecordUsedSpillOffset(last_stack_slot);
// We need a zero reg. Always use r0 for that, and push it before to restore // We need a zero reg. Always use r0 for that, and push it before to restore
// its value afterwards. // its value afterwards.
push(r0); push(r0);
mov(r0, Operand(0)); mov(r0, Operand(0));
if (count <= 5) { if (size <= 5 * kStackSlotSize) {
// Special straight-line code for up to five slots. Generates two // Special straight-line code for up to five slots. Generates two
// instructions per slot. // instructions per slot.
for (uint32_t offset = 0; offset < count; ++offset) { uint32_t remainder = size;
StoreP(r0, liftoff::GetHalfStackSlot( for (; remainder >= kStackSlotSize; remainder -= kStackSlotSize) {
GetStackOffsetFromIndex(index + offset), kLowWord)); StoreP(r0, liftoff::GetHalfStackSlot(start + remainder, kLowWord));
StoreP(r0, liftoff::GetHalfStackSlot( StoreP(r0, liftoff::GetHalfStackSlot(start + remainder, kHighWord));
GetStackOffsetFromIndex(index + offset), kHighWord)); }
DCHECK(remainder == 4 || remainder == 0);
if (remainder) {
StoreP(r0, liftoff::GetHalfStackSlot(start + remainder, kLowWord));
} }
} else { } else {
// General case for bigger counts (9 instructions). // General case for bigger counts (9 instructions).
// Use r3 for start address (inclusive), r4 for end address (exclusive). // Use r3 for start address (inclusive), r4 for end address (exclusive).
push(r3); push(r3);
push(r4); push(r4);
SubP(r3, fp, SubP(r3, fp, Operand(liftoff::GetStackSlotOffset(start + remainder)));
Operand(liftoff::GetStackSlotOffset( SubP(r4, fp, Operand(liftoff::GetStackSlotOffset(start)));
GetStackOffsetFromIndex(last_stack_slot))));
SubP(r4, fp,
Operand(liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(index)) -
kStackSlotSize));
Label loop; Label loop;
bind(&loop); bind(&loop);
......
...@@ -445,31 +445,34 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t offset, RegPairHalf) { ...@@ -445,31 +445,34 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t offset, RegPairHalf) {
UNREACHABLE(); UNREACHABLE();
} }
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) { void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
DCHECK_LT(0, count); DCHECK_LT(0, size);
uint32_t last_stack_slot = index + count - 1; RecordUsedSpillOffset(start + size);
RecordUsedSpillOffset(
LiftoffAssembler::GetStackOffsetFromIndex(last_stack_slot));
if (count <= 3) { if (size <= 3 * kStackSlotSize) {
// Special straight-line code for up to three slots // Special straight-line code for up to three slots
// (7-10 bytes per slot: REX C7 <1-4 bytes op> <4 bytes imm>). // (7-10 bytes per slot: REX C7 <1-4 bytes op> <4 bytes imm>),
for (uint32_t offset = 0; offset < count; ++offset) { // And a movd (6-9 byte) when size % 8 != 0;
movq(liftoff::GetStackSlot(GetStackOffsetFromIndex(index + offset)), uint32_t remainder = size;
Immediate(0)); for (; remainder >= kStackSlotSize; remainder -= kStackSlotSize) {
movq(liftoff::GetStackSlot(start + remainder), Immediate(0));
}
DCHECK(remainder == 4 || remainder == 0);
if (remainder) {
movl(liftoff::GetStackSlot(start + remainder), Immediate(0));
} }
} else { } else {
// General case for bigger counts. // General case for bigger counts.
// This sequence takes 20-23 bytes (3 for pushes, 4-7 for lea, 2 for xor, 5 // This sequence takes 19-22 bytes (3 for pushes, 4-7 for lea, 2 for xor, 5
// for mov, 3 for repstosq, 3 for pops). // for mov, 2 for repstosl, 3 for pops).
// From intel manual: repstosq fills RCX quadwords at [RDI] with RAX.
pushq(rax); pushq(rax);
pushq(rcx); pushq(rcx);
pushq(rdi); pushq(rdi);
leaq(rdi, liftoff::GetStackSlot(GetStackOffsetFromIndex(last_stack_slot))); leaq(rdi, liftoff::GetStackSlot(start + size));
xorl(rax, rax); xorl(rax, rax);
movl(rcx, Immediate(count)); // Convert size (bytes) to doublewords (4-bytes).
repstosq(); movl(rcx, Immediate(size / 4));
repstosl();
popq(rdi); popq(rdi);
popq(rcx); popq(rcx);
popq(rax); popq(rax);
......
...@@ -182,6 +182,7 @@ TEST(DisasmX64) { ...@@ -182,6 +182,7 @@ TEST(DisasmX64) {
__ decq(rdx); __ decq(rdx);
__ cdq(); __ cdq();
__ repstosl();
__ repstosq(); __ repstosq();
__ nop(); __ nop();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment