Commit 785fa6b4 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[liftoff] Change FillStackSlotsWithZero to use bytes

Bug: v8:9909
Change-Id: I997ae6f19c580f08eb9ff8ee039e0dd647091616
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1947350
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#65320}
parent 0db45cb1
......@@ -1940,6 +1940,12 @@ void Assembler::emit_repmovs(int size) {
emit(0xA5);
}
void Assembler::repstosl() {
EnsureSpace ensure_space(this);
emit(0xF3);
emit(0xAB);
}
void Assembler::repstosq() {
EnsureSpace ensure_space(this);
emit(0xF3);
......
......@@ -544,6 +544,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void repmovsl() { emit_repmovs(kInt32Size); }
void repmovsq() { emit_repmovs(kInt64Size); }
// Repeated store of doublewords (fill (E)CX bytes at ES:[(E)DI] with EAX).
void repstosl();
// Repeated store of quadwords (fill RCX quadwords at [RDI] with RAX).
void repstosq();
......
......@@ -640,36 +640,29 @@ void LiftoffAssembler::FillI64Half(Register reg, uint32_t offset,
ldr(reg, liftoff::GetHalfStackSlot(offset, half));
}
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
DCHECK_LT(0, count);
uint32_t last_stack_slot = index + count - 1;
RecordUsedSpillOffset(GetStackOffsetFromIndex(last_stack_slot));
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
DCHECK_LT(0, size);
DCHECK_EQ(0, size % 4);
RecordUsedSpillOffset(start + size);
// We need a zero reg. Always use r0 for that, and push it before to restore
// its value afterwards.
push(r0);
mov(r0, Operand(0));
if (count <= 5) {
// Special straight-line code for up to five slots. Generates two
// instructions per slot.
for (uint32_t offset = 0; offset < count; ++offset) {
str(r0, liftoff::GetHalfStackSlot(GetStackOffsetFromIndex(index + offset),
kLowWord));
str(r0, liftoff::GetHalfStackSlot(GetStackOffsetFromIndex(index + offset),
kHighWord));
if (size <= 36) {
// Special straight-line code for up to 9 words. Generates one
// instruction per word.
for (uint32_t offset = 4; offset <= size; offset += 4) {
str(r0, liftoff::GetHalfStackSlot(start + offset, kLowWord));
}
} else {
// General case for bigger counts (9 instructions).
// Use r1 for start address (inclusive), r2 for end address (exclusive).
push(r1);
push(r2);
sub(r1, fp,
Operand(liftoff::GetStackSlotOffset(
GetStackOffsetFromIndex(last_stack_slot))));
sub(r2, fp,
Operand(liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(index)) -
kStackSlotSize));
sub(r1, fp, Operand(liftoff::GetStackSlotOffset(start + size)));
sub(r2, fp, Operand(liftoff::GetStackSlotOffset(start)));
Label loop;
bind(&loop);
......
......@@ -398,39 +398,52 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t offset, RegPairHalf) {
UNREACHABLE();
}
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
DCHECK_LT(0, count);
uint32_t last_stack_slot = index + count - 1;
RecordUsedSpillOffset(GetStackOffsetFromIndex(last_stack_slot));
int max_stp_offset =
-liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(index + count - 1));
if (count <= 12 && IsImmLSPair(max_stp_offset, kXRegSizeLog2)) {
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
DCHECK_LT(0, size);
DCHECK_EQ(0, size % 4);
RecordUsedSpillOffset(start + size);
int max_stp_offset = -liftoff::GetStackSlotOffset(start + size);
if (size <= 12 * kStackSlotSize &&
IsImmLSPair(max_stp_offset, kXRegSizeLog2)) {
// Special straight-line code for up to 12 slots. Generates one
// instruction per two slots (<= 6 instructions total).
for (; count > 1; count -= 2) {
// instruction per two slots (<= 7 instructions total).
STATIC_ASSERT(kStackSlotSize == kSystemPointerSize);
stp(xzr, xzr,
liftoff::GetStackSlot(GetStackOffsetFromIndex(index + count - 1)));
uint32_t remainder = size;
for (; remainder >= 2 * kStackSlotSize; remainder -= 2 * kStackSlotSize) {
stp(xzr, xzr, liftoff::GetStackSlot(start + remainder));
}
DCHECK(count == 0 || count == 1);
if (count) {
str(xzr, liftoff::GetStackSlot(GetStackOffsetFromIndex(index)));
DCHECK_GE(12, remainder);
switch (remainder) {
case 12:
str(xzr, liftoff::GetStackSlot(start + remainder));
strh(xzr, liftoff::GetStackSlot(start + remainder - 8));
break;
case 8:
str(xzr, liftoff::GetStackSlot(start + remainder));
break;
case 4:
strh(xzr, liftoff::GetStackSlot(start + remainder));
break;
case 0:
break;
default:
UNREACHABLE();
}
} else {
// General case for bigger counts (5-8 instructions).
UseScratchRegisterScope temps(this);
Register address_reg = temps.AcquireX();
// This {Sub} might use another temp register if the offset is too large.
Sub(address_reg, fp,
liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(last_stack_slot)));
Sub(address_reg, fp, liftoff::GetStackSlotOffset(start + size));
Register count_reg = temps.AcquireX();
Mov(count_reg, count);
Mov(count_reg, size / 4);
Label loop;
bind(&loop);
sub(count_reg, count_reg, 1);
str(xzr, MemOperand(address_reg, kSystemPointerSize, PostIndex));
strh(xzr, MemOperand(address_reg, kSystemPointerSize, PostIndex));
cbnz(count_reg, &loop);
}
}
......
......@@ -503,21 +503,16 @@ void LiftoffAssembler::FillI64Half(Register reg, uint32_t offset,
mov(reg, liftoff::GetHalfStackSlot(offset, half));
}
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
DCHECK_LT(0, count);
uint32_t last_stack_slot = index + count - 1;
RecordUsedSpillOffset(GetStackOffsetFromIndex(last_stack_slot));
if (count <= 2) {
// Special straight-line code for up to two slots (6-9 bytes per word:
// C7 <1-4 bytes operand> <4 bytes imm>, makes 12-18 bytes per slot).
for (uint32_t offset = 0; offset < count; ++offset) {
mov(liftoff::GetHalfStackSlot(GetStackOffsetFromIndex(index + offset),
kLowWord),
Immediate(0));
mov(liftoff::GetHalfStackSlot(GetStackOffsetFromIndex(index + offset),
kHighWord),
Immediate(0));
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
DCHECK_LT(0, size);
DCHECK_EQ(0, size % 4);
RecordUsedSpillOffset(start + size);
if (size <= 12) {
// Special straight-line code for up to three words (6-9 bytes per word:
// C7 <1-4 bytes operand> <4 bytes imm>, makes 18-27 bytes total).
for (uint32_t offset = 4; offset <= size; offset += 4) {
mov(liftoff::GetHalfStackSlot(start + offset, kLowWord), Immediate(0));
}
} else {
// General case for bigger counts.
......@@ -527,10 +522,10 @@ void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
push(eax);
push(ecx);
push(edi);
lea(edi, liftoff::GetStackSlot(GetStackOffsetFromIndex(last_stack_slot)));
lea(edi, liftoff::GetStackSlot(start + size));
xor_(eax, eax);
// Number of words is number of slots times two.
mov(ecx, Immediate(count * 2));
// Size is in bytes, convert to doublewords (4-bytes).
mov(ecx, Immediate(size / 4));
rep_stos();
pop(edi);
pop(ecx);
......
......@@ -286,12 +286,14 @@ class LiftoffAssembler : public TurboAssembler {
LiftoffRegister PopToRegister(LiftoffRegList pinned = {});
uint32_t NextSpillOffset(ValueType type) {
return TopSpillOffset() + SlotSizeForType(type);
}
uint32_t TopSpillOffset() {
if (cache_state_.stack_state.empty()) {
return SlotSizeForType(type);
return 0;
}
VarState last = cache_state_.stack_state.back();
uint32_t offset = last.offset() + SlotSizeForType(type);
return offset;
return cache_state_.stack_state.back().offset();
}
void PushRegister(ValueType type, LiftoffRegister reg) {
......
......@@ -478,16 +478,18 @@ class LiftoffCompiler {
for (uint32_t param_idx = 0; param_idx < num_params; ++param_idx) {
input_idx += ProcessParameter(__ local_type(param_idx), input_idx);
}
uint32_t params_size = __ TopSpillOffset();
DCHECK_EQ(input_idx, descriptor_->InputCount());
// Initialize locals beyond parameters.
if (SpillLocalsInitially(decoder, num_params)) {
__ FillStackSlotsWithZero(num_params, __ num_locals() - num_params);
for (uint32_t param_idx = num_params; param_idx < __ num_locals();
++param_idx) {
ValueType type = decoder->GetLocalType(param_idx);
__ PushStack(type);
}
uint32_t spill_size = __ TopSpillOffset();
__ FillStackSlotsWithZero(params_size, spill_size);
} else {
for (uint32_t param_idx = num_params; param_idx < __ num_locals();
++param_idx) {
......
......@@ -616,28 +616,23 @@ void LiftoffAssembler::FillI64Half(Register reg, uint32_t offset,
lw(reg, liftoff::GetHalfStackSlot(offset, half));
}
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
DCHECK_LT(0, count);
uint32_t last_stack_slot = index + count - 1;
RecordUsedSpillOffset(GetStackOffsetFromIndex(last_stack_slot));
if (count <= 12) {
// Special straight-line code for up to 12 slots. Generates one
// instruction per slot (<=12 instructions total).
for (uint32_t offset = 0; offset < count; ++offset) {
Sw(zero_reg,
liftoff::GetStackSlot(GetStackOffsetFromIndex(index + offset)));
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
DCHECK_LT(0, size);
DCHECK_EQ(0, size % 4);
RecordUsedSpillOffset(start + size);
if (size <= 48) {
// Special straight-line code for up to 12 words. Generates one
// instruction per word (<=12 instructions total).
for (uint32_t offset = 4; offset <= size; offset += 4) {
Sw(zero_reg, liftoff::GetStackSlot(start + offset));
}
} else {
// General case for bigger counts (12 instructions).
// Use a0 for start address (inclusive), a1 for end address (exclusive).
Push(a1, a0);
Addu(a0, fp,
Operand(-liftoff::GetStackSlotOffset(
GetStackOffsetFromIndex(last_stack_slot))));
Addu(a1, fp,
Operand(-liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(index)) +
kStackSlotSize));
Addu(a0, fp, Operand(-liftoff::GetStackSlotOffset(start + size)));
Addu(a1, fp, Operand(-liftoff::GetStackSlotOffset(start)));
Label loop;
bind(&loop);
......
......@@ -524,28 +524,27 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t offset, RegPairHalf) {
UNREACHABLE();
}
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
DCHECK_LT(0, count);
uint32_t last_stack_slot = index + count - 1;
RecordUsedSpillOffset(GetStackOffsetFromIndex(last_stack_slot));
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
DCHECK_LT(0, size);
RecordUsedSpillOffset(start + size);
if (count <= 12) {
if (size <= 12 * kStackSlotSize) {
// Special straight-line code for up to 12 slots. Generates one
// instruction per slot (<= 12 instructions total).
for (uint32_t offset = 0; offset < count; ++offset) {
Sd(zero_reg,
liftoff::GetStackSlot(GetStackOffsetFromIndex(index + offset)));
uint32_t remainder = size;
for (; remainder >= kStackSlotSize; remainder -= kStackSlotSize) {
Sd(zero_reg, liftoff::GetStackSlot(start + remainder));
}
DCHECK(remainder == 4 || remainder == 0);
if (remainder) {
Sw(zero_reg, liftoff::GetStackSlot(start + remainder));
}
} else {
// General case for bigger counts (12 instructions).
// Use a0 for start address (inclusive), a1 for end address (exclusive).
Push(a1, a0);
Daddu(a0, fp,
Operand(-liftoff::GetStackSlotOffset(
GetStackOffsetFromIndex(last_stack_slot))));
Daddu(a1, fp,
Operand(-liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(index)) +
kStackSlotSize));
Daddu(a0, fp, Operand(-liftoff::GetStackSlotOffset(start + end)));
Daddu(a1, fp, Operand(-liftoff::GetStackSlotOffset(start)));
Label loop;
bind(&loop);
......
......@@ -145,36 +145,34 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t offset, RegPairHalf) {
bailout(kUnsupportedArchitecture, "FillI64Half");
}
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
DCHECK_LT(0, count);
uint32_t last_stack_slot = index + count - 1;
RecordUsedSpillOffset(last_stack_slot);
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
DCHECK_LT(0, size);
RecordUsedSpillOffset(start + size);
// We need a zero reg. Always use r0 for that, and push it before to restore
// its value afterwards.
push(r0);
mov(r0, Operand(0));
if (count <= 5) {
if (size <= 5 * kStackSlotSize) {
// Special straight-line code for up to five slots. Generates two
// instructions per slot.
for (uint32_t offset = 0; offset < count; ++offset) {
StoreP(r0, liftoff::GetHalfStackSlot(
GetStackOffsetFromIndex(index + offset), kLowWord));
StoreP(r0, liftoff::GetHalfStackSlot(
GetStackOffsetFromIndex(index + offset), kHighWord));
uint32_t remainder = size;
for (; remainder >= kStackSlotSize; remainder -= kStackSlotSize) {
StoreP(r0, liftoff::GetHalfStackSlot(start + remainder, kLowWord));
StoreP(r0, liftoff::GetHalfStackSlot(start + remainder, kHighWord));
}
DCHECK(remainder == 4 || remainder == 0);
if (remainder) {
StoreP(r0, liftoff::GetHalfStackSlot(start + remainder, kLowWord));
}
} else {
// General case for bigger counts (9 instructions).
// Use r3 for start address (inclusive), r4 for end address (exclusive).
push(r3);
push(r4);
SubP(r3, fp,
Operand(liftoff::GetStackSlotOffset(
GetStackOffsetFromIndex(last_stack_slot))));
SubP(r4, fp,
Operand(liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(index)) -
kStackSlotSize));
SubP(r3, fp, Operand(liftoff::GetStackSlotOffset(start + remainder)));
SubP(r4, fp, Operand(liftoff::GetStackSlotOffset(start)));
Label loop;
bind(&loop);
......
......@@ -445,31 +445,34 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t offset, RegPairHalf) {
UNREACHABLE();
}
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
DCHECK_LT(0, count);
uint32_t last_stack_slot = index + count - 1;
RecordUsedSpillOffset(
LiftoffAssembler::GetStackOffsetFromIndex(last_stack_slot));
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
DCHECK_LT(0, size);
RecordUsedSpillOffset(start + size);
if (count <= 3) {
if (size <= 3 * kStackSlotSize) {
// Special straight-line code for up to three slots
// (7-10 bytes per slot: REX C7 <1-4 bytes op> <4 bytes imm>).
for (uint32_t offset = 0; offset < count; ++offset) {
movq(liftoff::GetStackSlot(GetStackOffsetFromIndex(index + offset)),
Immediate(0));
// (7-10 bytes per slot: REX C7 <1-4 bytes op> <4 bytes imm>),
// And a movd (6-9 byte) when size % 8 != 0;
uint32_t remainder = size;
for (; remainder >= kStackSlotSize; remainder -= kStackSlotSize) {
movq(liftoff::GetStackSlot(start + remainder), Immediate(0));
}
DCHECK(remainder == 4 || remainder == 0);
if (remainder) {
movl(liftoff::GetStackSlot(start + remainder), Immediate(0));
}
} else {
// General case for bigger counts.
// This sequence takes 20-23 bytes (3 for pushes, 4-7 for lea, 2 for xor, 5
// for mov, 3 for repstosq, 3 for pops).
// From intel manual: repstosq fills RCX quadwords at [RDI] with RAX.
// This sequence takes 19-22 bytes (3 for pushes, 4-7 for lea, 2 for xor, 5
// for mov, 2 for repstosl, 3 for pops).
pushq(rax);
pushq(rcx);
pushq(rdi);
leaq(rdi, liftoff::GetStackSlot(GetStackOffsetFromIndex(last_stack_slot)));
leaq(rdi, liftoff::GetStackSlot(start + size));
xorl(rax, rax);
movl(rcx, Immediate(count));
repstosq();
// Convert size (bytes) to doublewords (4-bytes).
movl(rcx, Immediate(size / 4));
repstosl();
popq(rdi);
popq(rcx);
popq(rax);
......
......@@ -182,6 +182,7 @@ TEST(DisasmX64) {
__ decq(rdx);
__ cdq();
__ repstosl();
__ repstosq();
__ nop();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment