Commit a8cdda99 authored by Clemens Backes's avatar Clemens Backes Committed by Commit Bot

[Liftoff] Improve initialization for many locals

WebAssembly locals are specified to be zero on function entry. Liftoff
implements this by just storing the constant 0 in the virtual stack for
integer types, and using one floating point register initialized to
zero for all floating point types.
For big counts of locals this leads to problems (manifesting as huge
blocks of code being generated) once we hit a merge point: All those
constants (for int) and all duplicate register uses (for floats) need to
be fixed up, by using separate registers for the locals or spilling to
the stack if no more registers are available. All this spilling
generates a lot of code, and can even happen multiple times within a
function.

This CL optimizes for such cases by spilling all locals to the stack
initially. All merges within the function body get much smaller then.
The spilled values rarely have to be loaded anyway, because the initial
zero value is usually overwritten before the first use.

To optimize the code size for initializing big numbers of locals on the
stack, this CL also introduces the platform-specific
{FillStackSlotsWithZero} method which uses a loop for bigger local
counts.

This often saves dozens of kilobytes for very big functions, and shows
an overall code size reduction of 4-5 percent for big modules.

R=jkummerow@chromium.org

Bug: v8:9830
Change-Id: I23fa4145847827420f09e043a11e0e7b606e94cc
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1856004
Commit-Queue: Clemens Backes <clemensb@chromium.org>
Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Cr-Commit-Position: refs/heads/master@{#64282}
parent 12d84b69
......@@ -513,6 +513,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void movzx_w(Register dst, Operand src);
void movq(XMMRegister dst, Operand src);
// Conditional moves
void cmov(Condition cc, Register dst, Register src) {
cmov(cc, dst, Operand(src));
......
......@@ -1971,6 +1971,13 @@ void Assembler::emit_repmovs(int size) {
emit(0xA5);
}
void Assembler::repstosq() {
EnsureSpace ensure_space(this);
emit(0xF3);
emit_rex_64();
emit(0xAB);
}
void Assembler::mull(Register src) {
EnsureSpace ensure_space(this);
emit_optional_rex_32(src);
......
......@@ -538,12 +538,14 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void movsxlq(Register dst, Operand src);
// Repeated moves.
void repmovsb();
void repmovsw();
void repmovsl() { emit_repmovs(kInt32Size); }
void repmovsq() { emit_repmovs(kInt64Size); }
// Repeated store of quadwords (fill RCX quadwords at [RDI] with RAX).
void repstosq();
// Instruction to load from an immediate 64-bit pointer into RAX.
void load_rax(Address value, RelocInfo::Mode rmode);
void load_rax(ExternalReference ext);
......
......@@ -91,9 +91,9 @@ static const ByteMnemonic zero_operands_instr[] = {
{0x61, UNSET_OP_ORDER, "popad"}, {0x9C, UNSET_OP_ORDER, "pushfd"},
{0x9D, UNSET_OP_ORDER, "popfd"}, {0x9E, UNSET_OP_ORDER, "sahf"},
{0x99, UNSET_OP_ORDER, "cdq"}, {0x9B, UNSET_OP_ORDER, "fwait"},
{0xA4, UNSET_OP_ORDER, "movs"}, {0xA5, UNSET_OP_ORDER, "movs"},
{0xA6, UNSET_OP_ORDER, "cmps"}, {0xA7, UNSET_OP_ORDER, "cmps"},
{-1, UNSET_OP_ORDER, ""}};
{0xAB, UNSET_OP_ORDER, "stos"}, {0xA4, UNSET_OP_ORDER, "movs"},
{0xA5, UNSET_OP_ORDER, "movs"}, {0xA6, UNSET_OP_ORDER, "cmps"},
{0xA7, UNSET_OP_ORDER, "cmps"}, {-1, UNSET_OP_ORDER, ""}};
static const ByteMnemonic call_jump_instr[] = {{0xE8, UNSET_OP_ORDER, "call"},
{0xE9, UNSET_OP_ORDER, "jmp"},
......@@ -2434,13 +2434,13 @@ int DisassemblerX64::InstructionDecode(v8::internal::Vector<char> out_buffer,
byte_size_operand_ = idesc.byte_size_operation;
switch (idesc.type) {
case ZERO_OPERANDS_INSTR:
if (current >= 0xA4 && current <= 0xA7) {
if ((current >= 0xA4 && current <= 0xA7) ||
(current >= 0xAA && current <= 0xAD)) {
// String move or compare operations.
if (group_1_prefix_ == REP_PREFIX) {
// REP.
AppendToBuffer("rep ");
}
if (rex_w()) AppendToBuffer("REX.W ");
AppendToBuffer("%s%c", idesc.mnem, operand_size_code());
} else {
AppendToBuffer("%s%c", idesc.mnem, operand_size_code());
......
......@@ -46,10 +46,12 @@ constexpr int32_t kConstantStackSpace = kSystemPointerSize;
// Three instructions are required to sub a large constant, movw + movt + sub.
constexpr int32_t kPatchInstructionsRequired = 3;
inline int GetStackSlotOffset(uint32_t index) {
return kFirstStackSlotOffset + index * LiftoffAssembler::kStackSlotSize;
}
inline MemOperand GetStackSlot(uint32_t index) {
int32_t offset =
kFirstStackSlotOffset + index * LiftoffAssembler::kStackSlotSize;
return MemOperand(fp, -offset);
return MemOperand(fp, -GetStackSlotOffset(index));
}
inline MemOperand GetHalfStackSlot(uint32_t index, RegPairHalf half) {
......@@ -635,6 +637,44 @@ void LiftoffAssembler::FillI64Half(Register reg, uint32_t index,
ldr(reg, liftoff::GetHalfStackSlot(index, half));
}
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
DCHECK_LT(0, count);
uint32_t last_stack_slot = index + count - 1;
RecordUsedSpillSlot(last_stack_slot);
// We need a zero reg. Always use r0 for that, and push it before to restore
// its value afterwards.
push(r0);
mov(r0, Operand(0));
if (count <= 5) {
// Special straight-line code for up to five slots. Generates two
// instructions per slot.
for (uint32_t offset = 0; offset < count; ++offset) {
str(r0, liftoff::GetHalfStackSlot(index + offset, kLowWord));
str(r0, liftoff::GetHalfStackSlot(index + offset, kHighWord));
}
} else {
// General case for bigger counts (9 instructions).
// Use r1 for start address (inclusive), r2 for end address (exclusive).
push(r1);
push(r2);
sub(r1, fp, Operand(liftoff::GetStackSlotOffset(last_stack_slot)));
sub(r2, fp, Operand(liftoff::GetStackSlotOffset(index) + kStackSlotSize));
Label loop;
bind(&loop);
str(r0, MemOperand(r1, /* offset */ kSystemPointerSize, PostIndex));
cmp(r1, r2);
b(&loop, ne);
pop(r2);
pop(r1);
}
pop(r0);
}
#define I32_BINOP(name, instruction) \
void LiftoffAssembler::emit_##name(Register dst, Register lhs, \
Register rhs) { \
......
......@@ -43,10 +43,12 @@ constexpr int32_t kInstanceOffset = 2 * kSystemPointerSize;
constexpr int32_t kFirstStackSlotOffset = kInstanceOffset + kSystemPointerSize;
constexpr int32_t kConstantStackSpace = 0;
inline int GetStackSlotOffset(uint32_t index) {
return kFirstStackSlotOffset + index * LiftoffAssembler::kStackSlotSize;
}
inline MemOperand GetStackSlot(uint32_t index) {
int32_t offset =
kFirstStackSlotOffset + index * LiftoffAssembler::kStackSlotSize;
return MemOperand(fp, -offset);
return MemOperand(fp, -GetStackSlotOffset(index));
}
inline MemOperand GetInstanceOperand() {
......@@ -398,6 +400,38 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t index, RegPairHalf) {
UNREACHABLE();
}
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
DCHECK_LT(0, count);
uint32_t last_stack_slot = index + count - 1;
RecordUsedSpillSlot(last_stack_slot);
int max_stp_offset = -liftoff::GetStackSlotOffset(index + count - 1);
if (count <= 20 && IsImmLSPair(max_stp_offset, kXRegSizeLog2)) {
// Special straight-line code for up to 20 slots. Generates one
// instruction per two slots (<= 10 instructions total).
for (; count > 1; count -= 2) {
STATIC_ASSERT(kStackSlotSize == kSystemPointerSize);
stp(xzr, xzr, liftoff::GetStackSlot(index + count - 1));
}
DCHECK(count == 0 || count == 1);
if (count) str(xzr, liftoff::GetStackSlot(index));
} else {
// General case for bigger counts (7 instructions).
// Use x0 for start address (inclusive), x1 for end address (exclusive).
Push(x1, x0);
Sub(x0, fp, Operand(liftoff::GetStackSlotOffset(last_stack_slot)));
Sub(x1, fp, Operand(liftoff::GetStackSlotOffset(index) + kStackSlotSize));
Label loop;
bind(&loop);
str(xzr, MemOperand(x0, /* offset */ kSystemPointerSize, PostIndex));
cmp(x0, x1);
b(&loop, ne);
Pop(x0, x1);
}
}
#define I32_BINOP(name, instruction) \
void LiftoffAssembler::emit_##name(Register dst, Register lhs, \
Register rhs) { \
......
......@@ -511,6 +511,37 @@ void LiftoffAssembler::FillI64Half(Register reg, uint32_t index,
mov(reg, liftoff::GetHalfStackSlot(index, half));
}
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
DCHECK_LT(0, count);
uint32_t last_stack_slot = index + count - 1;
RecordUsedSpillSlot(last_stack_slot);
if (count <= 2) {
// Special straight-line code for up to two slots (6-9 bytes per word:
// C7 <1-4 bytes operand> <4 bytes imm>, makes 12-18 bytes per slot).
for (uint32_t offset = 0; offset < count; ++offset) {
mov(liftoff::GetHalfStackSlot(index + offset, kLowWord), Immediate(0));
mov(liftoff::GetHalfStackSlot(index + offset, kHighWord), Immediate(0));
}
} else {
// General case for bigger counts.
// This sequence takes 19-22 bytes (3 for pushes, 3-6 for lea, 2 for xor, 5
// for mov, 3 for repstosq, 3 for pops).
// Note: rep_stos fills ECX doublewords at [EDI] with EAX.
push(eax);
push(ecx);
push(edi);
lea(edi, liftoff::GetStackSlot(last_stack_slot));
xor_(eax, eax);
// Number of words is number of slots times two.
mov(ecx, Immediate(count * 2));
rep_stos();
pop(edi);
pop(ecx);
pop(eax);
}
}
void LiftoffAssembler::emit_i32_add(Register dst, Register lhs, Register rhs) {
if (lhs != dst) {
lea(dst, Operand(lhs, rhs, times_1, 0));
......
......@@ -386,6 +386,7 @@ class LiftoffAssembler : public TurboAssembler {
// Only used on 32-bit systems: Fill a register from a "half stack slot", i.e.
// 4 bytes on the stack holding half of a 64-bit value.
inline void FillI64Half(Register, uint32_t index, RegPairHalf);
inline void FillStackSlotsWithZero(uint32_t index, uint32_t count);
// i32 binops.
inline void emit_i32_add(Register dst, Register lhs, Register rhs);
......
......@@ -341,6 +341,24 @@ class LiftoffCompiler {
__ bind(ool.continuation.get());
}
bool SpillLocalsInitially(FullDecoder* decoder, uint32_t num_params) {
int actual_locals = __ num_locals() - num_params;
DCHECK_LE(0, actual_locals);
constexpr int kNumCacheRegisters = NumRegs(kLiftoffAssemblerGpCacheRegs);
// If we have many locals, we put them on the stack initially. This avoids
// having to spill them on merge points. Use of these initial values should
// be rare anyway.
if (actual_locals > kNumCacheRegisters / 2) return true;
// If there are locals which are not i32 or i64, we also spill all locals,
// because other types cannot be initialized to constants.
for (uint32_t param_idx = num_params; param_idx < __ num_locals();
++param_idx) {
ValueType type = decoder->GetLocalType(param_idx);
if (type != kWasmI32 && type != kWasmI64) return true;
}
return false;
}
void StartFunctionBody(FullDecoder* decoder, Control* block) {
for (uint32_t i = 0; i < __ num_locals(); ++i) {
if (!CheckSupportedType(decoder, kSupportedTypes, __ local_type(i),
......@@ -373,6 +391,7 @@ class LiftoffCompiler {
// LiftoffAssembler methods.
if (DidAssemblerBailout(decoder)) return;
// Process parameters.
__ SpillInstance(instance_reg);
// Input 0 is the code target, 1 is the instance. First parameter at 2.
uint32_t input_idx = kInstanceParameterIndex + 1;
......@@ -380,32 +399,20 @@ class LiftoffCompiler {
input_idx += ProcessParameter(__ local_type(param_idx), input_idx);
}
DCHECK_EQ(input_idx, descriptor_->InputCount());
// Set to a gp register, to mark this uninitialized.
LiftoffRegister zero_double_reg = kGpCacheRegList.GetFirstRegSet();
DCHECK(zero_double_reg.is_gp());
for (uint32_t param_idx = num_params; param_idx < __ num_locals();
++param_idx) {
ValueType type = decoder->GetLocalType(param_idx);
switch (type) {
case kWasmI32:
__ cache_state()->stack_state.emplace_back(kWasmI32, uint32_t{0});
break;
case kWasmI64:
__ cache_state()->stack_state.emplace_back(kWasmI64, uint32_t{0});
break;
case kWasmF32:
case kWasmF64:
if (zero_double_reg.is_gp()) {
// Note: This might spill one of the registers used to hold
// parameters.
zero_double_reg = __ GetUnusedRegister(kFpReg);
// Zero is represented by the bit pattern 0 for both f32 and f64.
__ LoadConstant(zero_double_reg, WasmValue(0.));
}
__ PushRegister(type, zero_double_reg);
break;
default:
UNIMPLEMENTED();
// Initialize locals beyond parameters.
if (SpillLocalsInitially(decoder, num_params)) {
__ FillStackSlotsWithZero(num_params, __ num_locals() - num_params);
for (uint32_t param_idx = num_params; param_idx < __ num_locals();
++param_idx) {
ValueType type = decoder->GetLocalType(param_idx);
__ cache_state()->stack_state.emplace_back(type);
}
} else {
for (uint32_t param_idx = num_params; param_idx < __ num_locals();
++param_idx) {
ValueType type = decoder->GetLocalType(param_idx);
__ cache_state()->stack_state.emplace_back(type, int32_t{0});
}
}
......
......@@ -452,6 +452,35 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t index, RegPairHalf) {
UNREACHABLE();
}
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
DCHECK_LT(0, count);
uint32_t last_stack_slot = index + count - 1;
RecordUsedSpillSlot(last_stack_slot);
if (count <= 3) {
// Special straight-line code for up to three slots
// (7-10 bytes per slot: REX C7 <1-4 bytes op> <4 bytes imm>).
for (uint32_t offset = 0; offset < count; ++offset) {
movq(liftoff::GetStackSlot(index + offset), Immediate(0));
}
} else {
// General case for bigger counts.
// This sequence takes 20-23 bytes (3 for pushes, 4-7 for lea, 2 for xor, 5
// for mov, 3 for repstosq, 3 for pops).
// From intel manual: repstosq fills RCX quadwords at [RDI] with RAX.
pushq(rax);
pushq(rcx);
pushq(rdi);
leaq(rdi, liftoff::GetStackSlot(last_stack_slot));
xorl(rax, rax);
movl(rcx, Immediate(count));
repstosq();
popq(rdi);
popq(rcx);
popq(rax);
}
}
void LiftoffAssembler::emit_i32_add(Register dst, Register lhs, Register rhs) {
if (lhs != dst) {
leal(dst, Operand(lhs, rhs, times_1, 0));
......
......@@ -182,6 +182,8 @@ TEST(DisasmX64) {
__ decq(rdx);
__ cdq();
__ repstosq();
__ nop();
__ idivq(rdx);
__ mull(rdx);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment