Commit 76bc9a86 authored by Mu Tao's avatar Mu Tao Committed by Commit Bot

[mips][Liftoff] Improve initialization for many locals

Port a8cdda99

Original Commit Message:

    WebAssembly locals are specified to be zero on function entry. Liftoff
    implements this by just storing the constant 0 in the virtual stack for
    integer types, and using one floating point register initialized to
    zero for all floating point types.
    For big counts of locals this leads to problems (manifesting as huge
    blocks of code being generated) once we hit a merge point: All those
    constants (for int) and all duplicate register uses (for floats) need to
    be fixed up, by using separate registers for the locals or spilling to
    the stack if no more registers are available. All this spilling
    generates a lot of code, and can even happen multiple times within a
    function.

    This CL optimizes for such cases by spilling all locals to the stack
    initially. All merges within the function body get much smaller then.
    The spilled values rarely have to be loaded anyway, because the initial
    zero value is usually overwritten before the first use.

    To optimize the code size for initializing big numbers of locals on the
    stack, this CL also introduces the platform-specific
    {FillStackSlotsWithZero} method which uses a loop for bigger local
    counts.

    This often saves dozens of kilobytes for very big functions, and shows
    an overall code size reduction of 4-5 percent for big modules.

R=xwafish@gmail.com

Change-Id: Id65b6d36beadcba0d3f3726bb6559bb316cb212e
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1862830
Auto-Submit: Mu Tao <pamilty@gmail.com>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Commit-Queue: Mu Tao <pamilty@gmail.com>
Cr-Commit-Position: refs/heads/master@{#64315}
parent c7c47c68
......@@ -13,6 +13,28 @@ namespace wasm {
namespace liftoff {
// half
// slot Frame
// -----+--------------------+---------------------------
// n+3 | parameter n |
// ... | ... |
// 4 | parameter 1 | or parameter 2
// 3 | parameter 0 | or parameter 1
// 2 | (result address) | or parameter 0
// -----+--------------------+---------------------------
// 1 | return addr (ra) |
// 0 | previous frame (fp)|
// -----+--------------------+ <-- frame ptr (fp)
// -1 | 0xa: WASM_COMPILED |
// -2 | instance |
// -----+--------------------+---------------------------
// -3 | slot 0 (high) | ^
// -4 | slot 0 (low) | |
// -5 | slot 1 (high) | Frame slots
// -6 | slot 1 (low) | |
// | | v
// -----+--------------------+ <-- stack ptr (sp)
//
#if defined(V8_TARGET_BIG_ENDIAN)
constexpr int32_t kLowWordOffset = 4;
constexpr int32_t kHighWordOffset = 0;
......@@ -27,9 +49,12 @@ constexpr int32_t kConstantStackSpace = 8;
constexpr int32_t kFirstStackSlotOffset =
kConstantStackSpace + LiftoffAssembler::kStackSlotSize;
inline int GetStackSlotOffset(uint32_t index) {
return kFirstStackSlotOffset + index * LiftoffAssembler::kStackSlotSize;
}
inline MemOperand GetStackSlot(uint32_t index) {
int32_t offset = index * LiftoffAssembler::kStackSlotSize;
return MemOperand(fp, -kFirstStackSlotOffset - offset);
return MemOperand(fp, -GetStackSlotOffset(index));
}
inline MemOperand GetHalfStackSlot(uint32_t index, RegPairHalf half) {
......@@ -583,6 +608,35 @@ void LiftoffAssembler::FillI64Half(Register reg, uint32_t index,
lw(reg, liftoff::GetHalfStackSlot(index, half));
}
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
DCHECK_LT(0, count);
uint32_t last_stack_slot = index + count - 1;
RecordUsedSpillSlot(last_stack_slot);
if (count <= 12) {
// Special straight-line code for up to 12 slots. Generates one
// instruction per slot (<=12 instructions total).
for (uint32_t offset = 0; offset < count; ++offset) {
Sw(zero_reg, liftoff::GetStackSlot(index + offset));
}
} else {
// General case for bigger counts (12 instructions).
// Use a0 for start address (inclusive), a1 for end address (exclusive).
Push(a1, a0);
Addu(a0, fp, Operand(-liftoff::GetStackSlotOffset(last_stack_slot)));
Addu(a1, fp,
Operand(-(liftoff::GetStackSlotOffset(index) + kStackSlotSize)));
Label loop;
bind(&loop);
Sw(zero_reg, MemOperand(a0, kSystemPointerSize));
addiu(a0, a0, kSystemPointerSize);
BranchShort(&loop, ne, a0, Operand(a1));
Pop(a1, a0);
}
}
void LiftoffAssembler::emit_i32_mul(Register dst, Register lhs, Register rhs) {
TurboAssembler::Mul(dst, lhs, rhs);
}
......
......@@ -13,15 +13,44 @@ namespace wasm {
namespace liftoff {
// Liftoff Frames.
//
// slot Frame
// +--------------------+---------------------------
// n+4 | optional padding slot to keep the stack 16 byte aligned.
// n+3 | parameter n |
// ... | ... |
// 4 | parameter 1 | or parameter 2
// 3 | parameter 0 | or parameter 1
// 2 | (result address) | or parameter 0
// -----+--------------------+---------------------------
// 1 | return addr (ra) |
// 0 | previous frame (fp)|
// -----+--------------------+ <-- frame ptr (fp)
// -1 | 0xa: WASM_COMPILED |
// -2 | instance |
// -----+--------------------+---------------------------
// -3 | slot 0 | ^
// -4 | slot 1 | |
// | | Frame slots
// | | |
// | | v
// | optional padding slot to keep the stack 16 byte aligned.
// -----+--------------------+ <-- stack ptr (sp)
//
// fp-8 holds the stack marker, fp-16 is the instance parameter, first stack
// slot is located at fp-24.
constexpr int32_t kConstantStackSpace = 16;
constexpr int32_t kFirstStackSlotOffset =
kConstantStackSpace + LiftoffAssembler::kStackSlotSize;
inline int GetStackSlotOffset(uint32_t index) {
return kFirstStackSlotOffset + index * LiftoffAssembler::kStackSlotSize;
}
inline MemOperand GetStackSlot(uint32_t index) {
int32_t offset = index * LiftoffAssembler::kStackSlotSize;
return MemOperand(fp, -kFirstStackSlotOffset - offset);
return MemOperand(fp, -GetStackSlotOffset(index));
}
inline MemOperand GetInstanceOperand() { return MemOperand(fp, -16); }
......@@ -498,6 +527,35 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t index, RegPairHalf) {
UNREACHABLE();
}
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
DCHECK_LT(0, count);
uint32_t last_stack_slot = index + count - 1;
RecordUsedSpillSlot(last_stack_slot);
if (count <= 12) {
// Special straight-line code for up to 12 slots. Generates one
// instruction per slot (<= 12 instructions total).
for (uint32_t offset = 0; offset < count; ++offset) {
Sd(zero_reg, liftoff::GetStackSlot(index + offset));
}
} else {
// General case for bigger counts (12 instructions).
// Use a0 for start address (inclusive), a1 for end address (exclusive).
Push(a1, a0);
Daddu(a0, fp, Operand(-liftoff::GetStackSlotOffset(last_stack_slot)));
Daddu(a1, fp,
Operand(-(liftoff::GetStackSlotOffset(index) + kStackSlotSize)));
Label loop;
bind(&loop);
Sd(zero_reg, MemOperand(a0, kSystemPointerSize));
daddiu(a0, a0, kSystemPointerSize);
BranchShort(&loop, ne, a0, Operand(a1));
Pop(a1, a0);
}
}
void LiftoffAssembler::emit_i32_mul(Register dst, Register lhs, Register rhs) {
TurboAssembler::Mul(dst, lhs, rhs);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment