Commit c314cf74 authored by Milad Farazmand's avatar Milad Farazmand Committed by Commit Bot

PPC/s390: [Liftoff] Improve initialization for many locals

Port a8cdda99

Original Commit Message:

    WebAssembly locals are specified to be zero on function entry. Liftoff
    implements this by just storing the constant 0 in the virtual stack for
    integer types, and using one floating point register initialized to
    zero for all floating point types.
    For big counts of locals this leads to problems (manifesting as huge
    blocks of code being generated) once we hit a merge point: All those
    constants (for int) and all duplicate register uses (for floats) need to
    be fixed up, by using separate registers for the locals or spilling to
    the stack if no more registers are available. All this spilling
    generates a lot of code, and can even happen multiple times within a
    function.

    This CL optimizes for such cases by spilling all locals to the stack
    initially. All merges within the function body get much smaller then.
    The spilled values rarely have to be loaded anyway, because the initial
    zero value is usually overwritten before the first use.

    To optimize the code size for initializing big numbers of locals on the
    stack, this CL also introduces the platform-specific
    {FillStackSlotsWithZero} method which uses a loop for bigger local
    counts.

    This often saves dozens of kilobytes for very big functions, and shows
    an overall code size reduction of 4-5 percent for big modules.

R=clemensb@chromium.org, joransiu@ca.ibm.com, jyan@ca.ibm.com, michael_dawson@ca.ibm.com
BUG=
LOG=N

Change-Id: I2459080a1f6acfdd212e9a93a868d028980c5554
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1863370Reviewed-by: 's avatarJunliang Yan <jyan@ca.ibm.com>
Reviewed-by: 's avatarMilad Farazmand <miladfar@ca.ibm.com>
Commit-Queue: Milad Farazmand <miladfar@ca.ibm.com>
Cr-Commit-Position: refs/heads/master@{#64301}
parent e359c49b
...@@ -12,6 +12,49 @@ namespace v8 { ...@@ -12,6 +12,49 @@ namespace v8 {
namespace internal { namespace internal {
namespace wasm { namespace wasm {
namespace liftoff {
// half
// slot Frame
// -----+--------------------+---------------------------
// n+3 | parameter n |
// ... | ... |
// 4 | parameter 1 | or parameter 2
// 3 | parameter 0 | or parameter 1
// 2 | (result address) | or parameter 0
// -----+--------------------+---------------------------
// 1 | return addr (lr) |
// 0 | previous frame (fp)|
// -----+--------------------+ <-- frame ptr (fp)
// -1 | 0xa: WASM_COMPILED |
// -2 | instance |
// -----+--------------------+---------------------------
// -3 | slot 0 (high) | ^
// -4 | slot 0 (low) | |
// -5 | slot 1 (high) | Frame slots
// -6 | slot 1 (low) | |
// | | v
// -----+--------------------+ <-- stack ptr (sp)
//
constexpr int32_t kInstanceOffset = 2 * kSystemPointerSize;
constexpr int32_t kFirstStackSlotOffset =
kInstanceOffset + 2 * kSystemPointerSize;
inline int GetStackSlotOffset(uint32_t index) {
return kFirstStackSlotOffset + index * LiftoffAssembler::kStackSlotSize;
}
inline MemOperand GetHalfStackSlot(uint32_t index, RegPairHalf half) {
int32_t half_offset =
half == kLowWord ? 0 : LiftoffAssembler::kStackSlotSize / 2;
int32_t offset = kFirstStackSlotOffset +
index * LiftoffAssembler::kStackSlotSize - half_offset;
return MemOperand(fp, -offset);
}
} // namespace liftoff
int LiftoffAssembler::PrepareStackFrame() { int LiftoffAssembler::PrepareStackFrame() {
bailout(kUnsupportedArchitecture, "PrepareStackFrame"); bailout(kUnsupportedArchitecture, "PrepareStackFrame");
return 0; return 0;
...@@ -108,6 +151,45 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t index, RegPairHalf) { ...@@ -108,6 +151,45 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t index, RegPairHalf) {
bailout(kUnsupportedArchitecture, "FillI64Half"); bailout(kUnsupportedArchitecture, "FillI64Half");
} }
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
DCHECK_LT(0, count);
uint32_t last_stack_slot = index + count - 1;
RecordUsedSpillSlot(last_stack_slot);
// We need a zero reg. Always use r0 for that, and push it before to restore
// its value afterwards.
push(r0);
mov(r0, Operand(0));
if (count <= 5) {
// Special straight-line code for up to five slots. Generates two
// instructions per slot.
for (uint32_t offset = 0; offset < count; ++offset) {
StoreP(r0, liftoff::GetHalfStackSlot(index + offset, kLowWord));
StoreP(r0, liftoff::GetHalfStackSlot(index + offset, kHighWord));
}
} else {
// General case for bigger counts (9 instructions).
// Use r4 for start address (inclusive), r5 for end address (exclusive).
push(r4);
push(r5);
subi(r4, fp, Operand(liftoff::GetStackSlotOffset(last_stack_slot)));
subi(r5, fp, Operand(liftoff::GetStackSlotOffset(index) + kStackSlotSize));
Label loop;
bind(&loop);
StoreP(r0, MemOperand(r0));
addi(r0, r0, Operand(kSystemPointerSize));
cmp(r4, r5);
bne(&loop);
pop(r4);
pop(r5);
}
pop(r0);
}
#define UNIMPLEMENTED_I32_BINOP(name) \ #define UNIMPLEMENTED_I32_BINOP(name) \
void LiftoffAssembler::emit_##name(Register dst, Register lhs, \ void LiftoffAssembler::emit_##name(Register dst, Register lhs, \
Register rhs) { \ Register rhs) { \
......
...@@ -12,6 +12,48 @@ namespace v8 { ...@@ -12,6 +12,48 @@ namespace v8 {
namespace internal { namespace internal {
namespace wasm { namespace wasm {
namespace liftoff {
// half
// slot Frame
// -----+--------------------+---------------------------
// n+3 | parameter n |
// ... | ... |
// 4 | parameter 1 | or parameter 2
// 3 | parameter 0 | or parameter 1
// 2 | (result address) | or parameter 0
// -----+--------------------+---------------------------
// 1 | return addr (lr) |
// 0 | previous frame (fp)|
// -----+--------------------+ <-- frame ptr (fp)
// -1 | 0xa: WASM_COMPILED |
// -2 | instance |
// -----+--------------------+---------------------------
// -3 | slot 0 (high) | ^
// -4 | slot 0 (low) | |
// -5 | slot 1 (high) | Frame slots
// -6 | slot 1 (low) | |
// | | v
// -----+--------------------+ <-- stack ptr (sp)
//
constexpr int32_t kInstanceOffset = 2 * kSystemPointerSize;
constexpr int32_t kFirstStackSlotOffset =
kInstanceOffset + 2 * kSystemPointerSize;
inline int GetStackSlotOffset(uint32_t index) {
return kFirstStackSlotOffset + index * LiftoffAssembler::kStackSlotSize;
}
inline MemOperand GetHalfStackSlot(uint32_t index, RegPairHalf half) {
int32_t half_offset =
half == kLowWord ? 0 : LiftoffAssembler::kStackSlotSize / 2;
int32_t offset = kFirstStackSlotOffset +
index * LiftoffAssembler::kStackSlotSize - half_offset;
return MemOperand(fp, -offset);
}
} // namespace liftoff
int LiftoffAssembler::PrepareStackFrame() { int LiftoffAssembler::PrepareStackFrame() {
bailout(kUnsupportedArchitecture, "PrepareStackFrame"); bailout(kUnsupportedArchitecture, "PrepareStackFrame");
return 0; return 0;
...@@ -108,6 +150,45 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t index, RegPairHalf) { ...@@ -108,6 +150,45 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t index, RegPairHalf) {
bailout(kUnsupportedArchitecture, "FillI64Half"); bailout(kUnsupportedArchitecture, "FillI64Half");
} }
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
DCHECK_LT(0, count);
uint32_t last_stack_slot = index + count - 1;
RecordUsedSpillSlot(last_stack_slot);
// We need a zero reg. Always use r0 for that, and push it before to restore
// its value afterwards.
push(r0);
mov(r0, Operand(0));
if (count <= 5) {
// Special straight-line code for up to five slots. Generates two
// instructions per slot.
for (uint32_t offset = 0; offset < count; ++offset) {
StoreP(r0, liftoff::GetHalfStackSlot(index + offset, kLowWord));
StoreP(r0, liftoff::GetHalfStackSlot(index + offset, kHighWord));
}
} else {
// General case for bigger counts (9 instructions).
// Use r3 for start address (inclusive), r4 for end address (exclusive).
push(r3);
push(r4);
SubP(r3, fp, Operand(liftoff::GetStackSlotOffset(last_stack_slot)));
SubP(r4, fp, Operand(liftoff::GetStackSlotOffset(index) + kStackSlotSize));
Label loop;
bind(&loop);
StoreP(r0, MemOperand(r0));
la(r0, MemOperand(r0, kSystemPointerSize));
CmpLogicalP(r3, r4);
bne(&loop);
pop(r4);
pop(r3);
}
pop(r0);
}
#define UNIMPLEMENTED_I32_BINOP(name) \ #define UNIMPLEMENTED_I32_BINOP(name) \
void LiftoffAssembler::emit_##name(Register dst, Register lhs, \ void LiftoffAssembler::emit_##name(Register dst, Register lhs, \
Register rhs) { \ Register rhs) { \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment