Commit fd93f338 authored by Andreas Haas's avatar Andreas Haas Committed by V8 LUCI CQ

[wasm][liftoff] Cache the memory start register

WebAssembly functions often have subsequent memory accesses, and each of
these memory accesses need the start address of the memory in a register.
With this CL the register with the memory start address is cached, so
only the first memory access has to load the memory start address into a
register, subsequent memory accesses can just reuse the register.

In first measurements with the epic benchmark this reduces the size of
the generated Liftoff code by a bit more than 5%.

R=clemensb@chromium.org

Bug: v8:11862
Change-Id: Ic33e7e3c00a4209570821269c728187affbeadcf
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2947403
Commit-Queue: Andreas Haas <ahaas@chromium.org>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#75113}
parent 64828e21
......@@ -15,6 +15,7 @@
#include "src/utils/ostreams.h"
#include "src/wasm/baseline/liftoff-register.h"
#include "src/wasm/function-body-decoder-impl.h"
#include "src/wasm/object-access.h"
#include "src/wasm/wasm-linkage.h"
#include "src/wasm/wasm-opcodes.h"
......@@ -446,6 +447,10 @@ void LiftoffAssembler::CacheState::InitMerge(const CacheState& source,
SetInstanceCacheRegister(source.cached_instance);
}
if (source.cached_mem_start != no_reg) {
SetMemStartCacheRegister(source.cached_mem_start);
}
uint32_t stack_base = stack_depth + num_locals;
uint32_t target_height = stack_base + arity;
uint32_t discarded = source.stack_height() - target_height;
......@@ -709,10 +714,13 @@ void LiftoffAssembler::MergeFullStackWith(CacheState& target,
}
// Full stack merging is only done for forward jumps, so we can just clear the
// instance cache register at the target in case of mismatch.
// cache registers at the target in case of mismatch.
if (source.cached_instance != target.cached_instance) {
target.ClearCachedInstanceRegister();
}
if (source.cached_mem_start != target.cached_mem_start) {
target.ClearCachedMemStartRegister();
}
}
void LiftoffAssembler::MergeStackWith(CacheState& target, uint32_t arity,
......@@ -754,6 +762,34 @@ void LiftoffAssembler::MergeStackWith(CacheState& target, uint32_t arity,
}
}
}
if (cache_state_.cached_mem_start != target.cached_mem_start &&
target.cached_mem_start != no_reg) {
if (jump_direction == kForwardJump) {
// On forward jumps, reset the cached memory start in the target state.
target.ClearCachedMemStartRegister();
} else {
// On backward jumps, we already generated code assuming that the
// memory start is available in that register. Thus move it there.
if (cache_state_.cached_mem_start == no_reg) {
// {target.cached_instance} already got restored above, so we can use it
// if it exists.
Register instance = target.cached_instance;
if (instance == no_reg) {
// We don't have the instance available yet. Store it into the target
// mem_start, so that we can load the mem_start from there.
instance = target.cached_mem_start;
LoadInstanceFromFrame(instance);
}
LoadFromInstance(
target.cached_mem_start, instance,
ObjectAccess::ToTagged(WasmInstanceObject::kMemoryStartOffset),
sizeof(size_t));
} else {
Move(target.cached_mem_start, cache_state_.cached_mem_start,
kPointerKind);
}
}
}
}
void LiftoffAssembler::Spill(VarState* slot) {
......@@ -784,7 +820,7 @@ void LiftoffAssembler::SpillAllRegisters() {
Spill(slot.offset(), slot.reg(), slot.kind());
slot.MakeStack();
}
cache_state_.ClearCachedInstanceRegister();
cache_state_.ClearAllCacheRegisters();
cache_state_.reset_used_registers();
}
......@@ -793,9 +829,21 @@ void LiftoffAssembler::ClearRegister(
LiftoffRegList pinned) {
if (reg == cache_state()->cached_instance) {
cache_state()->ClearCachedInstanceRegister();
// We can return immediately. The instance is only used to load information
// at the beginning of an instruction when values don't have to be in
// specific registers yet. Therefore the instance should never be one of the
// {possible_uses}.
for (Register* use : possible_uses) {
USE(use);
DCHECK_NE(reg, *use);
}
return;
}
if (cache_state()->is_used(LiftoffRegister(reg))) {
} else if (reg == cache_state()->cached_mem_start) {
cache_state()->ClearCachedMemStartRegister();
// The memory start may be among the {possible_uses}, e.g. for an atomic
// compare exchange. Therefore it is necessary to iterate over the
// {possible_uses} below, and we cannot return early.
} else if (cache_state()->is_used(LiftoffRegister(reg))) {
SpillRegister(LiftoffRegister(reg));
}
Register replacement = no_reg;
......@@ -891,7 +939,7 @@ void LiftoffAssembler::PrepareCall(const ValueKindSig* sig,
constexpr size_t kInputShift = 1;
// Spill all cache slots which are not being used as parameters.
cache_state_.ClearCachedInstanceRegister();
cache_state_.ClearAllCacheRegisters();
for (VarState* it = cache_state_.stack_state.end() - 1 - num_params;
it >= cache_state_.stack_state.begin() &&
!cache_state_.used_registers.is_empty();
......@@ -1125,13 +1173,15 @@ bool LiftoffAssembler::ValidateCacheState() const {
}
used_regs.set(reg);
}
if (cache_state_.cached_instance != no_reg) {
DCHECK(!used_regs.has(cache_state_.cached_instance));
int liftoff_code =
LiftoffRegister{cache_state_.cached_instance}.liftoff_code();
used_regs.set(cache_state_.cached_instance);
DCHECK_EQ(0, register_use_count[liftoff_code]);
register_use_count[liftoff_code] = 1;
for (Register cache_reg :
{cache_state_.cached_instance, cache_state_.cached_mem_start}) {
if (cache_reg != no_reg) {
DCHECK(!used_regs.has(cache_reg));
int liftoff_code = LiftoffRegister{cache_reg}.liftoff_code();
used_regs.set(cache_reg);
DCHECK_EQ(0, register_use_count[liftoff_code]);
register_use_count[liftoff_code] = 1;
}
}
bool valid = memcmp(register_use_count, cache_state_.register_use_count,
sizeof(register_use_count)) == 0 &&
......
......@@ -200,6 +200,7 @@ class LiftoffAssembler : public TurboAssembler {
uint32_t register_use_count[kAfterMaxLiftoffRegCode] = {0};
LiftoffRegList last_spilled_regs;
Register cached_instance = no_reg;
Register cached_mem_start = no_reg;
bool has_unused_register(RegClass rc, LiftoffRegList pinned = {}) const {
if (kNeedI64RegPair && rc == kGpRegPair) {
......@@ -250,31 +251,47 @@ class LiftoffAssembler : public TurboAssembler {
// Volatile registers are registers which are used for caching values that
// can easily be reloaded. Those are returned first if we run out of free
// registers.
// Note: This interface is a bit more generic than currently needed, in
// anticipation of more "volatile registers" being added later.
bool has_volatile_register(LiftoffRegList candidates) {
return cached_instance != no_reg && candidates.has(cached_instance);
return (cached_instance != no_reg && candidates.has(cached_instance)) ||
(cached_mem_start != no_reg && candidates.has(cached_mem_start));
}
LiftoffRegister take_volatile_register(LiftoffRegList candidates) {
DCHECK(candidates.has(cached_instance));
LiftoffRegister ret{cached_instance};
DCHECK(has_volatile_register(candidates));
Register reg = no_reg;
if (cached_instance != no_reg && candidates.has(cached_instance)) {
reg = cached_instance;
cached_instance = no_reg;
} else {
DCHECK(candidates.has(cached_mem_start));
reg = cached_mem_start;
cached_mem_start = no_reg;
}
LiftoffRegister ret{reg};
DCHECK_EQ(1, register_use_count[ret.liftoff_code()]);
register_use_count[ret.liftoff_code()] = 0;
used_registers.clear(ret);
cached_instance = no_reg;
return ret;
}
void SetInstanceCacheRegister(Register reg) {
DCHECK_EQ(no_reg, cached_instance);
cached_instance = reg;
void SetCacheRegister(Register* cache, Register reg) {
DCHECK_EQ(no_reg, *cache);
*cache = reg;
int liftoff_code = LiftoffRegister{reg}.liftoff_code();
DCHECK_EQ(0, register_use_count[liftoff_code]);
register_use_count[liftoff_code] = 1;
used_registers.set(reg);
}
void SetInstanceCacheRegister(Register reg) {
SetCacheRegister(&cached_instance, reg);
}
void SetMemStartCacheRegister(Register reg) {
SetCacheRegister(&cached_mem_start, reg);
}
Register TrySetCachedInstanceRegister(LiftoffRegList pinned) {
DCHECK_EQ(no_reg, cached_instance);
LiftoffRegList available_regs =
......@@ -290,13 +307,24 @@ class LiftoffAssembler : public TurboAssembler {
return new_cache_reg;
}
void ClearCachedInstanceRegister() {
if (cached_instance == no_reg) return;
int liftoff_code = LiftoffRegister{cached_instance}.liftoff_code();
void ClearCacheRegister(Register* cache) {
if (*cache == no_reg) return;
int liftoff_code = LiftoffRegister{*cache}.liftoff_code();
DCHECK_EQ(1, register_use_count[liftoff_code]);
register_use_count[liftoff_code] = 0;
used_registers.clear(cached_instance);
cached_instance = no_reg;
used_registers.clear(*cache);
*cache = no_reg;
}
void ClearCachedInstanceRegister() { ClearCacheRegister(&cached_instance); }
void ClearCachedMemStartRegister() {
ClearCacheRegister(&cached_mem_start);
}
void ClearAllCacheRegisters() {
ClearCacheRegister(&cached_instance);
ClearCacheRegister(&cached_mem_start);
}
void inc_used(LiftoffRegister reg) {
......@@ -551,6 +579,8 @@ class LiftoffAssembler : public TurboAssembler {
if (cache_state_.is_free(r)) continue;
if (r.is_gp() && cache_state_.cached_instance == r.gp()) {
cache_state_.ClearCachedInstanceRegister();
} else if (r.is_gp() && cache_state_.cached_mem_start == r.gp()) {
cache_state_.ClearCachedMemStartRegister();
} else {
SpillRegister(r);
}
......
......@@ -2819,6 +2819,17 @@ class LiftoffCompiler {
return true;
}
Register GetMemoryStart(LiftoffRegList pinned) {
Register memory_start = __ cache_state()->cached_mem_start;
if (memory_start == no_reg) {
memory_start = __ GetUnusedRegister(kGpReg, pinned).gp();
LOAD_INSTANCE_FIELD(memory_start, MemoryStart, kSystemPointerSize,
pinned);
__ cache_state()->SetMemStartCacheRegister(memory_start);
}
return memory_start;
}
void LoadMem(FullDecoder* decoder, LoadType type,
const MemoryAccessImmediate<validate>& imm,
const Value& index_val, Value* result) {
......@@ -2837,8 +2848,7 @@ class LiftoffCompiler {
__ cache_state()->stack_state.pop_back();
DEBUG_CODE_COMMENT("load from memory (constant offset)");
LiftoffRegList pinned;
Register mem = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(mem, MemoryStart, kSystemPointerSize, pinned);
Register mem = pinned.set(GetMemoryStart(pinned));
LiftoffRegister value = pinned.set(__ GetUnusedRegister(rc, pinned));
__ Load(value, mem, no_reg, offset, type, pinned, nullptr, true,
i64_offset);
......@@ -2855,8 +2865,7 @@ class LiftoffCompiler {
// Load the memory start address only now to reduce register pressure
// (important on ia32).
Register mem = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(mem, MemoryStart, kSystemPointerSize, pinned);
Register mem = pinned.set(GetMemoryStart(pinned));
LiftoffRegister value = pinned.set(__ GetUnusedRegister(rc, pinned));
uint32_t protected_load_pc = 0;
......@@ -2899,8 +2908,7 @@ class LiftoffCompiler {
LiftoffRegList pinned = LiftoffRegList::ForRegs(index);
index = AddMemoryMasking(index, &offset, &pinned);
DEBUG_CODE_COMMENT("load with transformation");
Register addr = __ GetUnusedRegister(kGpReg, pinned).gp();
LOAD_INSTANCE_FIELD(addr, MemoryStart, kSystemPointerSize, pinned);
Register addr = GetMemoryStart(pinned);
LiftoffRegister value = __ GetUnusedRegister(reg_class_for(kS128), {});
uint32_t protected_load_pc = 0;
__ LoadTransform(value, addr, index, offset, type, transform,
......@@ -2940,8 +2948,7 @@ class LiftoffCompiler {
pinned.set(index);
index = AddMemoryMasking(index, &offset, &pinned);
DEBUG_CODE_COMMENT("load lane");
Register addr = __ GetUnusedRegister(kGpReg, pinned).gp();
LOAD_INSTANCE_FIELD(addr, MemoryStart, kSystemPointerSize, pinned);
Register addr = GetMemoryStart(pinned);
LiftoffRegister result = __ GetUnusedRegister(reg_class_for(kS128), {});
uint32_t protected_load_pc = 0;
......@@ -2976,8 +2983,7 @@ class LiftoffCompiler {
if (IndexStaticallyInBounds(index_slot, type.size(), &offset)) {
__ cache_state()->stack_state.pop_back();
DEBUG_CODE_COMMENT("store to memory (constant offset)");
Register mem = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(mem, MemoryStart, kSystemPointerSize, pinned);
Register mem = pinned.set(GetMemoryStart(pinned));
__ Store(mem, no_reg, offset, value, type, pinned, nullptr, true);
} else {
LiftoffRegister full_index = __ PopToRegister(pinned);
......@@ -2991,8 +2997,7 @@ class LiftoffCompiler {
uint32_t protected_store_pc = 0;
// Load the memory start address only now to reduce register pressure
// (important on ia32).
Register mem = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(mem, MemoryStart, kSystemPointerSize, pinned);
Register mem = pinned.set(GetMemoryStart(pinned));
LiftoffRegList outer_pinned;
if (V8_UNLIKELY(FLAG_trace_wasm_memory)) outer_pinned.set(index);
__ Store(mem, index, offset, value, type, outer_pinned,
......@@ -3024,8 +3029,7 @@ class LiftoffCompiler {
pinned.set(index);
index = AddMemoryMasking(index, &offset, &pinned);
DEBUG_CODE_COMMENT("store lane to memory");
Register addr = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(addr, MemoryStart, kSystemPointerSize, pinned);
Register addr = pinned.set(GetMemoryStart(pinned));
uint32_t protected_store_pc = 0;
__ StoreLane(addr, index, offset, value, type, lane, &protected_store_pc);
if (env_->use_trap_handler) {
......@@ -4274,8 +4278,7 @@ class LiftoffCompiler {
uintptr_t offset = imm.offset;
index = AddMemoryMasking(index, &offset, &pinned);
DEBUG_CODE_COMMENT("atomic store to memory");
Register addr = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(addr, MemoryStart, kSystemPointerSize, pinned);
Register addr = pinned.set(GetMemoryStart(pinned));
LiftoffRegList outer_pinned;
if (V8_UNLIKELY(FLAG_trace_wasm_memory)) outer_pinned.set(index);
__ AtomicStore(addr, index, offset, value, type, outer_pinned);
......@@ -4298,8 +4301,7 @@ class LiftoffCompiler {
uintptr_t offset = imm.offset;
index = AddMemoryMasking(index, &offset, &pinned);
DEBUG_CODE_COMMENT("atomic load from memory");
Register addr = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(addr, MemoryStart, kSystemPointerSize, pinned);
Register addr = pinned.set(GetMemoryStart(pinned));
RegClass rc = reg_class_for(kind);
LiftoffRegister value = pinned.set(__ GetUnusedRegister(rc, pinned));
__ AtomicLoad(value, addr, index, offset, type, pinned);
......@@ -4346,8 +4348,7 @@ class LiftoffCompiler {
uintptr_t offset = imm.offset;
index = AddMemoryMasking(index, &offset, &pinned);
Register addr = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(addr, MemoryStart, kSystemPointerSize, pinned);
Register addr = pinned.set(GetMemoryStart(pinned));
(asm_.*emit_fn)(addr, index, offset, value, result, type);
__ PushRegister(result_kind, result);
......@@ -4403,8 +4404,7 @@ class LiftoffCompiler {
uintptr_t offset = imm.offset;
index = AddMemoryMasking(index, &offset, &pinned);
Register addr = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(addr, MemoryStart, kSystemPointerSize, pinned);
Register addr = pinned.set(GetMemoryStart(pinned));
LiftoffRegister result =
pinned.set(__ GetUnusedRegister(reg_class_for(result_kind), pinned));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment