Commit ddaa1f0a authored by Peter Marshall's avatar Peter Marshall Committed by Commit Bot

Reland "[cpu-profiler] Fix stack iterability for fast C calls with no exit frame"

This is a reland of d5f4a33e

Original change's description:
> [cpu-profiler] Fix stack iterability for fast C calls with no exit frame
>
> Before fast C calls, store the current FP and PC on the isolate. When
> iterating frames in SafeStackFrameIterator, check if these fields are
> set and start iterating at the calling frame's FP instead of the current
> FP, which will be in C++ code. We need to do this because c_entry_fp is
> not set on the Isolate for Fast-C-Calls because we don't build an exit
> frame.
>
> This change makes stack samples that occur within 'Fast-C-Calls'
> iterable, meaning we can properly attribute ticks within the JS caller.
>
> Fast-C-Calls can't call back into JS code, so we can only ever have one
> such call on the stack at a time, allowing us to store the FP on the
> isolate rather than the stack.
>
> TBR=v8-mips-ports@googlegroups.com
>
> Bug: v8:8464, v8:7202
> Change-Id: I7bf39eba779dad34754d5759d741c421b362a406
> Reviewed-on: https://chromium-review.googlesource.com/c/1340241
> Commit-Queue: Peter Marshall <petermarshall@chromium.org>
> Reviewed-by: Jakob Gruber <jgruber@chromium.org>
> Reviewed-by: Martyn Capewell <martyn.capewell@arm.com>
> Reviewed-by: Alexei Filippov <alph@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#57896}

TBR=v8-mips-ports@googlegroups.com
TBR=jgruber@chromium.org

Bug: v8:8464, v8:7202
Change-Id: I5f37ded4ea572e8e9890ba186aa3d74a0dfc1274
Reviewed-on: https://chromium-review.googlesource.com/c/1354042Reviewed-by: 's avatarPeter Marshall <petermarshall@chromium.org>
Commit-Queue: Peter Marshall <petermarshall@chromium.org>
Cr-Commit-Position: refs/heads/master@{#57912}
parent 781789c0
......@@ -2344,10 +2344,37 @@ void TurboAssembler::CallCFunctionHelper(Register function,
}
#endif
// Save the frame pointer and PC so that the stack layout remains iterable,
// even without an ExitFrame which normally exists between JS and C frames.
if (isolate() != nullptr) {
Register scratch = r4;
Push(scratch);
Move(scratch, ExternalReference::fast_c_call_caller_pc_address(isolate()));
str(pc, MemOperand(scratch));
Move(scratch, ExternalReference::fast_c_call_caller_fp_address(isolate()));
str(fp, MemOperand(scratch));
Pop(scratch);
}
// Just call directly. The function called cannot cause a GC, or
// allow preemption, so the return address in the link register
// stays correct.
Call(function);
if (isolate() != nullptr) {
// We don't unset the PC; the FP is the source of truth.
Register scratch1 = r4;
Register scratch2 = r5;
Push(scratch1);
Push(scratch2);
Move(scratch1, ExternalReference::fast_c_call_caller_fp_address(isolate()));
mov(scratch2, Operand::Zero());
str(scratch2, MemOperand(scratch1));
Pop(scratch2);
Pop(scratch1);
}
int stack_passed_arguments = CalculateStackPassedWords(
num_reg_arguments, num_double_arguments);
if (ActivationFrameAlignment() > kPointerSize) {
......
......@@ -1827,10 +1827,38 @@ void TurboAssembler::CallCFunction(Register function, int num_of_reg_args,
DCHECK_LE(num_of_double_args + num_of_reg_args, 2);
}
// Save the frame pointer and PC so that the stack layout remains iterable,
// even without an ExitFrame which normally exists between JS and C frames.
if (isolate() != nullptr) {
Register scratch1 = x4;
Register scratch2 = x5;
Push(scratch1, scratch2);
Label get_pc;
Bind(&get_pc);
Adr(scratch2, &get_pc);
Mov(scratch1, ExternalReference::fast_c_call_caller_pc_address(isolate()));
Str(scratch2, MemOperand(scratch1));
Mov(scratch1, ExternalReference::fast_c_call_caller_fp_address(isolate()));
Str(fp, MemOperand(scratch1));
Pop(scratch2, scratch1);
}
// Call directly. The function called cannot cause a GC, or allow preemption,
// so the return address in the link register stays correct.
Call(function);
if (isolate() != nullptr) {
// We don't unset the PC; the FP is the source of truth.
Register scratch = x4;
Push(scratch, xzr);
Mov(scratch, ExternalReference::fast_c_call_caller_fp_address(isolate()));
Str(xzr, MemOperand(scratch));
Pop(xzr, scratch);
}
if (num_of_reg_args > kRegisterPassedArguments) {
// Drop the register passed arguments.
int claim_slots = RoundUp(num_of_reg_args - kRegisterPassedArguments, 2);
......
......@@ -832,6 +832,18 @@ ExternalReference ExternalReference::wasm_thread_in_wasm_flag_address_address(
&isolate->thread_local_top()->thread_in_wasm_flag_address_));
}
ExternalReference ExternalReference::fast_c_call_caller_fp_address(
Isolate* isolate) {
return ExternalReference(
isolate->isolate_data()->fast_c_call_caller_fp_address());
}
ExternalReference ExternalReference::fast_c_call_caller_pc_address(
Isolate* isolate) {
return ExternalReference(
isolate->isolate_data()->fast_c_call_caller_pc_address());
}
ExternalReference ExternalReference::fixed_typed_array_base_data_offset() {
return ExternalReference(reinterpret_cast<void*>(
FixedTypedArrayBase::kDataOffset - kHeapObjectTag));
......
......@@ -72,6 +72,10 @@ class StatsCounter;
V(debug_restart_fp_address, "Debug::restart_fp_address()") \
V(wasm_thread_in_wasm_flag_address_address, \
"&Isolate::thread_in_wasm_flag_address") \
V(fast_c_call_caller_fp_address, \
"IsolateData::fast_c_call_caller_fp_address") \
V(fast_c_call_caller_pc_address, \
"IsolateData::fast_c_call_caller_pc_address") \
EXTERNAL_REFERENCE_LIST_NON_INTERPRETED_REGEXP(V)
#define EXTERNAL_REFERENCE_LIST(V) \
......
......@@ -226,7 +226,24 @@ SafeStackFrameIterator::SafeStackFrameIterator(
StackFrame::Type type;
ThreadLocalTop* top = isolate->thread_local_top();
bool advance_frame = true;
if (IsValidTop(top)) {
Address fast_c_fp = isolate->isolate_data()->fast_c_call_caller_fp();
// 'Fast C calls' are a special type of C call where we call directly from JS
// to C without an exit frame inbetween. The CEntryStub is responsible for
// setting Isolate::c_entry_fp, meaning that it won't be set for fast C calls.
// To keep the stack iterable, we store the FP and PC of the caller of the
// fast C call on the isolate. This is guaranteed to be the topmost JS frame,
// because fast C calls cannot call back into JS. We start iterating the stack
// from this topmost JS frame.
if (fast_c_fp) {
DCHECK_NE(kNullAddress, isolate->isolate_data()->fast_c_call_caller_pc());
type = StackFrame::Type::OPTIMIZED;
top_frame_type_ = type;
state.fp = fast_c_fp;
state.sp = sp;
state.pc_address = isolate->isolate_data()->fast_c_call_caller_pc_address();
advance_frame = false;
} else if (IsValidTop(top)) {
type = ExitFrame::GetStateForFramePointer(Isolate::c_entry_fp(top), &state);
top_frame_type_ = type;
} else if (IsValidStackAddress(fp)) {
......
......@@ -1830,7 +1830,39 @@ void TurboAssembler::CallCFunction(Register function, int num_arguments) {
CheckStackAlignment();
}
// Save the frame pointer and PC so that the stack layout remains iterable,
// even without an ExitFrame which normally exists between JS and C frames.
if (isolate() != nullptr) {
// Get the current PC via call, pop. This gets the return address pushed to
// the stack by call.
Label get_pc;
call(&get_pc);
bind(&get_pc);
// Find two caller-saved scratch registers.
Register scratch1 = eax;
Register scratch2 = ecx;
if (function == eax) scratch1 = edx;
if (function == ecx) scratch2 = edx;
pop(scratch1);
mov(ExternalReferenceAsOperand(
ExternalReference::fast_c_call_caller_pc_address(isolate()),
scratch2),
scratch1);
mov(ExternalReferenceAsOperand(
ExternalReference::fast_c_call_caller_fp_address(isolate()),
scratch2),
ebp);
}
call(function);
if (isolate() != nullptr) {
// We don't unset the PC; the FP is the source of truth.
mov(ExternalReferenceAsOperand(
ExternalReference::fast_c_call_caller_fp_address(isolate()), edx),
Immediate(0));
}
if (base::OS::ActivationFrameAlignment() != 0) {
mov(esp, Operand(esp, num_arguments * kPointerSize));
} else {
......
......@@ -71,6 +71,12 @@ class IsolateData final {
return kVirtualCallTargetRegisterOffset - kIsolateRootBias;
}
// The FP and PC that are saved right before TurboAssembler::CallCFunction.
Address* fast_c_call_caller_fp_address() { return &fast_c_call_caller_fp_; }
Address* fast_c_call_caller_pc_address() { return &fast_c_call_caller_pc_; }
Address fast_c_call_caller_fp() { return fast_c_call_caller_fp_; }
Address fast_c_call_caller_pc() { return fast_c_call_caller_pc_; }
// Returns true if this address points to data stored in this instance.
// If it's the case then the value can be accessed indirectly through the
// root register.
......@@ -100,6 +106,8 @@ class IsolateData final {
V(kExternalReferenceTableOffset, ExternalReferenceTable::SizeInBytes()) \
V(kBuiltinsTableOffset, Builtins::builtin_count* kPointerSize) \
V(kVirtualCallTargetRegisterOffset, kPointerSize) \
V(kFastCCallCallerFPOffset, kPointerSize) \
V(kFastCCallCallerPCOffset, kPointerSize) \
/* This padding aligns IsolateData size by 8 bytes. */ \
V(kPaddingOffset, \
8 + RoundUp<8>(static_cast<int>(kPaddingOffset)) - kPaddingOffset) \
......@@ -138,6 +146,13 @@ class IsolateData final {
// ia32 (otherwise the arguments adaptor call runs out of registers).
void* virtual_call_target_register_ = nullptr;
// Stores the state of the caller for TurboAssembler::CallCFunction so that
// the sampling CPU profiler can iterate the stack during such calls. These
// are stored on IsolateData so that they can be stored to with only one move
// instruction in compiled code.
Address fast_c_call_caller_fp_ = kNullAddress;
Address fast_c_call_caller_pc_ = kNullAddress;
// Ensure the size is 8-byte aligned in order to make alignment of the field
// following the IsolateData field predictable. This solves the issue with
// C++ compilers for 32-bit platforms which are not consistent at aligning
......@@ -177,6 +192,10 @@ void IsolateData::AssertPredictableLayout() {
kExternalMemoryLlimitOffset);
STATIC_ASSERT(offsetof(IsolateData, external_memory_at_last_mark_compact_) ==
kExternalMemoryAtLastMarkCompactOffset);
STATIC_ASSERT(offsetof(IsolateData, fast_c_call_caller_fp_) ==
kFastCCallCallerFPOffset);
STATIC_ASSERT(offsetof(IsolateData, fast_c_call_caller_pc_) ==
kFastCCallCallerPCOffset);
STATIC_ASSERT(sizeof(IsolateData) == IsolateData::kSize);
}
......
......@@ -5399,7 +5399,38 @@ void TurboAssembler::CallCFunctionHelper(Register function_base,
function_offset = 0;
}
// Save the frame pointer and PC so that the stack layout remains iterable,
// even without an ExitFrame which normally exists between JS and C frames.
if (isolate() != nullptr) {
UseScratchRegisterScope temps(this);
Register scratch1 = temps.Acquire();
// 't' registers are caller-saved so this is safe as a scratch register.
Register scratch2 = t5;
DCHECK(!AreAliased(scratch1, scratch2, function_base));
Label get_pc;
mov(scratch1, ra);
Call(&get_pc);
bind(&get_pc);
mov(scratch2, ra);
mov(ra, scratch1);
li(scratch1, ExternalReference::fast_c_call_caller_pc_address(isolate()));
sw(scratch2, MemOperand(scratch1));
li(scratch1, ExternalReference::fast_c_call_caller_fp_address(isolate()));
sw(fp, MemOperand(scratch1));
}
Call(function_base, function_offset);
if (isolate() != nullptr) {
// We don't unset the PC; the FP is the source of truth.
UseScratchRegisterScope temps(this);
Register scratch = temps.Acquire();
li(scratch, ExternalReference::fast_c_call_caller_fp_address(isolate()));
sw(zero_reg, MemOperand(scratch));
}
}
int stack_passed_arguments = CalculateStackPassedWords(
......
......@@ -5761,7 +5761,38 @@ void TurboAssembler::CallCFunctionHelper(Register function,
function = t9;
}
// Save the frame pointer and PC so that the stack layout remains iterable,
// even without an ExitFrame which normally exists between JS and C frames.
if (isolate() != nullptr) {
UseScratchRegisterScope temps(this);
Register scratch1 = temps.Acquire();
// 't' registers are caller-saved so this is safe as a scratch register.
Register scratch2 = t2;
DCHECK(!AreAliased(scratch1, scratch2, function));
Label get_pc;
mov(scratch1, ra);
Call(&get_pc);
bind(&get_pc);
mov(scratch2, ra);
mov(ra, scratch1);
li(scratch1, ExternalReference::fast_c_call_caller_pc_address(isolate()));
Sd(scratch2, MemOperand(scratch1));
li(scratch1, ExternalReference::fast_c_call_caller_fp_address(isolate()));
Sd(fp, MemOperand(scratch1));
}
Call(function);
if (isolate() != nullptr) {
// We don't unset the PC; the FP is the source of truth.
UseScratchRegisterScope temps(this);
Register scratch = temps.Acquire();
li(scratch, ExternalReference::fast_c_call_caller_fp_address(isolate()));
Sd(zero_reg, MemOperand(scratch));
}
}
int stack_passed_arguments = CalculateStackPassedWords(
......
......@@ -2652,7 +2652,30 @@ void TurboAssembler::CallCFunction(Register function, int num_arguments) {
CheckStackAlignment();
}
// Save the frame pointer and PC so that the stack layout remains iterable,
// even without an ExitFrame which normally exists between JS and C frames.
if (isolate() != nullptr) {
Label get_pc;
DCHECK(!AreAliased(kScratchRegister, function));
leaq(kScratchRegister, Operand(&get_pc, 0));
bind(&get_pc);
movp(ExternalReferenceAsOperand(
ExternalReference::fast_c_call_caller_pc_address(isolate())),
kScratchRegister);
movp(ExternalReferenceAsOperand(
ExternalReference::fast_c_call_caller_fp_address(isolate())),
rbp);
}
call(function);
if (isolate() != nullptr) {
// We don't unset the PC; the FP is the source of truth.
movp(ExternalReferenceAsOperand(
ExternalReference::fast_c_call_caller_fp_address(isolate())),
Immediate(0));
}
DCHECK_NE(base::OS::ActivationFrameAlignment(), 0);
DCHECK_GE(num_arguments, 0);
int argument_slots_on_stack =
......
......@@ -93,9 +93,6 @@
'test-cpu-profiler/TracingCpuProfiler': [SKIP],
'test-sampler/LibSamplerCollectSample': [SKIP],
# BUG(7202). The test is flaky.
'test-cpu-profiler/NativeFrameStackTrace': [SKIP],
# BUG(7054)
'test-cpu-profiler/StaticCollectSampleAPI': [SKIP],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment