Commit e5630ea9 authored by jgruber's avatar jgruber Committed by Commit Bot

[builtins,x64] pc-relative builtin-to-builtin calls

This addresses one of the major remaining slowdowns with embedded
builtins on x64.

When generating code for a call to a builtin callee from a builtin
caller, we'd look up the Code target object from the builtins constant
list, calculate the location of the first instruction, and jump to it.
Note that for embedded builtin callees, the Code object is itself only
a trampoline to the off-heap code and thus an additional indirection.
An example of the call sequence in pseudo-asm:

// Load from the constants list.
mov reg, [kRootPointer, kBuiltinsConstantListOffset]
mov reg, [reg, offset_of_the_code_constant]
// Calculate first instruction and call it.
add reg, Code::kHeaderOffset
call reg
// The trampoline forwards to the off-heap area.
mov kOffHeapTrampolineRegister, <off-heap instruction_start>
jmp kOffHeapTrampolineRegister

This CL changes calls to embedded builtin targets to use pc-relative
addressing. This reduces the above instruction sequence to:

call <pc-relative offset to target instruction_start>

Embedded-to-embedded calls jump directly to the embedded instruction
stream, bypassing the trampoline. Heap-to-embedded calls (and all
calls to heap-builtins) use pc-relative addressing targeting the
on-heap Code object.

Other relevant platforms (arm,arm64,mips,mips64) do not use pc-relative
calls. For these, we'll need a different solution, e.g. a table of
embedded builtin addresses reachable from the root pointer, similar to
the external reference table.

Bug: v8:6666
Change-Id: Ic0317d454e2da37d74eaecebcdfcbc0d5f5041ad
Reviewed-on: https://chromium-review.googlesource.com/1068732
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Reviewed-by: 's avatarMichael Starzinger <mstarzinger@chromium.org>
Cr-Commit-Position: refs/heads/master@{#53349}
parent 2f2bf246
...@@ -43,6 +43,7 @@ ...@@ -43,6 +43,7 @@
#include "src/ostreams.h" #include "src/ostreams.h"
#include "src/simulator.h" // For flushing instruction cache. #include "src/simulator.h" // For flushing instruction cache.
#include "src/snapshot/serializer-common.h" #include "src/snapshot/serializer-common.h"
#include "src/snapshot/snapshot.h"
namespace v8 { namespace v8 {
namespace internal { namespace internal {
...@@ -461,6 +462,16 @@ RelocIterator::RelocIterator(const CodeReference code_reference, int mode_mask) ...@@ -461,6 +462,16 @@ RelocIterator::RelocIterator(const CodeReference code_reference, int mode_mask)
code_reference.relocation_end(), code_reference.relocation_end(),
code_reference.relocation_start(), mode_mask) {} code_reference.relocation_start(), mode_mask) {}
#ifdef V8_EMBEDDED_BUILTINS
RelocIterator::RelocIterator(EmbeddedData* embedded_data, Code* code,
int mode_mask)
: RelocIterator(
code, embedded_data->InstructionStartOfBuiltin(code->builtin_index()),
code->constant_pool(),
code->relocation_start() + code->relocation_size(),
code->relocation_start(), mode_mask) {}
#endif // V8_EMBEDDED_BUILTINS
RelocIterator::RelocIterator(const CodeDesc& desc, int mode_mask) RelocIterator::RelocIterator(const CodeDesc& desc, int mode_mask)
: RelocIterator(nullptr, reinterpret_cast<Address>(desc.buffer), 0, : RelocIterator(nullptr, reinterpret_cast<Address>(desc.buffer), 0,
desc.buffer + desc.buffer_size, desc.buffer + desc.buffer_size,
......
...@@ -60,6 +60,7 @@ class ApiFunction; ...@@ -60,6 +60,7 @@ class ApiFunction;
namespace internal { namespace internal {
// Forward declarations. // Forward declarations.
class EmbeddedData;
class InstructionStream; class InstructionStream;
class Isolate; class Isolate;
class SCTableReference; class SCTableReference;
...@@ -679,6 +680,10 @@ class RelocIterator: public Malloced { ...@@ -679,6 +680,10 @@ class RelocIterator: public Malloced {
// Relocation information with mode k is included in the // Relocation information with mode k is included in the
// iteration iff bit k of mode_mask is set. // iteration iff bit k of mode_mask is set.
explicit RelocIterator(Code* code, int mode_mask = -1); explicit RelocIterator(Code* code, int mode_mask = -1);
#ifdef V8_EMBEDDED_BUILTINS
explicit RelocIterator(EmbeddedData* embedded_data, Code* code,
int mode_mask);
#endif // V8_EMBEDDED_BUILTINS
explicit RelocIterator(const CodeDesc& desc, int mode_mask = -1); explicit RelocIterator(const CodeDesc& desc, int mode_mask = -1);
explicit RelocIterator(const CodeReference code_reference, explicit RelocIterator(const CodeReference code_reference,
int mode_mask = -1); int mode_mask = -1);
......
...@@ -302,19 +302,30 @@ bool Builtins::IsLazy(int index) { ...@@ -302,19 +302,30 @@ bool Builtins::IsLazy(int index) {
// static // static
bool Builtins::IsIsolateIndependent(int index) { bool Builtins::IsIsolateIndependent(int index) {
DCHECK(IsBuiltinId(index)); DCHECK(IsBuiltinId(index));
// TODO(jgruber): There's currently two blockers for moving switch (index) {
// InterpreterEntryTrampoline into the binary: // TODO(jgruber): There's currently two blockers for moving
// 1. InterpreterEnterBytecode calculates a pointer into the middle of // InterpreterEntryTrampoline into the binary:
// InterpreterEntryTrampoline (see interpreter_entry_return_pc_offset). // 1. InterpreterEnterBytecode calculates a pointer into the middle of
// When the builtin is embedded, the pointer would need to be calculated // InterpreterEntryTrampoline (see interpreter_entry_return_pc_offset).
// at an offset from the embedded instruction stream (instead of the // When the builtin is embedded, the pointer would need to be calculated
// trampoline code object). // at an offset from the embedded instruction stream (instead of the
// 2. We create distinct copies of the trampoline to make it possible to // trampoline code object).
// attribute ticks in the interpreter to individual JS functions. // 2. We create distinct copies of the trampoline to make it possible to
// See https://crrev.com/c/959081 and InstallBytecodeArray. When the // attribute ticks in the interpreter to individual JS functions.
// trampoline is embedded, we need to ensure that CopyCode creates a copy // See https://crrev.com/c/959081 and InstallBytecodeArray. When the
// of the builtin itself (and not just the trampoline). // trampoline is embedded, we need to ensure that CopyCode creates a copy
return index != kInterpreterEntryTrampoline; // of the builtin itself (and not just the trampoline).
case kInterpreterEntryTrampoline:
return false;
// TODO(jgruber): WasmCompileLazy is copied off the heap during module
// compilation, which breaks pc-relative calls. It can be marked
// isolate-independent once copies are no longer generated for wasm.
case kWasmCompileLazy:
return false;
default:
return true;
}
UNREACHABLE();
} }
#ifdef V8_EMBEDDED_BUILTINS #ifdef V8_EMBEDDED_BUILTINS
......
...@@ -33,6 +33,10 @@ uint32_t BuiltinsConstantsTableBuilder::AddObject(Handle<Object> object) { ...@@ -33,6 +33,10 @@ uint32_t BuiltinsConstantsTableBuilder::AddObject(Handle<Object> object) {
DCHECK_EQ(isolate_->heap()->empty_fixed_array(), DCHECK_EQ(isolate_->heap()->empty_fixed_array(),
isolate_->heap()->builtins_constants_table()); isolate_->heap()->builtins_constants_table());
// Must be on the main thread.
DCHECK(ThreadId::Current().Equals(isolate_->thread_id()));
// Must be serializing.
DCHECK(isolate_->serializer_enabled()); DCHECK(isolate_->serializer_enabled());
#endif #endif
......
...@@ -2599,7 +2599,7 @@ void CodeGenerator::AssembleMove(InstructionOperand* source, ...@@ -2599,7 +2599,7 @@ void CodeGenerator::AssembleMove(InstructionOperand* source,
if (IsMaterializableFromRoot(src_object, &index)) { if (IsMaterializableFromRoot(src_object, &index)) {
__ LoadRoot(dst, index); __ LoadRoot(dst, index);
} else { } else {
__ Mov(dst, src_object); __ Move(dst, src_object);
} }
} else { } else {
__ Mov(dst, g.ToImmediate(source)); __ Mov(dst, g.ToImmediate(source));
......
...@@ -260,31 +260,6 @@ void CodeAssembler::GenerateCheckMaybeObjectIsObject(Node* node, ...@@ -260,31 +260,6 @@ void CodeAssembler::GenerateCheckMaybeObjectIsObject(Node* node,
#endif #endif
#ifdef V8_EMBEDDED_BUILTINS #ifdef V8_EMBEDDED_BUILTINS
TNode<HeapObject> CodeAssembler::LookupConstant(Handle<HeapObject> object) {
DCHECK(isolate()->ShouldLoadConstantsFromRootList());
// Ensure the given object is in the builtins constants table and fetch its
// index.
BuiltinsConstantsTableBuilder* builder =
isolate()->builtins_constants_table_builder();
uint32_t index = builder->AddObject(object);
// The builtins constants table is loaded through the root register on all
// supported platforms. This is checked by the
// VerifyBuiltinsIsolateIndependence cctest, which disallows embedded objects
// in isolate-independent builtins.
DCHECK(isolate()->heap()->RootCanBeTreatedAsConstant(
Heap::kBuiltinsConstantsTableRootIndex));
TNode<FixedArray> builtins_constants_table = UncheckedCast<FixedArray>(
LoadRoot(Heap::kBuiltinsConstantsTableRootIndex));
// Generate the lookup.
const int32_t header_size = FixedArray::kHeaderSize - kHeapObjectTag;
TNode<IntPtrT> offset = IntPtrConstant(header_size + kPointerSize * index);
return UncheckedCast<HeapObject>(
Load(MachineType::AnyTagged(), builtins_constants_table, offset));
}
// External references are stored in the external reference table. // External references are stored in the external reference table.
TNode<ExternalReference> CodeAssembler::LookupExternalReference( TNode<ExternalReference> CodeAssembler::LookupExternalReference(
ExternalReference reference) { ExternalReference reference) {
...@@ -349,16 +324,6 @@ TNode<Smi> CodeAssembler::SmiConstant(int value) { ...@@ -349,16 +324,6 @@ TNode<Smi> CodeAssembler::SmiConstant(int value) {
TNode<HeapObject> CodeAssembler::UntypedHeapConstant( TNode<HeapObject> CodeAssembler::UntypedHeapConstant(
Handle<HeapObject> object) { Handle<HeapObject> object) {
#ifdef V8_EMBEDDED_BUILTINS
// Root constants are simply loaded from the root list, while non-root
// constants must be looked up from the builtins constants table.
if (isolate()->ShouldLoadConstantsFromRootList()) {
Heap::RootListIndex root_index;
if (!isolate()->heap()->IsRootHandle(object, &root_index)) {
return LookupConstant(object);
}
}
#endif // V8_EMBEDDED_BUILTINS
return UncheckedCast<HeapObject>(raw_assembler()->HeapConstant(object)); return UncheckedCast<HeapObject>(raw_assembler()->HeapConstant(object));
} }
......
...@@ -691,7 +691,6 @@ class V8_EXPORT_PRIVATE CodeAssembler { ...@@ -691,7 +691,6 @@ class V8_EXPORT_PRIVATE CodeAssembler {
#endif #endif
#ifdef V8_EMBEDDED_BUILTINS #ifdef V8_EMBEDDED_BUILTINS
TNode<HeapObject> LookupConstant(Handle<HeapObject> object);
TNode<ExternalReference> LookupExternalReference(ExternalReference reference); TNode<ExternalReference> LookupExternalReference(ExternalReference reference);
#endif #endif
......
...@@ -736,7 +736,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -736,7 +736,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
if (HasImmediateInput(instr, 0)) { if (HasImmediateInput(instr, 0)) {
Handle<Code> code = i.InputCode(0); Handle<Code> code = i.InputCode(0);
__ jmp(code, RelocInfo::CODE_TARGET); __ Jump(code, RelocInfo::CODE_TARGET);
} else { } else {
Register reg = i.InputRegister(0); Register reg = i.InputRegister(0);
__ addp(reg, Immediate(Code::kHeaderSize - kHeapObjectTag)); __ addp(reg, Immediate(Code::kHeaderSize - kHeapObjectTag));
......
...@@ -14250,7 +14250,7 @@ const char* AbstractCode::Kind2String(Kind kind) { ...@@ -14250,7 +14250,7 @@ const char* AbstractCode::Kind2String(Kind kind) {
} }
#ifdef V8_EMBEDDED_BUILTINS #ifdef V8_EMBEDDED_BUILTINS
bool Code::IsProcessIndependent() { bool Code::IsProcessIndependent(Isolate* isolate) {
constexpr int all_real_modes_mask = constexpr int all_real_modes_mask =
(1 << (RelocInfo::LAST_REAL_RELOC_MODE + 1)) - 1; (1 << (RelocInfo::LAST_REAL_RELOC_MODE + 1)) - 1;
constexpr int mode_mask = constexpr int mode_mask =
...@@ -14273,8 +14273,22 @@ bool Code::IsProcessIndependent() { ...@@ -14273,8 +14273,22 @@ bool Code::IsProcessIndependent() {
RelocInfo::ModeMask(RelocInfo::RUNTIME_ENTRY) | RelocInfo::ModeMask(RelocInfo::RUNTIME_ENTRY) |
RelocInfo::ModeMask(RelocInfo::EXTERNAL_REFERENCE))); RelocInfo::ModeMask(RelocInfo::EXTERNAL_REFERENCE)));
RelocIterator it(this, mode_mask); bool is_process_independent = true;
return it.done(); for (RelocIterator it(this, mode_mask); !it.done(); it.next()) {
if (RelocInfo::IsCodeTarget(it.rinfo()->rmode())) {
// Off-heap code targets are later rewritten as pc-relative jumps to the
// off-heap instruction stream and are thus process-independent.
Address target_address = it.rinfo()->target_address();
if (InstructionStream::PcIsOffHeap(isolate, target_address)) continue;
Code* target = Code::GetCodeFromTargetAddress(target_address);
CHECK(target->IsCode());
if (Builtins::IsEmbeddedBuiltin(target)) continue;
}
is_process_independent = false;
}
return is_process_independent;
} }
#endif #endif
......
...@@ -344,7 +344,7 @@ class Code : public HeapObject { ...@@ -344,7 +344,7 @@ class Code : public HeapObject {
#endif // DEBUG #endif // DEBUG
#ifdef V8_EMBEDDED_BUILTINS #ifdef V8_EMBEDDED_BUILTINS
bool IsProcessIndependent(); bool IsProcessIndependent(Isolate* isolate);
#endif #endif
inline bool CanContainWeakObjects(); inline bool CanContainWeakObjects();
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "src/snapshot/snapshot.h" #include "src/snapshot/snapshot.h"
#include "src/api.h" #include "src/api.h"
#include "src/assembler-inl.h"
#include "src/base/platform/platform.h" #include "src/base/platform/platform.h"
#include "src/callable.h" #include "src/callable.h"
#include "src/interface-descriptors.h" #include "src/interface-descriptors.h"
...@@ -325,6 +326,42 @@ bool BuiltinAliasesOffHeapTrampolineRegister(Isolate* isolate, Code* code) { ...@@ -325,6 +326,42 @@ bool BuiltinAliasesOffHeapTrampolineRegister(Isolate* isolate, Code* code) {
return false; return false;
} }
void FinalizeEmbeddedCodeTargets(Isolate* isolate, EmbeddedData* blob) {
static const int kRelocMask = RelocInfo::ModeMask(RelocInfo::CODE_TARGET);
for (int i = 0; i < Builtins::builtin_count; i++) {
if (!Builtins::IsIsolateIndependent(i)) continue;
Code* code = isolate->builtins()->builtin(i);
RelocIterator on_heap_it(code, kRelocMask);
RelocIterator off_heap_it(blob, code, kRelocMask);
#ifdef V8_TARGET_ARCH_X64
while (!on_heap_it.done()) {
DCHECK(!off_heap_it.done());
RelocInfo* rinfo = on_heap_it.rinfo();
DCHECK(RelocInfo::IsCodeTarget(rinfo->rmode()));
Code* target = Code::GetCodeFromTargetAddress(rinfo->target_address());
CHECK(Builtins::IsEmbeddedBuiltin(target));
off_heap_it.rinfo()->set_target_address(
blob->InstructionStartOfBuiltin(target->builtin_index()));
on_heap_it.next();
off_heap_it.next();
}
DCHECK(off_heap_it.done());
#else
// Architectures other than x64 do not use pc-relative calls and thus must
// not contain embedded code targets. Instead, we use an indirection through
// the root register.
CHECK(on_heap_it.done());
CHECK(off_heap_it.done());
#endif // V8_TARGET_ARCH_X64
}
}
} // namespace } // namespace
// static // static
...@@ -345,7 +382,7 @@ EmbeddedData EmbeddedData::FromIsolate(Isolate* isolate) { ...@@ -345,7 +382,7 @@ EmbeddedData EmbeddedData::FromIsolate(Isolate* isolate) {
// Sanity-check that the given builtin is isolate-independent and does not // Sanity-check that the given builtin is isolate-independent and does not
// use the trampoline register in its calling convention. // use the trampoline register in its calling convention.
if (!code->IsProcessIndependent()) { if (!code->IsProcessIndependent(isolate)) {
saw_unsafe_builtin = true; saw_unsafe_builtin = true;
fprintf(stderr, "%s is not isolate-independent.\n", Builtins::name(i)); fprintf(stderr, "%s is not isolate-independent.\n", Builtins::name(i));
} }
...@@ -399,6 +436,9 @@ EmbeddedData EmbeddedData::FromIsolate(Isolate* isolate) { ...@@ -399,6 +436,9 @@ EmbeddedData EmbeddedData::FromIsolate(Isolate* isolate) {
EmbeddedData d(blob, blob_size); EmbeddedData d(blob, blob_size);
// Fix up call targets that point to other embedded builtins.
FinalizeEmbeddedCodeTargets(isolate, &d);
// Hash the blob and store the result. // Hash the blob and store the result.
STATIC_ASSERT(HashSize() == kSizetSize); STATIC_ASSERT(HashSize() == kSizetSize);
const size_t hash = d.CreateHash(); const size_t hash = d.CreateHash();
......
...@@ -1527,13 +1527,12 @@ void MacroAssembler::PopQuad(Operand dst) { ...@@ -1527,13 +1527,12 @@ void MacroAssembler::PopQuad(Operand dst) {
} }
} }
void TurboAssembler::Jump(ExternalReference ext) {
void MacroAssembler::Jump(ExternalReference ext) {
LoadAddress(kScratchRegister, ext); LoadAddress(kScratchRegister, ext);
jmp(kScratchRegister); jmp(kScratchRegister);
} }
void MacroAssembler::Jump(Operand op) { void TurboAssembler::Jump(Operand op) {
if (kPointerSize == kInt64Size) { if (kPointerSize == kInt64Size) {
jmp(op); jmp(op);
} else { } else {
...@@ -1542,17 +1541,22 @@ void MacroAssembler::Jump(Operand op) { ...@@ -1542,17 +1541,22 @@ void MacroAssembler::Jump(Operand op) {
} }
} }
void TurboAssembler::Jump(Address destination, RelocInfo::Mode rmode) {
void MacroAssembler::Jump(Address destination, RelocInfo::Mode rmode) {
Move(kScratchRegister, destination, rmode); Move(kScratchRegister, destination, rmode);
jmp(kScratchRegister); jmp(kScratchRegister);
} }
void MacroAssembler::Jump(Handle<Code> code_object, RelocInfo::Mode rmode, void TurboAssembler::Jump(Handle<Code> code_object, RelocInfo::Mode rmode,
Condition cc) { Condition cc) {
// TODO(X64): Inline this // TODO(X64): Inline this
#ifdef V8_EMBEDDED_BUILTINS #ifdef V8_EMBEDDED_BUILTINS
if (root_array_available_ && isolate()->ShouldLoadConstantsFromRootList()) { if (root_array_available_ && isolate()->ShouldLoadConstantsFromRootList() &&
!Builtins::IsEmbeddedBuiltin(*code_object)) {
// Calls to embedded targets are initially generated as standard
// pc-relative calls below. When creating the embedded blob, call offsets
// are patched up to point directly to the off-heap instruction start.
// Note: It is safe to dereference code_object above since code generation
// for builtins and code stubs happens on the main thread.
Label skip; Label skip;
if (cc != always) { if (cc != always) {
if (cc == never) return; if (cc == never) return;
...@@ -1608,7 +1612,13 @@ void TurboAssembler::Call(Address destination, RelocInfo::Mode rmode) { ...@@ -1608,7 +1612,13 @@ void TurboAssembler::Call(Address destination, RelocInfo::Mode rmode) {
void TurboAssembler::Call(Handle<Code> code_object, RelocInfo::Mode rmode) { void TurboAssembler::Call(Handle<Code> code_object, RelocInfo::Mode rmode) {
#ifdef V8_EMBEDDED_BUILTINS #ifdef V8_EMBEDDED_BUILTINS
if (root_array_available_ && isolate()->ShouldLoadConstantsFromRootList()) { if (root_array_available_ && isolate()->ShouldLoadConstantsFromRootList() &&
!Builtins::IsEmbeddedBuiltin(*code_object)) {
// Calls to embedded targets are initially generated as standard
// pc-relative calls below. When creating the embedded blob, call offsets
// are patched up to point directly to the off-heap instruction start.
// Note: It is safe to dereference code_object above since code generation
// for builtins and code stubs happens on the main thread.
LookupConstant(kScratchRegister, code_object); LookupConstant(kScratchRegister, code_object);
leap(kScratchRegister, FieldOperand(kScratchRegister, Code::kHeaderSize)); leap(kScratchRegister, FieldOperand(kScratchRegister, Code::kHeaderSize));
call(kScratchRegister); call(kScratchRegister);
......
...@@ -391,6 +391,12 @@ class TurboAssembler : public Assembler { ...@@ -391,6 +391,12 @@ class TurboAssembler : public Assembler {
void RetpolineCall(Register reg); void RetpolineCall(Register reg);
void RetpolineCall(Address destination, RelocInfo::Mode rmode); void RetpolineCall(Address destination, RelocInfo::Mode rmode);
void Jump(Address destination, RelocInfo::Mode rmode);
void Jump(ExternalReference ext);
void Jump(Operand op);
void Jump(Handle<Code> code_object, RelocInfo::Mode rmode,
Condition cc = always);
void RetpolineJump(Register reg); void RetpolineJump(Register reg);
void CallForDeoptimization(Address target, RelocInfo::Mode rmode) { void CallForDeoptimization(Address target, RelocInfo::Mode rmode) {
...@@ -760,14 +766,6 @@ class MacroAssembler : public TurboAssembler { ...@@ -760,14 +766,6 @@ class MacroAssembler : public TurboAssembler {
void Negps(XMMRegister dst); void Negps(XMMRegister dst);
void Abspd(XMMRegister dst); void Abspd(XMMRegister dst);
void Negpd(XMMRegister dst); void Negpd(XMMRegister dst);
// Control Flow
void Jump(Address destination, RelocInfo::Mode rmode);
void Jump(ExternalReference ext);
void Jump(Operand op);
void Jump(Handle<Code> code_object, RelocInfo::Mode rmode,
Condition cc = always);
// Generates a trampoline to jump to the off-heap instruction stream. // Generates a trampoline to jump to the off-heap instruction stream.
void JumpToInstructionStream(Address entry); void JumpToInstructionStream(Address entry);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment