[osr] Basic support for concurrent OSR

This CL adds basic support behind --concurrent-osr, disabled by default. When enabled: 1) the first OSR request starts a concurrent OSR compile job. 2) on completion, the code object is inserted into the OSR cache. 3) the next OSR request picks up the cached code (assuming the request came from the same JumpLoop bytecode). We add a new osr optimization marker on the feedback vector to track whether an OSR compile is currently in progress. One fundamental issue remains: step 3) above is not guaranteed to hit the same JumpLoop, and a mismatch means the OSR'd code cannot be installed. This will be addressed in a followup by targeting specific bytecode offsets for the install request. This change is based on fanchen.kong@intel.com's earlier change crrev.com/c/3369361, thank you! Bug: v8:12161 Change-Id: Ib162906dd4b6ba056f62870aea2990f1369df235 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3548820Reviewed-by: Leszek Swirski <leszeks@chromium.org> Commit-Queue: Jakob Linke <jgruber@chromium.org> Cr-Commit-Position: refs/heads/main@{#79685}

[osr] Basic support for concurrent OSR
This CL adds basic support behind --concurrent-osr, disabled by default. When enabled: 1) the first OSR request starts a concurrent OSR compile job. 2) on completion, the code object is inserted into the OSR cache. 3) the next OSR request picks up the cached code (assuming the request came from the same JumpLoop bytecode). We add a new osr optimization marker on the feedback vector to track whether an OSR compile is currently in progress. One fundamental issue remains: step 3) above is not guaranteed to hit the same JumpLoop, and a mismatch means the OSR'd code cannot be installed. This will be addressed in a followup by targeting specific bytecode offsets for the install request. This change is based on fanchen.kong@intel.com's earlier change crrev.com/c/3369361, thank you! Bug: v8:12161 Change-Id: Ib162906dd4b6ba056f62870aea2990f1369df235 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3548820Reviewed-by: Leszek Swirski <leszeks@chromium.org> Commit-Queue: Jakob Linke <jgruber@chromium.org> Cr-Commit-Position: refs/heads/main@{#79685}
3ce690ee · Jakob Gruber · V8 LUCI CQ · dbff2a67 · 3ce690ee · 3ce690ee
Commit 3ce690ee authored Mar 30, 2022 by Jakob Gruber Committed by V8 LUCI CQ Mar 31, 2022
16 changed files
--- a/src/codegen/compiler.cc
+++ b/src/codegen/compiler.cc
--- a/src/codegen/compiler.h
+++ b/src/codegen/compiler.h
@@ -30,26 +30,20 @@ namespace internal {
 // Forward declarations.
 class AlignedCachedData;
-class AstRawString;
 class BackgroundCompileTask;
 class IsCompiledScope;
-class JavaScriptFrame;
 class OptimizedCompilationInfo;
-class OptimizedCompilationJob;
 class ParseInfo;
-class Parser;
 class RuntimeCallStats;
 class TimedHistogram;
 class TurbofanCompilationJob;
 class UnoptimizedCompilationInfo;
 class UnoptimizedCompilationJob;
+class UnoptimizedFrame;
 class WorkerThreadRuntimeCallStats;
 struct ScriptDetails;
 struct ScriptStreamingData;
-using UnoptimizedCompilationJobList =
-    std::forward_list<std::unique_ptr<UnoptimizedCompilationJob>>;
 // The V8 compiler API.
 //
 // This is the central hub for dispatching to the various compilers within V8.
@@ -97,6 +91,13 @@ class V8_EXPORT_PRIVATE Compiler : public AllStatic {
  static void CompileOptimized(Isolate* isolate, Handle<JSFunction> function,
                               ConcurrencyMode mode, CodeKind code_kind);
+  // Generate and return optimized code for OSR. The empty handle is returned
+  // either on failure, or after spawning a concurrent OSR task (in which case
+  // a future OSR request will pick up the resulting code object).
+  V8_WARN_UNUSED_RESULT static MaybeHandle<CodeT> CompileOptimizedOSR(
+      Isolate* isolate, Handle<JSFunction> function, BytecodeOffset osr_offset,
+      UnoptimizedFrame* frame, ConcurrencyMode mode);
  V8_WARN_UNUSED_RESULT static MaybeHandle<SharedFunctionInfo>
  CompileForLiveEdit(ParseInfo* parse_info, Handle<Script> script,
                     Isolate* isolate);
@@ -112,6 +113,10 @@ class V8_EXPORT_PRIVATE Compiler : public AllStatic {
                                            Isolate* isolate,
                                            ClearExceptionFlag flag);
+  // Dispose a job without finalization.
+  static void DisposeTurbofanCompilationJob(TurbofanCompilationJob* job,
+                                            bool restore_function_code);
  // Finalize and install Turbofan code from a previously run job.
  static bool FinalizeTurbofanCompilationJob(TurbofanCompilationJob* job,
                                             Isolate* isolate);
@@ -223,20 +228,6 @@ class V8_EXPORT_PRIVATE Compiler : public AllStatic {
  static Handle<SharedFunctionInfo> GetSharedFunctionInfo(FunctionLiteral* node,
                                                          Handle<Script> script,
                                                          IsolateT* isolate);
-  // ===========================================================================
-  // The following family of methods provides support for OSR. Code generated
-  // for entry via OSR might not be suitable for normal entry, hence will be
-  // returned directly to the caller.
-  //
-  // Please note this interface is the only part dealing with {Code} objects
-  // directly. Other methods are agnostic to {Code} and can use an interpreter
-  // instead of generating JIT code for a function at all.
-  // Generate and return optimized code for OSR, or empty handle on failure.
-  V8_WARN_UNUSED_RESULT static MaybeHandle<CodeT> GetOptimizedCodeForOSR(
-      Isolate* isolate, Handle<JSFunction> function, BytecodeOffset osr_offset,
-      JavaScriptFrame* osr_frame);
 };
 // A base class for compilation jobs intended to run concurrent to the main

--- a/src/compiler-dispatcher/optimizing-compile-dispatcher.cc
+++ b/src/compiler-dispatcher/optimizing-compile-dispatcher.cc
@@ -9,35 +9,20 @@
 #include "src/codegen/optimized-compilation-info.h"
 #include "src/execution/isolate.h"
 #include "src/execution/local-isolate.h"
+#include "src/handles/handles-inl.h"
 #include "src/heap/local-heap.h"
 #include "src/heap/parked-scope.h"
 #include "src/init/v8.h"
 #include "src/logging/counters.h"
 #include "src/logging/log.h"
 #include "src/logging/runtime-call-stats-scope.h"
-#include "src/objects/objects-inl.h"
+#include "src/objects/js-function.h"
 #include "src/tasks/cancelable-task.h"
 #include "src/tracing/trace-event.h"
 namespace v8 {
 namespace internal {
-namespace {
-void DisposeCompilationJob(TurbofanCompilationJob* job,
-                           bool restore_function_code) {
-  if (restore_function_code) {
-    Handle<JSFunction> function = job->compilation_info()->closure();
-    function->set_code(function->shared().GetCode(), kReleaseStore);
-    if (IsInProgress(function->tiering_state())) {
-      function->reset_tiering_state();
-    }
-  }
-  delete job;
-}
-}  // namespace
 class OptimizingCompileDispatcher::CompileTask : public CancelableTask {
 public:
  explicit CompileTask(Isolate* isolate,
@@ -129,26 +114,27 @@ void OptimizingCompileDispatcher::CompileNext(TurbofanCompilationJob* job,
 void OptimizingCompileDispatcher::FlushOutputQueue(bool restore_function_code) {
  for (;;) {
-    TurbofanCompilationJob* job = nullptr;
+    std::unique_ptr<TurbofanCompilationJob> job;
    {
      base::MutexGuard access_output_queue_(&output_queue_mutex_);
      if (output_queue_.empty()) return;
-      job = output_queue_.front();
+      job.reset(output_queue_.front());
      output_queue_.pop();
    }
-    DisposeCompilationJob(job, restore_function_code);
+    Compiler::DisposeTurbofanCompilationJob(job.get(), restore_function_code);
  }
 }
 void OptimizingCompileDispatcher::FlushInputQueue() {
  base::MutexGuard access_input_queue_(&input_queue_mutex_);
  while (input_queue_length_ > 0) {
-    TurbofanCompilationJob* job = input_queue_[InputQueueIndex(0)];
+    std::unique_ptr<TurbofanCompilationJob> job(
+        input_queue_[InputQueueIndex(0)]);
    DCHECK_NOT_NULL(job);
    input_queue_shift_ = InputQueueIndex(1);
    input_queue_length_--;
-    DisposeCompilationJob(job, true);
+    Compiler::DisposeTurbofanCompilationJob(job.get(), true);
  }
 }
@@ -196,25 +182,29 @@ void OptimizingCompileDispatcher::InstallOptimizedFunctions() {
  HandleScope handle_scope(isolate_);
  for (;;) {
-    TurbofanCompilationJob* job = nullptr;
+    std::unique_ptr<TurbofanCompilationJob> job;
    {
      base::MutexGuard access_output_queue_(&output_queue_mutex_);
      if (output_queue_.empty()) return;
-      job = output_queue_.front();
+      job.reset(output_queue_.front());
      output_queue_.pop();
    }
    OptimizedCompilationInfo* info = job->compilation_info();
    Handle<JSFunction> function(*info->closure(), isolate_);
-    if (function->HasAvailableCodeKind(info->code_kind())) {
+    // If another racing task has already finished compiling and installing the
+    // requested code kind on the function, throw out the current job.
+    if (!info->is_osr() && function->HasAvailableCodeKind(info->code_kind())) {
      if (FLAG_trace_concurrent_recompilation) {
        PrintF("  ** Aborting compilation for ");
        function->ShortPrint();
        PrintF(" as it has already been optimized.\n");
      }
-      DisposeCompilationJob(job, false);
+      Compiler::DisposeTurbofanCompilationJob(job.get(), false);
-    } else {
+      return;
-      Compiler::FinalizeTurbofanCompilationJob(job, isolate_);
    }
+    Compiler::FinalizeTurbofanCompilationJob(job.get(), isolate_);
  }
 }

--- a/src/execution/tiering-manager.cc
+++ b/src/execution/tiering-manager.cc
@@ -275,7 +275,10 @@ void TieringManager::MaybeOptimizeFrame(JSFunction function,
                                        UnoptimizedFrame* frame,
                                        CodeKind code_kind) {
  const TieringState tiering_state = function.feedback_vector().tiering_state();
-  if (V8_UNLIKELY(IsInProgress(tiering_state))) {
+  const TieringState osr_tiering_state =
+      function.feedback_vector().osr_tiering_state();
+  if (V8_UNLIKELY(IsInProgress(tiering_state)) ||
+      V8_UNLIKELY(IsInProgress(osr_tiering_state))) {
    // Note: This effectively disables OSR for the function while it is being
    // compiled.
    TraceInOptimizationQueue(function);

--- a/src/flags/flag-definitions.h
+++ b/src/flags/flag-definitions.h
@@ -882,6 +882,7 @@ DEFINE_BOOL(trace_turbo_inlining, false, "trace TurboFan inlining")
 DEFINE_BOOL(turbo_inline_array_builtins, true,
            "inline array builtins in TurboFan code")
 DEFINE_BOOL(use_osr, true, "use on-stack replacement")
+DEFINE_BOOL(concurrent_osr, false, "enable concurrent OSR")
 DEFINE_BOOL(trace_osr, false, "trace on-stack replacement")
 DEFINE_BOOL(analyze_environment_liveness, true,
            "analyze liveness of environment slots and zap dead values")

--- a/src/objects/code-kind.h
+++ b/src/objects/code-kind.h
@@ -104,7 +104,6 @@ inline constexpr bool CodeKindIsStoredInOptimizedCodeCache(CodeKind kind) {
 }
 inline CodeKind CodeKindForTopTier() { return CodeKind::TURBOFAN; }
-inline CodeKind CodeKindForOSR() { return CodeKind::TURBOFAN; }
 // The dedicated CodeKindFlag enum represents all code kinds in a format
 // suitable for bit sets.

--- a/src/objects/code.tq
+++ b/src/objects/code.tq
@@ -26,6 +26,7 @@ extern class BytecodeArray extends FixedArrayBase {
  // into other fields.
  osr_urgency: int8;
  bytecode_age: int8;
+  todo_use_me: int16;  // Placeholder for osr bytecode offset bits.
 }
 extern class CodeDataContainer extends HeapObject;
--- a/src/objects/feedback-vector.cc
+++ b/src/objects/feedback-vector.cc
@@ -427,9 +427,23 @@ void FeedbackVector::set_tiering_state(TieringState state) {
 void FeedbackVector::reset_flags() {
  set_flags(TieringStateBits::encode(TieringState::kNone) |
+            OsrTieringStateBit::encode(TieringState::kNone) |
            MaybeHasOptimizedCodeBit::encode(false));
 }
+TieringState FeedbackVector::osr_tiering_state() {
+  return OsrTieringStateBit::decode(flags());
+}
+void FeedbackVector::set_osr_tiering_state(TieringState marker) {
+  DCHECK(marker == TieringState::kNone || marker == TieringState::kInProgress);
+  STATIC_ASSERT(TieringState::kNone <= OsrTieringStateBit::kMax);
+  STATIC_ASSERT(TieringState::kInProgress <= OsrTieringStateBit::kMax);
+  int32_t state = flags();
+  state = OsrTieringStateBit::update(state, marker);
+  set_flags(state);
+}
 void FeedbackVector::EvictOptimizedCodeMarkedForDeoptimization(
    SharedFunctionInfo shared, const char* reason) {
  MaybeObject slot = maybe_optimized_code(kAcquireLoad);

--- a/src/objects/feedback-vector.h
+++ b/src/objects/feedback-vector.h
@@ -234,11 +234,13 @@ class FeedbackVector
                                                 const char* reason);
  void ClearOptimizedCode();
-  inline bool has_tiering_state() const;
  inline TieringState tiering_state() const;
  void set_tiering_state(TieringState state);
  void reset_tiering_state();
+  TieringState osr_tiering_state();
+  void set_osr_tiering_state(TieringState marker);
  void reset_flags();
  // Conversion from a slot to an integer index to the underlying array.

--- a/src/objects/feedback-vector.tq
+++ b/src/objects/feedback-vector.tq
@@ -10,7 +10,9 @@ bitfield struct FeedbackVectorFlags extends uint32 {
  // because they flag may lag behind the actual state of the world (it will be
  // updated in time).
  maybe_has_optimized_code: bool: 1 bit;
-  all_your_bits_are_belong_to_jgruber: uint32: 28 bit;
+  // Just one bit, since only {kNone,kInProgress} are relevant for OSR.
+  osr_tiering_state: TieringState: 1 bit;
+  all_your_bits_are_belong_to_jgruber: uint32: 27 bit;
 }
 @generateBodyDescriptor

--- a/src/objects/js-function-inl.h
+++ b/src/objects/js-function-inl.h
@@ -109,12 +109,20 @@ TieringState JSFunction::tiering_state() const {
 void JSFunction::set_tiering_state(TieringState state) {
  DCHECK(has_feedback_vector());
-  DCHECK(ChecksTieringState());
+  DCHECK(IsNone(state) || ChecksTieringState());
-  DCHECK(!ActiveTierIsTurbofan());
  feedback_vector().set_tiering_state(state);
 }
+TieringState JSFunction::osr_tiering_state() {
+  DCHECK(has_feedback_vector());
+  return feedback_vector().osr_tiering_state();
+}
+void JSFunction::set_osr_tiering_state(TieringState marker) {
+  DCHECK(has_feedback_vector());
+  feedback_vector().set_osr_tiering_state(marker);
+}
 bool JSFunction::has_feedback_vector() const {
  return shared().is_compiled() &&
         raw_feedback_cell().value().IsFeedbackVector();

--- a/src/objects/js-function.h
+++ b/src/objects/js-function.h
@@ -180,6 +180,9 @@ class JSFunction : public TorqueGeneratedJSFunction<
  void MarkForOptimization(Isolate* isolate, CodeKind target_kind,
                           ConcurrencyMode mode);
+  inline TieringState osr_tiering_state();
+  inline void set_osr_tiering_state(TieringState marker);
  // Sets the interrupt budget based on whether the function has a feedback
  // vector and any optimized code.
  void SetInterruptBudget(Isolate* isolate);

--- a/src/runtime/runtime-compiler.cc
+++ b/src/runtime/runtime-compiler.cc
@@ -225,64 +225,6 @@ RUNTIME_FUNCTION(Runtime_VerifyType) {
  return *obj;
 }
-namespace {
-bool IsSuitableForOnStackReplacement(Isolate* isolate,
-                                     Handle<JSFunction> function) {
-  // Don't OSR during serialization.
-  if (isolate->serializer_enabled()) return false;
-  // Keep track of whether we've succeeded in optimizing.
-  if (function->shared().optimization_disabled()) return false;
-  // TODO(chromium:1031479): Currently, OSR triggering mechanism is tied to the
-  // bytecode array. So, it might be possible to mark closure in one native
-  // context and optimize a closure from a different native context. So check if
-  // there is a feedback vector before OSRing. We don't expect this to happen
-  // often.
-  if (!function->has_feedback_vector()) return false;
-  // If we are trying to do OSR when there are already optimized
-  // activations of the function, it means (a) the function is directly or
-  // indirectly recursive and (b) an optimized invocation has been
-  // deoptimized so that we are currently in an unoptimized activation.
-  // Check for optimized activations of this function.
-  for (JavaScriptFrameIterator it(isolate); !it.done(); it.Advance()) {
-    JavaScriptFrame* frame = it.frame();
-    if (frame->is_optimized() && frame->function() == *function) return false;
-  }
-  return true;
-}
-BytecodeOffset DetermineEntryAndDisarmOSRForUnoptimized(
-    JavaScriptFrame* js_frame) {
-  UnoptimizedFrame* frame = reinterpret_cast<UnoptimizedFrame*>(js_frame);
-  // Note that the bytecode array active on the stack might be different from
-  // the one installed on the function (e.g. patched by debugger). This however
-  // is fine because we guarantee the layout to be in sync, hence any
-  // BytecodeOffset representing the entry point will be valid for any copy of
-  // the bytecode.
-  Handle<BytecodeArray> bytecode(frame->GetBytecodeArray(), frame->isolate());
-  DCHECK_IMPLIES(frame->is_interpreted(),
-                 frame->LookupCode().is_interpreter_trampoline_builtin());
-  DCHECK_IMPLIES(frame->is_baseline(),
-                 frame->LookupCode().kind() == CodeKind::BASELINE);
-  DCHECK(frame->is_unoptimized());
-  DCHECK(frame->function().shared().HasBytecodeArray());
-  // Disarm all back edges.
-  bytecode->reset_osr_urgency();
-  // Return a BytecodeOffset representing the bytecode offset of the back
-  // branch.
-  return BytecodeOffset(frame->GetBytecodeOffset());
-}
-}  // namespace
 RUNTIME_FUNCTION(Runtime_CompileForOnStackReplacement) {
  HandleScope handle_scope(isolate);
  DCHECK_EQ(0, args.length());
@@ -290,37 +232,33 @@ RUNTIME_FUNCTION(Runtime_CompileForOnStackReplacement) {
  // Determine the frame that triggered the OSR request.
  JavaScriptFrameIterator it(isolate);
-  JavaScriptFrame* frame = it.frame();
+  UnoptimizedFrame* frame = UnoptimizedFrame::cast(it.frame());
-  DCHECK(frame->is_unoptimized());
-  // Determine the entry point for which this OSR request has been fired and
+  DCHECK_IMPLIES(frame->is_interpreted(),
-  // also disarm all back edges in the calling code to stop new requests.
+                 frame->LookupCode().is_interpreter_trampoline_builtin());
-  BytecodeOffset osr_offset = DetermineEntryAndDisarmOSRForUnoptimized(frame);
+  DCHECK_IMPLIES(frame->is_baseline(),
+                 frame->LookupCode().kind() == CodeKind::BASELINE);
+  DCHECK(frame->function().shared().HasBytecodeArray());
+  // Determine the entry point for which this OSR request has been fired.
+  BytecodeOffset osr_offset = BytecodeOffset(frame->GetBytecodeOffset());
  DCHECK(!osr_offset.IsNone());
-  MaybeHandle<CodeT> maybe_result;
+  // TODO(v8:12161): If cache exists with different offset: kSynchronous.
+  ConcurrencyMode mode =
+      isolate->concurrent_recompilation_enabled() && FLAG_concurrent_osr
+          ? ConcurrencyMode::kConcurrent
+          : ConcurrencyMode::kSynchronous;
  Handle<JSFunction> function(frame->function(), isolate);
-  if (IsSuitableForOnStackReplacement(isolate, function)) {
+  MaybeHandle<CodeT> maybe_result =
-    if (FLAG_trace_osr) {
+      Compiler::CompileOptimizedOSR(isolate, function, osr_offset, frame, mode);
-      CodeTracer::Scope scope(isolate->GetCodeTracer());
-      PrintF(scope.file(), "[OSR - Compiling: ");
-      function->PrintName(scope.file());
-      PrintF(scope.file(), " at OSR bytecode offset %d]\n", osr_offset.ToInt());
-    }
-    maybe_result =
-        Compiler::GetOptimizedCodeForOSR(isolate, function, osr_offset, frame);
-  }
  Handle<CodeT> result;
  if (!maybe_result.ToHandle(&result)) {
    // No OSR'd code available.
-    if (FLAG_trace_osr) {
+    // TODO(v8:12161): Distinguish between actual failure and scheduling a
-      CodeTracer::Scope scope(isolate->GetCodeTracer());
+    // concurrent job.
-      PrintF(scope.file(), "[OSR - Failed: ");
-      function->PrintName(scope.file());
-      PrintF(scope.file(), " at OSR bytecode offset %d]\n", osr_offset.ToInt());
-    }
    if (!function->HasAttachedOptimizedCode()) {
      function->set_code(function->shared().GetCode(), kReleaseStore);
    }
@@ -329,7 +267,7 @@ RUNTIME_FUNCTION(Runtime_CompileForOnStackReplacement) {
  }
  DCHECK(!result.is_null());
-  DCHECK(result->is_turbofanned());
+  DCHECK(result->is_turbofanned());  // TODO(v8:7700): Support Maglev.
  DCHECK(CodeKindIsOptimizedJSFunction(result->kind()));
  DeoptimizationData data =
@@ -346,7 +284,11 @@ RUNTIME_FUNCTION(Runtime_CompileForOnStackReplacement) {
  }
  if (function->feedback_vector().invocation_count() <= 1 &&
-      function->tiering_state() != TieringState::kNone) {
+      !IsNone(function->tiering_state()) && V8_LIKELY(!FLAG_always_opt)) {
+    // Note: Why consider FLAG_always_opt? Because it makes invocation_count
+    // unreliable at low counts: the first entry may already be optimized, and
+    // thus won't increment invocation_count.
+    //
    // With lazy feedback allocation we may not have feedback for the
    // initial part of the function that was executed before we allocated a
    // feedback vector. Reset any tiering states for such functions.

--- a/src/runtime/runtime-test.cc
+++ b/src/runtime/runtime-test.cc
@@ -521,6 +521,47 @@ RUNTIME_FUNCTION(Runtime_PrepareFunctionForOptimization) {
  return ReadOnlyRoots(isolate).undefined_value();
 }
+namespace {
+void FinalizeOptimization(Isolate* isolate) {
+  DCHECK(isolate->concurrent_recompilation_enabled());
+  isolate->optimizing_compile_dispatcher()->AwaitCompileTasks();
+  isolate->optimizing_compile_dispatcher()->InstallOptimizedFunctions();
+  isolate->optimizing_compile_dispatcher()->set_finalize(true);
+}
+BytecodeOffset OffsetOfNextJumpLoop(Isolate* isolate, UnoptimizedFrame* frame) {
+  Handle<BytecodeArray> bytecode_array(frame->GetBytecodeArray(), isolate);
+  const int current_offset = frame->GetBytecodeOffset();
+  interpreter::BytecodeArrayIterator it(bytecode_array, current_offset);
+  // First, look for a loop that contains the current bytecode offset.
+  for (; !it.done(); it.Advance()) {
+    if (it.current_bytecode() != interpreter::Bytecode::kJumpLoop) {
+      continue;
+    }
+    if (!base::IsInRange(current_offset, it.GetJumpTargetOffset(),
+                         it.current_offset())) {
+      continue;
+    }
+    return BytecodeOffset(it.current_offset());
+  }
+  // Fall back to any loop after the current offset.
+  it.SetOffset(current_offset);
+  for (; !it.done(); it.Advance()) {
+    if (it.current_bytecode() == interpreter::Bytecode::kJumpLoop) {
+      return BytecodeOffset(it.current_offset());
+    }
+  }
+  return BytecodeOffset::None();
+}
+}  // namespace
 RUNTIME_FUNCTION(Runtime_OptimizeOsr) {
  HandleScope handle_scope(isolate);
  DCHECK(args.length() == 0 || args.length() == 1);
@@ -540,7 +581,9 @@ RUNTIME_FUNCTION(Runtime_OptimizeOsr) {
  if (!it.done()) function = handle(it.frame()->function(), isolate);
  if (function.is_null()) return CrashUnlessFuzzing(isolate);
-  if (!FLAG_opt) return ReadOnlyRoots(isolate).undefined_value();
+  if (V8_UNLIKELY(!FLAG_opt) || V8_UNLIKELY(!FLAG_use_osr)) {
+    return ReadOnlyRoots(isolate).undefined_value();
+  }
  if (!function->shared().allows_lazy_compilation()) {
    return CrashUnlessFuzzing(isolate);
@@ -567,6 +610,11 @@ RUNTIME_FUNCTION(Runtime_OptimizeOsr) {
    return ReadOnlyRoots(isolate).undefined_value();
  }
+  if (!it.frame()->is_unoptimized()) {
+    // Nothing to be done.
+    return ReadOnlyRoots(isolate).undefined_value();
+  }
  // Ensure that the function is marked for non-concurrent optimization, so that
  // subsequent runs don't also optimize.
  if (FLAG_trace_osr) {
@@ -581,8 +629,40 @@ RUNTIME_FUNCTION(Runtime_OptimizeOsr) {
  function->MarkForOptimization(isolate, CodeKind::TURBOFAN,
                                ConcurrencyMode::kSynchronous);
-  if (it.frame()->is_unoptimized()) {
+  isolate->tiering_manager()->RequestOsrAtNextOpportunity(*function);
-    isolate->tiering_manager()->RequestOsrAtNextOpportunity(*function);
+  // If concurrent OSR is enabled, the testing workflow is a bit tricky. We
+  // must guarantee that the next JumpLoop installs the finished OSR'd code
+  // object, but we still want to exercise concurrent code paths. To do so,
+  // we attempt to find the next JumpLoop, start an OSR job for it now, and
+  // immediately force finalization.
+  // If this succeeds and we correctly match up the next JumpLoop, once we
+  // reach the JumpLoop we'll hit the OSR cache and install the generated code.
+  // If not (e.g. because we enter a nested loop first), the next JumpLoop will
+  // see the cached OSR code with a mismatched offset, and trigger
+  // non-concurrent OSR compilation and installation.
+  if (isolate->concurrent_recompilation_enabled() && FLAG_concurrent_osr) {
+    const BytecodeOffset osr_offset =
+        OffsetOfNextJumpLoop(isolate, UnoptimizedFrame::cast(it.frame()));
+    if (osr_offset.IsNone()) {
+      // The loop may have been elided by bytecode generation (e.g. for
+      // patterns such as `do { ... } while (false);`.
+      return ReadOnlyRoots(isolate).undefined_value();
+    }
+    // Finalize first to ensure all pending tasks are done (since we can't
+    // queue more than one OSR job for each function).
+    FinalizeOptimization(isolate);
+    // Queue the job.
+    auto unused_result = Compiler::CompileOptimizedOSR(
+        isolate, function, osr_offset, UnoptimizedFrame::cast(it.frame()),
+        ConcurrencyMode::kConcurrent);
+    USE(unused_result);
+    // Finalize again to finish the queued job. The next call into
+    // CompileForOnStackReplacement will pick up the cached Code object.
+    FinalizeOptimization(isolate);
  }
  return ReadOnlyRoots(isolate).undefined_value();
@@ -746,9 +826,7 @@ RUNTIME_FUNCTION(Runtime_WaitForBackgroundOptimization) {
 RUNTIME_FUNCTION(Runtime_FinalizeOptimization) {
  DCHECK_EQ(0, args.length());
  if (isolate->concurrent_recompilation_enabled()) {
-    isolate->optimizing_compile_dispatcher()->AwaitCompileTasks();
+    FinalizeOptimization(isolate);
-    isolate->optimizing_compile_dispatcher()->InstallOptimizedFunctions();
-    isolate->optimizing_compile_dispatcher()->set_finalize(true);
  }
  return ReadOnlyRoots(isolate).undefined_value();
 }

--- a/src/utils/utils.h
+++ b/src/utils/utils.h
@@ -490,10 +490,10 @@ V8_EXPORT_PRIVATE std::ostream& operator<<(std::ostream& os, FeedbackSlot);
 class BytecodeOffset {
 public:
-  explicit BytecodeOffset(int id) : id_(id) {}
+  explicit constexpr BytecodeOffset(int id) : id_(id) {}
  int ToInt() const { return id_; }
-  static BytecodeOffset None() { return BytecodeOffset(kNoneId); }
+  static constexpr BytecodeOffset None() { return BytecodeOffset(kNoneId); }
  // Special bailout id support for deopting into the {JSConstructStub} stub.
  // The following hard-coded deoptimization points are supported by the stub:
@@ -506,7 +506,7 @@ class BytecodeOffset {
           id_ == ConstructStubInvoke().ToInt();
  }
-  bool IsNone() const { return id_ == kNoneId; }
+  constexpr bool IsNone() const { return id_ == kNoneId; }
  bool operator==(const BytecodeOffset& other) const {
    return id_ == other.id_;
  }

--- a/test/mjsunit/regress/regress-v8-5697.js
+++ b/test/mjsunit/regress/regress-v8-5697.js
 // Copyright 2016 the V8 project authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
+//
-// Flags: --allow-natives-syntax --opt
+// Flags: --allow-natives-syntax --opt --no-use-osr
+//
+// Why not OSR? Because it may inline the `store` function into OSR'd code
+// below before it has a chance to be optimized, making
+// `assertOptimized(store)` fail.
 function load(o) {
  return o.x;