Commit 40ebe845 authored by Manos Koukoutos's avatar Manos Koukoutos Committed by Commit Bot

[wasm][turbofan] Implement loop unrolling for wasm

Design doc: https://docs.google.com/document/d/1AsUCqslMUB6fLdnGq0ZoPk2kn50jIJAWAL77lKXXP5g/

Currently, wasm loop unrolling is disabled by default. We intend to
further investigate its compilation time cost and running time benefits
before enabling it.

Additional changes:
- Introduce LoopFinder::FindUnnestedLoopFromHeader() as a lightweight
  loop analysis.
- Move EliminateLoopExit into LoopPeeling and expose it.
- Introduce loop_info_ field into WasmGraphBuildingInterface, fill it
  up in Loop().
- Break after encountering the first loop in BuildNestedLoopExits.
- Introduce struct WasmLoopInfo. A WasmLoopInfo vector is instantiated
  in ExecuteTurbofanWasmCompilation, passed to BuildGraphForWasmFunction
  to be filled up by WasmGraphBuildingInterface, and then passed to
  GenerateCodeForWasmFunction to be used in WasmLoopUnrollingPhase.
- Introduce WasmLoopUnrollingPhase and insert it into the wasm
  compilation pipeline.
- Fix an issue where exception values were not wrapped in
  WasmGraphBuilderInterface.
- Update --wasm-loop-unrolling flag description.

Bug: v8:11298
Change-Id: I4b57cf2ea8520931f60769f843ffd57b3ca6399b
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2697349
Commit-Queue: Manos Koukoutos <manoskouk@chromium.org>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Reviewed-by: 's avatarGeorg Neis <neis@chromium.org>
Reviewed-by: 's avatarNico Hartmann <nicohartmann@chromium.org>
Cr-Commit-Position: refs/heads/master@{#73009}
parent 80635217
...@@ -2322,6 +2322,8 @@ v8_compiler_sources = [ ...@@ -2322,6 +2322,8 @@ v8_compiler_sources = [
"src/compiler/loop-analysis.h", "src/compiler/loop-analysis.h",
"src/compiler/loop-peeling.cc", "src/compiler/loop-peeling.cc",
"src/compiler/loop-peeling.h", "src/compiler/loop-peeling.h",
"src/compiler/loop-unrolling.cc",
"src/compiler/loop-unrolling.h",
"src/compiler/loop-variable-optimizer.cc", "src/compiler/loop-variable-optimizer.cc",
"src/compiler/loop-variable-optimizer.h", "src/compiler/loop-variable-optimizer.h",
"src/compiler/machine-graph-verifier.cc", "src/compiler/machine-graph-verifier.cc",
......
...@@ -39,7 +39,6 @@ struct TempLoopInfo { ...@@ -39,7 +39,6 @@ struct TempLoopInfo {
LoopTree::Loop* loop; LoopTree::Loop* loop;
}; };
// Encapsulation of the loop finding algorithm. // Encapsulation of the loop finding algorithm.
// ----------------------------------------------------------------------------- // -----------------------------------------------------------------------------
// Conceptually, the contents of a loop are those nodes that are "between" the // Conceptually, the contents of a loop are those nodes that are "between" the
...@@ -54,6 +53,8 @@ struct TempLoopInfo { ...@@ -54,6 +53,8 @@ struct TempLoopInfo {
// 1 bit per loop per node per direction are required during the marking phase. // 1 bit per loop per node per direction are required during the marking phase.
// To handle nested loops correctly, the algorithm must filter some reachability // To handle nested loops correctly, the algorithm must filter some reachability
// marks on edges into/out-of the loop header nodes. // marks on edges into/out-of the loop header nodes.
// Note: this algorithm assumes there are no unreachable loop header nodes
// (including loop phis).
class LoopFinderImpl { class LoopFinderImpl {
public: public:
LoopFinderImpl(Graph* graph, LoopTree* loop_tree, TickCounter* tick_counter, LoopFinderImpl(Graph* graph, LoopTree* loop_tree, TickCounter* tick_counter,
...@@ -542,6 +543,53 @@ LoopTree* LoopFinder::BuildLoopTree(Graph* graph, TickCounter* tick_counter, ...@@ -542,6 +543,53 @@ LoopTree* LoopFinder::BuildLoopTree(Graph* graph, TickCounter* tick_counter,
return loop_tree; return loop_tree;
} }
ZoneUnorderedSet<Node*>* LoopFinder::FindUnnestedLoopFromHeader(
Node* loop_header, Zone* zone) {
auto* visited = zone->New<ZoneUnorderedSet<Node*>>(zone);
std::vector<Node*> queue;
DCHECK(loop_header->opcode() == IrOpcode::kLoop);
queue.push_back(loop_header);
while (!queue.empty()) {
Node* node = queue.back();
queue.pop_back();
// Terminate is not part of the loop, and neither are its uses.
if (node->opcode() == IrOpcode::kTerminate) {
DCHECK_EQ(node->InputAt(1), loop_header);
continue;
}
visited->insert(node);
switch (node->opcode()) {
case IrOpcode::kLoopExit:
DCHECK_EQ(node->InputAt(1), loop_header);
// LoopExitValue/Effect uses are inside the loop. The rest are not.
for (Node* use : node->uses()) {
if (use->opcode() == IrOpcode::kLoopExitEffect ||
use->opcode() == IrOpcode::kLoopExitValue) {
if (visited->count(use) == 0) queue.push_back(use);
}
}
break;
case IrOpcode::kLoopExitEffect:
case IrOpcode::kLoopExitValue:
DCHECK_EQ(NodeProperties::GetControlInput(node)->InputAt(1),
loop_header);
// All uses are outside the loop, do nothing.
break;
default:
for (Node* use : node->uses()) {
if (visited->count(use) == 0) queue.push_back(use);
}
break;
}
}
return visited;
}
bool LoopFinder::HasMarkedExits(LoopTree* loop_tree, bool LoopFinder::HasMarkedExits(LoopTree* loop_tree,
const LoopTree::Loop* loop) { const LoopTree::Loop* loop) {
// Look for returns and if projections that are outside the loop but whose // Look for returns and if projections that are outside the loop but whose
......
...@@ -178,6 +178,15 @@ class V8_EXPORT_PRIVATE LoopFinder { ...@@ -178,6 +178,15 @@ class V8_EXPORT_PRIVATE LoopFinder {
Zone* temp_zone); Zone* temp_zone);
static bool HasMarkedExits(LoopTree* loop_tree_, const LoopTree::Loop* loop); static bool HasMarkedExits(LoopTree* loop_tree_, const LoopTree::Loop* loop);
// Find all nodes of a loop given its header node. This is much more
// restricted than BuildLoopTree.
// Assumptions:
// 1) All loop exits of the loop are marked with LoopExit, LoopExitEffect,
// and LoopExitValue nodes.
// 2) There are no nested loops within this loop.
static ZoneUnorderedSet<Node*>* FindUnnestedLoopFromHeader(Node* loop_header,
Zone* zone);
}; };
// Copies a range of nodes any number of times. // Copies a range of nodes any number of times.
......
...@@ -236,9 +236,7 @@ void LoopPeeler::PeelInnerLoops(LoopTree::Loop* loop) { ...@@ -236,9 +236,7 @@ void LoopPeeler::PeelInnerLoops(LoopTree::Loop* loop) {
Peel(loop); Peel(loop);
} }
namespace { void LoopPeeler::EliminateLoopExit(Node* node) {
void EliminateLoopExit(Node* node) {
DCHECK_EQ(IrOpcode::kLoopExit, node->opcode()); DCHECK_EQ(IrOpcode::kLoopExit, node->opcode());
// The exit markers take the loop exit as input. We iterate over uses // The exit markers take the loop exit as input. We iterate over uses
// and remove all the markers from the graph. // and remove all the markers from the graph.
...@@ -260,8 +258,6 @@ void EliminateLoopExit(Node* node) { ...@@ -260,8 +258,6 @@ void EliminateLoopExit(Node* node) {
node->Kill(); node->Kill();
} }
} // namespace
void LoopPeeler::PeelInnerLoopsOfTree() { void LoopPeeler::PeelInnerLoopsOfTree() {
for (LoopTree::Loop* loop : loop_tree_->outer_loops()) { for (LoopTree::Loop* loop : loop_tree_->outer_loops()) {
PeelInnerLoops(loop); PeelInnerLoops(loop);
......
...@@ -50,6 +50,7 @@ class V8_EXPORT_PRIVATE LoopPeeler { ...@@ -50,6 +50,7 @@ class V8_EXPORT_PRIVATE LoopPeeler {
void PeelInnerLoopsOfTree(); void PeelInnerLoopsOfTree();
static void EliminateLoopExits(Graph* graph, Zone* tmp_zone); static void EliminateLoopExits(Graph* graph, Zone* tmp_zone);
static void EliminateLoopExit(Node* loop);
static const size_t kMaxPeeledNodes = 1000; static const size_t kMaxPeeledNodes = 1000;
private: private:
......
// Copyright 2021 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/compiler/loop-unrolling.h"
#include "src/base/small-vector.h"
#include "src/codegen/tick-counter.h"
#include "src/compiler/common-operator.h"
#include "src/compiler/loop-analysis.h"
#include "src/compiler/loop-peeling.h"
namespace v8 {
namespace internal {
namespace compiler {
// A simple heuristic to decide how many times to unroll a loop. Favors small
// and deeply nested loops.
// TODO(manoskouk): Investigate how this can be improved.
V8_INLINE uint32_t unrolling_count_heuristic(uint32_t size, uint32_t depth) {
static constexpr uint32_t kMaximumUnnestedSize = 50;
static constexpr uint32_t kMaximumUnrollingCount = 7;
return std::min((depth + 1) * kMaximumUnnestedSize / size,
kMaximumUnrollingCount);
}
void UnrollLoop(Node* loop_node, ZoneUnorderedSet<Node*>* loop, uint32_t depth,
Graph* graph, CommonOperatorBuilder* common, Zone* tmp_zone,
SourcePositionTable* source_positions,
NodeOriginTable* node_origins) {
DCHECK_EQ(loop_node->opcode(), IrOpcode::kLoop);
// No back-jump to the loop header means this is not really a loop.
if (loop_node->InputCount() < 2) return;
uint32_t unrolling_count =
unrolling_count_heuristic(static_cast<uint32_t>(loop->size()), depth);
if (unrolling_count == 0) return;
uint32_t iteration_count = unrolling_count + 1;
uint32_t copied_size = static_cast<uint32_t>(loop->size()) * iteration_count;
NodeVector copies(tmp_zone);
NodeCopier copier(graph, copied_size, &copies, unrolling_count);
{
std::vector<Node*> loop_nodes(loop->begin(), loop->end());
copier.CopyNodes(
graph, tmp_zone, graph->NewNode(common->Dead()),
NodeRange(loop_nodes.data(), loop_nodes.data() + loop_nodes.size()),
source_positions, node_origins);
}
#define COPY(node, n) copier.map(node, n)
#define FOREACH_COPY_INDEX(i) for (uint32_t i = 0; i < unrolling_count; i++)
for (Node* node : *loop) {
switch (node->opcode()) {
case IrOpcode::kStackPointerGreaterThan: {
/*** Step 1: Remove stack checks from all but the first iteration of the
loop. ***/
for (Edge edge : node->use_edges()) {
if (edge.from()->opcode() == IrOpcode::kBranch) {
FOREACH_COPY_INDEX(i) {
COPY(edge.from(), i)
->ReplaceInput(0, graph->NewNode(common->Int32Constant(1)));
}
} else if (edge.from()->opcode() == IrOpcode::kEffectPhi) {
// We now need to remove stack check and the related function call
// from the effect chain.
// The effect chain looks like this (* stand for irrelevant nodes):
//
// replacing effect (effect before stack check)
// * * | *
// | | | |
// ( Load )
// * * | *
// | | | |
// ( Load )
// | |
// stack check
// | * | *
// | | | |
// | (call)
// | | *
// | | |
// stack check effect (that we need to replace)
Node* stack_check_effect = edge.from();
DCHECK_EQ(edge.index(), 0);
DCHECK_EQ(stack_check_effect->InputAt(1)->opcode(),
IrOpcode::kCall);
DCHECK_EQ(stack_check_effect->InputAt(1)->InputAt(1), node);
DCHECK_EQ(node->InputAt(1)->opcode(), IrOpcode::kLoad);
DCHECK_EQ(node->InputAt(1)->InputAt(2)->opcode(), IrOpcode::kLoad);
Node* replacing_effect = node->InputAt(1)->InputAt(2)->InputAt(2);
FOREACH_COPY_INDEX(i) {
COPY(stack_check_effect, i)
->ReplaceUses(COPY(replacing_effect, i));
}
}
}
break;
}
case IrOpcode::kLoopExit: {
/*** Step 2: Create merges for loop exits. ***/
if (node->InputAt(1) == loop_node) {
// Create a merge node from all iteration exits.
Node** merge_inputs = tmp_zone->NewArray<Node*>(iteration_count);
merge_inputs[0] = node;
for (uint32_t i = 1; i < iteration_count; i++) {
merge_inputs[i] = COPY(node, i - 1);
}
Node* merge_node = graph->NewNode(common->Merge(iteration_count),
iteration_count, merge_inputs);
// Replace all uses of the loop exit with the merge node.
for (Edge use_edge : node->use_edges()) {
Node* use = use_edge.from();
if (loop->count(use) == 1) {
// Uses within the loop will be LoopExitEffects and
// LoopExitValues. We need to create a phi from all loop
// iterations. Its merge will be the merge node for LoopExits.
const Operator* phi_operator;
if (use->opcode() == IrOpcode::kLoopExitEffect) {
phi_operator = common->EffectPhi(iteration_count);
} else {
DCHECK(use->opcode() == IrOpcode::kLoopExitValue);
phi_operator = common->Phi(
LoopExitValueRepresentationOf(use->op()), iteration_count);
}
Node** phi_inputs =
tmp_zone->NewArray<Node*>(iteration_count + 1);
phi_inputs[0] = use;
for (uint32_t i = 1; i < iteration_count; i++) {
phi_inputs[i] = COPY(use, i - 1);
}
phi_inputs[iteration_count] = merge_node;
Node* phi =
graph->NewNode(phi_operator, iteration_count + 1, phi_inputs);
use->ReplaceUses(phi);
// Repair phi which we just broke.
phi->ReplaceInput(0, use);
} else if (use != merge_node) {
// For uses outside the loop, simply redirect them to the merge.
use->ReplaceInput(use_edge.index(), merge_node);
}
}
}
break;
}
default:
break;
}
}
/*** Step 3: Rewire the iterations of the loop. Each iteration should flow
into the next one, and the last should flow into the first. ***/
// 3a) Rewire control.
// We start at index=1 assuming that index=0 is the (non-recursive) loop
// entry.
for (int input_index = 1; input_index < loop_node->InputCount();
input_index++) {
Node* last_iteration_input =
COPY(loop_node, unrolling_count - 1)->InputAt(input_index);
for (uint32_t copy_index = unrolling_count - 1; copy_index > 0;
copy_index--) {
COPY(loop_node, copy_index)
->ReplaceInput(input_index,
COPY(loop_node, copy_index - 1)->InputAt(input_index));
}
COPY(loop_node, 0)
->ReplaceInput(input_index, loop_node->InputAt(input_index));
loop_node->ReplaceInput(input_index, last_iteration_input);
}
// The loop of each following iteration will become a merge. We need to remove
// its non-recursive input.
FOREACH_COPY_INDEX(i) {
COPY(loop_node, i)->RemoveInput(0);
NodeProperties::ChangeOp(COPY(loop_node, i),
common->Merge(loop_node->InputCount() - 1));
}
// 3b) Rewire phis and loop exits.
for (Node* use : loop_node->uses()) {
if (NodeProperties::IsPhi(use)) {
int count = use->opcode() == IrOpcode::kPhi
? use->op()->ValueInputCount()
: use->op()->EffectInputCount();
// Phis depending on the loop header should take their input from the
// previous iteration instead.
for (int input_index = 1; input_index < count; input_index++) {
Node* last_iteration_input =
COPY(use, unrolling_count - 1)->InputAt(input_index);
for (uint32_t copy_index = unrolling_count - 1; copy_index > 0;
copy_index--) {
COPY(use, copy_index)
->ReplaceInput(input_index,
COPY(use, copy_index - 1)->InputAt(input_index));
}
COPY(use, 0)->ReplaceInput(input_index, use->InputAt(input_index));
use->ReplaceInput(input_index, last_iteration_input);
}
// Phis in each following iteration should not depend on the
// (non-recursive) entry to the loop. Remove their first input.
FOREACH_COPY_INDEX(i) {
COPY(use, i)->RemoveInput(0);
NodeProperties::ChangeOp(
COPY(use, i), common->ResizeMergeOrPhi(use->op(), count - 1));
}
}
// Loop exits should point to the loop header.
if (use->opcode() == IrOpcode::kLoopExit) {
FOREACH_COPY_INDEX(i) { COPY(use, i)->ReplaceInput(1, loop_node); }
}
}
}
#undef COPY
#undef FOREACH_COPY_INDEX
} // namespace compiler
} // namespace internal
} // namespace v8
// Copyright 2021 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_COMPILER_LOOP_UNROLLING_H_
#define V8_COMPILER_LOOP_UNROLLING_H_
// Loop unrolling is an optimization that copies the body of a loop and creates
// a fresh loop, whose iteration corresponds to 2 or more iterations of the
// initial loop. For a high-level description of the algorithm see
// docs.google.com/document/d/1AsUCqslMUB6fLdnGq0ZoPk2kn50jIJAWAL77lKXXP5g/
#include "src/compiler/common-operator.h"
#include "src/compiler/loop-analysis.h"
namespace v8 {
namespace internal {
namespace compiler {
void UnrollLoop(Node* loop_node, ZoneUnorderedSet<Node*>* loop, uint32_t depth,
Graph* graph, CommonOperatorBuilder* common, Zone* tmp_zone,
SourcePositionTable* source_positions,
NodeOriginTable* node_origins);
} // namespace compiler
} // namespace internal
} // namespace v8
#endif // V8_COMPILER_LOOP_UNROLLING_H_
...@@ -56,6 +56,7 @@ ...@@ -56,6 +56,7 @@
#include "src/compiler/load-elimination.h" #include "src/compiler/load-elimination.h"
#include "src/compiler/loop-analysis.h" #include "src/compiler/loop-analysis.h"
#include "src/compiler/loop-peeling.h" #include "src/compiler/loop-peeling.h"
#include "src/compiler/loop-unrolling.h"
#include "src/compiler/loop-variable-optimizer.h" #include "src/compiler/loop-variable-optimizer.h"
#include "src/compiler/machine-graph-verifier.h" #include "src/compiler/machine-graph-verifier.h"
#include "src/compiler/machine-operator-reducer.h" #include "src/compiler/machine-operator-reducer.h"
...@@ -1776,6 +1777,37 @@ struct LoopPeelingPhase { ...@@ -1776,6 +1777,37 @@ struct LoopPeelingPhase {
} }
}; };
struct WasmLoopUnrollingPhase {
DECL_PIPELINE_PHASE_CONSTANTS(WasmLoopUnrolling)
void Run(PipelineData* data, Zone* temp_zone,
std::vector<compiler::WasmLoopInfo>* loop_infos) {
for (WasmLoopInfo& loop_info : *loop_infos) {
if (loop_info.is_innermost) {
ZoneUnorderedSet<Node*>* loop =
LoopFinder::FindUnnestedLoopFromHeader(loop_info.header, temp_zone);
UnrollLoop(loop_info.header, loop, loop_info.nesting_depth,
data->graph(), data->common(), temp_zone,
data->source_positions(), data->node_origins());
}
}
for (WasmLoopInfo& loop_info : *loop_infos) {
std::unordered_set<Node*> loop_exits;
// We collect exits into a set first because we are not allowed to mutate
// them while iterating uses().
for (Node* use : loop_info.header->uses()) {
if (use->opcode() == IrOpcode::kLoopExit) {
loop_exits.insert(use);
}
}
for (Node* use : loop_exits) {
LoopPeeler::EliminateLoopExit(use);
}
}
}
};
struct LoopExitEliminationPhase { struct LoopExitEliminationPhase {
DECL_PIPELINE_PHASE_CONSTANTS(LoopExitElimination) DECL_PIPELINE_PHASE_CONSTANTS(LoopExitElimination)
...@@ -3209,7 +3241,7 @@ void Pipeline::GenerateCodeForWasmFunction( ...@@ -3209,7 +3241,7 @@ void Pipeline::GenerateCodeForWasmFunction(
MachineGraph* mcgraph, CallDescriptor* call_descriptor, MachineGraph* mcgraph, CallDescriptor* call_descriptor,
SourcePositionTable* source_positions, NodeOriginTable* node_origins, SourcePositionTable* source_positions, NodeOriginTable* node_origins,
wasm::FunctionBody function_body, const wasm::WasmModule* module, wasm::FunctionBody function_body, const wasm::WasmModule* module,
int function_index) { int function_index, std::vector<compiler::WasmLoopInfo>* loop_info) {
ZoneStats zone_stats(wasm_engine->allocator()); ZoneStats zone_stats(wasm_engine->allocator());
std::unique_ptr<PipelineStatistics> pipeline_statistics( std::unique_ptr<PipelineStatistics> pipeline_statistics(
CreatePipelineStatistics(wasm_engine, function_body, module, info, CreatePipelineStatistics(wasm_engine, function_body, module, info,
...@@ -3236,8 +3268,8 @@ void Pipeline::GenerateCodeForWasmFunction( ...@@ -3236,8 +3268,8 @@ void Pipeline::GenerateCodeForWasmFunction(
pipeline.RunPrintAndVerify("V8.WasmMachineCode", true); pipeline.RunPrintAndVerify("V8.WasmMachineCode", true);
if (FLAG_wasm_loop_unrolling) { if (FLAG_wasm_loop_unrolling) {
pipeline.Run<LoopExitEliminationPhase>(); pipeline.Run<WasmLoopUnrollingPhase>(loop_info);
pipeline.RunPrintAndVerify("V8.LoopExitEliminationPhase", true); pipeline.RunPrintAndVerify("V8.WasmLoopUnrolling", true);
} }
data.BeginPhaseKind("V8.WasmOptimization"); data.BeginPhaseKind("V8.WasmOptimization");
......
...@@ -41,6 +41,7 @@ class MachineGraph; ...@@ -41,6 +41,7 @@ class MachineGraph;
class NodeOriginTable; class NodeOriginTable;
class Schedule; class Schedule;
class SourcePositionTable; class SourcePositionTable;
struct WasmLoopInfo;
class Pipeline : public AllStatic { class Pipeline : public AllStatic {
public: public:
...@@ -57,7 +58,7 @@ class Pipeline : public AllStatic { ...@@ -57,7 +58,7 @@ class Pipeline : public AllStatic {
MachineGraph* mcgraph, CallDescriptor* call_descriptor, MachineGraph* mcgraph, CallDescriptor* call_descriptor,
SourcePositionTable* source_positions, NodeOriginTable* node_origins, SourcePositionTable* source_positions, NodeOriginTable* node_origins,
wasm::FunctionBody function_body, const wasm::WasmModule* module, wasm::FunctionBody function_body, const wasm::WasmModule* module,
int function_index); int function_index, std::vector<compiler::WasmLoopInfo>* loop_infos);
// Run the pipeline on a machine graph and generate code. // Run the pipeline on a machine graph and generate code.
static wasm::WasmCompilationResult GenerateCodeForWasmNativeStub( static wasm::WasmCompilationResult GenerateCodeForWasmNativeStub(
......
...@@ -7829,14 +7829,15 @@ bool BuildGraphForWasmFunction(AccountingAllocator* allocator, ...@@ -7829,14 +7829,15 @@ bool BuildGraphForWasmFunction(AccountingAllocator* allocator,
const wasm::FunctionBody& func_body, const wasm::FunctionBody& func_body,
int func_index, wasm::WasmFeatures* detected, int func_index, wasm::WasmFeatures* detected,
MachineGraph* mcgraph, MachineGraph* mcgraph,
std::vector<compiler::WasmLoopInfo>* loop_infos,
NodeOriginTable* node_origins, NodeOriginTable* node_origins,
SourcePositionTable* source_positions) { SourcePositionTable* source_positions) {
// Create a TF graph during decoding. // Create a TF graph during decoding.
WasmGraphBuilder builder(env, mcgraph->zone(), mcgraph, func_body.sig, WasmGraphBuilder builder(env, mcgraph->zone(), mcgraph, func_body.sig,
source_positions); source_positions);
wasm::VoidResult graph_construction_result = wasm::VoidResult graph_construction_result = wasm::BuildTFGraph(
wasm::BuildTFGraph(allocator, env->enabled_features, env->module, allocator, env->enabled_features, env->module, &builder, detected,
&builder, detected, func_body, node_origins); func_body, loop_infos, node_origins);
if (graph_construction_result.failed()) { if (graph_construction_result.failed()) {
if (FLAG_trace_wasm_compiler) { if (FLAG_trace_wasm_compiler) {
StdoutStream{} << "Compilation failed: " StdoutStream{} << "Compilation failed: "
...@@ -7943,9 +7944,12 @@ wasm::WasmCompilationResult ExecuteTurbofanWasmCompilation( ...@@ -7943,9 +7944,12 @@ wasm::WasmCompilationResult ExecuteTurbofanWasmCompilation(
: nullptr; : nullptr;
SourcePositionTable* source_positions = SourcePositionTable* source_positions =
mcgraph->zone()->New<SourcePositionTable>(mcgraph->graph()); mcgraph->zone()->New<SourcePositionTable>(mcgraph->graph());
std::vector<WasmLoopInfo> loop_infos;
if (!BuildGraphForWasmFunction(wasm_engine->allocator(), env, func_body, if (!BuildGraphForWasmFunction(wasm_engine->allocator(), env, func_body,
func_index, detected, mcgraph, node_origins, func_index, detected, mcgraph, &loop_infos,
source_positions)) { node_origins, source_positions)) {
return wasm::WasmCompilationResult{}; return wasm::WasmCompilationResult{};
} }
...@@ -7966,7 +7970,7 @@ wasm::WasmCompilationResult ExecuteTurbofanWasmCompilation( ...@@ -7966,7 +7970,7 @@ wasm::WasmCompilationResult ExecuteTurbofanWasmCompilation(
Pipeline::GenerateCodeForWasmFunction( Pipeline::GenerateCodeForWasmFunction(
&info, wasm_engine, mcgraph, call_descriptor, source_positions, &info, wasm_engine, mcgraph, call_descriptor, source_positions,
node_origins, func_body, env->module, func_index); node_origins, func_body, env->module, func_index, &loop_infos);
if (counters) { if (counters) {
counters->wasm_compile_function_peak_memory_bytes()->AddSample( counters->wasm_compile_function_peak_memory_bytes()->AddSample(
......
...@@ -177,6 +177,17 @@ struct WasmInstanceCacheNodes { ...@@ -177,6 +177,17 @@ struct WasmInstanceCacheNodes {
Node* mem_mask; Node* mem_mask;
}; };
struct WasmLoopInfo {
Node* header;
uint32_t nesting_depth;
bool is_innermost;
WasmLoopInfo(Node* header, uint32_t nesting_depth, bool is_innermost)
: header(header),
nesting_depth(nesting_depth),
is_innermost(is_innermost) {}
};
// Abstracts details of building TurboFan graph nodes for wasm to separate // Abstracts details of building TurboFan graph nodes for wasm to separate
// the wasm decoder from the internal details of TurboFan. // the wasm decoder from the internal details of TurboFan.
class WasmGraphBuilder { class WasmGraphBuilder {
......
...@@ -934,8 +934,7 @@ DEFINE_BOOL(wasm_math_intrinsics, true, ...@@ -934,8 +934,7 @@ DEFINE_BOOL(wasm_math_intrinsics, true,
"intrinsify some Math imports into wasm") "intrinsify some Math imports into wasm")
DEFINE_BOOL(wasm_loop_unrolling, false, DEFINE_BOOL(wasm_loop_unrolling, false,
"generate and then remove loop exits in wasm turbofan code " "enable loop unrolling for wasm functions (experimental)")
"(placeholder for future loop unrolling feature)")
DEFINE_BOOL(wasm_trap_handler, true, DEFINE_BOOL(wasm_trap_handler, true,
"use signal handlers to catch out of bounds memory access in wasm" "use signal handlers to catch out of bounds memory access in wasm"
" (currently Linux x86_64 only)") " (currently Linux x86_64 only)")
......
...@@ -931,6 +931,7 @@ class RuntimeCallTimer final { ...@@ -931,6 +931,7 @@ class RuntimeCallTimer final {
ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, LocateSpillSlots) \ ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, LocateSpillSlots) \
ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, LoopExitElimination) \ ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, LoopExitElimination) \
ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, LoopPeeling) \ ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, LoopPeeling) \
ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, WasmLoopUnrolling) \
ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, MachineOperatorOptimization) \ ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, MachineOperatorOptimization) \
ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, MeetRegisterConstraints) \ ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, MeetRegisterConstraints) \
ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, MemoryOptimization) \ ADD_THREAD_SPECIFIC_COUNTER(V, Optimize, MemoryOptimization) \
......
...@@ -178,6 +178,26 @@ class WasmGraphBuildingInterface { ...@@ -178,6 +178,26 @@ class WasmGraphBuildingInterface {
ssa_env_->state = SsaEnv::kMerged; ssa_env_->state = SsaEnv::kMerged;
TFNode* loop_node = builder_->Loop(control()); TFNode* loop_node = builder_->Loop(control());
if (FLAG_wasm_loop_unrolling) {
uint32_t nesting_depth = 0;
for (uint32_t depth = 1; depth < decoder->control_depth(); depth++) {
if (decoder->control_at(depth)->is_loop()) {
nesting_depth++;
}
}
// If this loop is nested, the parent loop's is_innermost field needs to
// be false. If the last loop in loop_infos_ has less depth, it has to be
// the parent loop. If it does not, it means another loop has been found
// within the parent loop, and that loop will have set the parent's
// is_innermost to false, so we do not need to do anything.
if (nesting_depth > 0 &&
loop_infos_.back().nesting_depth < nesting_depth) {
loop_infos_.back().is_innermost = false;
}
loop_infos_.emplace_back(loop_node, nesting_depth, true);
}
builder_->SetControl(loop_node); builder_->SetControl(loop_node);
decoder->control_at(0)->loop_node = loop_node; decoder->control_at(0)->loop_node = loop_node;
...@@ -732,7 +752,8 @@ class WasmGraphBuildingInterface { ...@@ -732,7 +752,8 @@ class WasmGraphBuildingInterface {
TryInfo* target_try = decoder->control_at(depth)->try_info; TryInfo* target_try = decoder->control_at(depth)->try_info;
if (FLAG_wasm_loop_unrolling) { if (FLAG_wasm_loop_unrolling) {
StackValueVector stack_values; StackValueVector stack_values;
BuildNestedLoopExits(decoder, depth, true, stack_values); BuildNestedLoopExits(decoder, depth, true, stack_values,
&block->try_info->exception);
} }
Goto(decoder, target_try->catch_env); Goto(decoder, target_try->catch_env);
...@@ -1055,10 +1076,14 @@ class WasmGraphBuildingInterface { ...@@ -1055,10 +1076,14 @@ class WasmGraphBuildingInterface {
to->node = from.node; to->node = from.node;
} }
std::vector<compiler::WasmLoopInfo> loop_infos() { return loop_infos_; }
private: private:
SsaEnv* ssa_env_ = nullptr; SsaEnv* ssa_env_ = nullptr;
compiler::WasmGraphBuilder* builder_; compiler::WasmGraphBuilder* builder_;
uint32_t current_catch_ = kNullCatch; uint32_t current_catch_ = kNullCatch;
// Tracks loop data for loop unrolling.
std::vector<compiler::WasmLoopInfo> loop_infos_;
TFNode* effect() { return builder_->effect(); } TFNode* effect() { return builder_->effect(); }
...@@ -1143,7 +1168,7 @@ class WasmGraphBuildingInterface { ...@@ -1143,7 +1168,7 @@ class WasmGraphBuildingInterface {
if (FLAG_wasm_loop_unrolling) { if (FLAG_wasm_loop_unrolling) {
StackValueVector values; StackValueVector values;
BuildNestedLoopExits(decoder, control_depth_of_current_catch(decoder), BuildNestedLoopExits(decoder, control_depth_of_current_catch(decoder),
true, values); true, values, &if_exception);
} }
Goto(decoder, try_info->catch_env); Goto(decoder, try_info->catch_env);
if (try_info->exception == nullptr) { if (try_info->exception == nullptr) {
...@@ -1421,16 +1446,28 @@ class WasmGraphBuildingInterface { ...@@ -1421,16 +1446,28 @@ class WasmGraphBuildingInterface {
void BuildNestedLoopExits(FullDecoder* decoder, uint32_t depth_limit, void BuildNestedLoopExits(FullDecoder* decoder, uint32_t depth_limit,
bool wrap_exit_values, bool wrap_exit_values,
StackValueVector& stack_values) { StackValueVector& stack_values,
TFNode** exception_value = nullptr) {
DCHECK(FLAG_wasm_loop_unrolling); DCHECK(FLAG_wasm_loop_unrolling);
Control* control = nullptr;
// We are only interested in exits from the innermost loop.
for (uint32_t i = 0; i < depth_limit; i++) { for (uint32_t i = 0; i < depth_limit; i++) {
Control* control = decoder->control_at(i); Control* c = decoder->control_at(i);
if (!control->is_loop()) continue; if (c->is_loop()) {
control = c;
break;
}
}
if (control != nullptr) {
BuildLoopExits(decoder, control); BuildLoopExits(decoder, control);
for (Value& value : stack_values) { for (Value& value : stack_values) {
value.node = builder_->LoopExitValue( value.node = builder_->LoopExitValue(
value.node, value.type.machine_representation()); value.node, value.type.machine_representation());
} }
if (exception_value != nullptr) {
*exception_value = builder_->LoopExitValue(
*exception_value, MachineRepresentation::kWord32);
}
if (wrap_exit_values) { if (wrap_exit_values) {
WrapLocalsAtLoopExit(decoder, control); WrapLocalsAtLoopExit(decoder, control);
} }
...@@ -1459,6 +1496,7 @@ DecodeResult BuildTFGraph(AccountingAllocator* allocator, ...@@ -1459,6 +1496,7 @@ DecodeResult BuildTFGraph(AccountingAllocator* allocator,
const WasmFeatures& enabled, const WasmModule* module, const WasmFeatures& enabled, const WasmModule* module,
compiler::WasmGraphBuilder* builder, compiler::WasmGraphBuilder* builder,
WasmFeatures* detected, const FunctionBody& body, WasmFeatures* detected, const FunctionBody& body,
std::vector<compiler::WasmLoopInfo>* loop_infos,
compiler::NodeOriginTable* node_origins) { compiler::NodeOriginTable* node_origins) {
Zone zone(allocator, ZONE_NAME); Zone zone(allocator, ZONE_NAME);
WasmFullDecoder<Decoder::kFullValidation, WasmGraphBuildingInterface> decoder( WasmFullDecoder<Decoder::kFullValidation, WasmGraphBuildingInterface> decoder(
...@@ -1470,6 +1508,9 @@ DecodeResult BuildTFGraph(AccountingAllocator* allocator, ...@@ -1470,6 +1508,9 @@ DecodeResult BuildTFGraph(AccountingAllocator* allocator,
if (node_origins) { if (node_origins) {
builder->RemoveBytecodePositionDecorator(); builder->RemoveBytecodePositionDecorator();
} }
if (FLAG_wasm_loop_unrolling) {
*loop_infos = decoder.interface().loop_infos();
}
return decoder.toResult(nullptr); return decoder.toResult(nullptr);
} }
......
...@@ -15,6 +15,7 @@ namespace internal { ...@@ -15,6 +15,7 @@ namespace internal {
namespace compiler { // external declarations from compiler. namespace compiler { // external declarations from compiler.
class NodeOriginTable; class NodeOriginTable;
class WasmGraphBuilder; class WasmGraphBuilder;
struct WasmLoopInfo;
} // namespace compiler } // namespace compiler
namespace wasm { namespace wasm {
...@@ -27,6 +28,7 @@ V8_EXPORT_PRIVATE DecodeResult ...@@ -27,6 +28,7 @@ V8_EXPORT_PRIVATE DecodeResult
BuildTFGraph(AccountingAllocator* allocator, const WasmFeatures& enabled, BuildTFGraph(AccountingAllocator* allocator, const WasmFeatures& enabled,
const WasmModule* module, compiler::WasmGraphBuilder* builder, const WasmModule* module, compiler::WasmGraphBuilder* builder,
WasmFeatures* detected, const FunctionBody& body, WasmFeatures* detected, const FunctionBody& body,
std::vector<compiler::WasmLoopInfo>* loop_infos,
compiler::NodeOriginTable* node_origins); compiler::NodeOriginTable* node_origins);
} // namespace wasm } // namespace wasm
......
...@@ -359,16 +359,18 @@ void TestBuildingGraphWithBuilder(compiler::WasmGraphBuilder* builder, ...@@ -359,16 +359,18 @@ void TestBuildingGraphWithBuilder(compiler::WasmGraphBuilder* builder,
const byte* start, const byte* end) { const byte* start, const byte* end) {
WasmFeatures unused_detected_features; WasmFeatures unused_detected_features;
FunctionBody body(sig, 0, start, end); FunctionBody body(sig, 0, start, end);
std::vector<compiler::WasmLoopInfo> loops;
DecodeResult result = DecodeResult result =
BuildTFGraph(zone->allocator(), WasmFeatures::All(), nullptr, builder, BuildTFGraph(zone->allocator(), WasmFeatures::All(), nullptr, builder,
&unused_detected_features, body, nullptr); &unused_detected_features, body, &loops, nullptr);
if (result.failed()) { if (result.failed()) {
#ifdef DEBUG #ifdef DEBUG
if (!FLAG_trace_wasm_decoder) { if (!FLAG_trace_wasm_decoder) {
// Retry the compilation with the tracing flag on, to help in debugging. // Retry the compilation with the tracing flag on, to help in debugging.
FLAG_trace_wasm_decoder = true; FLAG_trace_wasm_decoder = true;
result = BuildTFGraph(zone->allocator(), WasmFeatures::All(), nullptr, result =
builder, &unused_detected_features, body, nullptr); BuildTFGraph(zone->allocator(), WasmFeatures::All(), nullptr, builder,
&unused_detected_features, body, &loops, nullptr);
} }
#endif #endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment