Commit ba7d9e5f authored by Darius M's avatar Darius M Committed by V8 LUCI CQ

[turboshaft] port value numbering optimization

Bug: v8:12783
Change-Id: I5b7acf2445b0f898158448dde206a0cecdab6a80
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3764345Reviewed-by: 's avatarTobias Tebbi <tebbi@chromium.org>
Commit-Queue: Darius Mercadier <dmercadier@chromium.org>
Cr-Commit-Position: refs/heads/main@{#82097}
parent 6fbe1bf2
......@@ -2870,6 +2870,7 @@ filegroup(
"src/compiler/turboshaft/recreate-schedule.cc",
"src/compiler/turboshaft/recreate-schedule.h",
"src/compiler/turboshaft/sidetable.h",
"src/compiler/turboshaft/value-numbering-assembler.h",
"src/compiler/type-cache.cc",
"src/compiler/type-cache.h",
"src/compiler/type-narrowing-reducer.cc",
......
......@@ -2985,6 +2985,7 @@ v8_header_set("v8_internal_headers") {
"src/compiler/turboshaft/optimization-phase.h",
"src/compiler/turboshaft/recreate-schedule.h",
"src/compiler/turboshaft/sidetable.h",
"src/compiler/turboshaft/value-numbering-assembler.h",
"src/compiler/type-cache.h",
"src/compiler/type-narrowing-reducer.h",
"src/compiler/typed-optimization.h",
......
......@@ -84,6 +84,7 @@
#include "src/compiler/turboshaft/graph.h"
#include "src/compiler/turboshaft/optimization-phase.h"
#include "src/compiler/turboshaft/recreate-schedule.h"
#include "src/compiler/turboshaft/value-numbering-assembler.h"
#include "src/compiler/type-narrowing-reducer.h"
#include "src/compiler/typed-optimization.h"
#include "src/compiler/typer.h"
......@@ -2045,9 +2046,10 @@ struct OptimizeTurboshaftPhase {
DECL_PIPELINE_PHASE_CONSTANTS(OptimizeTurboshaft)
void Run(PipelineData* data, Zone* temp_zone) {
turboshaft::OptimizationPhase<
turboshaft::LivenessAnalyzer,
turboshaft::Assembler>::Run(&data->turboshaft_graph(), temp_zone);
turboshaft::OptimizationPhase<turboshaft::LivenessAnalyzer,
turboshaft::ValueNumberingAssembler>::
Run(&data->turboshaft_graph(), temp_zone,
turboshaft::VisitOrder::kDominator);
}
};
......
......@@ -208,6 +208,9 @@ class Assembler
public:
Block* NewBlock(Block::Kind kind) { return graph_.NewBlock(kind); }
void EnterBlock(const Block& block) { USE(block); }
void ExitBlock(const Block& block) { USE(block); }
V8_INLINE bool Bind(Block* block) {
if (!graph().Add(block)) return false;
DCHECK_NULL(current_block_);
......
......@@ -231,6 +231,8 @@ class RandomAccessStackDominatorNode
// Returns the lowest common dominator of {this} and {other}.
Derived* GetCommonDominator(RandomAccessStackDominatorNode<Derived>* other);
int Depth() const { return len_; }
private:
friend class Graph;
friend class DominatorForwardTreeNode<Derived>;
......
......@@ -5,6 +5,7 @@
#ifndef V8_COMPILER_TURBOSHAFT_OPERATIONS_H_
#define V8_COMPILER_TURBOSHAFT_OPERATIONS_H_
#include <cmath>
#include <cstdint>
#include <cstring>
#include <limits>
......@@ -200,6 +201,10 @@ struct OpProperties {
!(can_read || can_write || can_abort || is_block_terminator);
const bool is_required_when_unused =
can_write || can_abort || is_block_terminator;
// Nodes that don't read, write and aren't block terminators can be eliminated
// via value numbering.
const bool can_be_eliminated =
!(can_read || can_write || is_block_terminator);
constexpr OpProperties(bool can_read, bool can_write, bool can_abort,
bool is_block_terminator)
......@@ -1031,10 +1036,20 @@ struct ConstantOp : FixedArityOperationT<0, ConstantOp> {
case Kind::kTaggedIndex:
return storage.integral == other.storage.integral;
case Kind::kFloat32:
return storage.float32 == other.storage.float32;
// Using a bit_cast to uint32_t in order to return false when comparing
// +0 and -0.
return base::bit_cast<uint32_t>(storage.float32) ==
base::bit_cast<uint32_t>(other.storage.float32) ||
(std::isnan(storage.float32) &&
std::isnan(other.storage.float32));
case Kind::kFloat64:
case Kind::kNumber:
return storage.float64 == other.storage.float64;
// Using a bit_cast to uint64_t in order to return false when comparing
// +0 and -0.
return base::bit_cast<uint64_t>(storage.float64) ==
base::bit_cast<uint64_t>(other.storage.float64) ||
(std::isnan(storage.float64) &&
std::isnan(other.storage.float64));
case Kind::kExternal:
return storage.external.address() == other.storage.external.address();
case Kind::kHeapObject:
......
......@@ -95,15 +95,16 @@ struct LivenessAnalyzer : AnalyzerBase {
}
};
enum class VisitOrder { kAsEmitted, kDominator };
template <class Analyzer, class Assembler>
class OptimizationPhase {
private:
struct Impl;
public:
enum class VisitOrder { kNatural, kDominator };
static void Run(Graph* input, Zone* phase_zone,
VisitOrder visit_order = VisitOrder::kNatural) {
VisitOrder visit_order = VisitOrder::kAsEmitted) {
Impl phase{*input, phase_zone, visit_order};
if (FLAG_turboshaft_trace_reduction) {
phase.template Run<true>();
......@@ -111,8 +112,9 @@ class OptimizationPhase {
phase.template Run<false>();
}
}
static void RunWithoutTracing(Graph* input, Zone* phase_zone,
VisitOrder visit_order = VisitOrder::kNatural) {
static void RunWithoutTracing(
Graph* input, Zone* phase_zone,
VisitOrder visit_order = VisitOrder::kAsEmitted) {
Impl phase{*input, phase_zone, visit_order};
phase.template Run<false>();
}
......@@ -146,14 +148,14 @@ struct OptimizationPhase<Analyzer, Assembler>::Impl {
if (visit_order == VisitOrder::kDominator) {
RunDominatorOrder<trace_reduction>();
} else {
RunNaturalOrder<trace_reduction>();
RunAsEmittedOrder<trace_reduction>();
}
input_graph.SwapWithCompanion();
}
template <bool trace_reduction>
void RunNaturalOrder() {
void RunAsEmittedOrder() {
for (const Block& input_block : input_graph.blocks()) {
VisitBlock<trace_reduction>(input_block);
}
......@@ -179,12 +181,14 @@ struct OptimizationPhase<Analyzer, Assembler>::Impl {
template <bool trace_reduction>
void VisitBlock(const Block& input_block) {
assembler.EnterBlock(input_block);
current_input_block = &input_block;
if constexpr (trace_reduction) {
std::cout << PrintAsBlockHeader{input_block} << "\n";
}
if (!assembler.Bind(MapToNewGraph(input_block.index()))) {
if constexpr (trace_reduction) TraceBlockUnreachable();
assembler.ExitBlock(input_block);
return;
}
assembler.current_block()->SetDeferred(input_block.IsDeferred());
......@@ -226,6 +230,7 @@ struct OptimizationPhase<Analyzer, Assembler>::Impl {
}
op_mapping[index.id()] = new_index;
}
assembler.ExitBlock(input_block);
if constexpr (trace_reduction) TraceBlockFinished();
}
......@@ -306,7 +311,7 @@ struct OptimizationPhase<Analyzer, Assembler>::Impl {
// need to skip phi inputs that belong to control predecessors that have no
// equivalent in the new graph.
// When iterating the graph in kNatural order (ie, going through all of
// When iterating the graph in kAsEmitted order (ie, going through all of
// the blocks in linear order), we assume that the order of control
// predecessors did not change. In kDominator order, the order of control
// predecessor might or might not change.
......
// Copyright 2022 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_COMPILER_TURBOSHAFT_VALUE_NUMBERING_ASSEMBLER_H_
#define V8_COMPILER_TURBOSHAFT_VALUE_NUMBERING_ASSEMBLER_H_
#include "src/base/logging.h"
#include "src/base/vector.h"
#include "src/compiler/turboshaft/assembler.h"
#include "src/compiler/turboshaft/graph.h"
#include "src/compiler/turboshaft/operations.h"
#include "src/utils/utils.h"
#include "src/zone/zone-containers.h"
namespace v8 {
namespace internal {
namespace compiler {
namespace turboshaft {
// Value numbering removes redundant nodes from the graph. A simple example
// could be:
//
// x = a + b
// y = a + b
// z = x * y
//
// Is simplified to
//
// x = a + b
// z = x * x
//
// It works by storing previously seen nodes in a hashmap, and when visiting a
// new node, we check to see if it's already in the hashmap. If yes, then we
// return the old node. If not, then we keep the new one (and add it into the
// hashmap). A high-level pseudo-code would be:
//
// def VisitOp(op):
// if op in hashmap:
// return hashmap.get(op)
// else:
// hashmap.add(op)
// return op
//
// We implemented our own hashmap (to have more control, it should become
// clearer why by the end of this explanation). When there is a collision, we
// look at the next index (and the next one if there is yet another collision,
// etc). While not the fastest approach, it has the advantage of not requiring
// any dynamic memory allocation (besides the initial table, and the resizing).
//
// For the approach describe above (the pseudocode and the paragraph before it)
// to be correct, a node should only be replaced by a node defined in blocks
// that dominate the current block. Thus, this assembler should only be used
// with OptimizationPhases that iterate the graph in VisitOrder::kDominator
// order. Then, when going down the dominator tree, we add nodes to the hashmap,
// and when going back up the dominator tree, we remove nodes from the hashmap.
//
// In order to efficiently remove all the nodes of a given block from the
// hashmap, we maintain a linked-list of hashmap entries per block (this way, we
// don't have to iterate the wole hashmap). Note that, in practice, we think in
// terms of "depth" rather than "block", and we thus have one linked-list per
// depth of the dominator tree. The heads of those linked lists are stored in
// the vector {depths_heads_}. The linked lists are then implemented in-place in
// the hashtable entries, thanks to the `depth_neighboring_entry` field of the
// `Entry` structure.
// To remove all of the entries from a given linked list, we iterate the entries
// in the linked list, setting all of their `hash` field to 0 (we prevent hashes
// from being equal to 0, in order to detect empty entries: their hash is 0).
class ValueNumberingAssembler : public Assembler {
// ValueNumberingAssembler inherits directly from Assembler because it
// overwrites the last operation in case of a cache hit, which assumes that
// the base assembler emits everything exactly as given without applying any
// optimizations.
using Base = Assembler;
public:
ValueNumberingAssembler(Graph* graph, Zone* phase_zone)
: Assembler(graph, phase_zone), depths_heads_(phase_zone) {
table_ = phase_zone->NewVector<Entry>(
base::bits::RoundUpToPowerOfTwo(
std::max<size_t>(128, graph->op_id_capacity() / 2)),
Entry());
entry_count_ = 0;
mask_ = table_.size() - 1;
current_depth_ = -1;
}
#define EMIT_OP(Name) \
template <class... Args> \
OpIndex Name(Args... args) { \
OpIndex next_index = graph().next_operation_index(); \
USE(next_index); \
OpIndex result = Base::Name(args...); \
DCHECK_EQ(next_index, result); \
return AddOrFind<Name##Op>(result); \
}
TURBOSHAFT_OPERATION_LIST(EMIT_OP)
#undef EMIT_OP
void EnterBlock(const Block& block) {
int new_depth = block.Depth();
// Remember that this assembler should only be used for OptimizationPhases
// that visit the graph in VisitOrder::kDominator order. We can't properly
// check that here, but we do two checks, which should be enough to ensure
// that we are actually visiting the graph in dominator order:
// - There should be only one block at depth 0 (the root).
// - There should be no "jumps" downward in the dominator tree ({new_depth}
// cannot be lower than {current_depth}+1).
DCHECK_IMPLIES(current_depth_ == 0, new_depth != 0);
DCHECK_LE(new_depth, current_depth_ + 1);
if (new_depth <= current_depth_) {
while (current_depth_ >= new_depth) {
ClearCurrentDepthEntries();
--current_depth_;
}
}
current_depth_ = new_depth;
depths_heads_.push_back(nullptr);
}
private:
// TODO(dmercadier): Once the mapping from Operations to Blocks has been added
// to turboshaft, remove the `block` field from the `Entry` structure.
struct Entry {
OpIndex value;
BlockIndex block;
size_t hash = 0;
Entry* depth_neighboring_entry = nullptr;
};
template <class Op>
OpIndex AddOrFind(OpIndex op_idx) {
if constexpr (!Op::properties.can_be_eliminated ||
std::is_same<Op, PendingLoopPhiOp>::value) {
return op_idx;
}
RehashIfNeeded();
const Op& op = graph().Get(op_idx).Cast<Op>();
constexpr bool same_block_only = std::is_same<Op, PhiOp>::value;
size_t hash = ComputeHash<same_block_only>(op);
size_t start_index = hash & mask_;
for (size_t i = start_index;; i = NextEntryIndex(i)) {
Entry& entry = table_[i];
if (entry.hash == 0) {
// We didn't find {op} in {table_}. Inserting it and returning.
table_[i] =
Entry{op_idx, current_block()->index(), hash, depths_heads_.back()};
depths_heads_.back() = &table_[i];
++entry_count_;
return op_idx;
}
if (entry.hash == hash) {
const Operation& entry_op = graph().Get(entry.value);
if (entry_op.Is<Op>() &&
(!same_block_only || entry.block == current_block()->index()) &&
entry_op.Cast<Op>() == op) {
graph().RemoveLast();
return entry.value;
}
}
// Making sure that we don't have an infinite loop.
DCHECK_NE(start_index, NextEntryIndex(i));
}
}
// Remove all of the Entries of the current depth.
void ClearCurrentDepthEntries() {
for (Entry* entry = depths_heads_.back(); entry != nullptr;) {
entry->hash = 0;
Entry* next_entry = entry->depth_neighboring_entry;
entry->depth_neighboring_entry = nullptr;
entry = next_entry;
--entry_count_;
}
depths_heads_.pop_back();
}
// If the table is too full, double its size and re-insert the old entries.
void RehashIfNeeded() {
if (V8_LIKELY(table_.size() - (table_.size() / 4) > entry_count_)) return;
base::Vector<Entry> new_table = table_ =
phase_zone()->NewVector<Entry>(table_.size() * 2, Entry());
size_t mask = mask_ = table_.size() - 1;
for (size_t depth_idx = 0; depth_idx < depths_heads_.size(); depth_idx++) {
// It's important to fill the new hash by inserting data in increasing
// depth order, in order to avoid holes when later calling
// ClearCurrentDepthEntries. Consider for instance:
//
// ---+------+------+------+----
// | a1 | a2 | a3 |
// ---+------+------+------+----
//
// Where a1, a2 and a3 have the same hash. By construction, we know that
// depth(a1) <= depth(a2) <= depth(a3). If, when re-hashing, we were to
// insert them in another order, say:
//
// ---+------+------+------+----
// | a3 | a1 | a2 |
// ---+------+------+------+----
//
// Then, when we'll call ClearCurrentDepthEntries to remove entries from
// a3's depth, we'll get this:
//
// ---+------+------+------+----
// | null | a1 | a2 |
// ---+------+------+------+----
//
// And, when looking if a1 is in the hash, we'd find a "null" where we
// expect it, and assume that it's not present. If, instead, we always
// conserve the increasing depth order, then when removing a3, we'd get:
//
// ---+------+------+------+----
// | a1 | a2 | null |
// ---+------+------+------+----
//
// Where we can still find a1 and a2.
Entry* entry = depths_heads_[depth_idx];
depths_heads_[depth_idx] = nullptr;
while (entry != nullptr) {
for (size_t i = entry->hash & mask;; i = NextEntryIndex(i)) {
if (new_table[i].hash == 0) {
new_table[i] = *entry;
Entry* next_entry = entry->depth_neighboring_entry;
new_table[i].depth_neighboring_entry = depths_heads_[depth_idx];
depths_heads_[depth_idx] = &new_table[i];
entry = next_entry;
break;
}
}
}
}
}
template <bool same_block_only, class Op>
size_t ComputeHash(const Op& op) {
size_t hash = op.hash_value();
if (same_block_only) {
hash = base::hash_combine(current_block()->index(), hash);
}
if (V8_UNLIKELY(hash == 0)) return 1;
return hash;
}
size_t NextEntryIndex(size_t index) { return (index + 1) & mask_; }
Entry* NextEntry(Entry* entry) {
return V8_LIKELY(entry + 1 < table_.end()) ? entry + 1 : &table_[0];
}
Entry* PrevEntry(Entry* entry) {
return V8_LIKELY(entry > table_.begin()) ? entry - 1 : table_.end() - 1;
}
int current_depth_;
base::Vector<Entry> table_;
size_t mask_;
size_t entry_count_;
ZoneVector<Entry*> depths_heads_;
};
} // namespace turboshaft
} // namespace compiler
} // namespace internal
} // namespace v8
#endif // V8_COMPILER_TURBOSHAFT_VALUE_NUMBERING_ASSEMBLER_H_
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment