759 lines
18 KiB
C++
759 lines
18 KiB
C++
#pragma once
|
|
|
|
#include "cuda_memory.hpp"
|
|
#include "cuda_stream.hpp"
|
|
|
|
#include "../utility/object_pool.hpp"
|
|
#include "../utility/traits.hpp"
|
|
#include "../utility/passive_vector.hpp"
|
|
|
|
namespace tf {
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// cudaGraph_t routines
|
|
// ----------------------------------------------------------------------------
|
|
|
|
/**
|
|
@brief gets the memcpy node parameter of a copy task
|
|
*/
|
|
template <typename T,
|
|
std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
|
|
>
|
|
cudaMemcpy3DParms cuda_get_copy_parms(T* tgt, const T* src, size_t num) {
|
|
|
|
using U = std::decay_t<T>;
|
|
|
|
cudaMemcpy3DParms p;
|
|
|
|
p.srcArray = nullptr;
|
|
p.srcPos = ::make_cudaPos(0, 0, 0);
|
|
p.srcPtr = ::make_cudaPitchedPtr(const_cast<T*>(src), num*sizeof(U), num, 1);
|
|
p.dstArray = nullptr;
|
|
p.dstPos = ::make_cudaPos(0, 0, 0);
|
|
p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1);
|
|
p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1);
|
|
p.kind = cudaMemcpyDefault;
|
|
|
|
return p;
|
|
}
|
|
|
|
/**
|
|
@brief gets the memcpy node parameter of a memcpy task (untyped)
|
|
*/
|
|
inline cudaMemcpy3DParms cuda_get_memcpy_parms(
|
|
void* tgt, const void* src, size_t bytes
|
|
) {
|
|
|
|
// Parameters in cudaPitchedPtr
|
|
// d - Pointer to allocated memory
|
|
// p - Pitch of allocated memory in bytes
|
|
// xsz - Logical width of allocation in elements
|
|
// ysz - Logical height of allocation in elements
|
|
cudaMemcpy3DParms p;
|
|
p.srcArray = nullptr;
|
|
p.srcPos = ::make_cudaPos(0, 0, 0);
|
|
p.srcPtr = ::make_cudaPitchedPtr(const_cast<void*>(src), bytes, bytes, 1);
|
|
p.dstArray = nullptr;
|
|
p.dstPos = ::make_cudaPos(0, 0, 0);
|
|
p.dstPtr = ::make_cudaPitchedPtr(tgt, bytes, bytes, 1);
|
|
p.extent = ::make_cudaExtent(bytes, 1, 1);
|
|
p.kind = cudaMemcpyDefault;
|
|
|
|
return p;
|
|
}
|
|
|
|
/**
|
|
@brief gets the memset node parameter of a memcpy task (untyped)
|
|
*/
|
|
inline cudaMemsetParams cuda_get_memset_parms(void* dst, int ch, size_t count) {
|
|
|
|
cudaMemsetParams p;
|
|
p.dst = dst;
|
|
p.value = ch;
|
|
p.pitch = 0;
|
|
//p.elementSize = (count & 1) == 0 ? ((count & 3) == 0 ? 4 : 2) : 1;
|
|
//p.width = (count & 1) == 0 ? ((count & 3) == 0 ? count >> 2 : count >> 1) : count;
|
|
p.elementSize = 1; // either 1, 2, or 4
|
|
p.width = count;
|
|
p.height = 1;
|
|
|
|
return p;
|
|
}
|
|
|
|
/**
|
|
@brief gets the memset node parameter of a fill task (typed)
|
|
*/
|
|
template <typename T, std::enable_if_t<
|
|
is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
|
|
>
|
|
cudaMemsetParams cuda_get_fill_parms(T* dst, T value, size_t count) {
|
|
|
|
cudaMemsetParams p;
|
|
p.dst = dst;
|
|
|
|
// perform bit-wise copy
|
|
p.value = 0; // crucial
|
|
static_assert(sizeof(T) <= sizeof(p.value), "internal error");
|
|
std::memcpy(&p.value, &value, sizeof(T));
|
|
|
|
p.pitch = 0;
|
|
p.elementSize = sizeof(T); // either 1, 2, or 4
|
|
p.width = count;
|
|
p.height = 1;
|
|
|
|
return p;
|
|
}
|
|
|
|
/**
|
|
@brief gets the memset node parameter of a zero task (typed)
|
|
*/
|
|
template <typename T, std::enable_if_t<
|
|
is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
|
|
>
|
|
cudaMemsetParams cuda_get_zero_parms(T* dst, size_t count) {
|
|
|
|
cudaMemsetParams p;
|
|
p.dst = dst;
|
|
p.value = 0;
|
|
p.pitch = 0;
|
|
p.elementSize = sizeof(T); // either 1, 2, or 4
|
|
p.width = count;
|
|
p.height = 1;
|
|
|
|
return p;
|
|
}
|
|
|
|
/**
|
|
@brief queries the number of root nodes in a native CUDA graph
|
|
*/
|
|
inline size_t cuda_get_graph_num_root_nodes(cudaGraph_t graph) {
|
|
size_t num_nodes;
|
|
TF_CHECK_CUDA(
|
|
cudaGraphGetRootNodes(graph, nullptr, &num_nodes),
|
|
"failed to get native graph root nodes"
|
|
);
|
|
return num_nodes;
|
|
}
|
|
|
|
/**
|
|
@brief queries the number of nodes in a native CUDA graph
|
|
*/
|
|
inline size_t cuda_get_graph_num_nodes(cudaGraph_t graph) {
|
|
size_t num_nodes;
|
|
TF_CHECK_CUDA(
|
|
cudaGraphGetNodes(graph, nullptr, &num_nodes),
|
|
"failed to get native graph nodes"
|
|
);
|
|
return num_nodes;
|
|
}
|
|
|
|
/**
|
|
@brief queries the number of edges in a native CUDA graph
|
|
*/
|
|
inline size_t cuda_get_graph_num_edges(cudaGraph_t graph) {
|
|
size_t num_edges;
|
|
TF_CHECK_CUDA(
|
|
cudaGraphGetEdges(graph, nullptr, nullptr, &num_edges),
|
|
"failed to get native graph edges"
|
|
);
|
|
return num_edges;
|
|
}
|
|
|
|
/**
|
|
@brief acquires the nodes in a native CUDA graph
|
|
*/
|
|
inline std::vector<cudaGraphNode_t> cuda_get_graph_nodes(cudaGraph_t graph) {
|
|
size_t num_nodes = cuda_get_graph_num_nodes(graph);
|
|
std::vector<cudaGraphNode_t> nodes(num_nodes);
|
|
TF_CHECK_CUDA(
|
|
cudaGraphGetNodes(graph, nodes.data(), &num_nodes),
|
|
"failed to get native graph nodes"
|
|
);
|
|
return nodes;
|
|
}
|
|
|
|
/**
|
|
@brief acquires the root nodes in a native CUDA graph
|
|
*/
|
|
inline std::vector<cudaGraphNode_t> cuda_get_graph_root_nodes(cudaGraph_t graph) {
|
|
size_t num_nodes = cuda_get_graph_num_root_nodes(graph);
|
|
std::vector<cudaGraphNode_t> nodes(num_nodes);
|
|
TF_CHECK_CUDA(
|
|
cudaGraphGetRootNodes(graph, nodes.data(), &num_nodes),
|
|
"failed to get native graph nodes"
|
|
);
|
|
return nodes;
|
|
}
|
|
|
|
/**
|
|
@brief acquires the edges in a native CUDA graph
|
|
*/
|
|
inline std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>>
|
|
cuda_get_graph_edges(cudaGraph_t graph) {
|
|
size_t num_edges = cuda_get_graph_num_edges(graph);
|
|
std::vector<cudaGraphNode_t> froms(num_edges), tos(num_edges);
|
|
TF_CHECK_CUDA(
|
|
cudaGraphGetEdges(graph, froms.data(), tos.data(), &num_edges),
|
|
"failed to get native graph edges"
|
|
);
|
|
std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>> edges(num_edges);
|
|
for(size_t i=0; i<num_edges; i++) {
|
|
edges[i] = std::make_pair(froms[i], tos[i]);
|
|
}
|
|
return edges;
|
|
}
|
|
|
|
/**
|
|
@brief queries the type of a native CUDA graph node
|
|
|
|
valid type values are:
|
|
+ cudaGraphNodeTypeKernel = 0x00
|
|
+ cudaGraphNodeTypeMemcpy = 0x01
|
|
+ cudaGraphNodeTypeMemset = 0x02
|
|
+ cudaGraphNodeTypeHost = 0x03
|
|
+ cudaGraphNodeTypeGraph = 0x04
|
|
+ cudaGraphNodeTypeEmpty = 0x05
|
|
+ cudaGraphNodeTypeWaitEvent = 0x06
|
|
+ cudaGraphNodeTypeEventRecord = 0x07
|
|
*/
|
|
inline cudaGraphNodeType cuda_get_graph_node_type(cudaGraphNode_t node) {
|
|
cudaGraphNodeType type;
|
|
TF_CHECK_CUDA(
|
|
cudaGraphNodeGetType(node, &type), "failed to get native graph node type"
|
|
);
|
|
return type;
|
|
}
|
|
|
|
/**
|
|
@brief convert the type of a native CUDA graph node to a readable string
|
|
*/
|
|
inline const char* cuda_graph_node_type_to_string(cudaGraphNodeType type) {
|
|
switch(type) {
|
|
case cudaGraphNodeTypeKernel : return "kernel";
|
|
case cudaGraphNodeTypeMemcpy : return "memcpy";
|
|
case cudaGraphNodeTypeMemset : return "memset";
|
|
case cudaGraphNodeTypeHost : return "host";
|
|
case cudaGraphNodeTypeGraph : return "graph";
|
|
case cudaGraphNodeTypeEmpty : return "empty";
|
|
case cudaGraphNodeTypeWaitEvent : return "event_wait";
|
|
case cudaGraphNodeTypeEventRecord : return "event_record";
|
|
default : return "undefined";
|
|
}
|
|
}
|
|
|
|
/**
|
|
@brief dumps a native CUDA graph and all associated child graphs to a DOT format
|
|
|
|
@tparam T output stream target
|
|
@param os target output stream
|
|
@param graph native CUDA graph
|
|
*/
|
|
template <typename T>
|
|
void cuda_dump_graph(T& os, cudaGraph_t graph) {
|
|
|
|
os << "digraph cudaGraph {\n";
|
|
|
|
std::stack<std::tuple<cudaGraph_t, cudaGraphNode_t, int>> stack;
|
|
stack.push(std::make_tuple(graph, nullptr, 1));
|
|
|
|
int pl = 0;
|
|
|
|
while(stack.empty() == false) {
|
|
|
|
auto [graph, parent, l] = stack.top();
|
|
stack.pop();
|
|
|
|
for(int i=0; i<pl-l+1; i++) {
|
|
os << "}\n";
|
|
}
|
|
|
|
os << "subgraph cluster_p" << graph << " {\n"
|
|
<< "label=\"cudaGraph-L" << l << "\";\n"
|
|
<< "color=\"purple\";\n";
|
|
|
|
auto nodes = cuda_get_graph_nodes(graph);
|
|
auto edges = cuda_get_graph_edges(graph);
|
|
|
|
for(auto& [from, to] : edges) {
|
|
os << 'p' << from << " -> " << 'p' << to << ";\n";
|
|
}
|
|
|
|
for(auto& node : nodes) {
|
|
auto type = cuda_get_graph_node_type(node);
|
|
if(type == cudaGraphNodeTypeGraph) {
|
|
|
|
cudaGraph_t graph;
|
|
TF_CHECK_CUDA(cudaGraphChildGraphNodeGetGraph(node, &graph), "");
|
|
stack.push(std::make_tuple(graph, node, l+1));
|
|
|
|
os << 'p' << node << "["
|
|
<< "shape=folder, style=filled, fontcolor=white, fillcolor=purple, "
|
|
<< "label=\"cudaGraph-L" << l+1
|
|
<< "\"];\n";
|
|
}
|
|
else {
|
|
os << 'p' << node << "[label=\""
|
|
<< cuda_graph_node_type_to_string(type)
|
|
<< "\"];\n";
|
|
}
|
|
}
|
|
|
|
// precede to parent
|
|
if(parent != nullptr) {
|
|
std::unordered_set<cudaGraphNode_t> successors;
|
|
for(const auto& p : edges) {
|
|
successors.insert(p.first);
|
|
}
|
|
for(auto node : nodes) {
|
|
if(successors.find(node) == successors.end()) {
|
|
os << 'p' << node << " -> " << 'p' << parent << ";\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
// set the previous level
|
|
pl = l;
|
|
}
|
|
|
|
for(int i=0; i<=pl; i++) {
|
|
os << "}\n";
|
|
}
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// cudaGraph class
|
|
// ----------------------------------------------------------------------------
|
|
|
|
// class: cudaGraph
|
|
class cudaGraph : public CustomGraphBase {
|
|
|
|
friend class cudaNode;
|
|
friend class cudaTask;
|
|
friend class cudaFlowCapturerBase;
|
|
friend class cudaFlowCapturer;
|
|
friend class cudaFlow;
|
|
|
|
friend class Taskflow;
|
|
friend class Executor;
|
|
|
|
public:
|
|
|
|
cudaGraph() = default;
|
|
~cudaGraph();
|
|
|
|
cudaGraph(const cudaGraph&) = delete;
|
|
cudaGraph(cudaGraph&&);
|
|
|
|
cudaGraph& operator = (const cudaGraph&) = delete;
|
|
cudaGraph& operator = (cudaGraph&&);
|
|
|
|
template <typename... ArgsT>
|
|
cudaNode* emplace_back(ArgsT&&...);
|
|
|
|
void clear();
|
|
|
|
bool empty() const;
|
|
|
|
void dump(std::ostream&, const void*, const std::string&) const override final;
|
|
|
|
private:
|
|
|
|
cudaGraph_t _native_handle {nullptr};
|
|
|
|
// TODO: nvcc complains deleter of unique_ptr
|
|
//std::vector<std::unique_ptr<cudaNode>> _nodes;
|
|
std::vector<cudaNode*> _nodes;
|
|
std::vector<cudaNode*> _toposort();
|
|
};
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// cudaNode class
|
|
// ----------------------------------------------------------------------------
|
|
|
|
// class: cudaNode
|
|
// each create_native_node is wrapped in a function to call at runtime
|
|
// in order to work with gpu context
|
|
class cudaNode {
|
|
|
|
friend class cudaGraph;
|
|
friend class cudaTask;
|
|
friend class cudaFlow;
|
|
friend class cudaFlowCapturer;
|
|
friend class cudaFlowCapturerBase;
|
|
|
|
friend class Taskflow;
|
|
friend class Executor;
|
|
|
|
// Empty handle
|
|
struct Empty {
|
|
};
|
|
|
|
// Host handle
|
|
struct Host {
|
|
|
|
template <typename C>
|
|
Host(C&&);
|
|
|
|
std::function<void()> func;
|
|
|
|
static void callback(void*);
|
|
};
|
|
|
|
// Memset handle
|
|
struct Memset {
|
|
};
|
|
|
|
// Memcpy handle
|
|
struct Memcpy {
|
|
};
|
|
|
|
// Kernel handle
|
|
struct Kernel {
|
|
|
|
template <typename F>
|
|
Kernel(F&& f);
|
|
|
|
void* func {nullptr};
|
|
};
|
|
|
|
// Subflow handle
|
|
struct Subflow {
|
|
cudaGraph graph;
|
|
};
|
|
|
|
// Capture
|
|
struct Capture {
|
|
|
|
template <typename C>
|
|
Capture(C&&);
|
|
|
|
std::function<void(cudaStream_t)> work;
|
|
};
|
|
|
|
using handle_t = std::variant<
|
|
Empty,
|
|
Host,
|
|
Memset,
|
|
Memcpy,
|
|
Kernel,
|
|
Subflow,
|
|
Capture
|
|
>;
|
|
|
|
constexpr static auto STATE_VISITED = 0x1;
|
|
|
|
public:
|
|
|
|
// variant index
|
|
constexpr static auto CUDA_EMPTY_TASK = get_index_v<Empty, handle_t>;
|
|
constexpr static auto CUDA_HOST_TASK = get_index_v<Host, handle_t>;
|
|
constexpr static auto CUDA_MEMSET_TASK = get_index_v<Memset, handle_t>;
|
|
constexpr static auto CUDA_MEMCPY_TASK = get_index_v<Memcpy, handle_t>;
|
|
constexpr static auto CUDA_KERNEL_TASK = get_index_v<Kernel, handle_t>;
|
|
constexpr static auto CUDA_SUBFLOW_TASK = get_index_v<Subflow, handle_t>;
|
|
constexpr static auto CUDA_CAPTURE_TASK = get_index_v<Capture, handle_t>;
|
|
|
|
cudaNode() = delete;
|
|
|
|
template <typename... ArgsT>
|
|
cudaNode(cudaGraph&, ArgsT&&...);
|
|
|
|
private:
|
|
|
|
int _state;
|
|
|
|
cudaGraph& _graph;
|
|
|
|
std::string _name;
|
|
|
|
handle_t _handle;
|
|
|
|
cudaGraphNode_t _native_handle {nullptr};
|
|
|
|
std::vector<cudaNode*> _successors;
|
|
|
|
void _precede(cudaNode*);
|
|
void _set_state(int);
|
|
void _unset_state(int);
|
|
void _clear_state();
|
|
bool _has_state(int) const;
|
|
};
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// cudaNode definitions
|
|
// ----------------------------------------------------------------------------
|
|
|
|
// Host handle constructor
|
|
template <typename C>
|
|
cudaNode::Host::Host(C&& c) : func {std::forward<C>(c)} {
|
|
}
|
|
|
|
// Host callback
|
|
inline void cudaNode::Host::callback(void* data) {
|
|
static_cast<Host*>(data)->func();
|
|
};
|
|
|
|
// Kernel handle constructor
|
|
template <typename F>
|
|
cudaNode::Kernel::Kernel(F&& f) :
|
|
func {std::forward<F>(f)} {
|
|
}
|
|
|
|
// Capture handle constructor
|
|
template <typename C>
|
|
cudaNode::Capture::Capture(C&& work) :
|
|
work {std::forward<C>(work)} {
|
|
}
|
|
|
|
// Constructor
|
|
template <typename... ArgsT>
|
|
cudaNode::cudaNode(cudaGraph& graph, ArgsT&&... args) :
|
|
_graph {graph},
|
|
_handle {std::forward<ArgsT>(args)...} {
|
|
}
|
|
|
|
// Procedure: _precede
|
|
inline void cudaNode::_precede(cudaNode* v) {
|
|
|
|
_successors.push_back(v);
|
|
|
|
// capture node doesn't have the native graph yet
|
|
if(_handle.index() != CUDA_CAPTURE_TASK) {
|
|
TF_CHECK_CUDA(
|
|
::cudaGraphAddDependencies(
|
|
_graph._native_handle, &_native_handle, &v->_native_handle, 1
|
|
),
|
|
"failed to add a preceding link ", this, "->", v
|
|
);
|
|
}
|
|
}
|
|
|
|
// Procedure: _set_state
|
|
inline void cudaNode::_set_state(int flag) {
|
|
_state |= flag;
|
|
}
|
|
|
|
// Procedure: _unset_state
|
|
inline void cudaNode::_unset_state(int flag) {
|
|
_state &= ~flag;
|
|
}
|
|
|
|
// Procedure: _clear_state
|
|
inline void cudaNode::_clear_state() {
|
|
_state = 0;
|
|
}
|
|
|
|
// Function: _has_state
|
|
inline bool cudaNode::_has_state(int flag) const {
|
|
return _state & flag;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// cudaGraph definitions
|
|
// ----------------------------------------------------------------------------
|
|
|
|
// Destructor
|
|
inline cudaGraph::~cudaGraph() {
|
|
clear();
|
|
assert(_native_handle == nullptr);
|
|
}
|
|
|
|
// Move constructor
|
|
inline cudaGraph::cudaGraph(cudaGraph&& g) :
|
|
_native_handle {g._native_handle},
|
|
_nodes {std::move(g._nodes)} {
|
|
|
|
g._native_handle = nullptr;
|
|
|
|
assert(g._nodes.empty());
|
|
}
|
|
|
|
// Move assignment
|
|
inline cudaGraph& cudaGraph::operator = (cudaGraph&& rhs) {
|
|
|
|
clear();
|
|
|
|
// lhs
|
|
_native_handle = rhs._native_handle;
|
|
_nodes = std::move(rhs._nodes);
|
|
|
|
assert(rhs._nodes.empty());
|
|
|
|
// rhs
|
|
rhs._native_handle = nullptr;
|
|
|
|
return *this;
|
|
}
|
|
|
|
// Function: empty
|
|
inline bool cudaGraph::empty() const {
|
|
return _nodes.empty();
|
|
}
|
|
|
|
// Procedure: clear
|
|
inline void cudaGraph::clear() {
|
|
for(auto n : _nodes) {
|
|
delete n;
|
|
}
|
|
_nodes.clear();
|
|
}
|
|
|
|
// Function: emplace_back
|
|
template <typename... ArgsT>
|
|
cudaNode* cudaGraph::emplace_back(ArgsT&&... args) {
|
|
//auto node = std::make_unique<cudaNode>(std::forward<ArgsT>(args)...);
|
|
//_nodes.emplace_back(std::move(node));
|
|
//return _nodes.back().get();
|
|
// TODO: object pool
|
|
|
|
auto node = new cudaNode(std::forward<ArgsT>(args)...);
|
|
_nodes.push_back(node);
|
|
return node;
|
|
}
|
|
|
|
// Procedure: _toposort
|
|
// topological sort iteratively
|
|
inline std::vector<cudaNode*> cudaGraph::_toposort() {
|
|
|
|
std::stack<cudaNode*> dfs;
|
|
std::vector<cudaNode*> res;
|
|
|
|
for(auto node : _nodes) {
|
|
node->_unset_state(cudaNode::STATE_VISITED);
|
|
}
|
|
|
|
for(auto node : _nodes) {
|
|
if(!node->_has_state(cudaNode::STATE_VISITED)) {
|
|
dfs.push(node);
|
|
}
|
|
|
|
while(!dfs.empty()) {
|
|
auto u = dfs.top();
|
|
dfs.pop();
|
|
|
|
if(u->_has_state(cudaNode::STATE_VISITED)){
|
|
res.push_back(u);
|
|
continue;
|
|
}
|
|
|
|
u->_set_state(cudaNode::STATE_VISITED);
|
|
dfs.push(u);
|
|
|
|
for(auto s : u->_successors) {
|
|
if(!(s->_has_state(cudaNode::STATE_VISITED))) {
|
|
dfs.push(s);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
std::reverse(res.begin(), res.end());
|
|
|
|
return res;
|
|
}
|
|
|
|
// Procedure: dump the graph to a DOT format
|
|
inline void cudaGraph::dump(
|
|
std::ostream& os, const void* root, const std::string& root_name
|
|
) const {
|
|
|
|
// recursive dump with stack
|
|
std::stack<std::tuple<const cudaGraph*, const cudaNode*, int>> stack;
|
|
stack.push(std::make_tuple(this, nullptr, 1));
|
|
|
|
int pl = 0;
|
|
|
|
while(!stack.empty()) {
|
|
|
|
auto [graph, parent, l] = stack.top();
|
|
stack.pop();
|
|
|
|
for(int i=0; i<pl-l+1; i++) {
|
|
os << "}\n";
|
|
}
|
|
|
|
if(parent == nullptr) {
|
|
if(root) {
|
|
os << "subgraph cluster_p" << root << " {\nlabel=\"cudaFlow: ";
|
|
if(root_name.empty()) os << 'p' << root;
|
|
else os << root_name;
|
|
os << "\";\n" << "color=\"purple\"\n";
|
|
}
|
|
else {
|
|
os << "digraph cudaFlow {\n";
|
|
}
|
|
}
|
|
else {
|
|
os << "subgraph cluster_p" << parent << " {\nlabel=\"cudaSubflow: ";
|
|
if(parent->_name.empty()) os << 'p' << parent;
|
|
else os << parent->_name;
|
|
os << "\";\n" << "color=\"purple\"\n";
|
|
}
|
|
|
|
for(auto& v : graph->_nodes) {
|
|
|
|
os << 'p' << v << "[label=\"";
|
|
if(v->_name.empty()) {
|
|
os << 'p' << v << "\"";
|
|
}
|
|
else {
|
|
os << v->_name << "\"";
|
|
}
|
|
|
|
switch(v->_handle.index()) {
|
|
case cudaNode::CUDA_KERNEL_TASK:
|
|
os << " style=\"filled\""
|
|
<< " color=\"white\" fillcolor=\"black\""
|
|
<< " fontcolor=\"white\""
|
|
<< " shape=\"box3d\"";
|
|
break;
|
|
|
|
case cudaNode::CUDA_SUBFLOW_TASK:
|
|
stack.push(std::make_tuple(
|
|
&std::get<cudaNode::Subflow>(v->_handle).graph, v, l+1)
|
|
);
|
|
os << " style=\"filled\""
|
|
<< " color=\"black\" fillcolor=\"purple\""
|
|
<< " fontcolor=\"white\""
|
|
<< " shape=\"folder\"";
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
os << "];\n";
|
|
|
|
for(const auto s : v->_successors) {
|
|
os << 'p' << v << " -> " << 'p' << s << ";\n";
|
|
}
|
|
|
|
if(v->_successors.size() == 0) {
|
|
if(parent == nullptr) {
|
|
if(root) {
|
|
os << 'p' << v << " -> p" << root << ";\n";
|
|
}
|
|
}
|
|
else {
|
|
os << 'p' << v << " -> p" << parent << ";\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
// set the previous level
|
|
pl = l;
|
|
}
|
|
|
|
for(int i=0; i<pl; i++) {
|
|
os << "}\n";
|
|
}
|
|
|
|
}
|
|
|
|
|
|
} // end of namespace tf -----------------------------------------------------
|
|
|
|
|
|
|
|
|