#pragma once #include "cuda_memory.hpp" #include "cuda_stream.hpp" #include "../utility/object_pool.hpp" #include "../utility/traits.hpp" #include "../utility/passive_vector.hpp" namespace tf { // ---------------------------------------------------------------------------- // cudaGraph_t routines // ---------------------------------------------------------------------------- /** @brief gets the memcpy node parameter of a copy task */ template , void>* = nullptr > cudaMemcpy3DParms cuda_get_copy_parms(T* tgt, const T* src, size_t num) { using U = std::decay_t; cudaMemcpy3DParms p; p.srcArray = nullptr; p.srcPos = ::make_cudaPos(0, 0, 0); p.srcPtr = ::make_cudaPitchedPtr(const_cast(src), num*sizeof(U), num, 1); p.dstArray = nullptr; p.dstPos = ::make_cudaPos(0, 0, 0); p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1); p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1); p.kind = cudaMemcpyDefault; return p; } /** @brief gets the memcpy node parameter of a memcpy task (untyped) */ inline cudaMemcpy3DParms cuda_get_memcpy_parms( void* tgt, const void* src, size_t bytes ) { // Parameters in cudaPitchedPtr // d - Pointer to allocated memory // p - Pitch of allocated memory in bytes // xsz - Logical width of allocation in elements // ysz - Logical height of allocation in elements cudaMemcpy3DParms p; p.srcArray = nullptr; p.srcPos = ::make_cudaPos(0, 0, 0); p.srcPtr = ::make_cudaPitchedPtr(const_cast(src), bytes, bytes, 1); p.dstArray = nullptr; p.dstPos = ::make_cudaPos(0, 0, 0); p.dstPtr = ::make_cudaPitchedPtr(tgt, bytes, bytes, 1); p.extent = ::make_cudaExtent(bytes, 1, 1); p.kind = cudaMemcpyDefault; return p; } /** @brief gets the memset node parameter of a memcpy task (untyped) */ inline cudaMemsetParams cuda_get_memset_parms(void* dst, int ch, size_t count) { cudaMemsetParams p; p.dst = dst; p.value = ch; p.pitch = 0; //p.elementSize = (count & 1) == 0 ? ((count & 3) == 0 ? 4 : 2) : 1; //p.width = (count & 1) == 0 ? ((count & 3) == 0 ? count >> 2 : count >> 1) : count; p.elementSize = 1; // either 1, 2, or 4 p.width = count; p.height = 1; return p; } /** @brief gets the memset node parameter of a fill task (typed) */ template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr > cudaMemsetParams cuda_get_fill_parms(T* dst, T value, size_t count) { cudaMemsetParams p; p.dst = dst; // perform bit-wise copy p.value = 0; // crucial static_assert(sizeof(T) <= sizeof(p.value), "internal error"); std::memcpy(&p.value, &value, sizeof(T)); p.pitch = 0; p.elementSize = sizeof(T); // either 1, 2, or 4 p.width = count; p.height = 1; return p; } /** @brief gets the memset node parameter of a zero task (typed) */ template && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr > cudaMemsetParams cuda_get_zero_parms(T* dst, size_t count) { cudaMemsetParams p; p.dst = dst; p.value = 0; p.pitch = 0; p.elementSize = sizeof(T); // either 1, 2, or 4 p.width = count; p.height = 1; return p; } /** @brief queries the number of root nodes in a native CUDA graph */ inline size_t cuda_get_graph_num_root_nodes(cudaGraph_t graph) { size_t num_nodes; TF_CHECK_CUDA( cudaGraphGetRootNodes(graph, nullptr, &num_nodes), "failed to get native graph root nodes" ); return num_nodes; } /** @brief queries the number of nodes in a native CUDA graph */ inline size_t cuda_get_graph_num_nodes(cudaGraph_t graph) { size_t num_nodes; TF_CHECK_CUDA( cudaGraphGetNodes(graph, nullptr, &num_nodes), "failed to get native graph nodes" ); return num_nodes; } /** @brief queries the number of edges in a native CUDA graph */ inline size_t cuda_get_graph_num_edges(cudaGraph_t graph) { size_t num_edges; TF_CHECK_CUDA( cudaGraphGetEdges(graph, nullptr, nullptr, &num_edges), "failed to get native graph edges" ); return num_edges; } /** @brief acquires the nodes in a native CUDA graph */ inline std::vector cuda_get_graph_nodes(cudaGraph_t graph) { size_t num_nodes = cuda_get_graph_num_nodes(graph); std::vector nodes(num_nodes); TF_CHECK_CUDA( cudaGraphGetNodes(graph, nodes.data(), &num_nodes), "failed to get native graph nodes" ); return nodes; } /** @brief acquires the root nodes in a native CUDA graph */ inline std::vector cuda_get_graph_root_nodes(cudaGraph_t graph) { size_t num_nodes = cuda_get_graph_num_root_nodes(graph); std::vector nodes(num_nodes); TF_CHECK_CUDA( cudaGraphGetRootNodes(graph, nodes.data(), &num_nodes), "failed to get native graph nodes" ); return nodes; } /** @brief acquires the edges in a native CUDA graph */ inline std::vector> cuda_get_graph_edges(cudaGraph_t graph) { size_t num_edges = cuda_get_graph_num_edges(graph); std::vector froms(num_edges), tos(num_edges); TF_CHECK_CUDA( cudaGraphGetEdges(graph, froms.data(), tos.data(), &num_edges), "failed to get native graph edges" ); std::vector> edges(num_edges); for(size_t i=0; i void cuda_dump_graph(T& os, cudaGraph_t graph) { os << "digraph cudaGraph {\n"; std::stack> stack; stack.push(std::make_tuple(graph, nullptr, 1)); int pl = 0; while(stack.empty() == false) { auto [graph, parent, l] = stack.top(); stack.pop(); for(int i=0; i " << 'p' << to << ";\n"; } for(auto& node : nodes) { auto type = cuda_get_graph_node_type(node); if(type == cudaGraphNodeTypeGraph) { cudaGraph_t graph; TF_CHECK_CUDA(cudaGraphChildGraphNodeGetGraph(node, &graph), ""); stack.push(std::make_tuple(graph, node, l+1)); os << 'p' << node << "[" << "shape=folder, style=filled, fontcolor=white, fillcolor=purple, " << "label=\"cudaGraph-L" << l+1 << "\"];\n"; } else { os << 'p' << node << "[label=\"" << cuda_graph_node_type_to_string(type) << "\"];\n"; } } // precede to parent if(parent != nullptr) { std::unordered_set successors; for(const auto& p : edges) { successors.insert(p.first); } for(auto node : nodes) { if(successors.find(node) == successors.end()) { os << 'p' << node << " -> " << 'p' << parent << ";\n"; } } } // set the previous level pl = l; } for(int i=0; i<=pl; i++) { os << "}\n"; } } // ---------------------------------------------------------------------------- // cudaGraph class // ---------------------------------------------------------------------------- // class: cudaGraph class cudaGraph : public CustomGraphBase { friend class cudaNode; friend class cudaTask; friend class cudaFlowCapturerBase; friend class cudaFlowCapturer; friend class cudaFlow; friend class Taskflow; friend class Executor; public: cudaGraph() = default; ~cudaGraph(); cudaGraph(const cudaGraph&) = delete; cudaGraph(cudaGraph&&); cudaGraph& operator = (const cudaGraph&) = delete; cudaGraph& operator = (cudaGraph&&); template cudaNode* emplace_back(ArgsT&&...); void clear(); bool empty() const; void dump(std::ostream&, const void*, const std::string&) const override final; private: cudaGraph_t _native_handle {nullptr}; // TODO: nvcc complains deleter of unique_ptr //std::vector> _nodes; std::vector _nodes; std::vector _toposort(); }; // ---------------------------------------------------------------------------- // cudaNode class // ---------------------------------------------------------------------------- // class: cudaNode // each create_native_node is wrapped in a function to call at runtime // in order to work with gpu context class cudaNode { friend class cudaGraph; friend class cudaTask; friend class cudaFlow; friend class cudaFlowCapturer; friend class cudaFlowCapturerBase; friend class Taskflow; friend class Executor; // Empty handle struct Empty { }; // Host handle struct Host { template Host(C&&); std::function func; static void callback(void*); }; // Memset handle struct Memset { }; // Memcpy handle struct Memcpy { }; // Kernel handle struct Kernel { template Kernel(F&& f); void* func {nullptr}; }; // Subflow handle struct Subflow { cudaGraph graph; }; // Capture struct Capture { template Capture(C&&); std::function work; }; using handle_t = std::variant< Empty, Host, Memset, Memcpy, Kernel, Subflow, Capture >; constexpr static auto STATE_VISITED = 0x1; public: // variant index constexpr static auto CUDA_EMPTY_TASK = get_index_v; constexpr static auto CUDA_HOST_TASK = get_index_v; constexpr static auto CUDA_MEMSET_TASK = get_index_v; constexpr static auto CUDA_MEMCPY_TASK = get_index_v; constexpr static auto CUDA_KERNEL_TASK = get_index_v; constexpr static auto CUDA_SUBFLOW_TASK = get_index_v; constexpr static auto CUDA_CAPTURE_TASK = get_index_v; cudaNode() = delete; template cudaNode(cudaGraph&, ArgsT&&...); private: int _state; cudaGraph& _graph; std::string _name; handle_t _handle; cudaGraphNode_t _native_handle {nullptr}; std::vector _successors; void _precede(cudaNode*); void _set_state(int); void _unset_state(int); void _clear_state(); bool _has_state(int) const; }; // ---------------------------------------------------------------------------- // cudaNode definitions // ---------------------------------------------------------------------------- // Host handle constructor template cudaNode::Host::Host(C&& c) : func {std::forward(c)} { } // Host callback inline void cudaNode::Host::callback(void* data) { static_cast(data)->func(); }; // Kernel handle constructor template cudaNode::Kernel::Kernel(F&& f) : func {std::forward(f)} { } // Capture handle constructor template cudaNode::Capture::Capture(C&& work) : work {std::forward(work)} { } // Constructor template cudaNode::cudaNode(cudaGraph& graph, ArgsT&&... args) : _graph {graph}, _handle {std::forward(args)...} { } // Procedure: _precede inline void cudaNode::_precede(cudaNode* v) { _successors.push_back(v); // capture node doesn't have the native graph yet if(_handle.index() != CUDA_CAPTURE_TASK) { TF_CHECK_CUDA( ::cudaGraphAddDependencies( _graph._native_handle, &_native_handle, &v->_native_handle, 1 ), "failed to add a preceding link ", this, "->", v ); } } // Procedure: _set_state inline void cudaNode::_set_state(int flag) { _state |= flag; } // Procedure: _unset_state inline void cudaNode::_unset_state(int flag) { _state &= ~flag; } // Procedure: _clear_state inline void cudaNode::_clear_state() { _state = 0; } // Function: _has_state inline bool cudaNode::_has_state(int flag) const { return _state & flag; } // ---------------------------------------------------------------------------- // cudaGraph definitions // ---------------------------------------------------------------------------- // Destructor inline cudaGraph::~cudaGraph() { clear(); assert(_native_handle == nullptr); } // Move constructor inline cudaGraph::cudaGraph(cudaGraph&& g) : _native_handle {g._native_handle}, _nodes {std::move(g._nodes)} { g._native_handle = nullptr; assert(g._nodes.empty()); } // Move assignment inline cudaGraph& cudaGraph::operator = (cudaGraph&& rhs) { clear(); // lhs _native_handle = rhs._native_handle; _nodes = std::move(rhs._nodes); assert(rhs._nodes.empty()); // rhs rhs._native_handle = nullptr; return *this; } // Function: empty inline bool cudaGraph::empty() const { return _nodes.empty(); } // Procedure: clear inline void cudaGraph::clear() { for(auto n : _nodes) { delete n; } _nodes.clear(); } // Function: emplace_back template cudaNode* cudaGraph::emplace_back(ArgsT&&... args) { //auto node = std::make_unique(std::forward(args)...); //_nodes.emplace_back(std::move(node)); //return _nodes.back().get(); // TODO: object pool auto node = new cudaNode(std::forward(args)...); _nodes.push_back(node); return node; } // Procedure: _toposort // topological sort iteratively inline std::vector cudaGraph::_toposort() { std::stack dfs; std::vector res; for(auto node : _nodes) { node->_unset_state(cudaNode::STATE_VISITED); } for(auto node : _nodes) { if(!node->_has_state(cudaNode::STATE_VISITED)) { dfs.push(node); } while(!dfs.empty()) { auto u = dfs.top(); dfs.pop(); if(u->_has_state(cudaNode::STATE_VISITED)){ res.push_back(u); continue; } u->_set_state(cudaNode::STATE_VISITED); dfs.push(u); for(auto s : u->_successors) { if(!(s->_has_state(cudaNode::STATE_VISITED))) { dfs.push(s); } } } } std::reverse(res.begin(), res.end()); return res; } // Procedure: dump the graph to a DOT format inline void cudaGraph::dump( std::ostream& os, const void* root, const std::string& root_name ) const { // recursive dump with stack std::stack> stack; stack.push(std::make_tuple(this, nullptr, 1)); int pl = 0; while(!stack.empty()) { auto [graph, parent, l] = stack.top(); stack.pop(); for(int i=0; i_name.empty()) os << 'p' << parent; else os << parent->_name; os << "\";\n" << "color=\"purple\"\n"; } for(auto& v : graph->_nodes) { os << 'p' << v << "[label=\""; if(v->_name.empty()) { os << 'p' << v << "\""; } else { os << v->_name << "\""; } switch(v->_handle.index()) { case cudaNode::CUDA_KERNEL_TASK: os << " style=\"filled\"" << " color=\"white\" fillcolor=\"black\"" << " fontcolor=\"white\"" << " shape=\"box3d\""; break; case cudaNode::CUDA_SUBFLOW_TASK: stack.push(std::make_tuple( &std::get(v->_handle).graph, v, l+1) ); os << " style=\"filled\"" << " color=\"black\" fillcolor=\"purple\"" << " fontcolor=\"white\"" << " shape=\"folder\""; break; default: break; } os << "];\n"; for(const auto s : v->_successors) { os << 'p' << v << " -> " << 'p' << s << ";\n"; } if(v->_successors.size() == 0) { if(parent == nullptr) { if(root) { os << 'p' << v << " -> p" << root << ";\n"; } } else { os << 'p' << v << " -> p" << parent << ";\n"; } } } // set the previous level pl = l; } for(int i=0; i