cocos-engine-external/sources/taskflow/cuda/cuda_graph.hpp

#pragma once

#include "cuda_memory.hpp"
#include "cuda_stream.hpp"

#include "../utility/object_pool.hpp"
#include "../utility/traits.hpp"
#include "../utility/passive_vector.hpp"

namespace tf {

// ----------------------------------------------------------------------------
// cudaGraph_t routines
// ----------------------------------------------------------------------------

/**
@brief gets the memcpy node parameter of a copy task
*/
template <typename T,
  std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
>
cudaMemcpy3DParms cuda_get_copy_parms(T* tgt, const T* src, size_t num) {

  using U = std::decay_t<T>;

  cudaMemcpy3DParms p;

  p.srcArray = nullptr;
  p.srcPos = ::make_cudaPos(0, 0, 0);
  p.srcPtr = ::make_cudaPitchedPtr(const_cast<T*>(src), num*sizeof(U), num, 1);
  p.dstArray = nullptr;
  p.dstPos = ::make_cudaPos(0, 0, 0);
  p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1);
  p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1);
  p.kind = cudaMemcpyDefault;

  return p;
}

/**
@brief gets the memcpy node parameter of a memcpy task (untyped)
*/
inline cudaMemcpy3DParms cuda_get_memcpy_parms(
  void* tgt, const void* src, size_t bytes
)  {

  // Parameters in cudaPitchedPtr
  // d   - Pointer to allocated memory
  // p   - Pitch of allocated memory in bytes
  // xsz - Logical width of allocation in elements
  // ysz - Logical height of allocation in elements
  cudaMemcpy3DParms p;
  p.srcArray = nullptr;
  p.srcPos = ::make_cudaPos(0, 0, 0);
  p.srcPtr = ::make_cudaPitchedPtr(const_cast<void*>(src), bytes, bytes, 1);
  p.dstArray = nullptr;
  p.dstPos = ::make_cudaPos(0, 0, 0);
  p.dstPtr = ::make_cudaPitchedPtr(tgt, bytes, bytes, 1);
  p.extent = ::make_cudaExtent(bytes, 1, 1);
  p.kind = cudaMemcpyDefault;

  return p;
}

/**
@brief gets the memset node parameter of a memcpy task (untyped)
*/
inline cudaMemsetParams cuda_get_memset_parms(void* dst, int ch, size_t count) {

  cudaMemsetParams p;
  p.dst = dst;
  p.value = ch;
  p.pitch = 0;
  //p.elementSize = (count & 1) == 0 ? ((count & 3) == 0 ? 4 : 2) : 1;
  //p.width = (count & 1) == 0 ? ((count & 3) == 0 ? count >> 2 : count >> 1) : count;
  p.elementSize = 1;  // either 1, 2, or 4
  p.width = count;
  p.height = 1;

  return p;
}

/**
@brief gets the memset node parameter of a fill task (typed)
*/
template <typename T, std::enable_if_t<
  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
>
cudaMemsetParams cuda_get_fill_parms(T* dst, T value, size_t count) {

  cudaMemsetParams p;
  p.dst = dst;

  // perform bit-wise copy
  p.value = 0;  // crucial
  static_assert(sizeof(T) <= sizeof(p.value), "internal error");
  std::memcpy(&p.value, &value, sizeof(T));

  p.pitch = 0;
  p.elementSize = sizeof(T);  // either 1, 2, or 4
  p.width = count;
  p.height = 1;

  return p;
}

/**
@brief gets the memset node parameter of a zero task (typed)
*/
template <typename T, std::enable_if_t<
  is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
>
cudaMemsetParams cuda_get_zero_parms(T* dst, size_t count) {

  cudaMemsetParams p;
  p.dst = dst;
  p.value = 0;
  p.pitch = 0;
  p.elementSize = sizeof(T);  // either 1, 2, or 4
  p.width = count;
  p.height = 1;

  return p;
}

/**
@brief queries the number of root nodes in a native CUDA graph
*/
inline size_t cuda_get_graph_num_root_nodes(cudaGraph_t graph) {
  size_t num_nodes;
  TF_CHECK_CUDA(
    cudaGraphGetRootNodes(graph, nullptr, &num_nodes),
    "failed to get native graph root nodes"
  );
  return num_nodes;
}

/**
@brief queries the number of nodes in a native CUDA graph
*/
inline size_t cuda_get_graph_num_nodes(cudaGraph_t graph) {
  size_t num_nodes;
  TF_CHECK_CUDA(
    cudaGraphGetNodes(graph, nullptr, &num_nodes),
    "failed to get native graph nodes"
  );
  return num_nodes;
}

/**
@brief queries the number of edges in a native CUDA graph
*/
inline size_t cuda_get_graph_num_edges(cudaGraph_t graph) {
  size_t num_edges;
  TF_CHECK_CUDA(
    cudaGraphGetEdges(graph, nullptr, nullptr, &num_edges),
    "failed to get native graph edges"
  );
  return num_edges;
}

/**
@brief acquires the nodes in a native CUDA graph
*/
inline std::vector<cudaGraphNode_t> cuda_get_graph_nodes(cudaGraph_t graph) {
  size_t num_nodes = cuda_get_graph_num_nodes(graph);
  std::vector<cudaGraphNode_t> nodes(num_nodes);
  TF_CHECK_CUDA(
    cudaGraphGetNodes(graph, nodes.data(), &num_nodes),
    "failed to get native graph nodes"
  );
  return nodes;
}

/**
@brief acquires the root nodes in a native CUDA graph
*/
inline std::vector<cudaGraphNode_t> cuda_get_graph_root_nodes(cudaGraph_t graph) {
  size_t num_nodes = cuda_get_graph_num_root_nodes(graph);
  std::vector<cudaGraphNode_t> nodes(num_nodes);
  TF_CHECK_CUDA(
    cudaGraphGetRootNodes(graph, nodes.data(), &num_nodes),
    "failed to get native graph nodes"
  );
  return nodes;
}

/**
@brief acquires the edges in a native CUDA graph
*/
inline std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>>
cuda_get_graph_edges(cudaGraph_t graph) {
  size_t num_edges = cuda_get_graph_num_edges(graph);
  std::vector<cudaGraphNode_t> froms(num_edges), tos(num_edges);
  TF_CHECK_CUDA(
    cudaGraphGetEdges(graph, froms.data(), tos.data(), &num_edges),
    "failed to get native graph edges"
  );
  std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>> edges(num_edges);
  for(size_t i=0; i<num_edges; i++) {
    edges[i] = std::make_pair(froms[i], tos[i]);
  }
  return edges;
}

/**
@brief queries the type of a native CUDA graph node

valid type values are:
  + cudaGraphNodeTypeKernel      = 0x00
  + cudaGraphNodeTypeMemcpy      = 0x01
  + cudaGraphNodeTypeMemset      = 0x02
  + cudaGraphNodeTypeHost        = 0x03
  + cudaGraphNodeTypeGraph       = 0x04
  + cudaGraphNodeTypeEmpty       = 0x05
  + cudaGraphNodeTypeWaitEvent   = 0x06
  + cudaGraphNodeTypeEventRecord = 0x07
*/
inline cudaGraphNodeType cuda_get_graph_node_type(cudaGraphNode_t node) {
  cudaGraphNodeType type;
  TF_CHECK_CUDA(
    cudaGraphNodeGetType(node, &type), "failed to get native graph node type"
  );
  return type;
}

/**
@brief convert the type of a native CUDA graph node to a readable string
*/
inline const char* cuda_graph_node_type_to_string(cudaGraphNodeType type) {
  switch(type) {
    case cudaGraphNodeTypeKernel      : return "kernel";
    case cudaGraphNodeTypeMemcpy      : return "memcpy";
    case cudaGraphNodeTypeMemset      : return "memset";
    case cudaGraphNodeTypeHost        : return "host";
    case cudaGraphNodeTypeGraph       : return "graph";
    case cudaGraphNodeTypeEmpty       : return "empty";
    case cudaGraphNodeTypeWaitEvent   : return "event_wait";
    case cudaGraphNodeTypeEventRecord : return "event_record";
    default                           : return "undefined";
  }
}

/**
@brief dumps a native CUDA graph and all associated child graphs to a DOT format

@tparam T output stream target
@param os target output stream
@param graph native CUDA graph
*/
template <typename T>
void cuda_dump_graph(T& os, cudaGraph_t graph) {

  os << "digraph cudaGraph {\n";

  std::stack<std::tuple<cudaGraph_t, cudaGraphNode_t, int>> stack;
  stack.push(std::make_tuple(graph, nullptr, 1));

  int pl = 0;

  while(stack.empty() == false) {

    auto [graph, parent, l] = stack.top();
    stack.pop();

    for(int i=0; i<pl-l+1; i++) {
      os << "}\n";
    }

    os << "subgraph cluster_p" << graph << " {\n"
       << "label=\"cudaGraph-L" << l << "\";\n"
       << "color=\"purple\";\n";

    auto nodes = cuda_get_graph_nodes(graph);
    auto edges = cuda_get_graph_edges(graph);

    for(auto& [from, to] : edges) {
      os << 'p' << from << " -> " << 'p' << to << ";\n";
    }

    for(auto& node : nodes) {
      auto type = cuda_get_graph_node_type(node);
      if(type == cudaGraphNodeTypeGraph) {

        cudaGraph_t graph;
        TF_CHECK_CUDA(cudaGraphChildGraphNodeGetGraph(node, &graph), "");
        stack.push(std::make_tuple(graph, node, l+1));

        os << 'p' << node << "["
           << "shape=folder, style=filled, fontcolor=white, fillcolor=purple, "
           << "label=\"cudaGraph-L" << l+1
           << "\"];\n";
      }
      else {
        os << 'p' << node << "[label=\""
           << cuda_graph_node_type_to_string(type)
           << "\"];\n";
      }
    }

    // precede to parent
    if(parent != nullptr) {
      std::unordered_set<cudaGraphNode_t> successors;
      for(const auto& p : edges) {
        successors.insert(p.first);
      }
      for(auto node : nodes) {
        if(successors.find(node) == successors.end()) {
          os << 'p' << node << " -> " << 'p' << parent << ";\n";
        }
      }
    }

    // set the previous level
    pl = l;
  }

  for(int i=0; i<=pl; i++) {
    os << "}\n";
  }
}

// ----------------------------------------------------------------------------
// cudaGraph class
// ----------------------------------------------------------------------------

// class: cudaGraph
class cudaGraph : public CustomGraphBase {

  friend class cudaNode;
  friend class cudaTask;
  friend class cudaFlowCapturerBase;
  friend class cudaFlowCapturer;
  friend class cudaFlow;

  friend class Taskflow;
  friend class Executor;

  public:

    cudaGraph() = default;
    ~cudaGraph();

    cudaGraph(const cudaGraph&) = delete;
    cudaGraph(cudaGraph&&);

    cudaGraph& operator = (const cudaGraph&) = delete;
    cudaGraph& operator = (cudaGraph&&);

    template <typename... ArgsT>
    cudaNode* emplace_back(ArgsT&&...);

    void clear();

    bool empty() const;

    void dump(std::ostream&, const void*, const std::string&) const override final;

  private:

    cudaGraph_t _native_handle {nullptr};

    // TODO: nvcc complains deleter of unique_ptr
    //std::vector<std::unique_ptr<cudaNode>> _nodes;
    std::vector<cudaNode*> _nodes;
    std::vector<cudaNode*> _toposort();
};

// ----------------------------------------------------------------------------
// cudaNode class
// ----------------------------------------------------------------------------

// class: cudaNode
// each create_native_node is wrapped in a function to call at runtime
// in order to work with gpu context
class cudaNode {

  friend class cudaGraph;
  friend class cudaTask;
  friend class cudaFlow;
  friend class cudaFlowCapturer;
  friend class cudaFlowCapturerBase;

  friend class Taskflow;
  friend class Executor;

  // Empty handle
  struct Empty {
  };

  // Host handle
  struct Host {

    template <typename C>
    Host(C&&);

    std::function<void()> func;

    static void callback(void*);
  };

  // Memset handle
  struct Memset {
  };

  // Memcpy handle
  struct Memcpy {
  };

  // Kernel handle
  struct Kernel {

    template <typename F>
    Kernel(F&& f);

    void* func {nullptr};
  };

  // Subflow handle
  struct Subflow {
    cudaGraph graph;
  };

  // Capture
  struct Capture {

    template <typename C>
    Capture(C&&);

    std::function<void(cudaStream_t)> work;
  };

  using handle_t = std::variant<
    Empty,
    Host,
    Memset,
    Memcpy,
    Kernel,
    Subflow,
    Capture
  >;

  constexpr static auto STATE_VISITED = 0x1;

  public:

  // variant index
  constexpr static auto CUDA_EMPTY_TASK   = get_index_v<Empty, handle_t>;
  constexpr static auto CUDA_HOST_TASK    = get_index_v<Host, handle_t>;
  constexpr static auto CUDA_MEMSET_TASK  = get_index_v<Memset, handle_t>;
  constexpr static auto CUDA_MEMCPY_TASK  = get_index_v<Memcpy, handle_t>;
  constexpr static auto CUDA_KERNEL_TASK  = get_index_v<Kernel, handle_t>;
  constexpr static auto CUDA_SUBFLOW_TASK = get_index_v<Subflow, handle_t>;
  constexpr static auto CUDA_CAPTURE_TASK = get_index_v<Capture, handle_t>;

    cudaNode() = delete;

    template <typename... ArgsT>
    cudaNode(cudaGraph&, ArgsT&&...);

  private:

    int _state;

    cudaGraph& _graph;

    std::string _name;

    handle_t _handle;

    cudaGraphNode_t _native_handle {nullptr};

    std::vector<cudaNode*> _successors;

    void _precede(cudaNode*);
    void _set_state(int);
    void _unset_state(int);
    void _clear_state();
    bool _has_state(int) const;
};

// ----------------------------------------------------------------------------
// cudaNode definitions
// ----------------------------------------------------------------------------

// Host handle constructor
template <typename C>
cudaNode::Host::Host(C&& c) : func {std::forward<C>(c)} {
}

// Host callback
inline void cudaNode::Host::callback(void* data) {
  static_cast<Host*>(data)->func();
};

// Kernel handle constructor
template <typename F>
cudaNode::Kernel::Kernel(F&& f) :
  func {std::forward<F>(f)} {
}

// Capture handle constructor
template <typename C>
cudaNode::Capture::Capture(C&& work) :
  work {std::forward<C>(work)} {
}

// Constructor
template <typename... ArgsT>
cudaNode::cudaNode(cudaGraph& graph, ArgsT&&... args) :
  _graph {graph},
  _handle {std::forward<ArgsT>(args)...} {
}

// Procedure: _precede
inline void cudaNode::_precede(cudaNode* v) {

  _successors.push_back(v);

  // capture node doesn't have the native graph yet
  if(_handle.index() != CUDA_CAPTURE_TASK) {
    TF_CHECK_CUDA(
      ::cudaGraphAddDependencies(
        _graph._native_handle, &_native_handle, &v->_native_handle, 1
      ),
      "failed to add a preceding link ", this, "->", v
    );
  }
}

// Procedure: _set_state
inline void cudaNode::_set_state(int flag) {
  _state |= flag;
}

// Procedure: _unset_state
inline void cudaNode::_unset_state(int flag) {
  _state &= ~flag;
}

// Procedure: _clear_state
inline void cudaNode::_clear_state() {
  _state = 0;
}

// Function: _has_state
inline bool cudaNode::_has_state(int flag) const {
  return _state & flag;
}

// ----------------------------------------------------------------------------
// cudaGraph definitions
// ----------------------------------------------------------------------------

// Destructor
inline cudaGraph::~cudaGraph() {
  clear();
  assert(_native_handle == nullptr);
}

// Move constructor
inline cudaGraph::cudaGraph(cudaGraph&& g) :
  _native_handle {g._native_handle},
  _nodes         {std::move(g._nodes)} {

  g._native_handle = nullptr;

  assert(g._nodes.empty());
}

// Move assignment
inline cudaGraph& cudaGraph::operator = (cudaGraph&& rhs) {

  clear();

  // lhs
  _native_handle = rhs._native_handle;
  _nodes = std::move(rhs._nodes);

  assert(rhs._nodes.empty());

  // rhs
  rhs._native_handle = nullptr;

  return *this;
}

// Function: empty
inline bool cudaGraph::empty() const {
  return _nodes.empty();
}

// Procedure: clear
inline void cudaGraph::clear() {
  for(auto n : _nodes) {
    delete n;
  }
  _nodes.clear();
}

// Function: emplace_back
template <typename... ArgsT>
cudaNode* cudaGraph::emplace_back(ArgsT&&... args) {
  //auto node = std::make_unique<cudaNode>(std::forward<ArgsT>(args)...);
  //_nodes.emplace_back(std::move(node));
  //return _nodes.back().get();
  // TODO: object pool

  auto node = new cudaNode(std::forward<ArgsT>(args)...);
  _nodes.push_back(node);
  return node;
}

// Procedure: _toposort
// topological sort iteratively
inline std::vector<cudaNode*> cudaGraph::_toposort() {

  std::stack<cudaNode*> dfs;
  std::vector<cudaNode*> res;

  for(auto node : _nodes) {
    node->_unset_state(cudaNode::STATE_VISITED);
  }

  for(auto node : _nodes) {
    if(!node->_has_state(cudaNode::STATE_VISITED)) {
      dfs.push(node);
    }

    while(!dfs.empty()) {
      auto u = dfs.top();
      dfs.pop();

      if(u->_has_state(cudaNode::STATE_VISITED)){
        res.push_back(u);
        continue;
      }

      u->_set_state(cudaNode::STATE_VISITED);
      dfs.push(u);

      for(auto s : u->_successors) {
        if(!(s->_has_state(cudaNode::STATE_VISITED))) {
          dfs.push(s);
        }
      }
    }
  }

  std::reverse(res.begin(), res.end());

  return res;
}

// Procedure: dump the graph to a DOT format
inline void cudaGraph::dump(
  std::ostream& os, const void* root, const std::string& root_name
) const {

  // recursive dump with stack
  std::stack<std::tuple<const cudaGraph*, const cudaNode*, int>> stack;
  stack.push(std::make_tuple(this, nullptr, 1));

  int pl = 0;

  while(!stack.empty()) {

    auto [graph, parent, l] = stack.top();
    stack.pop();

    for(int i=0; i<pl-l+1; i++) {
      os << "}\n";
    }

    if(parent == nullptr) {
      if(root) {
        os << "subgraph cluster_p" << root << " {\nlabel=\"cudaFlow: ";
        if(root_name.empty()) os << 'p' << root;
        else os << root_name;
        os << "\";\n" << "color=\"purple\"\n";
      }
      else {
        os << "digraph cudaFlow {\n";
      }
    }
    else {
      os << "subgraph cluster_p" << parent << " {\nlabel=\"cudaSubflow: ";
      if(parent->_name.empty()) os << 'p' << parent;
      else os << parent->_name;
      os << "\";\n" << "color=\"purple\"\n";
    }

    for(auto& v : graph->_nodes) {

      os << 'p' << v << "[label=\"";
      if(v->_name.empty()) {
        os << 'p' << v << "\"";
      }
      else {
        os << v->_name << "\"";
      }

      switch(v->_handle.index()) {
        case cudaNode::CUDA_KERNEL_TASK:
          os << " style=\"filled\""
             << " color=\"white\" fillcolor=\"black\""
             << " fontcolor=\"white\""
             << " shape=\"box3d\"";
        break;

        case cudaNode::CUDA_SUBFLOW_TASK:
          stack.push(std::make_tuple(
            &std::get<cudaNode::Subflow>(v->_handle).graph, v, l+1)
          );
          os << " style=\"filled\""
             << " color=\"black\" fillcolor=\"purple\""
             << " fontcolor=\"white\""
             << " shape=\"folder\"";
        break;

        default:
        break;
      }

      os << "];\n";

      for(const auto s : v->_successors) {
        os << 'p' << v << " -> " << 'p' << s << ";\n";
      }

      if(v->_successors.size() == 0) {
        if(parent == nullptr) {
          if(root) {
            os << 'p' << v << " -> p" << root << ";\n";
          }
        }
        else {
          os << 'p' << v << " -> p" << parent << ";\n";
        }
      }
    }

    // set the previous level
    pl = l;
  }

  for(int i=0; i<pl; i++) {
    os << "}\n";
  }

}


}  // end of namespace tf -----------------------------------------------------