cocos-engine-external/sources/taskflow/cuda/cuda_capturer.hpp

#pragma once

#include "cuda_task.hpp"
#include "cuda_algorithm/cuda_for_each.hpp"
#include "cuda_algorithm/cuda_transform.hpp"
#include "cuda_algorithm/cuda_reduce.hpp"

/**
@file cuda_capturer.hpp
@brief cudaFlowCapturer include file
*/

namespace tf {

/**
@brief queries the maximum threads allowed per block
*/
constexpr size_t cuda_default_max_threads_per_block() {
  return 512;
}

/**
@brief queries the default number of threads per block in an 1D vector of N elements
*/
constexpr size_t cuda_default_threads_per_block(size_t N) {
  // TODO: special case when N == 0?
  if(N <= 32) return 32;
  else {
    return std::min(cuda_default_max_threads_per_block(), next_pow2(N));
  }
}

// ----------------------------------------------------------------------------
// class definition: cudaFlowCapturerBase
// ----------------------------------------------------------------------------

/**
@class cudaFlowCapturerBase

@brief base class to construct a CUDA task graph through stream capture
*/
class cudaFlowCapturerBase {

  friend class cudaFlowCapturer;

  public:

    /**
    @brief default constructor
     */
    cudaFlowCapturerBase() = default;

    /**
    @brief default virtual destructor
     */
    virtual ~cudaFlowCapturerBase() = default;

    /**
    @brief dumps the capture graph into a DOT format through an
           output stream
    */
    void dump(std::ostream& os) const;

    /**
    @brief captures a sequential CUDA operations from the given callable

    @tparam C callable type constructible with @c std::function<void(cudaStream_t)>
    @param callable a callable to capture CUDA operations with the stream

    This methods applies a stream created by the flow to capture
    a sequence of CUDA operations defined in the callable.
    */
    template <typename C, std::enable_if_t<
      std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr
    >
    cudaTask on(C&& callable);

    /**
    @brief copies data between host and device asynchronously through a stream

    @param dst destination memory address
    @param src source memory address
    @param count size in bytes to copy

    The method captures a @c cudaMemcpyAsync operation through an
    internal stream.
    */
    cudaTask memcpy(void* dst, const void* src, size_t count);

    /**
    @brief captures a copy task of typed data

    @tparam T element type (non-void)

    @param tgt pointer to the target memory block
    @param src pointer to the source memory block
    @param num number of elements to copy

    @return cudaTask handle

    A copy task transfers <tt>num*sizeof(T)</tt> bytes of data from a source location
    to a target location. Direction can be arbitrary among CPUs and GPUs.
    */
    template <typename T,
      std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
    >
    cudaTask copy(T* tgt, const T* src, size_t num);

    /**
    @brief initializes or sets GPU memory to the given value byte by byte

    @param devPtr pointer to GPU mempry
    @param value value to set for each byte of the specified memory
    @param count size in bytes to set

    The method captures a @c cudaMemsetAsync operation through an
    internal stream to fill the first @c count bytes of the memory area
    pointed to by @c devPtr with the constant byte value @c value.
    */
    cudaTask memset(void* devPtr, int value, size_t count);

    /**
    @brief captures a kernel

    @tparam F kernel function type
    @tparam ArgsT kernel function parameters type

    @param g configured grid
    @param b configured block
    @param s configured shared memory
    @param f kernel function
    @param args arguments to forward to the kernel function by copy

    @return cudaTask handle
    */
    template <typename F, typename... ArgsT>
    cudaTask kernel(dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args);

    // ------------------------------------------------------------------------
    // generic algorithms
    // ------------------------------------------------------------------------

    /**
    @brief capturers a kernel to runs the given callable with only one thread

    @tparam C callable type

    @param callable callable to run by a single kernel thread
    */
    template <typename C>
    cudaTask single_task(C&& callable);

    /**
    @brief captures a kernel that applies a callable to each dereferenced element
           of the data array

    @tparam I iterator type
    @tparam C callable type

    @param first iterator to the beginning (inclusive)
    @param last iterator to the end (exclusive)
    @param callable a callable object to apply to the dereferenced iterator

    @return cudaTask handle

    This method is equivalent to the parallel execution of the following loop on a GPU:

    @code{.cpp}
    for(auto itr = first; itr != last; i++) {
      callable(*itr);
    }
    @endcode
    */
    template <typename I, typename C>
    cudaTask for_each(I first, I last, C&& callable);

    /**
    @brief captures a kernel that applies a callable to each index in the range
           with the step size

    @tparam I index type
    @tparam C callable type

    @param first beginning index
    @param last last index
    @param step step size
    @param callable the callable to apply to each element in the data array

    @return cudaTask handle

    This method is equivalent to the parallel execution of the following loop on a GPU:

    @code{.cpp}
    // step is positive [first, last)
    for(auto i=first; i<last; i+=step) {
      callable(i);
    }

    // step is negative [first, last)
    for(auto i=first; i>last; i+=step) {
      callable(i);
    }
    @endcode
    */
    template <typename I, typename C>
    cudaTask for_each_index(I first, I last, I step, C&& callable);

    /**
    @brief captures a kernel that applies a callable to a source range and
           stores the result in a target range

    @tparam I iterator type
    @tparam C callable type
    @tparam S source types

    @param first iterator to the beginning (inclusive)
    @param last iterator to the end (exclusive)
    @param callable the callable to apply to each element in the range
    @param srcs iterators to the source ranges

    @return cudaTask handle

    This method is equivalent to the parallel execution of the following loop on a GPU:

    @code{.cpp}
    while (first != last) {
      *first++ = callable(*src1++, *src2++, *src3++, ...);
    }
    @endcode
    */
    template <typename I, typename C, typename... S>
    cudaTask transform(I first, I last, C&& callable, S... srcs);

    /**
    @brief captures a kernel that performs parallel reduction over a range of items

    @tparam I input iterator type
    @tparam T value type
    @tparam C callable type

    @param first iterator to the beginning (inclusive)
    @param last iterator to the end (exclusive)
    @param result pointer to the result with an initialized value
    @param op binary reduction operator

    @return a tf::cudaTask handle

    This method is equivalent to the parallel execution of the following loop on a GPU:

    @code{.cpp}
    while (first != last) {
      *result = op(*result, *first++);
    }
    @endcode
    */
    template <typename I, typename T, typename C>
    cudaTask reduce(I first, I last, T* result, C&& op);

    /**
    @brief similar to tf::cudaFlowCapturerBase::reduce but does not assum
           any initial value to reduce

    This method is equivalent to the parallel execution of the following loop
    on a GPU:

    @code{.cpp}
    *result = *first++;  // no initial values partitipcate in the loop
    while (first != last) {
      *result = op(*result, *first++);
    }
    @endcode
    */
    template <typename I, typename T, typename C>
    cudaTask uninitialized_reduce(I first, I last, T* result, C&& op);

  private:

    cudaGraph* _graph {nullptr};

    cudaFlowCapturerBase(cudaGraph&);
};

// Constructor
inline cudaFlowCapturerBase::cudaFlowCapturerBase(cudaGraph& g) :
  _graph {&g} {
}

// Procedure: dump
inline void cudaFlowCapturerBase::dump(std::ostream& os) const {
  _graph->dump(os, nullptr, "");
}

// Function: capture
template <typename C, std::enable_if_t<
  std::is_invocable_r_v<void, C, cudaStream_t>, void>*
>
cudaTask cudaFlowCapturerBase::on(C&& callable) {
  auto node = _graph->emplace_back(*_graph,
    std::in_place_type_t<cudaNode::Capture>{}, std::forward<C>(callable)
  );
  return cudaTask(node);
}

// Function: memcpy
inline cudaTask cudaFlowCapturerBase::memcpy(
  void* dst, const void* src, size_t count
) {
  return on([dst, src, count] (cudaStream_t stream) mutable {
    TF_CHECK_CUDA(
      cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream),
      "failed to capture memcpy"
    );
  });
}

template <typename T, std::enable_if_t<!std::is_same_v<T, void>, void>*>
cudaTask cudaFlowCapturerBase::copy(T* tgt, const T* src, size_t num) {
  return on([tgt, src, num] (cudaStream_t stream) mutable {
    TF_CHECK_CUDA(
      cudaMemcpyAsync(tgt, src, sizeof(T)*num, cudaMemcpyDefault, stream),
      "failed to capture copy"
    );
  });
}

// Function: memset
inline cudaTask cudaFlowCapturerBase::memset(void* ptr, int v, size_t n) {
  return on([ptr, v, n] (cudaStream_t stream) mutable {
    TF_CHECK_CUDA(
      cudaMemsetAsync(ptr, v, n, stream), "failed to capture memset"
    );
  });
}

// Function: kernel
template <typename F, typename... ArgsT>
cudaTask cudaFlowCapturerBase::kernel(
  dim3 g, dim3 b, size_t shm, F&& f, ArgsT&&... args
) {
  return on([g, b, shm, f, args...] (cudaStream_t stream) mutable {
    f<<<g, b, shm, stream>>>(args...);
  });
}

// Function: single_task
template <typename C>
cudaTask cudaFlowCapturerBase::single_task(C&& callable) {
  return on([c=std::forward<C>(callable)] (cudaStream_t stream) mutable {
    cuda_single_task<C><<<1, 1, 0, stream>>>(c);
  });
}

// Function: for_each
template <typename I, typename C>
cudaTask cudaFlowCapturerBase::for_each(I first, I last, C&& c) {
  return on([first, last, c=std::forward<C>(c)](cudaStream_t stream) mutable {
    // TODO: special case for N == 0?
    size_t N = std::distance(first, last);
    size_t B = cuda_default_threads_per_block(N);
    cuda_for_each<I, C><<<(N+B-1)/B, B, 0, stream>>>(first, N, c);
  });
}

// Function: for_each_index
template <typename I, typename C>
cudaTask cudaFlowCapturerBase::for_each_index(I beg, I end, I inc, C&& c) {

  if(is_range_invalid(beg, end, inc)) {
    TF_THROW("invalid range [", beg, ", ", end, ") with inc size ", inc);
  }

  return on([beg, end, inc, c=std::forward<C>(c)] (cudaStream_t stream) mutable {
    // TODO: special case when N is 0?
    size_t N = distance(beg, end, inc);
    size_t B = cuda_default_threads_per_block(N);
    cuda_for_each_index<I, C><<<(N+B-1)/B, B, 0, stream>>>(beg, inc, N, c);
  });
}

// Function: transform
template <typename I, typename C, typename... S>
cudaTask cudaFlowCapturerBase::transform(I first, I last, C&& c, S... srcs) {
  return on([first, last, c=std::forward<C>(c), srcs...]
  (cudaStream_t stream) mutable {
    // TODO: special case when N is 0?
    size_t N = std::distance(first, last);
    size_t B = cuda_default_threads_per_block(N);
    cuda_transform<I, C, S...><<<(N+B-1)/B, B, 0, stream>>>(first, N, c, srcs...);
  });
}

// Function: reduce
template <typename I, typename T, typename C>
cudaTask cudaFlowCapturerBase::reduce(I first, I last, T* result, C&& c) {

  return on([first, last, result, c=std::forward<C>(c)]
  (cudaStream_t stream) mutable {
    //using value_t = std::decay_t<decltype(*std::declval<I>())>;

    // TODO: special case N == 0?
    size_t N = std::distance(first, last);
    size_t B = cuda_default_threads_per_block(N);

    cuda_reduce<I, T, C, false><<<1, B, B*sizeof(T), stream>>>(
      first, N, result, c
    );
  });
}

// Function: uninitialized_reduce
template <typename I, typename T, typename C>
cudaTask cudaFlowCapturerBase::uninitialized_reduce(
  I first, I last, T* result, C&& c
) {

  return on([first, last, result, c=std::forward<C>(c)]
  (cudaStream_t stream) mutable {
    //using value_t = std::decay_t<decltype(*std::declval<I>())>;

    // TODO: special case N == 0?
    size_t N = std::distance(first, last);
    size_t B = cuda_default_threads_per_block(N);

    cuda_reduce<I, T, C, true><<<1, B, B*sizeof(T), stream>>>(
      first, N, result, c
    );
  });
}

// ----------------------------------------------------------------------------
// class definition: cudaFlowCapturer
// ----------------------------------------------------------------------------

/**
@class cudaFlowCapturer

@brief class for building a CUDA task dependency graph through stream capture

A %cudaFlowCapturer inherits all the base methods from tf::cudaFlowCapturerBase
to construct a CUDA task graph through <i>stream capturer</i>.
This class also defines a factory interface tf::cudaFlowCapturer::make_capturer
for users to create custom capturers with their lifetimes managed by the factory.

The usage of tf::cudaFlowCapturer is similar to tf::cudaFlow, except users can
call the method tf::cudaFlowCapturer::on to capture a sequence of asynchronous
CUDA operations through the given stream.
The following example creates a CUDA graph that captures two kernel tasks,
@c task_1 and @c task_2, where @c task_1 runs before @c task_2.

@code{.cpp}
taskflow.emplace([](tf::cudaFlowCapturer& capturer){

  // capture my_kernel_1 through the given stream managed by the capturer
  auto task_1 = capturer.on([&](cudaStream_t stream){
    my_kernel_1<<<grid_1, block_1, shm_size_1, stream>>>(my_parameters_1);
  });

  // capture my_kernel_2 through the given stream managed by the capturer
  auto task_2 = capturer.on([&](cudaStream_t stream){
    my_kernel_2<<<grid_2, block_2, shm_size_2, stream>>>(my_parameters_2);
  });

  task_1.precede(task_2);
});
@endcode

Similar to tf::cudaFlow, a %cudaFlowCapturer is a task (tf::Task)
created from tf::Taskflow
and will be run by @em one worker thread in the executor.
That is, the callable that describes a %cudaFlowCapturer
will be executed sequentially.
*/
class cudaFlowCapturer : public cudaFlowCapturerBase {

  friend class cudaFlow;
  friend class Executor;

  public:

    /**
    @brief queries the emptiness of the graph
    */
    bool empty() const;

    /**
    @brief creates a custom capturer derived from tf::cudaFlowCapturerBase

    @tparam T custom capturer type
    @tparam ArgsT arguments types

    @param args arguments to forward to construct the custom capturer

    This %cudaFlowCapturer object keeps a list of custom capturers
    and manages their lifetimes. The lifetime of each custom capturer is
    the same as the capturer.
     */
    template <typename T, typename... ArgsT>
    T* make_capturer(ArgsT&&... args);

  private:

    std::vector<std::unique_ptr<cudaFlowCapturerBase>> _capturers;

    cudaFlowCapturer(cudaGraph&);

    cudaGraph_t _capture();

    cudaGraphExec_t _executable {nullptr};
};

// constructor
inline cudaFlowCapturer::cudaFlowCapturer(cudaGraph& g) :
  cudaFlowCapturerBase{g} {
}

// Function: empty
inline bool cudaFlowCapturer::empty() const {
  return _graph->empty();
}

//// Procedure: _create_executable
//inline void cudaFlowCapturer::_create_executable() {
//  assert(_executable == nullptr);
//  TF_CHECK_CUDA(
//    cudaGraphInstantiate(
//      &_executable, _graph->_native_handle, nullptr, nullptr, 0
//    ),
//    "failed to create an executable for captured graph"
//  );
//}
//
//// Procedure: _destroy_executable
//inline void cudaFlowCapturer::_destroy_executable() {
//  assert(_executable != nullptr);
//  TF_CHECK_CUDA(
//    cudaGraphExecDestroy(_executable), "failed to destroy executable graph"
//  );
//  _executable = nullptr;
//}

// Function: make_capturer
template <typename T, typename... ArgsT>
T* cudaFlowCapturer::make_capturer(ArgsT&&... args) {

  static_assert(std::is_base_of_v<cudaFlowCapturerBase, T>);

  auto ptr = std::make_unique<T>(std::forward<ArgsT>(args)...);
  ptr->_graph = this->_graph;
  auto raw = ptr.get();
  _capturers.push_back(std::move(ptr));
  return raw;
}

// Function: _capture
inline cudaGraph_t cudaFlowCapturer::_capture() {

  // acquire per-thread stream and turn it into capture mode
  // we must use ThreadLocal mode to avoid clashing with CUDA global states
  cudaScopedPerThreadStream stream;

  TF_CHECK_CUDA(
    cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal),
    "failed to turn stream into per-thread capture mode"
  );

  // TODO: need an efficient algorithm
  auto ordered = _graph->_toposort();
  for(auto& node : ordered) {
    std::get<cudaNode::Capture>(node->_handle).work(stream);
  }

  cudaGraph_t g;

  TF_CHECK_CUDA(
    cudaStreamEndCapture(stream, &g), "failed to end capture"
  );

  //cuda_dump_graph(std::cout, g);

  return g;
}

}  // end of namespace tf -----------------------------------------------------