cocos-engine-external/sources/taskflow/cuda/cuda_capturer.hpp

585 lines
17 KiB
C++

#pragma once
#include "cuda_task.hpp"
#include "cuda_algorithm/cuda_for_each.hpp"
#include "cuda_algorithm/cuda_transform.hpp"
#include "cuda_algorithm/cuda_reduce.hpp"
/**
@file cuda_capturer.hpp
@brief cudaFlowCapturer include file
*/
namespace tf {
/**
@brief queries the maximum threads allowed per block
*/
constexpr size_t cuda_default_max_threads_per_block() {
return 512;
}
/**
@brief queries the default number of threads per block in an 1D vector of N elements
*/
constexpr size_t cuda_default_threads_per_block(size_t N) {
// TODO: special case when N == 0?
if(N <= 32) return 32;
else {
return std::min(cuda_default_max_threads_per_block(), next_pow2(N));
}
}
// ----------------------------------------------------------------------------
// class definition: cudaFlowCapturerBase
// ----------------------------------------------------------------------------
/**
@class cudaFlowCapturerBase
@brief base class to construct a CUDA task graph through stream capture
*/
class cudaFlowCapturerBase {
friend class cudaFlowCapturer;
public:
/**
@brief default constructor
*/
cudaFlowCapturerBase() = default;
/**
@brief default virtual destructor
*/
virtual ~cudaFlowCapturerBase() = default;
/**
@brief dumps the capture graph into a DOT format through an
output stream
*/
void dump(std::ostream& os) const;
/**
@brief captures a sequential CUDA operations from the given callable
@tparam C callable type constructible with @c std::function<void(cudaStream_t)>
@param callable a callable to capture CUDA operations with the stream
This methods applies a stream created by the flow to capture
a sequence of CUDA operations defined in the callable.
*/
template <typename C, std::enable_if_t<
std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr
>
cudaTask on(C&& callable);
/**
@brief copies data between host and device asynchronously through a stream
@param dst destination memory address
@param src source memory address
@param count size in bytes to copy
The method captures a @c cudaMemcpyAsync operation through an
internal stream.
*/
cudaTask memcpy(void* dst, const void* src, size_t count);
/**
@brief captures a copy task of typed data
@tparam T element type (non-void)
@param tgt pointer to the target memory block
@param src pointer to the source memory block
@param num number of elements to copy
@return cudaTask handle
A copy task transfers <tt>num*sizeof(T)</tt> bytes of data from a source location
to a target location. Direction can be arbitrary among CPUs and GPUs.
*/
template <typename T,
std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
>
cudaTask copy(T* tgt, const T* src, size_t num);
/**
@brief initializes or sets GPU memory to the given value byte by byte
@param devPtr pointer to GPU mempry
@param value value to set for each byte of the specified memory
@param count size in bytes to set
The method captures a @c cudaMemsetAsync operation through an
internal stream to fill the first @c count bytes of the memory area
pointed to by @c devPtr with the constant byte value @c value.
*/
cudaTask memset(void* devPtr, int value, size_t count);
/**
@brief captures a kernel
@tparam F kernel function type
@tparam ArgsT kernel function parameters type
@param g configured grid
@param b configured block
@param s configured shared memory
@param f kernel function
@param args arguments to forward to the kernel function by copy
@return cudaTask handle
*/
template <typename F, typename... ArgsT>
cudaTask kernel(dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args);
// ------------------------------------------------------------------------
// generic algorithms
// ------------------------------------------------------------------------
/**
@brief capturers a kernel to runs the given callable with only one thread
@tparam C callable type
@param callable callable to run by a single kernel thread
*/
template <typename C>
cudaTask single_task(C&& callable);
/**
@brief captures a kernel that applies a callable to each dereferenced element
of the data array
@tparam I iterator type
@tparam C callable type
@param first iterator to the beginning (inclusive)
@param last iterator to the end (exclusive)
@param callable a callable object to apply to the dereferenced iterator
@return cudaTask handle
This method is equivalent to the parallel execution of the following loop on a GPU:
@code{.cpp}
for(auto itr = first; itr != last; i++) {
callable(*itr);
}
@endcode
*/
template <typename I, typename C>
cudaTask for_each(I first, I last, C&& callable);
/**
@brief captures a kernel that applies a callable to each index in the range
with the step size
@tparam I index type
@tparam C callable type
@param first beginning index
@param last last index
@param step step size
@param callable the callable to apply to each element in the data array
@return cudaTask handle
This method is equivalent to the parallel execution of the following loop on a GPU:
@code{.cpp}
// step is positive [first, last)
for(auto i=first; i<last; i+=step) {
callable(i);
}
// step is negative [first, last)
for(auto i=first; i>last; i+=step) {
callable(i);
}
@endcode
*/
template <typename I, typename C>
cudaTask for_each_index(I first, I last, I step, C&& callable);
/**
@brief captures a kernel that applies a callable to a source range and
stores the result in a target range
@tparam I iterator type
@tparam C callable type
@tparam S source types
@param first iterator to the beginning (inclusive)
@param last iterator to the end (exclusive)
@param callable the callable to apply to each element in the range
@param srcs iterators to the source ranges
@return cudaTask handle
This method is equivalent to the parallel execution of the following loop on a GPU:
@code{.cpp}
while (first != last) {
*first++ = callable(*src1++, *src2++, *src3++, ...);
}
@endcode
*/
template <typename I, typename C, typename... S>
cudaTask transform(I first, I last, C&& callable, S... srcs);
/**
@brief captures a kernel that performs parallel reduction over a range of items
@tparam I input iterator type
@tparam T value type
@tparam C callable type
@param first iterator to the beginning (inclusive)
@param last iterator to the end (exclusive)
@param result pointer to the result with an initialized value
@param op binary reduction operator
@return a tf::cudaTask handle
This method is equivalent to the parallel execution of the following loop on a GPU:
@code{.cpp}
while (first != last) {
*result = op(*result, *first++);
}
@endcode
*/
template <typename I, typename T, typename C>
cudaTask reduce(I first, I last, T* result, C&& op);
/**
@brief similar to tf::cudaFlowCapturerBase::reduce but does not assum
any initial value to reduce
This method is equivalent to the parallel execution of the following loop
on a GPU:
@code{.cpp}
*result = *first++; // no initial values partitipcate in the loop
while (first != last) {
*result = op(*result, *first++);
}
@endcode
*/
template <typename I, typename T, typename C>
cudaTask uninitialized_reduce(I first, I last, T* result, C&& op);
private:
cudaGraph* _graph {nullptr};
cudaFlowCapturerBase(cudaGraph&);
};
// Constructor
inline cudaFlowCapturerBase::cudaFlowCapturerBase(cudaGraph& g) :
_graph {&g} {
}
// Procedure: dump
inline void cudaFlowCapturerBase::dump(std::ostream& os) const {
_graph->dump(os, nullptr, "");
}
// Function: capture
template <typename C, std::enable_if_t<
std::is_invocable_r_v<void, C, cudaStream_t>, void>*
>
cudaTask cudaFlowCapturerBase::on(C&& callable) {
auto node = _graph->emplace_back(*_graph,
std::in_place_type_t<cudaNode::Capture>{}, std::forward<C>(callable)
);
return cudaTask(node);
}
// Function: memcpy
inline cudaTask cudaFlowCapturerBase::memcpy(
void* dst, const void* src, size_t count
) {
return on([dst, src, count] (cudaStream_t stream) mutable {
TF_CHECK_CUDA(
cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream),
"failed to capture memcpy"
);
});
}
template <typename T, std::enable_if_t<!std::is_same_v<T, void>, void>*>
cudaTask cudaFlowCapturerBase::copy(T* tgt, const T* src, size_t num) {
return on([tgt, src, num] (cudaStream_t stream) mutable {
TF_CHECK_CUDA(
cudaMemcpyAsync(tgt, src, sizeof(T)*num, cudaMemcpyDefault, stream),
"failed to capture copy"
);
});
}
// Function: memset
inline cudaTask cudaFlowCapturerBase::memset(void* ptr, int v, size_t n) {
return on([ptr, v, n] (cudaStream_t stream) mutable {
TF_CHECK_CUDA(
cudaMemsetAsync(ptr, v, n, stream), "failed to capture memset"
);
});
}
// Function: kernel
template <typename F, typename... ArgsT>
cudaTask cudaFlowCapturerBase::kernel(
dim3 g, dim3 b, size_t shm, F&& f, ArgsT&&... args
) {
return on([g, b, shm, f, args...] (cudaStream_t stream) mutable {
f<<<g, b, shm, stream>>>(args...);
});
}
// Function: single_task
template <typename C>
cudaTask cudaFlowCapturerBase::single_task(C&& callable) {
return on([c=std::forward<C>(callable)] (cudaStream_t stream) mutable {
cuda_single_task<C><<<1, 1, 0, stream>>>(c);
});
}
// Function: for_each
template <typename I, typename C>
cudaTask cudaFlowCapturerBase::for_each(I first, I last, C&& c) {
return on([first, last, c=std::forward<C>(c)](cudaStream_t stream) mutable {
// TODO: special case for N == 0?
size_t N = std::distance(first, last);
size_t B = cuda_default_threads_per_block(N);
cuda_for_each<I, C><<<(N+B-1)/B, B, 0, stream>>>(first, N, c);
});
}
// Function: for_each_index
template <typename I, typename C>
cudaTask cudaFlowCapturerBase::for_each_index(I beg, I end, I inc, C&& c) {
if(is_range_invalid(beg, end, inc)) {
TF_THROW("invalid range [", beg, ", ", end, ") with inc size ", inc);
}
return on([beg, end, inc, c=std::forward<C>(c)] (cudaStream_t stream) mutable {
// TODO: special case when N is 0?
size_t N = distance(beg, end, inc);
size_t B = cuda_default_threads_per_block(N);
cuda_for_each_index<I, C><<<(N+B-1)/B, B, 0, stream>>>(beg, inc, N, c);
});
}
// Function: transform
template <typename I, typename C, typename... S>
cudaTask cudaFlowCapturerBase::transform(I first, I last, C&& c, S... srcs) {
return on([first, last, c=std::forward<C>(c), srcs...]
(cudaStream_t stream) mutable {
// TODO: special case when N is 0?
size_t N = std::distance(first, last);
size_t B = cuda_default_threads_per_block(N);
cuda_transform<I, C, S...><<<(N+B-1)/B, B, 0, stream>>>(first, N, c, srcs...);
});
}
// Function: reduce
template <typename I, typename T, typename C>
cudaTask cudaFlowCapturerBase::reduce(I first, I last, T* result, C&& c) {
return on([first, last, result, c=std::forward<C>(c)]
(cudaStream_t stream) mutable {
//using value_t = std::decay_t<decltype(*std::declval<I>())>;
// TODO: special case N == 0?
size_t N = std::distance(first, last);
size_t B = cuda_default_threads_per_block(N);
cuda_reduce<I, T, C, false><<<1, B, B*sizeof(T), stream>>>(
first, N, result, c
);
});
}
// Function: uninitialized_reduce
template <typename I, typename T, typename C>
cudaTask cudaFlowCapturerBase::uninitialized_reduce(
I first, I last, T* result, C&& c
) {
return on([first, last, result, c=std::forward<C>(c)]
(cudaStream_t stream) mutable {
//using value_t = std::decay_t<decltype(*std::declval<I>())>;
// TODO: special case N == 0?
size_t N = std::distance(first, last);
size_t B = cuda_default_threads_per_block(N);
cuda_reduce<I, T, C, true><<<1, B, B*sizeof(T), stream>>>(
first, N, result, c
);
});
}
// ----------------------------------------------------------------------------
// class definition: cudaFlowCapturer
// ----------------------------------------------------------------------------
/**
@class cudaFlowCapturer
@brief class for building a CUDA task dependency graph through stream capture
A %cudaFlowCapturer inherits all the base methods from tf::cudaFlowCapturerBase
to construct a CUDA task graph through <i>stream capturer</i>.
This class also defines a factory interface tf::cudaFlowCapturer::make_capturer
for users to create custom capturers with their lifetimes managed by the factory.
The usage of tf::cudaFlowCapturer is similar to tf::cudaFlow, except users can
call the method tf::cudaFlowCapturer::on to capture a sequence of asynchronous
CUDA operations through the given stream.
The following example creates a CUDA graph that captures two kernel tasks,
@c task_1 and @c task_2, where @c task_1 runs before @c task_2.
@code{.cpp}
taskflow.emplace([](tf::cudaFlowCapturer& capturer){
// capture my_kernel_1 through the given stream managed by the capturer
auto task_1 = capturer.on([&](cudaStream_t stream){
my_kernel_1<<<grid_1, block_1, shm_size_1, stream>>>(my_parameters_1);
});
// capture my_kernel_2 through the given stream managed by the capturer
auto task_2 = capturer.on([&](cudaStream_t stream){
my_kernel_2<<<grid_2, block_2, shm_size_2, stream>>>(my_parameters_2);
});
task_1.precede(task_2);
});
@endcode
Similar to tf::cudaFlow, a %cudaFlowCapturer is a task (tf::Task)
created from tf::Taskflow
and will be run by @em one worker thread in the executor.
That is, the callable that describes a %cudaFlowCapturer
will be executed sequentially.
*/
class cudaFlowCapturer : public cudaFlowCapturerBase {
friend class cudaFlow;
friend class Executor;
public:
/**
@brief queries the emptiness of the graph
*/
bool empty() const;
/**
@brief creates a custom capturer derived from tf::cudaFlowCapturerBase
@tparam T custom capturer type
@tparam ArgsT arguments types
@param args arguments to forward to construct the custom capturer
This %cudaFlowCapturer object keeps a list of custom capturers
and manages their lifetimes. The lifetime of each custom capturer is
the same as the capturer.
*/
template <typename T, typename... ArgsT>
T* make_capturer(ArgsT&&... args);
private:
std::vector<std::unique_ptr<cudaFlowCapturerBase>> _capturers;
cudaFlowCapturer(cudaGraph&);
cudaGraph_t _capture();
cudaGraphExec_t _executable {nullptr};
};
// constructor
inline cudaFlowCapturer::cudaFlowCapturer(cudaGraph& g) :
cudaFlowCapturerBase{g} {
}
// Function: empty
inline bool cudaFlowCapturer::empty() const {
return _graph->empty();
}
//// Procedure: _create_executable
//inline void cudaFlowCapturer::_create_executable() {
// assert(_executable == nullptr);
// TF_CHECK_CUDA(
// cudaGraphInstantiate(
// &_executable, _graph->_native_handle, nullptr, nullptr, 0
// ),
// "failed to create an executable for captured graph"
// );
//}
//
//// Procedure: _destroy_executable
//inline void cudaFlowCapturer::_destroy_executable() {
// assert(_executable != nullptr);
// TF_CHECK_CUDA(
// cudaGraphExecDestroy(_executable), "failed to destroy executable graph"
// );
// _executable = nullptr;
//}
// Function: make_capturer
template <typename T, typename... ArgsT>
T* cudaFlowCapturer::make_capturer(ArgsT&&... args) {
static_assert(std::is_base_of_v<cudaFlowCapturerBase, T>);
auto ptr = std::make_unique<T>(std::forward<ArgsT>(args)...);
ptr->_graph = this->_graph;
auto raw = ptr.get();
_capturers.push_back(std::move(ptr));
return raw;
}
// Function: _capture
inline cudaGraph_t cudaFlowCapturer::_capture() {
// acquire per-thread stream and turn it into capture mode
// we must use ThreadLocal mode to avoid clashing with CUDA global states
cudaScopedPerThreadStream stream;
TF_CHECK_CUDA(
cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal),
"failed to turn stream into per-thread capture mode"
);
// TODO: need an efficient algorithm
auto ordered = _graph->_toposort();
for(auto& node : ordered) {
std::get<cudaNode::Capture>(node->_handle).work(stream);
}
cudaGraph_t g;
TF_CHECK_CUDA(
cudaStreamEndCapture(stream, &g), "failed to end capture"
);
//cuda_dump_graph(std::cout, g);
return g;
}
} // end of namespace tf -----------------------------------------------------