cocos-engine-external/sources/taskflow/cuda/cublas/cublas_flow.hpp

1361 lines
46 KiB
C++

#pragma once
#include "cublas_handle.hpp"
/**
@file cublas_flow.hpp
@brief cublasFlowCapturer include file
*/
namespace tf {
// ----------------------------------------------------------------------------
// cublasFlowCapturer definition
// ----------------------------------------------------------------------------
/**
@class cublasFlowCapturer
@brief class to construct a cuBLAS task graph
%cublasFlowCapturer provides a higher-level interface over the @cuBLAS library
and hide concurrency details from users.
It inherits methods from tf::cudaFlowCapturerBase and must be used from
a tf::cudaFlowCapturer object.
All pointers used to %cublasFlowCapturer methods must be in GPU memory space or managed
(i.e., @c cudaMallocManaged),
including scalars, @c alpha and @c beta, input data and output data pointers.
The following example uses @c cublas<t>amax to find the minimum index of the element
of the maximum absolute magnitude in a vector.
@code{.cpp}
#include <taskflow/cublasflow.hpp>
int main() {
tf::Executor executor;
tf::Taskflow taskflow;
size_t N = 1024;
float *x = nullptr;
int *d_res;
int h_res;
std::vector<float> host(N, 0.0f);
host[512] = 100.0f; // artificially set the mid-position to the largest
cudaMalloc(&x, N*sizeof(float));
cudaMalloc(&d_res, sizeof(int));
taskflow.emplace([&](tf::cudaFlowCapturer& capturer){
auto* cublas = capturer.make_capturer<tf::cublasFlowCapturer>();
tf::cudaTask h2d = capturer.copy(x, host.data(), N);
tf::cudaTask find_max = cublas->amax(N, x, 1, d_res);
tf::cudaTask d2h = capturer.copy(&h_res, d_res, 1);
h2d.precede(find_max); // amax runs before host-to-device copy
find_max.precede(d2h); // amax runs after device-to-host copy
});
executor.run(taskflow).wait();
assert(h_res == 512);
}
@endcode
Currently, %cublasFlowCapturer supports only @c float and @c double data types.
We design most tf::cublasFlowCapturer methods on top of the native,
high-performance @cuBLAS library.
You may refer to @cuBLAS for more details.
*/
class cublasFlowCapturer : public cudaFlowCapturerBase {
public:
/**
@brief constructs a cublas flow capturer
*/
cublasFlowCapturer() = default;
/**
@brief gets the native cublas handle associated with this %cublasFlowCapturer
@return a native cublas handle of type cublasHandle_t
*/
cublasHandle_t native_handle();
/**
@brief copies vector data from host to device
This method copies @c n elements from a vector @c h in host memory space
to a vector @c d in GPU memory space.
The storage spacing between consecutive elements is given by @c inch for
the source vector @c h and by @c incd for the destination vector @c d.
This method calls native @c cublasSetVectorAsync with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param n number of elements
@param d target device pointer
@param incd spacing between consecutive elements in @c d
@param h source host pointer
@param inch spacing between consecutive elements in @c h
@return a tf::cudaTask handle
*/
template <typename T,
std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
>
cudaTask vset(size_t n, const T* h, int inch, T* d, int incd);
/**
@brief copies vector data from device to host
This method copies @c n elements from a vector @c d in GPU memory space
to a vector @c h in host memory space.
The storage spacing between consecutive elements is given by @c inch for
the target vector @c h and by @c incd for the source vector @c d.
This method calls native @c cublasGetVectorAsync with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param n number of elements
@param h target host pointer
@param inch spacing between consecutive elements in @c h
@param d source device pointer
@param incd spacing between consecutive elements in @c d
@return a tf::cudaTask handle
*/
template <typename T,
std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
>
cudaTask vget(size_t n, const T* d, int incd, T* h, int inch);
// ------------------------------------------------------------------------
// Level-1 vector-vector operations
// ------------------------------------------------------------------------
/**
@brief finds the smallest index of the element of the maximum
absolute magnitude
This method calls native @c cublas<t>amax with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param n number of elements in vector @c x
@param x pointer to the memory address of the vector
@param incx stride between consecutive elements of @c x
@param result the resulting index (1-based indexing)
@return a tf::cudaTask handle
*/
template <typename T>
cudaTask amax(int n, const T* x, int incx, int* result);
/**
@brief finds the smallest index of the element of the minimum
absolute magnitude
This method calls native @c cublas<t>amin with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param n number of elements in vector @c x
@param x pointer to the memory address of the vector
@param incx stride between consecutive elements of @c x
@param result the resulting index (1-based indexing)
@return a tf::cudaTask handle
*/
template <typename T>
cudaTask amin(int n, const T* x, int incx, int* result);
/**
@brief finds the sum of absolute values of the elements over a vector
This method calls native @c cublas<t>asum with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param n number of elements in vector @c x
@param x pointer to the memory address of the vector
@param incx stride between consecutive elements of @c x
@param result the result
@return a tf::cudaTask handle
*/
template <typename T>
cudaTask asum(int n, const T* x, int incx, T* result);
/**
@brief multiples a vector by a scalar and adds it to a vector
This function multiplies the vector @c x by the scalar @c alpha and
adds it to the vector @c y overwriting the latest vector with the result.
Hence, the performed operation is:
<tt>y[j] = alpha * x[k] + y[j]</tt>,
where @c j and @c k are indices of @c n elements with step sizes
@c incy and @c incx.
This method calls native @c cublas<t>asum with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param n number of elements in vectors @c x and @c y
@param alpha scalar used to multiplication
@param x pointer to the memory address of the vector @c x
@param incx stride between consecutive elements of @c x
@param y pointer to the memory address of the vector @c y
@param incy stride between consecutive elements of @c y
@return a tf::cudaTask handle
*/
template <typename T>
cudaTask axpy(
int n, const T *alpha, const T *x, int incx, T *y, int incy
);
/**
@brief copies a vector to another vector
This function copies @c n elements from a vector @c x of a step size @c incx
to another vector @c y of step size @c incy.
adds it to the vector @c y overwriting the latest vector with the result.
Hence, the performed operation is:
<tt>y[j] = x[k]</tt>,
where @c j and @c k are indices of @c n elements with step sizes
@c incy and @c incx.
This method calls native @c cublas<t>copy with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param n number of elements to copy
@param x pointer to the memory address of the vector @c x
@param incx stride between consecutive elements of @c x
@param y pointer to the memory address of the vector @c y
@param incy stride between consecutive elements of @c y
@return a tf::cudaTask handle
*/
template <typename T>
cudaTask vcopy(int n, const T* x, int incx, T* y, int incy);
/**
@brief computes the dot product of two vectors
<tt>sum += x[i] * y[i]</tt>
This method calls native @c cublas<t>dot with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param n number of elements to perform the dot product
@param x pointer to the memory address of the vector @c x
@param incx stride between consecutive elements of @c x
@param y pointer to the memory address of the vector @c y
@param incy stride between consecutive elements of @c y
@param result the resulting dot product
@return a tf::cudaTask handle
*/
template <typename T>
cudaTask dot(int n, const T* x, int incx, const T* y, int incy, T* result);
/**
@brief computes the Euclidean norm of a vector
This method calls native @c cublas<t>nrm2 with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param n number of elements in vector @c x
@param x pointer to the memory address of the vector
@param incx stride between consecutive elements of @c x
@param result the result
@return a tf::cudaTask handle
*/
template <typename T>
cudaTask nrm2(int n, const T* x, int incx, T* result);
/**
@brief scales a vector by a scalar
This method calls native @c cublas<t>scal with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param n number of elements in vector @c x
@param scalar scalar used for multiplication
@param x pointer to the memory address of the vector
@param incx stride between consecutive elements of @c x
@return a tf::cudaTask handle
*/
template <typename T>
cudaTask scal(int n, const T* scalar, T* x, int incx);
/**
@brief swaps elements between two vectors
This function interchanges the elements of vectors @c x and @c y.
Hence, the performed operation is:
<tt>y[j] <-> x[k]</tt>,
where @c j is the index of element in @c y with a step size @c incy and
@c k is the index of element in @c x with a step size @c incx.
This method calls native @c cublas<t>swap with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param n number of elements to perform the dot product
@param x pointer to the memory address of the vector @c x
@param incx stride between consecutive elements of @c x
@param y pointer to the memory address of the vector @c y
@param incy stride between consecutive elements of @c y
@return a tf::cudaTask handle
*/
template <typename T>
cudaTask swap(int n, T* x, int incx, T* y, int incy);
// ------------------------------------------------------------------------
// TODO Level-2 matrix_vector operations
// ------------------------------------------------------------------------
/**
@brief performs matrix-vector multiplication
This function performs matrix-vector multiplication:
<tt>y = alpha * op(A) * x + beta * y</tt>,
where @c alpha and @c beta are scalars, @c A
is a 2D matrix stored in column-major format,
and @c x, @c y are vectors.
The input matrices are in column-major storage.
This method calls native @c cublas<t>gemv with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param trans transport operation @c op(A)
@param m number of rows of matrix @c A
@param n number of columns of matrix @c A
@param alpha pointer to the @c alpha scalar
@param A pointer to the address of @c A
@param lda leading dimension of 2D array used to store the matrix @c A
@param x pointer to the address of @c x of at least
<tt>(1 + (n - 1) * abs(incx))</tt> elements if no transposition,
or <tt>(1 + (m - 1) * abs(incx))</tt> elements otherwise.
@param incx stride between consecutive elements of @c x
@param beta pointer to the @c beta scalar
@param y pointer to the address of @c y
@param incy stride between consecutive elements of @c y
@return a tf::cudaTask handle
*/
template <typename T>
cudaTask gemv(
cublasOperation_t trans,
int m, int n,
const T *alpha,
const T *A, int lda,
const T *x, int incx,
const T *beta,
T *y, int incy
);
/**
@brief similar to tf::cublasFlowCapturer::gemv but operates on C-styled
row-major layout
*/
template <typename T>
cudaTask c_gemv(
cublasOperation_t trans,
int m, int n,
const T *alpha,
const T *A, int lda,
const T *x, int incx,
const T *beta,
T *y, int incy
);
/**
@brief performs symmetric matrix-vector multiplication
This function performs symmetric matrix-vector multiplication:
<tt>y = alpha * A * x + beta * y</tt>,
where @c alpha and @c beta are scalars, @c A
is a 2D symmetric matrix stored in column-major format,
and @c x, @c y are vectors
This method calls native @c cublas<t>symv with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param uplo indicates if matrix @c A lower or upper part is stored,
the other symmetric part is not referenced and is inferred
from the stored elements
@param n number of rows and columns of matrix @c A
@param alpha pointer to the @c alpha scalar
@param A pointer to the address of @c A
@param lda leading dimension of 2D array used to store the matrix @c A
@param x pointer to the address of @c x
@param incx stride between consecutive elements of @c x
@param beta pointer to the @c beta scalar
@param y pointer to the address of @c y
@param incy stride between consecutive elements of @c y
@return a tf::cudaTask handle
*/
template <typename T>
cudaTask symv(
cublasFillMode_t uplo,
int n,
const T *alpha,
const T *A, int lda,
const T *x, int incx,
const T *beta,
T *y, int incy
);
/**
@brief similar to tf::cublasFlowCapturer::symv but operates on
C-styled row-major layout
*/
template <typename T>
cudaTask c_symv(
cublasFillMode_t uplo,
int n,
const T *alpha,
const T *A, int lda,
const T *x, int incx,
const T *beta,
T *y, int incy
);
/**
@brief performs symmetric rank-1 update
This function performs symmetric rank-1 update:
<tt>A = alpha * x * x^T + A</tt>,
where @c alpha is a scalar, @c A
is a 2D symmetric matrix stored in column-major format,
and @c x is a vector.
The result is also symmetric and is stored on in the @c uplo part
of @c A.
This method calls native @c cublas<t>syr with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param uplo indicates if matrix @c A lower or upper part is stored,
the other symmetric part is not referenced and is inferred
from the stored elements
@param n number of rows and columns of matrix @c A
@param alpha pointer to the @c alpha scalar
@param x pointer to the address of @c x
@param incx stride between consecutive elements of @c x
@param A pointer to the address of @c A
@param lda leading dimension of 2D array used to store the matrix @c A
@return a tf::cudaTask handle
*/
template <typename T>
cudaTask syr(
cublasFillMode_t uplo,
int n,
const T *alpha,
const T *x, int incx,
T *A, int lda
);
/**
@brief similar to tf::cublasFlowCapturer::c_syr but operates on
C-styled row-major layout
*/
template <typename T>
cudaTask c_syr(
cublasFillMode_t uplo,
int n,
const T *alpha,
const T *x, int incx,
T *A, int lda
);
/**
@brief performs symmetric rank-2 update
This function performs symmetric rank-2 update:
<tt>A = alpha * x * y^T + y * x^T + A</tt>,
where @c alpha is a scalar, @c A
is a 2D symmetric matrix stored in column-major format,
and @c x and @c y are vectors.
The result is also symmetric and is stored on in the @c uplo part
of @c A.
This method calls native @c cublas<t>syr2 with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param uplo indicates if matrix @c A lower or upper part is stored,
the other symmetric part is not referenced and is inferred
from the stored elements
@param n number of rows and columns of matrix @c A
@param alpha pointer to the @c alpha scalar
@param x pointer to the address of @c x
@param incx stride between consecutive elements of @c x
@param y pointer to the address of @c y
@param incy stride between consecutive elements of @c y
@param A pointer to the address of @c A
@param lda leading dimension of 2D array used to store the matrix @c A
@return a tf::cudaTask handle
*/
template <typename T>
cudaTask syr2(
cublasFillMode_t uplo,
int n,
const T *alpha,
const T *x, int incx,
const T *y, int incy,
T *A, int lda
);
/**
@brief similar to tf::cublasFlowCapturer::syr2 but operates on
C-styled row-major layout
*/
template <typename T>
cudaTask c_syr2(
cublasFillMode_t uplo,
int n,
const T *alpha,
const T *x, int incx,
const T *y, int incy,
T *A, int lda
);
/**
@brief performs the triangular matrix-vector multiplication
This method performs the triangular matrix-vector multiplication:
<tt>x = op(A)</tt>,
where @c A is a triangular matrix stored in lower or upper mode
with or without the main diagonal, and @c x is a vector.
@tparam T data type
@param uplo indicates if matrix @c A lower or upper part is stored,
the other part is not referenced and is inferred from
the stored elements
@param tran transpose operation @c op(A)
@param diag indicates if the elements on the main diagonal of matrix @c A
are unity (i.e., all 1s) and of no need to be accessed
@param n number of rows and columns of matrix @c A
@param A pointer to the address of A
@param lda leading dimension of 2D array used to store matrix @c A
@param x input of vector @c b and output of the solution on exit
@param incx stride between consecutive elements of @c x
*/
template <typename T>
cudaTask trmv(
cublasFillMode_t uplo,
cublasOperation_t tran, cublasDiagType_t diag,
int n, const T* A, int lda,
T *x, int incx
);
/**
@brief similar to tf::cublasFlowCapturer::trmv but operates on C-styled
row-major layout
*/
template <typename T>
cudaTask c_trmv(
cublasFillMode_t uplo,
cublasOperation_t tran, cublasDiagType_t diag,
int n, const T* A, int lda,
T *x, int incx
);
/**
@brief solves the triangular linear system with a single right-hand-side
This method solves the triangular linear system with a single right-hand-side
<tt>op(A) x = b</tt>,
where @c A is a triangular matrix stored in lower or upper mode
with or without the main diagonal, and @c x and @c b are vectors.
@tparam T data type
@param uplo indicates if matrix @c A lower or upper part is stored,
the other part is not referenced and is inferred from
the stored elements
@param tran transpose operation @c op(A)
@param diag indicates if the elements on the main diagonal of matrix @c A
are unity (i.e., all 1s) and of no need to be accessed
@param n number of rows and columns of matrix @c A
@param A pointer to the address of A
@param lda leading dimension of 2D array used to store matrix @c A
@param x input of vector @c b and output of the solution on exit
@param incx stride between consecutive elements of @c x
*/
template <typename T>
cudaTask trsv(
cublasFillMode_t uplo,
cublasOperation_t tran, cublasDiagType_t diag,
int n, const T* A, int lda,
T *x, int incx
);
/**
@brief similar to tf::cublasFlowCapturer::trsv but operates on C-styled
row-major layout
*/
template <typename T>
cudaTask c_trsv(
cublasFillMode_t uplo,
cublasOperation_t tran, cublasDiagType_t diag,
int n, const T* A, int lda,
T *x, int incx
);
// ------------------------------------------------------------------------
// Level-3 matrix-matrix operations
// ------------------------------------------------------------------------
/**
@brief performs matrix-matrix addition and transposition
This method performs the matrix-matrix addition/transposition:
<tt>C = alpha * op(A) + beta * op(B)</tt>,
where @c alpha and @c beta are scalars, and @c A, @c B and @c C are matrices
stored in column-major format with dimensions @c op(A) as @c m by @c n,
@c op(B) as @c m by @c n and @c C as @c m by @c n, respectively.
The operation is out-of-place if @c C does not overlap @c A or @c B.
The in-place mode supports the following two operations:
1. <tt>C = alpha * C + beta * op(B)</tt>
2. <Tt>C = alpha * op(A) + beta * C</tt>
For in-place mode, if @c C equals @c A, @c ldc equals @c lda and
@c ta equals @c CUBLAS_OP_N. If @c C equals @c B, @c ldc equals @c ldb
and @c tb equals CUBLAS_OP_N.
The operation includes the following special cases:
1. the user can reset matrix @c C to zero by setting @c alpha and
@c beta to 0
2. the user can transpose matrix @c A by setting @c alpha to 1 and
@c beta to 0
The input matrices are in column-major storage.
This method calls native @c cublas<t>geam with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param ta transport operation @c op(A)
@param tb transport operation @c op(B)
@param m number of rows of matrix @c C and @c op(A)
@param n number of columns of matrix @c C and @c op(B)
@param alpha pointer to the @c alpha scalar
@param A pointer to the address of @c A
@param lda leading dimension of 2D array used to store the matrix @c A
@param beta pointer to the @c beta scalar
@param B pointer to the address of @c B
@param ldb leading dimension of 2D array used to store the matrix @c B
@param C pointer to the address of @c C
@param ldc leading dimension of 2D array used to store the matrix @c C
@return a tf::cudaTask handle
*/
template <typename T>
cudaTask geam(
cublasOperation_t ta, cublasOperation_t tb,
int m, int n,
const T *alpha,
const T *A, int lda,
const T *beta,
const T *B, int ldb,
T *C, int ldc
);
/**
@brief similar to tf::cublasFlowCapturer::geam but on row-major layout
*/
template <typename T>
cudaTask c_geam(
cublasOperation_t ta, cublasOperation_t tb,
int m, int n,
const T *alpha,
const T *A, int lda,
const T *beta,
const T *B, int ldb,
T *C, int ldc
);
/**
@brief performs matrix-matrix multiplication
This function performs matrix-matrix multiplication:
<tt>C = alpha * op (A) * op (B) + beta * C</tt>,
where @c alpha and @c beta are scalars, and @c A, @c B, and @c C
are 2D matrices stored in column-major format
with dimension @c op(A) as @c m by @c k,
dimension @c op(B) as @c k by @c n, and @c C as @c m by @c n.
The input matrices are in column-major storage.
This method calls native @c cublas<t>gemm with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param ta transport operation @c op(A)
@param tb transport operation @c op(B)
@param m number of rows of matrix @c C and @c op(A)
@param n number of columns of matrix @c C and @c op(B)
@param k number of columns of @c op(A) and rows of @c op(B)
@param alpha pointer to the @c alpha scalar
@param A pointer to the address of @c A
@param lda leading dimension of 2D array used to store the matrix @c A
@param B pointer to the address of @c B
@param ldb leading dimension of 2D array used to store the matrix @c B
@param beta pointer to the @c beta scalar
@param C pointer to the address of @c C
@param ldc leading dimension of 2D array used to store the matrix @c C
@return a tf::cudaTask handle
*/
template <typename T>
cudaTask gemm(
cublasOperation_t ta, cublasOperation_t tb,
int m, int n, int k,
const T *alpha,
const T *A, int lda,
const T *B, int ldb,
const T *beta,
T *C, int ldc
);
/**
@brief similar to tf::cublasFlowCapturer::gemm but operates on C-styled
row-major layout
*/
template <typename T>
cudaTask c_gemm(
cublasOperation_t ta, cublasOperation_t tb,
int m, int n, int k,
const T *alpha,
const T *A, int lda,
const T *B, int ldb,
const T *beta,
T *C, int ldc
);
/**
@brief performs matrix-matrix multiplication over a batch of matrices
The batch must be @em uniform.
All instances in the batch must have the same dimensions <tt>(m, n, k)</tt>,
leading dimensions <tt>(lda, ldb, ldc)</tt> and transpositions
<tt>(ta, tb)</tt> for their respective @c A, @c B and @c C matrices.
The address of the input matrices and the output matrix of each instance
of the batch are read from arrays of pointers passed to the function
by the caller.
<tt>C[i]= alpha * op (A[i]) * op (B[i]) + beta * C[i], i in [0, bc)</tt>,
where @c alpha and @c beta are scalars, and @c A[i], @c B[i], and @c C[i]
are 2D matrices stored in column-major format
with dimension @c op(A) as @c m by @c k,
dimension @c op(B) as @c k by @c n, and @c C as @c m by @c n.
The input matrices are in column-major storage.
This method calls native @c cublas<t>gemmBatched with packed parameters,
<tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param ta transport operation @c op(A[i])
@param tb transport operation @c op(B[i])
@param m number of rows of matrix @c C[i] and @c op(A[i])
@param n number of columns of matrix @c C[i] and @c op(B[i])
@param k number of columns of @c op(A[i]) and rows of @c op(B[i])
@param alpha pointer to the @c alpha scalar
@param A array pointer to @c A batch
@param lda leading dimension of 2D array used to store the matrix @c A[i]
@param B array pointer to @c B batch
@param ldb leading dimension of 2D array used to store the matrix @c B[i]
@param beta pointer to the @c beta scalar
@param C array pointer to @c C batch
@param ldc leading dimension of 2D array used to store the matrix @c C[i]
@param bc batch size (number of matrices)
@return a tf::cudaTask handle
*/
template <typename T>
cudaTask gemm_batched(
cublasOperation_t ta, cublasOperation_t tb,
int m, int n, int k,
const T *alpha,
const T *A[], int lda,
const T *B[], int ldb,
const T *beta,
T *C[], int ldc,
int bc
);
/**
@brief similar to tf::cublasFlowCapturer::gemm_batched but operates on
C-styled row-major layout
*/
template <typename T>
cudaTask c_gemm_batched(
cublasOperation_t ta, cublasOperation_t tb,
int m, int n, int k,
const T *alpha,
const T *A[], int lda,
const T *B[], int ldb,
const T *beta,
T *C[], int ldc,
int bc
);
/**
@brief performs matrix-matrix multiplication over a batch of matrices
with strided memory access
Here, we use @c A[i], @c B[i], @c C[i] as notation
for A, B and C matrices in the @c i-th instance of the batch,
implicitly assuming they are respectively address offsets
@c sA, @c sB, @c sC away from @c A[i-1], @c B[i-1], @c C[i-1].
The input matrices are in column-major storage.
This method calls native @c cublas<t>gemmStridedBatched with
packed parameters, <tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param ta transport operation @c op(A[i])
@param tb transport operation @c op(B[i])
@param m number of rows of matrix @c C[i] and @c op(A[i])
@param n number of columns of matrix @c C[i] and @c op(B[i])
@param k number of columns of @c op(A[i]) and rows of @c op(B[i])
@param alpha pointer to the @c alpha scalar
@param A pointer to @c A batch
@param lda leading dimension of 2D array used to store the matrix @c A[i]
@param sA address offset between @c A[i] and @c A[i+1]
@param B pointer to @c B batch
@param ldb leading dimension of 2D array used to store the matrix @c B[i]
@param sB address offset between @c B[i] and @c B[i+1]
@param beta pointer to the @c beta scalar
@param C pointer to @c C batch
@param ldc leading dimension of 2D array used to store the matrix @c C[i]
@param sC address offset between @c C[i] and @c C[i+1]
@param bc batch size (number of matrices)
@return a tf::cudaTask handle
The batch must be @em uniform.
All instances in the batch must have the same dimensions <tt>(m, n, k)</tt>,
leading dimensions <tt>(lda, ldb, ldc)</tt> and transpositions
<tt>(ta, tb)</tt> for their respective @c A, @c B and @c C matrices.
Input matrices @c A, @c B and output matrix @c C for each instance of the batch
are located at fixed address offsets from their locations in the previous instance.
Pointers to @c A, @c B and @c C matrices for the first instance are passed
to the function by the user along with the address @em offsets -
@c sA, @c sB and @c sC that determine the locations
of input and output matrices in future instances.
<tt>C + i*sC = alpha * op (A + i*sA) * op (B + i*sB) + beta * (C + i*sC), i in [0, bc)</tt>,
where @c alpha and @c beta are scalars, and @c A[i], @c B[i], and @c C[i]
are 2D matrices stored in column-major format
with dimension @c op(A) as @c m by @c k,
dimension @c op(B) as @c k by @c n, and @c C as @c m by @c n.
On certain problem sizes, it might be advantageous to create multiple gemm tasks
to take advantage of concurrent kernels, rather than this method.
*/
template <typename T>
cudaTask gemm_sbatched(
cublasOperation_t ta, cublasOperation_t tb,
int m, int n, int k,
const T *alpha,
const T *A, int lda, long long int sA,
const T *B, int ldb, long long int sB,
const T *beta,
T *C, int ldc, long long int sC,
int bc
);
/**
@brief similar to tf::cublasFlowCapturer::c_gemm_sbatched but operates on
C-styled row-major layout
*/
template <typename T>
cudaTask c_gemm_sbatched(
cublasOperation_t ta, cublasOperation_t tb,
int m, int n, int k,
const T *alpha,
const T *A, int lda, long long int sA,
const T *B, int ldb, long long int sB,
const T *beta,
T *C, int ldc, long long int sC,
int bc
);
/**
@brief performs the symmetric matrix-matrix multiplication
The method performs symmetric matrix-matrix multiplication:
<tt>C = alpha * A * B + beta * C, if side == CUBLAS_SIDE_LEFT</tt>, or
<tt>C = alpha * B * A + beta * C, if side == CUBLAS_SIDE_RIGHT</tt>.
@c A is a symmetric matrix stored in lower or upper mode,
@c B and @c C are @c m by @c n matrices, and @c alpha and @c beta
are scalars.
This method calls native @c cublas<t>symm with
packed parameters, <tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param side indicates if matrix @c A is on the left or right of @c B.
@param uplo indicates if matrix @c A lower or upper part is stored,
the other symmetric part is not referenced and
is inferred from the stored elements.
@param m number of rows of matrix @c C and @c B,
with matrix @c A sized accordingly
@param n number of columns of matrix @c C and @c B,
with matrix @c A sized accordingly
@param alpha scalar used for multiplication
@param A pointer to the address of matrix @c A
@param lda leading dimension of the 2D array used to store A
@param B pointer to the address of matrix @c B
@param ldb leading dimension of the 2D array used to store B
@param beta scalar used for multiplication
@param C pointer to the address of matrix @c C
@param ldc leading dimension of the 2D array used to store C
*/
template <typename T>
cudaTask symm(
cublasSideMode_t side, cublasFillMode_t uplo,
int m, int n,
const T *alpha,
const T *A, int lda,
const T *B, int ldb,
const T *beta,
T *C, int ldc
);
/**
@brief similar to tf::cublasFlowCapturer::symm but operates on
C-styled row-major layout
*/
template <typename T>
cudaTask c_symm(
cublasSideMode_t side, cublasFillMode_t uplo,
int m, int n,
const T *alpha,
const T *A, int lda,
const T *B, int ldb,
const T *beta,
T *C, int ldc
);
/**
@brief performs the symmetric rank-k update
This method performs the symmetric rank-k update :
<tt>C = alpha * op(A) * op(A)^T + beta * C</tt>,
where @c alpha and @c beta are scalars, @c C is a symmetric matrix
stored in lower or upper mode, and @c A is a matrix with dimension
@c op(A) @c n by @c k.
The result is stored to @c uplo part of @c C.
This method calls native @c cublas<t>syrk with
packed parameters, <tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param uplo indicates if matrix @c C lower or upper part is stored,
the other symmetric part is not referenced and is
inferred from the stored elements.
@param tran transposition operation to apply to @c A
@param n number of rows of matrix @c C and @c op(A)
@param k number of columns of matrix @c op(A)
@param alpha scalar used for multiplication
@param A pointer to the address of @c A
@param lda leading dimension of the 2D array used to store @c A
@param beta scalar used for multiplication
@param C pointer to the address of @c C
@param ldc leading dimension of the 2D array used to store @c C
*/
template <typename T>
cudaTask syrk(
cublasFillMode_t uplo, cublasOperation_t tran,
int n, int k,
const T *alpha,
const T *A, int lda,
const T *beta,
T *C, int ldc
);
/**
@brief similar to tf::cublasFlowCapturer::c_syrk but operates on
C-styled row-major layout
*/
template <typename T>
cudaTask c_syrk(
cublasFillMode_t uplo, cublasOperation_t tran,
int n, int k,
const T *alpha,
const T *A, int lda,
const T *beta,
T *C, int ldc
);
/**
@brief performs the symmetric rank-2k update
This method performs the symmetric rank-2k update :
<tt>C = alpha * (op(A) * op(B)^T + op(B) * op(A)^T) + beta * C</tt>,
where @c alpha and @c beta are scalars, @c C is a symmetric matrix
stored in lower or upper mode, and @c A and @c B are two matrices
with dimensions @c op(A) and op(B) @c n by @c k.
The result is stored to @c uplo part of @c C.
This method calls native @c cublas<t>syr2k with
packed parameters, <tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param uplo indicates if matrix @c C lower or upper part is stored,
the other symmetric part is not referenced and is
inferred from the stored elements.
@param tran transposition operation to apply to @c A
@param n number of rows of matrix @c C and @c op(A)
@param k number of columns of matrix @c op(A)
@param alpha scalar used for multiplication
@param A pointer to the address of @c A
@param lda leading dimension of the 2D array used to store @c A
@param B pointer to the address of @c B
@param ldb leading dimension of the 2D array used to store @c B
@param beta scalar used for multiplication
@param C pointer to the address of @c C
@param ldc leading dimension of the 2D array used to store @c C
*/
template <typename T>
cudaTask syr2k(
cublasFillMode_t uplo, cublasOperation_t tran,
int n, int k,
const T *alpha,
const T *A, int lda,
const T *B, int ldb,
const T *beta,
T *C, int ldc
);
/**
@brief similar to tf::cublasFlowCapturer::syr2k but operates on
C-styled row-major layout
*/
template <typename T>
cudaTask c_syr2k(
cublasFillMode_t uplo, cublasOperation_t tran,
int n, int k,
const T *alpha,
const T *A, int lda,
const T *B, int ldb,
const T *beta,
T *C, int ldc
);
/**
@brief performs a variation of the symmetric rank-k update
This method performs a variation of the symmetric rank-k update:
<tt>C = alpha * op(A) * op(B)^T + beta * C</tt>,
where @c alpha and @c beta are scalars, @c C is a symmetric matrix
stored in lower or upper mode, and @c A and @c B are two matrices
with dimensions @c op(A) and op(B) @c n by @c k.
The result is stored to @c uplo part of @c C.
This method calls native @c cublas<t>syr2k with
packed parameters, <tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param uplo indicates if matrix @c C lower or upper part is stored,
the other symmetric part is not referenced and is
inferred from the stored elements.
@param tran transposition operation to apply to @c A
@param n number of rows of matrix @c C and @c op(A)
@param k number of columns of matrix @c op(A)
@param alpha scalar used for multiplication
@param A pointer to the address of @c A
@param lda leading dimension of the 2D array used to store @c A
@param B pointer to the address of @c B
@param ldb leading dimension of the 2D array used to store @c B
@param beta scalar used for multiplication
@param C pointer to the address of @c C
@param ldc leading dimension of the 2D array used to store @c C
*/
template <typename T>
cudaTask syrkx(
cublasFillMode_t uplo, cublasOperation_t tran,
int n, int k,
const T *alpha,
const T *A, int lda,
const T *B, int ldb,
const T *beta,
T *C, int ldc
);
/**
@brief similar to tf::cublasFlowCapturer::syrkx but operates on
C-styled row-major layout
*/
template <typename T>
cudaTask c_syrkx(
cublasFillMode_t uplo, cublasOperation_t tran,
int n, int k,
const T *alpha,
const T *A, int lda,
const T *B, int ldb,
const T *beta,
T *C, int ldc
);
/**
@brief performs triangular matrix-matrix multiplication
This method performs triangular matrix-matrix multiplication:
<tt>C = alpha * op(A) * B</tt>, if <tt>side == CUBLAS_SIDE_LEFT</tt>, or
<tt>C = alpha * B * op(A)</tt>, if <tt>side == CUBLAS_SIDE_RIGHT</tt>,
where @c A is a triangular matrix stored in lower or upper mode with
or without the main diagonal, @c B and @c C are @c m by @c n matrix,
and @c alpha is a scalar.
This method calls native @c cublas<t>trmm with
packed parameters, <tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param side indicates if matrix @c A is on the left or right of @c B
@param uplo indicates if matrix @c A lower or upper part is stored,
the other part is not referenced and is inferred from
the stored elements
@param tran transposition operation to apply to @c A
@param diag indicates if the elements on the main diagonal of matrix
@c A are unity and should not be accessed.
@param m number of rows of matrix @c B, with matrix @c A sized accordingly
@param n number of columns of matrix @c B, with matrix @c A sized accordingly
@param alpha scalar used for multiplication
@param A pointer to the address of matrix @c A
@param lda leading dimension of the 2D array used to store @c A
@param B pointer to the address of matrix @c B
@param ldb leading dimension of the 2D array used to store @c B
@param C pointer to the address of matrix @c C
@param ldc leading dimension of the 2D array used to store @c C
Notice that in this method, @c B and @c C can point to the same address
in which case the in-place implementation is performed
(with results written back to @c B).
*/
template <typename T>
cudaTask trmm(
cublasSideMode_t side, cublasFillMode_t uplo,
cublasOperation_t tran, cublasDiagType_t diag,
int m, int n,
const T *alpha,
const T *A, int lda,
const T *B, int ldb,
T *C, int ldc
);
/**
@brief similar to tf::cublasFlowCapturer::trmm but oeprates on C-styled
row-major layout
*/
template <typename T>
cudaTask c_trmm(
cublasSideMode_t side, cublasFillMode_t uplo,
cublasOperation_t tran, cublasDiagType_t diag,
int m, int n,
const T *alpha,
const T *A, int lda,
const T *B, int ldb,
T *C, int ldc
);
/**
@brief solves the triangular linear system with multiple right-hand-sides
This method solves the triangular linear system with multiple
right-hand-sides:
<tt>op(A) * X = alpha * B</tt>, if <tt>side == CUBLAS_SIDE_LEFT</tt>, or
<tt>X * op(A) = alpha * B</tt>, if <tt>side == CUBLAS_SIDE_RIGHT</tt>,
where @c A is a triangular matrix stored in lower or upper mode
with or without the main diagonal, @c X and @c B are @c m by @c n matrices,
and @c alpha is a scalar.
The solution @c X overwrites the right-hand-sides @c B on exit.
This method calls native @c cublas<t>trsm with
packed parameters, <tt>(handle, args...)</tt>, where @c handle is managed by
the %cublasFlowCapturer and @c args... are the given arguments.
@tparam T data type
@param side indicates if @c A is on the left or right side of @c X
@param uplo indicates if matrix @c A lower or upper part is stored,
the other part is not referenced and is inferred from
the stored elements
@param tran transposition operation to apply to @c A
@param diag indicates if the elements on the main diagonal of matrix @c A
are unity and should not be accessed
@param m number of rows in matrix @c B, with matrix @c A sized accordingly
@param n number of columns in matrix @c B, with matrix @c A sized accordingly
@param alpha scalar to apply to @c B
@param A pointer to the address of matrix @c A
@param lda leading dimension of the 2D array used to store @c A
@param B pointer to the address of matrix @c B
@param ldb leading dimension of the 2D array used to store @c B
*/
template <typename T>
cudaTask trsm(
cublasSideMode_t side, cublasFillMode_t uplo,
cublasOperation_t tran, cublasDiagType_t diag,
int m, int n,
const T *alpha,
const T *A, int lda,
T *B, int ldb
);
/**
@brief similar to tf::cublasFlowCapturer::trsm but operates on C-styled
row-major layout
*/
template <typename T>
cudaTask c_trsm(
cublasSideMode_t side, cublasFillMode_t uplo,
cublasOperation_t tran, cublasDiagType_t diag,
int m, int n,
const T *alpha,
const T *A, int lda,
T *B, int ldb
);
private:
cublasScopedPerThreadHandle _handle;
void _stream(cudaStream_t);
};
// Procedure: _stream
inline void cublasFlowCapturer::_stream(cudaStream_t stream) {
TF_CHECK_CUBLAS(
cublasSetStream(_handle, stream), "failed to set cublas stream"
);
}
// Function: native_handle
inline cublasHandle_t cublasFlowCapturer::native_handle() {
return _handle;
}
} // end of namespace tf -----------------------------------------------------