#pragma once #include "cublas_handle.hpp" /** @file cublas_flow.hpp @brief cublasFlowCapturer include file */ namespace tf { // ---------------------------------------------------------------------------- // cublasFlowCapturer definition // ---------------------------------------------------------------------------- /** @class cublasFlowCapturer @brief class to construct a cuBLAS task graph %cublasFlowCapturer provides a higher-level interface over the @cuBLAS library and hide concurrency details from users. It inherits methods from tf::cudaFlowCapturerBase and must be used from a tf::cudaFlowCapturer object. All pointers used to %cublasFlowCapturer methods must be in GPU memory space or managed (i.e., @c cudaMallocManaged), including scalars, @c alpha and @c beta, input data and output data pointers. The following example uses @c cublasamax to find the minimum index of the element of the maximum absolute magnitude in a vector. @code{.cpp} #include int main() { tf::Executor executor; tf::Taskflow taskflow; size_t N = 1024; float *x = nullptr; int *d_res; int h_res; std::vector host(N, 0.0f); host[512] = 100.0f; // artificially set the mid-position to the largest cudaMalloc(&x, N*sizeof(float)); cudaMalloc(&d_res, sizeof(int)); taskflow.emplace([&](tf::cudaFlowCapturer& capturer){ auto* cublas = capturer.make_capturer(); tf::cudaTask h2d = capturer.copy(x, host.data(), N); tf::cudaTask find_max = cublas->amax(N, x, 1, d_res); tf::cudaTask d2h = capturer.copy(&h_res, d_res, 1); h2d.precede(find_max); // amax runs before host-to-device copy find_max.precede(d2h); // amax runs after device-to-host copy }); executor.run(taskflow).wait(); assert(h_res == 512); } @endcode Currently, %cublasFlowCapturer supports only @c float and @c double data types. We design most tf::cublasFlowCapturer methods on top of the native, high-performance @cuBLAS library. You may refer to @cuBLAS for more details. */ class cublasFlowCapturer : public cudaFlowCapturerBase { public: /** @brief constructs a cublas flow capturer */ cublasFlowCapturer() = default; /** @brief gets the native cublas handle associated with this %cublasFlowCapturer @return a native cublas handle of type cublasHandle_t */ cublasHandle_t native_handle(); /** @brief copies vector data from host to device This method copies @c n elements from a vector @c h in host memory space to a vector @c d in GPU memory space. The storage spacing between consecutive elements is given by @c inch for the source vector @c h and by @c incd for the destination vector @c d. This method calls native @c cublasSetVectorAsync with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param n number of elements @param d target device pointer @param incd spacing between consecutive elements in @c d @param h source host pointer @param inch spacing between consecutive elements in @c h @return a tf::cudaTask handle */ template , void>* = nullptr > cudaTask vset(size_t n, const T* h, int inch, T* d, int incd); /** @brief copies vector data from device to host This method copies @c n elements from a vector @c d in GPU memory space to a vector @c h in host memory space. The storage spacing between consecutive elements is given by @c inch for the target vector @c h and by @c incd for the source vector @c d. This method calls native @c cublasGetVectorAsync with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param n number of elements @param h target host pointer @param inch spacing between consecutive elements in @c h @param d source device pointer @param incd spacing between consecutive elements in @c d @return a tf::cudaTask handle */ template , void>* = nullptr > cudaTask vget(size_t n, const T* d, int incd, T* h, int inch); // ------------------------------------------------------------------------ // Level-1 vector-vector operations // ------------------------------------------------------------------------ /** @brief finds the smallest index of the element of the maximum absolute magnitude This method calls native @c cublasamax with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param n number of elements in vector @c x @param x pointer to the memory address of the vector @param incx stride between consecutive elements of @c x @param result the resulting index (1-based indexing) @return a tf::cudaTask handle */ template cudaTask amax(int n, const T* x, int incx, int* result); /** @brief finds the smallest index of the element of the minimum absolute magnitude This method calls native @c cublasamin with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param n number of elements in vector @c x @param x pointer to the memory address of the vector @param incx stride between consecutive elements of @c x @param result the resulting index (1-based indexing) @return a tf::cudaTask handle */ template cudaTask amin(int n, const T* x, int incx, int* result); /** @brief finds the sum of absolute values of the elements over a vector This method calls native @c cublasasum with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param n number of elements in vector @c x @param x pointer to the memory address of the vector @param incx stride between consecutive elements of @c x @param result the result @return a tf::cudaTask handle */ template cudaTask asum(int n, const T* x, int incx, T* result); /** @brief multiples a vector by a scalar and adds it to a vector This function multiplies the vector @c x by the scalar @c alpha and adds it to the vector @c y overwriting the latest vector with the result. Hence, the performed operation is: y[j] = alpha * x[k] + y[j], where @c j and @c k are indices of @c n elements with step sizes @c incy and @c incx. This method calls native @c cublasasum with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param n number of elements in vectors @c x and @c y @param alpha scalar used to multiplication @param x pointer to the memory address of the vector @c x @param incx stride between consecutive elements of @c x @param y pointer to the memory address of the vector @c y @param incy stride between consecutive elements of @c y @return a tf::cudaTask handle */ template cudaTask axpy( int n, const T *alpha, const T *x, int incx, T *y, int incy ); /** @brief copies a vector to another vector This function copies @c n elements from a vector @c x of a step size @c incx to another vector @c y of step size @c incy. adds it to the vector @c y overwriting the latest vector with the result. Hence, the performed operation is: y[j] = x[k], where @c j and @c k are indices of @c n elements with step sizes @c incy and @c incx. This method calls native @c cublascopy with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param n number of elements to copy @param x pointer to the memory address of the vector @c x @param incx stride between consecutive elements of @c x @param y pointer to the memory address of the vector @c y @param incy stride between consecutive elements of @c y @return a tf::cudaTask handle */ template cudaTask vcopy(int n, const T* x, int incx, T* y, int incy); /** @brief computes the dot product of two vectors sum += x[i] * y[i] This method calls native @c cublasdot with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param n number of elements to perform the dot product @param x pointer to the memory address of the vector @c x @param incx stride between consecutive elements of @c x @param y pointer to the memory address of the vector @c y @param incy stride between consecutive elements of @c y @param result the resulting dot product @return a tf::cudaTask handle */ template cudaTask dot(int n, const T* x, int incx, const T* y, int incy, T* result); /** @brief computes the Euclidean norm of a vector This method calls native @c cublasnrm2 with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param n number of elements in vector @c x @param x pointer to the memory address of the vector @param incx stride between consecutive elements of @c x @param result the result @return a tf::cudaTask handle */ template cudaTask nrm2(int n, const T* x, int incx, T* result); /** @brief scales a vector by a scalar This method calls native @c cublasscal with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param n number of elements in vector @c x @param scalar scalar used for multiplication @param x pointer to the memory address of the vector @param incx stride between consecutive elements of @c x @return a tf::cudaTask handle */ template cudaTask scal(int n, const T* scalar, T* x, int incx); /** @brief swaps elements between two vectors This function interchanges the elements of vectors @c x and @c y. Hence, the performed operation is: y[j] <-> x[k], where @c j is the index of element in @c y with a step size @c incy and @c k is the index of element in @c x with a step size @c incx. This method calls native @c cublasswap with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param n number of elements to perform the dot product @param x pointer to the memory address of the vector @c x @param incx stride between consecutive elements of @c x @param y pointer to the memory address of the vector @c y @param incy stride between consecutive elements of @c y @return a tf::cudaTask handle */ template cudaTask swap(int n, T* x, int incx, T* y, int incy); // ------------------------------------------------------------------------ // TODO Level-2 matrix_vector operations // ------------------------------------------------------------------------ /** @brief performs matrix-vector multiplication This function performs matrix-vector multiplication: y = alpha * op(A) * x + beta * y, where @c alpha and @c beta are scalars, @c A is a 2D matrix stored in column-major format, and @c x, @c y are vectors. The input matrices are in column-major storage. This method calls native @c cublasgemv with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param trans transport operation @c op(A) @param m number of rows of matrix @c A @param n number of columns of matrix @c A @param alpha pointer to the @c alpha scalar @param A pointer to the address of @c A @param lda leading dimension of 2D array used to store the matrix @c A @param x pointer to the address of @c x of at least (1 + (n - 1) * abs(incx)) elements if no transposition, or (1 + (m - 1) * abs(incx)) elements otherwise. @param incx stride between consecutive elements of @c x @param beta pointer to the @c beta scalar @param y pointer to the address of @c y @param incy stride between consecutive elements of @c y @return a tf::cudaTask handle */ template cudaTask gemv( cublasOperation_t trans, int m, int n, const T *alpha, const T *A, int lda, const T *x, int incx, const T *beta, T *y, int incy ); /** @brief similar to tf::cublasFlowCapturer::gemv but operates on C-styled row-major layout */ template cudaTask c_gemv( cublasOperation_t trans, int m, int n, const T *alpha, const T *A, int lda, const T *x, int incx, const T *beta, T *y, int incy ); /** @brief performs symmetric matrix-vector multiplication This function performs symmetric matrix-vector multiplication: y = alpha * A * x + beta * y, where @c alpha and @c beta are scalars, @c A is a 2D symmetric matrix stored in column-major format, and @c x, @c y are vectors This method calls native @c cublassymv with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param uplo indicates if matrix @c A lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements @param n number of rows and columns of matrix @c A @param alpha pointer to the @c alpha scalar @param A pointer to the address of @c A @param lda leading dimension of 2D array used to store the matrix @c A @param x pointer to the address of @c x @param incx stride between consecutive elements of @c x @param beta pointer to the @c beta scalar @param y pointer to the address of @c y @param incy stride between consecutive elements of @c y @return a tf::cudaTask handle */ template cudaTask symv( cublasFillMode_t uplo, int n, const T *alpha, const T *A, int lda, const T *x, int incx, const T *beta, T *y, int incy ); /** @brief similar to tf::cublasFlowCapturer::symv but operates on C-styled row-major layout */ template cudaTask c_symv( cublasFillMode_t uplo, int n, const T *alpha, const T *A, int lda, const T *x, int incx, const T *beta, T *y, int incy ); /** @brief performs symmetric rank-1 update This function performs symmetric rank-1 update: A = alpha * x * x^T + A, where @c alpha is a scalar, @c A is a 2D symmetric matrix stored in column-major format, and @c x is a vector. The result is also symmetric and is stored on in the @c uplo part of @c A. This method calls native @c cublassyr with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param uplo indicates if matrix @c A lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements @param n number of rows and columns of matrix @c A @param alpha pointer to the @c alpha scalar @param x pointer to the address of @c x @param incx stride between consecutive elements of @c x @param A pointer to the address of @c A @param lda leading dimension of 2D array used to store the matrix @c A @return a tf::cudaTask handle */ template cudaTask syr( cublasFillMode_t uplo, int n, const T *alpha, const T *x, int incx, T *A, int lda ); /** @brief similar to tf::cublasFlowCapturer::c_syr but operates on C-styled row-major layout */ template cudaTask c_syr( cublasFillMode_t uplo, int n, const T *alpha, const T *x, int incx, T *A, int lda ); /** @brief performs symmetric rank-2 update This function performs symmetric rank-2 update: A = alpha * x * y^T + y * x^T + A, where @c alpha is a scalar, @c A is a 2D symmetric matrix stored in column-major format, and @c x and @c y are vectors. The result is also symmetric and is stored on in the @c uplo part of @c A. This method calls native @c cublassyr2 with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param uplo indicates if matrix @c A lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements @param n number of rows and columns of matrix @c A @param alpha pointer to the @c alpha scalar @param x pointer to the address of @c x @param incx stride between consecutive elements of @c x @param y pointer to the address of @c y @param incy stride between consecutive elements of @c y @param A pointer to the address of @c A @param lda leading dimension of 2D array used to store the matrix @c A @return a tf::cudaTask handle */ template cudaTask syr2( cublasFillMode_t uplo, int n, const T *alpha, const T *x, int incx, const T *y, int incy, T *A, int lda ); /** @brief similar to tf::cublasFlowCapturer::syr2 but operates on C-styled row-major layout */ template cudaTask c_syr2( cublasFillMode_t uplo, int n, const T *alpha, const T *x, int incx, const T *y, int incy, T *A, int lda ); /** @brief performs the triangular matrix-vector multiplication This method performs the triangular matrix-vector multiplication: x = op(A), where @c A is a triangular matrix stored in lower or upper mode with or without the main diagonal, and @c x is a vector. @tparam T data type @param uplo indicates if matrix @c A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements @param tran transpose operation @c op(A) @param diag indicates if the elements on the main diagonal of matrix @c A are unity (i.e., all 1s) and of no need to be accessed @param n number of rows and columns of matrix @c A @param A pointer to the address of A @param lda leading dimension of 2D array used to store matrix @c A @param x input of vector @c b and output of the solution on exit @param incx stride between consecutive elements of @c x */ template cudaTask trmv( cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int n, const T* A, int lda, T *x, int incx ); /** @brief similar to tf::cublasFlowCapturer::trmv but operates on C-styled row-major layout */ template cudaTask c_trmv( cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int n, const T* A, int lda, T *x, int incx ); /** @brief solves the triangular linear system with a single right-hand-side This method solves the triangular linear system with a single right-hand-side op(A) x = b, where @c A is a triangular matrix stored in lower or upper mode with or without the main diagonal, and @c x and @c b are vectors. @tparam T data type @param uplo indicates if matrix @c A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements @param tran transpose operation @c op(A) @param diag indicates if the elements on the main diagonal of matrix @c A are unity (i.e., all 1s) and of no need to be accessed @param n number of rows and columns of matrix @c A @param A pointer to the address of A @param lda leading dimension of 2D array used to store matrix @c A @param x input of vector @c b and output of the solution on exit @param incx stride between consecutive elements of @c x */ template cudaTask trsv( cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int n, const T* A, int lda, T *x, int incx ); /** @brief similar to tf::cublasFlowCapturer::trsv but operates on C-styled row-major layout */ template cudaTask c_trsv( cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int n, const T* A, int lda, T *x, int incx ); // ------------------------------------------------------------------------ // Level-3 matrix-matrix operations // ------------------------------------------------------------------------ /** @brief performs matrix-matrix addition and transposition This method performs the matrix-matrix addition/transposition: C = alpha * op(A) + beta * op(B), where @c alpha and @c beta are scalars, and @c A, @c B and @c C are matrices stored in column-major format with dimensions @c op(A) as @c m by @c n, @c op(B) as @c m by @c n and @c C as @c m by @c n, respectively. The operation is out-of-place if @c C does not overlap @c A or @c B. The in-place mode supports the following two operations: 1. C = alpha * C + beta * op(B) 2. C = alpha * op(A) + beta * C For in-place mode, if @c C equals @c A, @c ldc equals @c lda and @c ta equals @c CUBLAS_OP_N. If @c C equals @c B, @c ldc equals @c ldb and @c tb equals CUBLAS_OP_N. The operation includes the following special cases: 1. the user can reset matrix @c C to zero by setting @c alpha and @c beta to 0 2. the user can transpose matrix @c A by setting @c alpha to 1 and @c beta to 0 The input matrices are in column-major storage. This method calls native @c cublasgeam with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param ta transport operation @c op(A) @param tb transport operation @c op(B) @param m number of rows of matrix @c C and @c op(A) @param n number of columns of matrix @c C and @c op(B) @param alpha pointer to the @c alpha scalar @param A pointer to the address of @c A @param lda leading dimension of 2D array used to store the matrix @c A @param beta pointer to the @c beta scalar @param B pointer to the address of @c B @param ldb leading dimension of 2D array used to store the matrix @c B @param C pointer to the address of @c C @param ldc leading dimension of 2D array used to store the matrix @c C @return a tf::cudaTask handle */ template cudaTask geam( cublasOperation_t ta, cublasOperation_t tb, int m, int n, const T *alpha, const T *A, int lda, const T *beta, const T *B, int ldb, T *C, int ldc ); /** @brief similar to tf::cublasFlowCapturer::geam but on row-major layout */ template cudaTask c_geam( cublasOperation_t ta, cublasOperation_t tb, int m, int n, const T *alpha, const T *A, int lda, const T *beta, const T *B, int ldb, T *C, int ldc ); /** @brief performs matrix-matrix multiplication This function performs matrix-matrix multiplication: C = alpha * op (A) * op (B) + beta * C, where @c alpha and @c beta are scalars, and @c A, @c B, and @c C are 2D matrices stored in column-major format with dimension @c op(A) as @c m by @c k, dimension @c op(B) as @c k by @c n, and @c C as @c m by @c n. The input matrices are in column-major storage. This method calls native @c cublasgemm with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param ta transport operation @c op(A) @param tb transport operation @c op(B) @param m number of rows of matrix @c C and @c op(A) @param n number of columns of matrix @c C and @c op(B) @param k number of columns of @c op(A) and rows of @c op(B) @param alpha pointer to the @c alpha scalar @param A pointer to the address of @c A @param lda leading dimension of 2D array used to store the matrix @c A @param B pointer to the address of @c B @param ldb leading dimension of 2D array used to store the matrix @c B @param beta pointer to the @c beta scalar @param C pointer to the address of @c C @param ldc leading dimension of 2D array used to store the matrix @c C @return a tf::cudaTask handle */ template cudaTask gemm( cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T *alpha, const T *A, int lda, const T *B, int ldb, const T *beta, T *C, int ldc ); /** @brief similar to tf::cublasFlowCapturer::gemm but operates on C-styled row-major layout */ template cudaTask c_gemm( cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T *alpha, const T *A, int lda, const T *B, int ldb, const T *beta, T *C, int ldc ); /** @brief performs matrix-matrix multiplication over a batch of matrices The batch must be @em uniform. All instances in the batch must have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (ta, tb) for their respective @c A, @c B and @c C matrices. The address of the input matrices and the output matrix of each instance of the batch are read from arrays of pointers passed to the function by the caller. C[i]= alpha * op (A[i]) * op (B[i]) + beta * C[i], i in [0, bc), where @c alpha and @c beta are scalars, and @c A[i], @c B[i], and @c C[i] are 2D matrices stored in column-major format with dimension @c op(A) as @c m by @c k, dimension @c op(B) as @c k by @c n, and @c C as @c m by @c n. The input matrices are in column-major storage. This method calls native @c cublasgemmBatched with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param ta transport operation @c op(A[i]) @param tb transport operation @c op(B[i]) @param m number of rows of matrix @c C[i] and @c op(A[i]) @param n number of columns of matrix @c C[i] and @c op(B[i]) @param k number of columns of @c op(A[i]) and rows of @c op(B[i]) @param alpha pointer to the @c alpha scalar @param A array pointer to @c A batch @param lda leading dimension of 2D array used to store the matrix @c A[i] @param B array pointer to @c B batch @param ldb leading dimension of 2D array used to store the matrix @c B[i] @param beta pointer to the @c beta scalar @param C array pointer to @c C batch @param ldc leading dimension of 2D array used to store the matrix @c C[i] @param bc batch size (number of matrices) @return a tf::cudaTask handle */ template cudaTask gemm_batched( cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T *alpha, const T *A[], int lda, const T *B[], int ldb, const T *beta, T *C[], int ldc, int bc ); /** @brief similar to tf::cublasFlowCapturer::gemm_batched but operates on C-styled row-major layout */ template cudaTask c_gemm_batched( cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T *alpha, const T *A[], int lda, const T *B[], int ldb, const T *beta, T *C[], int ldc, int bc ); /** @brief performs matrix-matrix multiplication over a batch of matrices with strided memory access Here, we use @c A[i], @c B[i], @c C[i] as notation for A, B and C matrices in the @c i-th instance of the batch, implicitly assuming they are respectively address offsets @c sA, @c sB, @c sC away from @c A[i-1], @c B[i-1], @c C[i-1]. The input matrices are in column-major storage. This method calls native @c cublasgemmStridedBatched with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param ta transport operation @c op(A[i]) @param tb transport operation @c op(B[i]) @param m number of rows of matrix @c C[i] and @c op(A[i]) @param n number of columns of matrix @c C[i] and @c op(B[i]) @param k number of columns of @c op(A[i]) and rows of @c op(B[i]) @param alpha pointer to the @c alpha scalar @param A pointer to @c A batch @param lda leading dimension of 2D array used to store the matrix @c A[i] @param sA address offset between @c A[i] and @c A[i+1] @param B pointer to @c B batch @param ldb leading dimension of 2D array used to store the matrix @c B[i] @param sB address offset between @c B[i] and @c B[i+1] @param beta pointer to the @c beta scalar @param C pointer to @c C batch @param ldc leading dimension of 2D array used to store the matrix @c C[i] @param sC address offset between @c C[i] and @c C[i+1] @param bc batch size (number of matrices) @return a tf::cudaTask handle The batch must be @em uniform. All instances in the batch must have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (ta, tb) for their respective @c A, @c B and @c C matrices. Input matrices @c A, @c B and output matrix @c C for each instance of the batch are located at fixed address offsets from their locations in the previous instance. Pointers to @c A, @c B and @c C matrices for the first instance are passed to the function by the user along with the address @em offsets - @c sA, @c sB and @c sC that determine the locations of input and output matrices in future instances. C + i*sC = alpha * op (A + i*sA) * op (B + i*sB) + beta * (C + i*sC), i in [0, bc), where @c alpha and @c beta are scalars, and @c A[i], @c B[i], and @c C[i] are 2D matrices stored in column-major format with dimension @c op(A) as @c m by @c k, dimension @c op(B) as @c k by @c n, and @c C as @c m by @c n. On certain problem sizes, it might be advantageous to create multiple gemm tasks to take advantage of concurrent kernels, rather than this method. */ template cudaTask gemm_sbatched( cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T *alpha, const T *A, int lda, long long int sA, const T *B, int ldb, long long int sB, const T *beta, T *C, int ldc, long long int sC, int bc ); /** @brief similar to tf::cublasFlowCapturer::c_gemm_sbatched but operates on C-styled row-major layout */ template cudaTask c_gemm_sbatched( cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T *alpha, const T *A, int lda, long long int sA, const T *B, int ldb, long long int sB, const T *beta, T *C, int ldc, long long int sC, int bc ); /** @brief performs the symmetric matrix-matrix multiplication The method performs symmetric matrix-matrix multiplication: C = alpha * A * B + beta * C, if side == CUBLAS_SIDE_LEFT, or C = alpha * B * A + beta * C, if side == CUBLAS_SIDE_RIGHT. @c A is a symmetric matrix stored in lower or upper mode, @c B and @c C are @c m by @c n matrices, and @c alpha and @c beta are scalars. This method calls native @c cublassymm with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param side indicates if matrix @c A is on the left or right of @c B. @param uplo indicates if matrix @c A lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements. @param m number of rows of matrix @c C and @c B, with matrix @c A sized accordingly @param n number of columns of matrix @c C and @c B, with matrix @c A sized accordingly @param alpha scalar used for multiplication @param A pointer to the address of matrix @c A @param lda leading dimension of the 2D array used to store A @param B pointer to the address of matrix @c B @param ldb leading dimension of the 2D array used to store B @param beta scalar used for multiplication @param C pointer to the address of matrix @c C @param ldc leading dimension of the 2D array used to store C */ template cudaTask symm( cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const T *alpha, const T *A, int lda, const T *B, int ldb, const T *beta, T *C, int ldc ); /** @brief similar to tf::cublasFlowCapturer::symm but operates on C-styled row-major layout */ template cudaTask c_symm( cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const T *alpha, const T *A, int lda, const T *B, int ldb, const T *beta, T *C, int ldc ); /** @brief performs the symmetric rank-k update This method performs the symmetric rank-k update : C = alpha * op(A) * op(A)^T + beta * C, where @c alpha and @c beta are scalars, @c C is a symmetric matrix stored in lower or upper mode, and @c A is a matrix with dimension @c op(A) @c n by @c k. The result is stored to @c uplo part of @c C. This method calls native @c cublassyrk with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param uplo indicates if matrix @c C lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements. @param tran transposition operation to apply to @c A @param n number of rows of matrix @c C and @c op(A) @param k number of columns of matrix @c op(A) @param alpha scalar used for multiplication @param A pointer to the address of @c A @param lda leading dimension of the 2D array used to store @c A @param beta scalar used for multiplication @param C pointer to the address of @c C @param ldc leading dimension of the 2D array used to store @c C */ template cudaTask syrk( cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T *alpha, const T *A, int lda, const T *beta, T *C, int ldc ); /** @brief similar to tf::cublasFlowCapturer::c_syrk but operates on C-styled row-major layout */ template cudaTask c_syrk( cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T *alpha, const T *A, int lda, const T *beta, T *C, int ldc ); /** @brief performs the symmetric rank-2k update This method performs the symmetric rank-2k update : C = alpha * (op(A) * op(B)^T + op(B) * op(A)^T) + beta * C, where @c alpha and @c beta are scalars, @c C is a symmetric matrix stored in lower or upper mode, and @c A and @c B are two matrices with dimensions @c op(A) and op(B) @c n by @c k. The result is stored to @c uplo part of @c C. This method calls native @c cublassyr2k with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param uplo indicates if matrix @c C lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements. @param tran transposition operation to apply to @c A @param n number of rows of matrix @c C and @c op(A) @param k number of columns of matrix @c op(A) @param alpha scalar used for multiplication @param A pointer to the address of @c A @param lda leading dimension of the 2D array used to store @c A @param B pointer to the address of @c B @param ldb leading dimension of the 2D array used to store @c B @param beta scalar used for multiplication @param C pointer to the address of @c C @param ldc leading dimension of the 2D array used to store @c C */ template cudaTask syr2k( cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T *alpha, const T *A, int lda, const T *B, int ldb, const T *beta, T *C, int ldc ); /** @brief similar to tf::cublasFlowCapturer::syr2k but operates on C-styled row-major layout */ template cudaTask c_syr2k( cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T *alpha, const T *A, int lda, const T *B, int ldb, const T *beta, T *C, int ldc ); /** @brief performs a variation of the symmetric rank-k update This method performs a variation of the symmetric rank-k update: C = alpha * op(A) * op(B)^T + beta * C, where @c alpha and @c beta are scalars, @c C is a symmetric matrix stored in lower or upper mode, and @c A and @c B are two matrices with dimensions @c op(A) and op(B) @c n by @c k. The result is stored to @c uplo part of @c C. This method calls native @c cublassyr2k with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param uplo indicates if matrix @c C lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements. @param tran transposition operation to apply to @c A @param n number of rows of matrix @c C and @c op(A) @param k number of columns of matrix @c op(A) @param alpha scalar used for multiplication @param A pointer to the address of @c A @param lda leading dimension of the 2D array used to store @c A @param B pointer to the address of @c B @param ldb leading dimension of the 2D array used to store @c B @param beta scalar used for multiplication @param C pointer to the address of @c C @param ldc leading dimension of the 2D array used to store @c C */ template cudaTask syrkx( cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T *alpha, const T *A, int lda, const T *B, int ldb, const T *beta, T *C, int ldc ); /** @brief similar to tf::cublasFlowCapturer::syrkx but operates on C-styled row-major layout */ template cudaTask c_syrkx( cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T *alpha, const T *A, int lda, const T *B, int ldb, const T *beta, T *C, int ldc ); /** @brief performs triangular matrix-matrix multiplication This method performs triangular matrix-matrix multiplication: C = alpha * op(A) * B, if side == CUBLAS_SIDE_LEFT, or C = alpha * B * op(A), if side == CUBLAS_SIDE_RIGHT, where @c A is a triangular matrix stored in lower or upper mode with or without the main diagonal, @c B and @c C are @c m by @c n matrix, and @c alpha is a scalar. This method calls native @c cublastrmm with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param side indicates if matrix @c A is on the left or right of @c B @param uplo indicates if matrix @c A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements @param tran transposition operation to apply to @c A @param diag indicates if the elements on the main diagonal of matrix @c A are unity and should not be accessed. @param m number of rows of matrix @c B, with matrix @c A sized accordingly @param n number of columns of matrix @c B, with matrix @c A sized accordingly @param alpha scalar used for multiplication @param A pointer to the address of matrix @c A @param lda leading dimension of the 2D array used to store @c A @param B pointer to the address of matrix @c B @param ldb leading dimension of the 2D array used to store @c B @param C pointer to the address of matrix @c C @param ldc leading dimension of the 2D array used to store @c C Notice that in this method, @c B and @c C can point to the same address in which case the in-place implementation is performed (with results written back to @c B). */ template cudaTask trmm( cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int m, int n, const T *alpha, const T *A, int lda, const T *B, int ldb, T *C, int ldc ); /** @brief similar to tf::cublasFlowCapturer::trmm but oeprates on C-styled row-major layout */ template cudaTask c_trmm( cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int m, int n, const T *alpha, const T *A, int lda, const T *B, int ldb, T *C, int ldc ); /** @brief solves the triangular linear system with multiple right-hand-sides This method solves the triangular linear system with multiple right-hand-sides: op(A) * X = alpha * B, if side == CUBLAS_SIDE_LEFT, or X * op(A) = alpha * B, if side == CUBLAS_SIDE_RIGHT, where @c A is a triangular matrix stored in lower or upper mode with or without the main diagonal, @c X and @c B are @c m by @c n matrices, and @c alpha is a scalar. The solution @c X overwrites the right-hand-sides @c B on exit. This method calls native @c cublastrsm with packed parameters, (handle, args...), where @c handle is managed by the %cublasFlowCapturer and @c args... are the given arguments. @tparam T data type @param side indicates if @c A is on the left or right side of @c X @param uplo indicates if matrix @c A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements @param tran transposition operation to apply to @c A @param diag indicates if the elements on the main diagonal of matrix @c A are unity and should not be accessed @param m number of rows in matrix @c B, with matrix @c A sized accordingly @param n number of columns in matrix @c B, with matrix @c A sized accordingly @param alpha scalar to apply to @c B @param A pointer to the address of matrix @c A @param lda leading dimension of the 2D array used to store @c A @param B pointer to the address of matrix @c B @param ldb leading dimension of the 2D array used to store @c B */ template cudaTask trsm( cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int m, int n, const T *alpha, const T *A, int lda, T *B, int ldb ); /** @brief similar to tf::cublasFlowCapturer::trsm but operates on C-styled row-major layout */ template cudaTask c_trsm( cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int m, int n, const T *alpha, const T *A, int lda, T *B, int ldb ); private: cublasScopedPerThreadHandle _handle; void _stream(cudaStream_t); }; // Procedure: _stream inline void cublasFlowCapturer::_stream(cudaStream_t stream) { TF_CHECK_CUBLAS( cublasSetStream(_handle, stream), "failed to set cublas stream" ); } // Function: native_handle inline cublasHandle_t cublasFlowCapturer::native_handle() { return _handle; } } // end of namespace tf -----------------------------------------------------