1361 lines
46 KiB
C++
1361 lines
46 KiB
C++
#pragma once
|
|
|
|
#include "cublas_handle.hpp"
|
|
|
|
/**
|
|
@file cublas_flow.hpp
|
|
@brief cublasFlowCapturer include file
|
|
*/
|
|
|
|
namespace tf {
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// cublasFlowCapturer definition
|
|
// ----------------------------------------------------------------------------
|
|
|
|
/**
|
|
@class cublasFlowCapturer
|
|
|
|
@brief class to construct a cuBLAS task graph
|
|
|
|
%cublasFlowCapturer provides a higher-level interface over the @cuBLAS library
|
|
and hide concurrency details from users.
|
|
It inherits methods from tf::cudaFlowCapturerBase and must be used from
|
|
a tf::cudaFlowCapturer object.
|
|
All pointers used to %cublasFlowCapturer methods must be in GPU memory space or managed
|
|
(i.e., @c cudaMallocManaged),
|
|
including scalars, @c alpha and @c beta, input data and output data pointers.
|
|
The following example uses @c cublas<t>amax to find the minimum index of the element
|
|
of the maximum absolute magnitude in a vector.
|
|
|
|
@code{.cpp}
|
|
#include <taskflow/cublasflow.hpp>
|
|
|
|
int main() {
|
|
tf::Executor executor;
|
|
tf::Taskflow taskflow;
|
|
|
|
size_t N = 1024;
|
|
float *x = nullptr;
|
|
int *d_res;
|
|
int h_res;
|
|
|
|
std::vector<float> host(N, 0.0f);
|
|
host[512] = 100.0f; // artificially set the mid-position to the largest
|
|
|
|
cudaMalloc(&x, N*sizeof(float));
|
|
cudaMalloc(&d_res, sizeof(int));
|
|
|
|
taskflow.emplace([&](tf::cudaFlowCapturer& capturer){
|
|
auto* cublas = capturer.make_capturer<tf::cublasFlowCapturer>();
|
|
|
|
tf::cudaTask h2d = capturer.copy(x, host.data(), N);
|
|
tf::cudaTask find_max = cublas->amax(N, x, 1, d_res);
|
|
tf::cudaTask d2h = capturer.copy(&h_res, d_res, 1);
|
|
|
|
h2d.precede(find_max); // amax runs before host-to-device copy
|
|
find_max.precede(d2h); // amax runs after device-to-host copy
|
|
});
|
|
|
|
executor.run(taskflow).wait();
|
|
|
|
assert(h_res == 512);
|
|
}
|
|
@endcode
|
|
|
|
Currently, %cublasFlowCapturer supports only @c float and @c double data types.
|
|
|
|
We design most tf::cublasFlowCapturer methods on top of the native,
|
|
high-performance @cuBLAS library.
|
|
You may refer to @cuBLAS for more details.
|
|
|
|
*/
|
|
class cublasFlowCapturer : public cudaFlowCapturerBase {
|
|
|
|
public:
|
|
|
|
/**
|
|
@brief constructs a cublas flow capturer
|
|
*/
|
|
cublasFlowCapturer() = default;
|
|
|
|
/**
|
|
@brief gets the native cublas handle associated with this %cublasFlowCapturer
|
|
|
|
@return a native cublas handle of type cublasHandle_t
|
|
*/
|
|
cublasHandle_t native_handle();
|
|
|
|
/**
|
|
@brief copies vector data from host to device
|
|
|
|
This method copies @c n elements from a vector @c h in host memory space
|
|
to a vector @c d in GPU memory space.
|
|
The storage spacing between consecutive elements is given by @c inch for
|
|
the source vector @c h and by @c incd for the destination vector @c d.
|
|
|
|
This method calls native @c cublasSetVectorAsync with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
@param n number of elements
|
|
@param d target device pointer
|
|
@param incd spacing between consecutive elements in @c d
|
|
@param h source host pointer
|
|
@param inch spacing between consecutive elements in @c h
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
template <typename T,
|
|
std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
|
|
>
|
|
cudaTask vset(size_t n, const T* h, int inch, T* d, int incd);
|
|
|
|
/**
|
|
@brief copies vector data from device to host
|
|
|
|
This method copies @c n elements from a vector @c d in GPU memory space
|
|
to a vector @c h in host memory space.
|
|
The storage spacing between consecutive elements is given by @c inch for
|
|
the target vector @c h and by @c incd for the source vector @c d.
|
|
|
|
This method calls native @c cublasGetVectorAsync with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
@param n number of elements
|
|
@param h target host pointer
|
|
@param inch spacing between consecutive elements in @c h
|
|
@param d source device pointer
|
|
@param incd spacing between consecutive elements in @c d
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
template <typename T,
|
|
std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
|
|
>
|
|
cudaTask vget(size_t n, const T* d, int incd, T* h, int inch);
|
|
|
|
// ------------------------------------------------------------------------
|
|
// Level-1 vector-vector operations
|
|
// ------------------------------------------------------------------------
|
|
|
|
/**
|
|
@brief finds the smallest index of the element of the maximum
|
|
absolute magnitude
|
|
|
|
This method calls native @c cublas<t>amax with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
|
|
@param n number of elements in vector @c x
|
|
@param x pointer to the memory address of the vector
|
|
@param incx stride between consecutive elements of @c x
|
|
@param result the resulting index (1-based indexing)
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
template <typename T>
|
|
cudaTask amax(int n, const T* x, int incx, int* result);
|
|
|
|
/**
|
|
@brief finds the smallest index of the element of the minimum
|
|
absolute magnitude
|
|
|
|
This method calls native @c cublas<t>amin with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
|
|
@param n number of elements in vector @c x
|
|
@param x pointer to the memory address of the vector
|
|
@param incx stride between consecutive elements of @c x
|
|
@param result the resulting index (1-based indexing)
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
template <typename T>
|
|
cudaTask amin(int n, const T* x, int incx, int* result);
|
|
|
|
/**
|
|
@brief finds the sum of absolute values of the elements over a vector
|
|
|
|
This method calls native @c cublas<t>asum with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
|
|
@param n number of elements in vector @c x
|
|
@param x pointer to the memory address of the vector
|
|
@param incx stride between consecutive elements of @c x
|
|
@param result the result
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
template <typename T>
|
|
cudaTask asum(int n, const T* x, int incx, T* result);
|
|
|
|
/**
|
|
@brief multiples a vector by a scalar and adds it to a vector
|
|
|
|
This function multiplies the vector @c x by the scalar @c alpha and
|
|
adds it to the vector @c y overwriting the latest vector with the result.
|
|
Hence, the performed operation is:
|
|
|
|
<tt>y[j] = alpha * x[k] + y[j]</tt>,
|
|
|
|
where @c j and @c k are indices of @c n elements with step sizes
|
|
@c incy and @c incx.
|
|
|
|
This method calls native @c cublas<t>asum with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
|
|
@param n number of elements in vectors @c x and @c y
|
|
@param alpha scalar used to multiplication
|
|
@param x pointer to the memory address of the vector @c x
|
|
@param incx stride between consecutive elements of @c x
|
|
@param y pointer to the memory address of the vector @c y
|
|
@param incy stride between consecutive elements of @c y
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
template <typename T>
|
|
cudaTask axpy(
|
|
int n, const T *alpha, const T *x, int incx, T *y, int incy
|
|
);
|
|
|
|
/**
|
|
@brief copies a vector to another vector
|
|
|
|
This function copies @c n elements from a vector @c x of a step size @c incx
|
|
to another vector @c y of step size @c incy.
|
|
|
|
adds it to the vector @c y overwriting the latest vector with the result.
|
|
Hence, the performed operation is:
|
|
|
|
<tt>y[j] = x[k]</tt>,
|
|
|
|
where @c j and @c k are indices of @c n elements with step sizes
|
|
@c incy and @c incx.
|
|
|
|
This method calls native @c cublas<t>copy with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
|
|
@param n number of elements to copy
|
|
@param x pointer to the memory address of the vector @c x
|
|
@param incx stride between consecutive elements of @c x
|
|
@param y pointer to the memory address of the vector @c y
|
|
@param incy stride between consecutive elements of @c y
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
template <typename T>
|
|
cudaTask vcopy(int n, const T* x, int incx, T* y, int incy);
|
|
|
|
/**
|
|
@brief computes the dot product of two vectors
|
|
|
|
<tt>sum += x[i] * y[i]</tt>
|
|
|
|
This method calls native @c cublas<t>dot with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
|
|
@param n number of elements to perform the dot product
|
|
@param x pointer to the memory address of the vector @c x
|
|
@param incx stride between consecutive elements of @c x
|
|
@param y pointer to the memory address of the vector @c y
|
|
@param incy stride between consecutive elements of @c y
|
|
@param result the resulting dot product
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
template <typename T>
|
|
cudaTask dot(int n, const T* x, int incx, const T* y, int incy, T* result);
|
|
|
|
/**
|
|
@brief computes the Euclidean norm of a vector
|
|
|
|
This method calls native @c cublas<t>nrm2 with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
|
|
@param n number of elements in vector @c x
|
|
@param x pointer to the memory address of the vector
|
|
@param incx stride between consecutive elements of @c x
|
|
@param result the result
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
template <typename T>
|
|
cudaTask nrm2(int n, const T* x, int incx, T* result);
|
|
|
|
/**
|
|
@brief scales a vector by a scalar
|
|
|
|
This method calls native @c cublas<t>scal with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
|
|
@param n number of elements in vector @c x
|
|
@param scalar scalar used for multiplication
|
|
@param x pointer to the memory address of the vector
|
|
@param incx stride between consecutive elements of @c x
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
template <typename T>
|
|
cudaTask scal(int n, const T* scalar, T* x, int incx);
|
|
|
|
/**
|
|
@brief swaps elements between two vectors
|
|
|
|
This function interchanges the elements of vectors @c x and @c y.
|
|
Hence, the performed operation is:
|
|
|
|
<tt>y[j] <-> x[k]</tt>,
|
|
|
|
where @c j is the index of element in @c y with a step size @c incy and
|
|
@c k is the index of element in @c x with a step size @c incx.
|
|
|
|
This method calls native @c cublas<t>swap with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
|
|
@param n number of elements to perform the dot product
|
|
@param x pointer to the memory address of the vector @c x
|
|
@param incx stride between consecutive elements of @c x
|
|
@param y pointer to the memory address of the vector @c y
|
|
@param incy stride between consecutive elements of @c y
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
template <typename T>
|
|
cudaTask swap(int n, T* x, int incx, T* y, int incy);
|
|
|
|
// ------------------------------------------------------------------------
|
|
// TODO Level-2 matrix_vector operations
|
|
// ------------------------------------------------------------------------
|
|
|
|
/**
|
|
@brief performs matrix-vector multiplication
|
|
|
|
This function performs matrix-vector multiplication:
|
|
|
|
<tt>y = alpha * op(A) * x + beta * y</tt>,
|
|
|
|
where @c alpha and @c beta are scalars, @c A
|
|
is a 2D matrix stored in column-major format,
|
|
and @c x, @c y are vectors.
|
|
|
|
The input matrices are in column-major storage.
|
|
|
|
This method calls native @c cublas<t>gemv with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
|
|
@param trans transport operation @c op(A)
|
|
@param m number of rows of matrix @c A
|
|
@param n number of columns of matrix @c A
|
|
@param alpha pointer to the @c alpha scalar
|
|
@param A pointer to the address of @c A
|
|
@param lda leading dimension of 2D array used to store the matrix @c A
|
|
@param x pointer to the address of @c x of at least
|
|
<tt>(1 + (n - 1) * abs(incx))</tt> elements if no transposition,
|
|
or <tt>(1 + (m - 1) * abs(incx))</tt> elements otherwise.
|
|
@param incx stride between consecutive elements of @c x
|
|
@param beta pointer to the @c beta scalar
|
|
@param y pointer to the address of @c y
|
|
@param incy stride between consecutive elements of @c y
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
|
|
template <typename T>
|
|
cudaTask gemv(
|
|
cublasOperation_t trans,
|
|
int m, int n,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *x, int incx,
|
|
const T *beta,
|
|
T *y, int incy
|
|
);
|
|
|
|
/**
|
|
@brief similar to tf::cublasFlowCapturer::gemv but operates on C-styled
|
|
row-major layout
|
|
*/
|
|
|
|
template <typename T>
|
|
cudaTask c_gemv(
|
|
cublasOperation_t trans,
|
|
int m, int n,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *x, int incx,
|
|
const T *beta,
|
|
T *y, int incy
|
|
);
|
|
|
|
/**
|
|
@brief performs symmetric matrix-vector multiplication
|
|
|
|
This function performs symmetric matrix-vector multiplication:
|
|
|
|
<tt>y = alpha * A * x + beta * y</tt>,
|
|
|
|
where @c alpha and @c beta are scalars, @c A
|
|
is a 2D symmetric matrix stored in column-major format,
|
|
and @c x, @c y are vectors
|
|
|
|
This method calls native @c cublas<t>symv with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
|
|
@param uplo indicates if matrix @c A lower or upper part is stored,
|
|
the other symmetric part is not referenced and is inferred
|
|
from the stored elements
|
|
@param n number of rows and columns of matrix @c A
|
|
@param alpha pointer to the @c alpha scalar
|
|
@param A pointer to the address of @c A
|
|
@param lda leading dimension of 2D array used to store the matrix @c A
|
|
@param x pointer to the address of @c x
|
|
@param incx stride between consecutive elements of @c x
|
|
@param beta pointer to the @c beta scalar
|
|
@param y pointer to the address of @c y
|
|
@param incy stride between consecutive elements of @c y
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
template <typename T>
|
|
cudaTask symv(
|
|
cublasFillMode_t uplo,
|
|
int n,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *x, int incx,
|
|
const T *beta,
|
|
T *y, int incy
|
|
);
|
|
|
|
/**
|
|
@brief similar to tf::cublasFlowCapturer::symv but operates on
|
|
C-styled row-major layout
|
|
*/
|
|
template <typename T>
|
|
cudaTask c_symv(
|
|
cublasFillMode_t uplo,
|
|
int n,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *x, int incx,
|
|
const T *beta,
|
|
T *y, int incy
|
|
);
|
|
|
|
/**
|
|
@brief performs symmetric rank-1 update
|
|
|
|
This function performs symmetric rank-1 update:
|
|
|
|
<tt>A = alpha * x * x^T + A</tt>,
|
|
|
|
where @c alpha is a scalar, @c A
|
|
is a 2D symmetric matrix stored in column-major format,
|
|
and @c x is a vector.
|
|
|
|
The result is also symmetric and is stored on in the @c uplo part
|
|
of @c A.
|
|
|
|
This method calls native @c cublas<t>syr with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
|
|
@param uplo indicates if matrix @c A lower or upper part is stored,
|
|
the other symmetric part is not referenced and is inferred
|
|
from the stored elements
|
|
@param n number of rows and columns of matrix @c A
|
|
@param alpha pointer to the @c alpha scalar
|
|
@param x pointer to the address of @c x
|
|
@param incx stride between consecutive elements of @c x
|
|
@param A pointer to the address of @c A
|
|
@param lda leading dimension of 2D array used to store the matrix @c A
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
template <typename T>
|
|
cudaTask syr(
|
|
cublasFillMode_t uplo,
|
|
int n,
|
|
const T *alpha,
|
|
const T *x, int incx,
|
|
T *A, int lda
|
|
);
|
|
|
|
/**
|
|
@brief similar to tf::cublasFlowCapturer::c_syr but operates on
|
|
C-styled row-major layout
|
|
*/
|
|
template <typename T>
|
|
cudaTask c_syr(
|
|
cublasFillMode_t uplo,
|
|
int n,
|
|
const T *alpha,
|
|
const T *x, int incx,
|
|
T *A, int lda
|
|
);
|
|
|
|
/**
|
|
@brief performs symmetric rank-2 update
|
|
|
|
This function performs symmetric rank-2 update:
|
|
|
|
<tt>A = alpha * x * y^T + y * x^T + A</tt>,
|
|
|
|
where @c alpha is a scalar, @c A
|
|
is a 2D symmetric matrix stored in column-major format,
|
|
and @c x and @c y are vectors.
|
|
|
|
The result is also symmetric and is stored on in the @c uplo part
|
|
of @c A.
|
|
|
|
This method calls native @c cublas<t>syr2 with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
|
|
@param uplo indicates if matrix @c A lower or upper part is stored,
|
|
the other symmetric part is not referenced and is inferred
|
|
from the stored elements
|
|
@param n number of rows and columns of matrix @c A
|
|
@param alpha pointer to the @c alpha scalar
|
|
@param x pointer to the address of @c x
|
|
@param incx stride between consecutive elements of @c x
|
|
@param y pointer to the address of @c y
|
|
@param incy stride between consecutive elements of @c y
|
|
@param A pointer to the address of @c A
|
|
@param lda leading dimension of 2D array used to store the matrix @c A
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
template <typename T>
|
|
cudaTask syr2(
|
|
cublasFillMode_t uplo,
|
|
int n,
|
|
const T *alpha,
|
|
const T *x, int incx,
|
|
const T *y, int incy,
|
|
T *A, int lda
|
|
);
|
|
|
|
/**
|
|
@brief similar to tf::cublasFlowCapturer::syr2 but operates on
|
|
C-styled row-major layout
|
|
*/
|
|
template <typename T>
|
|
cudaTask c_syr2(
|
|
cublasFillMode_t uplo,
|
|
int n,
|
|
const T *alpha,
|
|
const T *x, int incx,
|
|
const T *y, int incy,
|
|
T *A, int lda
|
|
);
|
|
|
|
/**
|
|
@brief performs the triangular matrix-vector multiplication
|
|
|
|
This method performs the triangular matrix-vector multiplication:
|
|
|
|
<tt>x = op(A)</tt>,
|
|
|
|
where @c A is a triangular matrix stored in lower or upper mode
|
|
with or without the main diagonal, and @c x is a vector.
|
|
|
|
@tparam T data type
|
|
@param uplo indicates if matrix @c A lower or upper part is stored,
|
|
the other part is not referenced and is inferred from
|
|
the stored elements
|
|
@param tran transpose operation @c op(A)
|
|
@param diag indicates if the elements on the main diagonal of matrix @c A
|
|
are unity (i.e., all 1s) and of no need to be accessed
|
|
@param n number of rows and columns of matrix @c A
|
|
@param A pointer to the address of A
|
|
@param lda leading dimension of 2D array used to store matrix @c A
|
|
@param x input of vector @c b and output of the solution on exit
|
|
@param incx stride between consecutive elements of @c x
|
|
*/
|
|
template <typename T>
|
|
cudaTask trmv(
|
|
cublasFillMode_t uplo,
|
|
cublasOperation_t tran, cublasDiagType_t diag,
|
|
int n, const T* A, int lda,
|
|
T *x, int incx
|
|
);
|
|
|
|
/**
|
|
@brief similar to tf::cublasFlowCapturer::trmv but operates on C-styled
|
|
row-major layout
|
|
*/
|
|
template <typename T>
|
|
cudaTask c_trmv(
|
|
cublasFillMode_t uplo,
|
|
cublasOperation_t tran, cublasDiagType_t diag,
|
|
int n, const T* A, int lda,
|
|
T *x, int incx
|
|
);
|
|
|
|
/**
|
|
@brief solves the triangular linear system with a single right-hand-side
|
|
|
|
This method solves the triangular linear system with a single right-hand-side
|
|
|
|
<tt>op(A) x = b</tt>,
|
|
|
|
where @c A is a triangular matrix stored in lower or upper mode
|
|
with or without the main diagonal, and @c x and @c b are vectors.
|
|
|
|
@tparam T data type
|
|
@param uplo indicates if matrix @c A lower or upper part is stored,
|
|
the other part is not referenced and is inferred from
|
|
the stored elements
|
|
@param tran transpose operation @c op(A)
|
|
@param diag indicates if the elements on the main diagonal of matrix @c A
|
|
are unity (i.e., all 1s) and of no need to be accessed
|
|
@param n number of rows and columns of matrix @c A
|
|
@param A pointer to the address of A
|
|
@param lda leading dimension of 2D array used to store matrix @c A
|
|
@param x input of vector @c b and output of the solution on exit
|
|
@param incx stride between consecutive elements of @c x
|
|
*/
|
|
template <typename T>
|
|
cudaTask trsv(
|
|
cublasFillMode_t uplo,
|
|
cublasOperation_t tran, cublasDiagType_t diag,
|
|
int n, const T* A, int lda,
|
|
T *x, int incx
|
|
);
|
|
|
|
/**
|
|
@brief similar to tf::cublasFlowCapturer::trsv but operates on C-styled
|
|
row-major layout
|
|
*/
|
|
template <typename T>
|
|
cudaTask c_trsv(
|
|
cublasFillMode_t uplo,
|
|
cublasOperation_t tran, cublasDiagType_t diag,
|
|
int n, const T* A, int lda,
|
|
T *x, int incx
|
|
);
|
|
|
|
// ------------------------------------------------------------------------
|
|
// Level-3 matrix-matrix operations
|
|
// ------------------------------------------------------------------------
|
|
|
|
/**
|
|
@brief performs matrix-matrix addition and transposition
|
|
|
|
This method performs the matrix-matrix addition/transposition:
|
|
|
|
<tt>C = alpha * op(A) + beta * op(B)</tt>,
|
|
|
|
where @c alpha and @c beta are scalars, and @c A, @c B and @c C are matrices
|
|
stored in column-major format with dimensions @c op(A) as @c m by @c n,
|
|
@c op(B) as @c m by @c n and @c C as @c m by @c n, respectively.
|
|
|
|
The operation is out-of-place if @c C does not overlap @c A or @c B.
|
|
|
|
The in-place mode supports the following two operations:
|
|
|
|
1. <tt>C = alpha * C + beta * op(B)</tt>
|
|
2. <Tt>C = alpha * op(A) + beta * C</tt>
|
|
|
|
For in-place mode, if @c C equals @c A, @c ldc equals @c lda and
|
|
@c ta equals @c CUBLAS_OP_N. If @c C equals @c B, @c ldc equals @c ldb
|
|
and @c tb equals CUBLAS_OP_N.
|
|
|
|
The operation includes the following special cases:
|
|
|
|
1. the user can reset matrix @c C to zero by setting @c alpha and
|
|
@c beta to 0
|
|
2. the user can transpose matrix @c A by setting @c alpha to 1 and
|
|
@c beta to 0
|
|
|
|
The input matrices are in column-major storage.
|
|
|
|
This method calls native @c cublas<t>geam with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
|
|
@param ta transport operation @c op(A)
|
|
@param tb transport operation @c op(B)
|
|
@param m number of rows of matrix @c C and @c op(A)
|
|
@param n number of columns of matrix @c C and @c op(B)
|
|
@param alpha pointer to the @c alpha scalar
|
|
@param A pointer to the address of @c A
|
|
@param lda leading dimension of 2D array used to store the matrix @c A
|
|
@param beta pointer to the @c beta scalar
|
|
@param B pointer to the address of @c B
|
|
@param ldb leading dimension of 2D array used to store the matrix @c B
|
|
@param C pointer to the address of @c C
|
|
@param ldc leading dimension of 2D array used to store the matrix @c C
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
template <typename T>
|
|
cudaTask geam(
|
|
cublasOperation_t ta, cublasOperation_t tb,
|
|
int m, int n,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *beta,
|
|
const T *B, int ldb,
|
|
T *C, int ldc
|
|
);
|
|
|
|
/**
|
|
@brief similar to tf::cublasFlowCapturer::geam but on row-major layout
|
|
*/
|
|
template <typename T>
|
|
cudaTask c_geam(
|
|
cublasOperation_t ta, cublasOperation_t tb,
|
|
int m, int n,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *beta,
|
|
const T *B, int ldb,
|
|
T *C, int ldc
|
|
);
|
|
|
|
/**
|
|
@brief performs matrix-matrix multiplication
|
|
|
|
This function performs matrix-matrix multiplication:
|
|
|
|
<tt>C = alpha * op (A) * op (B) + beta * C</tt>,
|
|
|
|
where @c alpha and @c beta are scalars, and @c A, @c B, and @c C
|
|
are 2D matrices stored in column-major format
|
|
with dimension @c op(A) as @c m by @c k,
|
|
dimension @c op(B) as @c k by @c n, and @c C as @c m by @c n.
|
|
|
|
The input matrices are in column-major storage.
|
|
|
|
This method calls native @c cublas<t>gemm with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
|
|
@param ta transport operation @c op(A)
|
|
@param tb transport operation @c op(B)
|
|
@param m number of rows of matrix @c C and @c op(A)
|
|
@param n number of columns of matrix @c C and @c op(B)
|
|
@param k number of columns of @c op(A) and rows of @c op(B)
|
|
@param alpha pointer to the @c alpha scalar
|
|
@param A pointer to the address of @c A
|
|
@param lda leading dimension of 2D array used to store the matrix @c A
|
|
@param B pointer to the address of @c B
|
|
@param ldb leading dimension of 2D array used to store the matrix @c B
|
|
@param beta pointer to the @c beta scalar
|
|
@param C pointer to the address of @c C
|
|
@param ldc leading dimension of 2D array used to store the matrix @c C
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
template <typename T>
|
|
cudaTask gemm(
|
|
cublasOperation_t ta, cublasOperation_t tb,
|
|
int m, int n, int k,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *B, int ldb,
|
|
const T *beta,
|
|
T *C, int ldc
|
|
);
|
|
|
|
/**
|
|
@brief similar to tf::cublasFlowCapturer::gemm but operates on C-styled
|
|
row-major layout
|
|
*/
|
|
template <typename T>
|
|
cudaTask c_gemm(
|
|
cublasOperation_t ta, cublasOperation_t tb,
|
|
int m, int n, int k,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *B, int ldb,
|
|
const T *beta,
|
|
T *C, int ldc
|
|
);
|
|
|
|
/**
|
|
@brief performs matrix-matrix multiplication over a batch of matrices
|
|
|
|
The batch must be @em uniform.
|
|
All instances in the batch must have the same dimensions <tt>(m, n, k)</tt>,
|
|
leading dimensions <tt>(lda, ldb, ldc)</tt> and transpositions
|
|
<tt>(ta, tb)</tt> for their respective @c A, @c B and @c C matrices.
|
|
The address of the input matrices and the output matrix of each instance
|
|
of the batch are read from arrays of pointers passed to the function
|
|
by the caller.
|
|
|
|
<tt>C[i]= alpha * op (A[i]) * op (B[i]) + beta * C[i], i in [0, bc)</tt>,
|
|
|
|
where @c alpha and @c beta are scalars, and @c A[i], @c B[i], and @c C[i]
|
|
are 2D matrices stored in column-major format
|
|
with dimension @c op(A) as @c m by @c k,
|
|
dimension @c op(B) as @c k by @c n, and @c C as @c m by @c n.
|
|
|
|
The input matrices are in column-major storage.
|
|
|
|
This method calls native @c cublas<t>gemmBatched with packed parameters,
|
|
<tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
|
|
@param ta transport operation @c op(A[i])
|
|
@param tb transport operation @c op(B[i])
|
|
@param m number of rows of matrix @c C[i] and @c op(A[i])
|
|
@param n number of columns of matrix @c C[i] and @c op(B[i])
|
|
@param k number of columns of @c op(A[i]) and rows of @c op(B[i])
|
|
@param alpha pointer to the @c alpha scalar
|
|
@param A array pointer to @c A batch
|
|
@param lda leading dimension of 2D array used to store the matrix @c A[i]
|
|
@param B array pointer to @c B batch
|
|
@param ldb leading dimension of 2D array used to store the matrix @c B[i]
|
|
@param beta pointer to the @c beta scalar
|
|
@param C array pointer to @c C batch
|
|
@param ldc leading dimension of 2D array used to store the matrix @c C[i]
|
|
@param bc batch size (number of matrices)
|
|
|
|
@return a tf::cudaTask handle
|
|
*/
|
|
template <typename T>
|
|
cudaTask gemm_batched(
|
|
cublasOperation_t ta, cublasOperation_t tb,
|
|
int m, int n, int k,
|
|
const T *alpha,
|
|
const T *A[], int lda,
|
|
const T *B[], int ldb,
|
|
const T *beta,
|
|
T *C[], int ldc,
|
|
int bc
|
|
);
|
|
|
|
/**
|
|
@brief similar to tf::cublasFlowCapturer::gemm_batched but operates on
|
|
C-styled row-major layout
|
|
*/
|
|
template <typename T>
|
|
cudaTask c_gemm_batched(
|
|
cublasOperation_t ta, cublasOperation_t tb,
|
|
int m, int n, int k,
|
|
const T *alpha,
|
|
const T *A[], int lda,
|
|
const T *B[], int ldb,
|
|
const T *beta,
|
|
T *C[], int ldc,
|
|
int bc
|
|
);
|
|
|
|
/**
|
|
@brief performs matrix-matrix multiplication over a batch of matrices
|
|
with strided memory access
|
|
|
|
Here, we use @c A[i], @c B[i], @c C[i] as notation
|
|
for A, B and C matrices in the @c i-th instance of the batch,
|
|
implicitly assuming they are respectively address offsets
|
|
@c sA, @c sB, @c sC away from @c A[i-1], @c B[i-1], @c C[i-1].
|
|
|
|
The input matrices are in column-major storage.
|
|
|
|
This method calls native @c cublas<t>gemmStridedBatched with
|
|
packed parameters, <tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
|
|
@param ta transport operation @c op(A[i])
|
|
@param tb transport operation @c op(B[i])
|
|
@param m number of rows of matrix @c C[i] and @c op(A[i])
|
|
@param n number of columns of matrix @c C[i] and @c op(B[i])
|
|
@param k number of columns of @c op(A[i]) and rows of @c op(B[i])
|
|
@param alpha pointer to the @c alpha scalar
|
|
@param A pointer to @c A batch
|
|
@param lda leading dimension of 2D array used to store the matrix @c A[i]
|
|
@param sA address offset between @c A[i] and @c A[i+1]
|
|
@param B pointer to @c B batch
|
|
@param ldb leading dimension of 2D array used to store the matrix @c B[i]
|
|
@param sB address offset between @c B[i] and @c B[i+1]
|
|
@param beta pointer to the @c beta scalar
|
|
@param C pointer to @c C batch
|
|
@param ldc leading dimension of 2D array used to store the matrix @c C[i]
|
|
@param sC address offset between @c C[i] and @c C[i+1]
|
|
@param bc batch size (number of matrices)
|
|
|
|
@return a tf::cudaTask handle
|
|
|
|
The batch must be @em uniform.
|
|
All instances in the batch must have the same dimensions <tt>(m, n, k)</tt>,
|
|
leading dimensions <tt>(lda, ldb, ldc)</tt> and transpositions
|
|
<tt>(ta, tb)</tt> for their respective @c A, @c B and @c C matrices.
|
|
Input matrices @c A, @c B and output matrix @c C for each instance of the batch
|
|
are located at fixed address offsets from their locations in the previous instance.
|
|
Pointers to @c A, @c B and @c C matrices for the first instance are passed
|
|
to the function by the user along with the address @em offsets -
|
|
@c sA, @c sB and @c sC that determine the locations
|
|
of input and output matrices in future instances.
|
|
|
|
<tt>C + i*sC = alpha * op (A + i*sA) * op (B + i*sB) + beta * (C + i*sC), i in [0, bc)</tt>,
|
|
|
|
where @c alpha and @c beta are scalars, and @c A[i], @c B[i], and @c C[i]
|
|
are 2D matrices stored in column-major format
|
|
with dimension @c op(A) as @c m by @c k,
|
|
dimension @c op(B) as @c k by @c n, and @c C as @c m by @c n.
|
|
|
|
On certain problem sizes, it might be advantageous to create multiple gemm tasks
|
|
to take advantage of concurrent kernels, rather than this method.
|
|
*/
|
|
template <typename T>
|
|
cudaTask gemm_sbatched(
|
|
cublasOperation_t ta, cublasOperation_t tb,
|
|
int m, int n, int k,
|
|
const T *alpha,
|
|
const T *A, int lda, long long int sA,
|
|
const T *B, int ldb, long long int sB,
|
|
const T *beta,
|
|
T *C, int ldc, long long int sC,
|
|
int bc
|
|
);
|
|
|
|
/**
|
|
@brief similar to tf::cublasFlowCapturer::c_gemm_sbatched but operates on
|
|
C-styled row-major layout
|
|
*/
|
|
template <typename T>
|
|
cudaTask c_gemm_sbatched(
|
|
cublasOperation_t ta, cublasOperation_t tb,
|
|
int m, int n, int k,
|
|
const T *alpha,
|
|
const T *A, int lda, long long int sA,
|
|
const T *B, int ldb, long long int sB,
|
|
const T *beta,
|
|
T *C, int ldc, long long int sC,
|
|
int bc
|
|
);
|
|
|
|
/**
|
|
@brief performs the symmetric matrix-matrix multiplication
|
|
|
|
The method performs symmetric matrix-matrix multiplication:
|
|
|
|
<tt>C = alpha * A * B + beta * C, if side == CUBLAS_SIDE_LEFT</tt>, or
|
|
|
|
<tt>C = alpha * B * A + beta * C, if side == CUBLAS_SIDE_RIGHT</tt>.
|
|
|
|
@c A is a symmetric matrix stored in lower or upper mode,
|
|
@c B and @c C are @c m by @c n matrices, and @c alpha and @c beta
|
|
are scalars.
|
|
|
|
This method calls native @c cublas<t>symm with
|
|
packed parameters, <tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
@param side indicates if matrix @c A is on the left or right of @c B.
|
|
@param uplo indicates if matrix @c A lower or upper part is stored,
|
|
the other symmetric part is not referenced and
|
|
is inferred from the stored elements.
|
|
@param m number of rows of matrix @c C and @c B,
|
|
with matrix @c A sized accordingly
|
|
@param n number of columns of matrix @c C and @c B,
|
|
with matrix @c A sized accordingly
|
|
@param alpha scalar used for multiplication
|
|
@param A pointer to the address of matrix @c A
|
|
@param lda leading dimension of the 2D array used to store A
|
|
@param B pointer to the address of matrix @c B
|
|
@param ldb leading dimension of the 2D array used to store B
|
|
@param beta scalar used for multiplication
|
|
@param C pointer to the address of matrix @c C
|
|
@param ldc leading dimension of the 2D array used to store C
|
|
|
|
*/
|
|
template <typename T>
|
|
cudaTask symm(
|
|
cublasSideMode_t side, cublasFillMode_t uplo,
|
|
int m, int n,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *B, int ldb,
|
|
const T *beta,
|
|
T *C, int ldc
|
|
);
|
|
|
|
/**
|
|
@brief similar to tf::cublasFlowCapturer::symm but operates on
|
|
C-styled row-major layout
|
|
*/
|
|
template <typename T>
|
|
cudaTask c_symm(
|
|
cublasSideMode_t side, cublasFillMode_t uplo,
|
|
int m, int n,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *B, int ldb,
|
|
const T *beta,
|
|
T *C, int ldc
|
|
);
|
|
|
|
/**
|
|
@brief performs the symmetric rank-k update
|
|
|
|
This method performs the symmetric rank-k update :
|
|
|
|
<tt>C = alpha * op(A) * op(A)^T + beta * C</tt>,
|
|
|
|
where @c alpha and @c beta are scalars, @c C is a symmetric matrix
|
|
stored in lower or upper mode, and @c A is a matrix with dimension
|
|
@c op(A) @c n by @c k.
|
|
|
|
The result is stored to @c uplo part of @c C.
|
|
|
|
This method calls native @c cublas<t>syrk with
|
|
packed parameters, <tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
@param uplo indicates if matrix @c C lower or upper part is stored,
|
|
the other symmetric part is not referenced and is
|
|
inferred from the stored elements.
|
|
@param tran transposition operation to apply to @c A
|
|
@param n number of rows of matrix @c C and @c op(A)
|
|
@param k number of columns of matrix @c op(A)
|
|
@param alpha scalar used for multiplication
|
|
@param A pointer to the address of @c A
|
|
@param lda leading dimension of the 2D array used to store @c A
|
|
@param beta scalar used for multiplication
|
|
@param C pointer to the address of @c C
|
|
@param ldc leading dimension of the 2D array used to store @c C
|
|
*/
|
|
template <typename T>
|
|
cudaTask syrk(
|
|
cublasFillMode_t uplo, cublasOperation_t tran,
|
|
int n, int k,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *beta,
|
|
T *C, int ldc
|
|
);
|
|
|
|
/**
|
|
@brief similar to tf::cublasFlowCapturer::c_syrk but operates on
|
|
C-styled row-major layout
|
|
*/
|
|
template <typename T>
|
|
cudaTask c_syrk(
|
|
cublasFillMode_t uplo, cublasOperation_t tran,
|
|
int n, int k,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *beta,
|
|
T *C, int ldc
|
|
);
|
|
|
|
/**
|
|
@brief performs the symmetric rank-2k update
|
|
|
|
This method performs the symmetric rank-2k update :
|
|
|
|
<tt>C = alpha * (op(A) * op(B)^T + op(B) * op(A)^T) + beta * C</tt>,
|
|
|
|
where @c alpha and @c beta are scalars, @c C is a symmetric matrix
|
|
stored in lower or upper mode, and @c A and @c B are two matrices
|
|
with dimensions @c op(A) and op(B) @c n by @c k.
|
|
|
|
The result is stored to @c uplo part of @c C.
|
|
|
|
This method calls native @c cublas<t>syr2k with
|
|
packed parameters, <tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
@param uplo indicates if matrix @c C lower or upper part is stored,
|
|
the other symmetric part is not referenced and is
|
|
inferred from the stored elements.
|
|
@param tran transposition operation to apply to @c A
|
|
@param n number of rows of matrix @c C and @c op(A)
|
|
@param k number of columns of matrix @c op(A)
|
|
@param alpha scalar used for multiplication
|
|
@param A pointer to the address of @c A
|
|
@param lda leading dimension of the 2D array used to store @c A
|
|
@param B pointer to the address of @c B
|
|
@param ldb leading dimension of the 2D array used to store @c B
|
|
@param beta scalar used for multiplication
|
|
@param C pointer to the address of @c C
|
|
@param ldc leading dimension of the 2D array used to store @c C
|
|
*/
|
|
template <typename T>
|
|
cudaTask syr2k(
|
|
cublasFillMode_t uplo, cublasOperation_t tran,
|
|
int n, int k,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *B, int ldb,
|
|
const T *beta,
|
|
T *C, int ldc
|
|
);
|
|
|
|
/**
|
|
@brief similar to tf::cublasFlowCapturer::syr2k but operates on
|
|
C-styled row-major layout
|
|
*/
|
|
template <typename T>
|
|
cudaTask c_syr2k(
|
|
cublasFillMode_t uplo, cublasOperation_t tran,
|
|
int n, int k,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *B, int ldb,
|
|
const T *beta,
|
|
T *C, int ldc
|
|
);
|
|
|
|
/**
|
|
@brief performs a variation of the symmetric rank-k update
|
|
|
|
This method performs a variation of the symmetric rank-k update:
|
|
|
|
<tt>C = alpha * op(A) * op(B)^T + beta * C</tt>,
|
|
|
|
where @c alpha and @c beta are scalars, @c C is a symmetric matrix
|
|
stored in lower or upper mode, and @c A and @c B are two matrices
|
|
with dimensions @c op(A) and op(B) @c n by @c k.
|
|
|
|
The result is stored to @c uplo part of @c C.
|
|
|
|
This method calls native @c cublas<t>syr2k with
|
|
packed parameters, <tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
@param uplo indicates if matrix @c C lower or upper part is stored,
|
|
the other symmetric part is not referenced and is
|
|
inferred from the stored elements.
|
|
@param tran transposition operation to apply to @c A
|
|
@param n number of rows of matrix @c C and @c op(A)
|
|
@param k number of columns of matrix @c op(A)
|
|
@param alpha scalar used for multiplication
|
|
@param A pointer to the address of @c A
|
|
@param lda leading dimension of the 2D array used to store @c A
|
|
@param B pointer to the address of @c B
|
|
@param ldb leading dimension of the 2D array used to store @c B
|
|
@param beta scalar used for multiplication
|
|
@param C pointer to the address of @c C
|
|
@param ldc leading dimension of the 2D array used to store @c C
|
|
*/
|
|
template <typename T>
|
|
cudaTask syrkx(
|
|
cublasFillMode_t uplo, cublasOperation_t tran,
|
|
int n, int k,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *B, int ldb,
|
|
const T *beta,
|
|
T *C, int ldc
|
|
);
|
|
|
|
/**
|
|
@brief similar to tf::cublasFlowCapturer::syrkx but operates on
|
|
C-styled row-major layout
|
|
*/
|
|
template <typename T>
|
|
cudaTask c_syrkx(
|
|
cublasFillMode_t uplo, cublasOperation_t tran,
|
|
int n, int k,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *B, int ldb,
|
|
const T *beta,
|
|
T *C, int ldc
|
|
);
|
|
|
|
/**
|
|
@brief performs triangular matrix-matrix multiplication
|
|
|
|
This method performs triangular matrix-matrix multiplication:
|
|
|
|
<tt>C = alpha * op(A) * B</tt>, if <tt>side == CUBLAS_SIDE_LEFT</tt>, or
|
|
|
|
<tt>C = alpha * B * op(A)</tt>, if <tt>side == CUBLAS_SIDE_RIGHT</tt>,
|
|
|
|
where @c A is a triangular matrix stored in lower or upper mode with
|
|
or without the main diagonal, @c B and @c C are @c m by @c n matrix,
|
|
and @c alpha is a scalar.
|
|
|
|
This method calls native @c cublas<t>trmm with
|
|
packed parameters, <tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
@param side indicates if matrix @c A is on the left or right of @c B
|
|
@param uplo indicates if matrix @c A lower or upper part is stored,
|
|
the other part is not referenced and is inferred from
|
|
the stored elements
|
|
@param tran transposition operation to apply to @c A
|
|
@param diag indicates if the elements on the main diagonal of matrix
|
|
@c A are unity and should not be accessed.
|
|
@param m number of rows of matrix @c B, with matrix @c A sized accordingly
|
|
@param n number of columns of matrix @c B, with matrix @c A sized accordingly
|
|
@param alpha scalar used for multiplication
|
|
@param A pointer to the address of matrix @c A
|
|
@param lda leading dimension of the 2D array used to store @c A
|
|
@param B pointer to the address of matrix @c B
|
|
@param ldb leading dimension of the 2D array used to store @c B
|
|
@param C pointer to the address of matrix @c C
|
|
@param ldc leading dimension of the 2D array used to store @c C
|
|
|
|
Notice that in this method, @c B and @c C can point to the same address
|
|
in which case the in-place implementation is performed
|
|
(with results written back to @c B).
|
|
*/
|
|
template <typename T>
|
|
cudaTask trmm(
|
|
cublasSideMode_t side, cublasFillMode_t uplo,
|
|
cublasOperation_t tran, cublasDiagType_t diag,
|
|
int m, int n,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *B, int ldb,
|
|
T *C, int ldc
|
|
);
|
|
|
|
/**
|
|
@brief similar to tf::cublasFlowCapturer::trmm but oeprates on C-styled
|
|
row-major layout
|
|
*/
|
|
template <typename T>
|
|
cudaTask c_trmm(
|
|
cublasSideMode_t side, cublasFillMode_t uplo,
|
|
cublasOperation_t tran, cublasDiagType_t diag,
|
|
int m, int n,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
const T *B, int ldb,
|
|
T *C, int ldc
|
|
);
|
|
|
|
/**
|
|
@brief solves the triangular linear system with multiple right-hand-sides
|
|
|
|
This method solves the triangular linear system with multiple
|
|
right-hand-sides:
|
|
|
|
<tt>op(A) * X = alpha * B</tt>, if <tt>side == CUBLAS_SIDE_LEFT</tt>, or
|
|
|
|
<tt>X * op(A) = alpha * B</tt>, if <tt>side == CUBLAS_SIDE_RIGHT</tt>,
|
|
|
|
where @c A is a triangular matrix stored in lower or upper mode
|
|
with or without the main diagonal, @c X and @c B are @c m by @c n matrices,
|
|
and @c alpha is a scalar.
|
|
|
|
The solution @c X overwrites the right-hand-sides @c B on exit.
|
|
|
|
This method calls native @c cublas<t>trsm with
|
|
packed parameters, <tt>(handle, args...)</tt>, where @c handle is managed by
|
|
the %cublasFlowCapturer and @c args... are the given arguments.
|
|
|
|
@tparam T data type
|
|
@param side indicates if @c A is on the left or right side of @c X
|
|
@param uplo indicates if matrix @c A lower or upper part is stored,
|
|
the other part is not referenced and is inferred from
|
|
the stored elements
|
|
@param tran transposition operation to apply to @c A
|
|
@param diag indicates if the elements on the main diagonal of matrix @c A
|
|
are unity and should not be accessed
|
|
@param m number of rows in matrix @c B, with matrix @c A sized accordingly
|
|
@param n number of columns in matrix @c B, with matrix @c A sized accordingly
|
|
@param alpha scalar to apply to @c B
|
|
@param A pointer to the address of matrix @c A
|
|
@param lda leading dimension of the 2D array used to store @c A
|
|
@param B pointer to the address of matrix @c B
|
|
@param ldb leading dimension of the 2D array used to store @c B
|
|
*/
|
|
template <typename T>
|
|
cudaTask trsm(
|
|
cublasSideMode_t side, cublasFillMode_t uplo,
|
|
cublasOperation_t tran, cublasDiagType_t diag,
|
|
int m, int n,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
T *B, int ldb
|
|
);
|
|
|
|
/**
|
|
@brief similar to tf::cublasFlowCapturer::trsm but operates on C-styled
|
|
row-major layout
|
|
*/
|
|
template <typename T>
|
|
cudaTask c_trsm(
|
|
cublasSideMode_t side, cublasFillMode_t uplo,
|
|
cublasOperation_t tran, cublasDiagType_t diag,
|
|
int m, int n,
|
|
const T *alpha,
|
|
const T *A, int lda,
|
|
T *B, int ldb
|
|
);
|
|
|
|
private:
|
|
|
|
cublasScopedPerThreadHandle _handle;
|
|
|
|
void _stream(cudaStream_t);
|
|
};
|
|
|
|
// Procedure: _stream
|
|
inline void cublasFlowCapturer::_stream(cudaStream_t stream) {
|
|
TF_CHECK_CUBLAS(
|
|
cublasSetStream(_handle, stream), "failed to set cublas stream"
|
|
);
|
|
}
|
|
|
|
// Function: native_handle
|
|
inline cublasHandle_t cublasFlowCapturer::native_handle() {
|
|
return _handle;
|
|
}
|
|
|
|
} // end of namespace tf -----------------------------------------------------
|
|
|
|
|