cocos-engine-external/sources/enoki/cuda.h

1027 lines
35 KiB
C++

/*
enoki/cuda.h -- CUDA-backed Enoki dynamic array with JIT compilation
Enoki is a C++ template library that enables transparent vectorization
of numerical kernels using SIMD instruction sets available on current
processor architectures.
Copyrighe (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
All rights reserved. Use of this source code is governed by a BSD-style
license that can be found in the LICENSE file.
*/
#pragma once
#define ENOKI_CUDA_H 1
#include <enoki/array.h>
NAMESPACE_BEGIN(enoki)
// -----------------------------------------------------------------------
//! @{ \name Imports from libenoki-cuda.so
// -----------------------------------------------------------------------
/// Initialize the tracing JIT
extern ENOKI_IMPORT void cuda_init();
/// Delete the trace, requires a subsequent call by cuda_init()
extern ENOKI_IMPORT void cuda_shutdown();
/// Compile and evaluate the trace up to the current instruction
extern ENOKI_IMPORT void cuda_eval(bool log_assembly /* = false */);
/// Invokes 'cuda_eval' if the given variable has not been evaluated yet
extern ENOKI_IMPORT void cuda_eval_var(uint32_t index, bool log_assembly = false);
/// Increase the reference count of a variable
extern ENOKI_IMPORT void cuda_inc_ref_ext(uint32_t);
/// Decrease the reference count of a variable
extern ENOKI_IMPORT void cuda_dec_ref_ext(uint32_t);
/// Return the size of a variable
extern ENOKI_IMPORT size_t cuda_var_size(uint32_t);
/// Return the pointer address of a variable (in device memory)
extern ENOKI_IMPORT void* cuda_var_ptr(uint32_t);
/// Retroactively adjust the recorded size of a variable
extern ENOKI_IMPORT uint32_t cuda_var_set_size(uint32_t index, size_t size, bool copy = false);
/// Mark a variable as dirty (e.g. due to scatter)
extern ENOKI_IMPORT void cuda_var_mark_dirty(uint32_t);
/// Attach a label to a variable (written to PTX assembly)
extern ENOKI_IMPORT void cuda_var_set_label(uint32_t, const char *);
/// Needed to mark certain instructions with side effects (e.g. scatter)
extern ENOKI_IMPORT void cuda_var_mark_side_effect(uint32_t);
/// Set the current scatter/source operand array
extern ENOKI_IMPORT void cuda_set_scatter_gather_operand(uint32_t index, bool gather);
/// Append an operation to the trace (0 arguments)
extern ENOKI_IMPORT uint32_t cuda_trace_append(EnokiType type,
const char *op);
/// Append an operation to the trace (1 argument)
extern ENOKI_IMPORT uint32_t cuda_trace_append(EnokiType type,
const char *op,
uint32_t arg1);
/// Append an operation to the trace (2 arguments)
extern ENOKI_IMPORT uint32_t cuda_trace_append(EnokiType type,
const char *op,
uint32_t arg1,
uint32_t arg2);
/// Append an operation to the trace (3 arguments)
extern ENOKI_IMPORT uint32_t cuda_trace_append(EnokiType type,
const char *op,
uint32_t arg1,
uint32_t arg2,
uint32_t arg3);
/// Insert a "printf" instruction for the given instruction
extern ENOKI_IMPORT void cuda_trace_printf(const char *fmt, uint32_t narg,
uint32_t *arg);
/// Computes the prefix sum of a given memory region
template <typename T> extern ENOKI_IMPORT T* cuda_psum(size_t, const T *);
/// Computes the horizontal sum of a given memory region
template <typename T> extern ENOKI_IMPORT T* cuda_hsum(size_t, const T *);
/// Computes the horizontal product of a given memory region
template <typename T> extern ENOKI_IMPORT T* cuda_hprod(size_t, const T *);
/// Computes the horizontal maximum of a given memory region
template <typename T> extern ENOKI_IMPORT T* cuda_hmax(size_t, const T *);
/// Computes the horizontal minimum of a given memory region
template <typename T> extern ENOKI_IMPORT T* cuda_hmin(size_t, const T *);
/// Compute the number of entries set to 'true'
extern ENOKI_IMPORT size_t cuda_count(size_t, const bool *);
template <typename T>
extern ENOKI_IMPORT void cuda_compress(size_t, const T *, const bool *mask,
T **, size_t *);
/// Computes a horizontal reduction of a mask array via AND
extern ENOKI_IMPORT bool cuda_all(size_t, const bool *);
/// Computes a horizontal reduction of a mask array via OR
extern ENOKI_IMPORT bool cuda_any(size_t, const bool *);
/// Sort 'ptrs' and return unique instances and their count, as well as a permutation
extern ENOKI_IMPORT void cuda_partition(size_t size, const void **ptrs,
void ***unique_out,
uint32_t **counts_out,
uint32_t ***perm_out);
/// Copy some host memory region to the device and wrap it in a variable
extern ENOKI_IMPORT uint32_t cuda_var_copy_to_device(EnokiType type,
size_t size, const void *value);
/// Create a variable that stores a pointer to some (device) memory region
extern ENOKI_IMPORT uint32_t cuda_var_register_ptr(const void *ptr);
/// Register a memory region (in device memory) as a variable
extern ENOKI_IMPORT uint32_t cuda_var_register(EnokiType type, size_t size,
void *ptr, bool dealloc);
/// Fetch a scalar value from a CUDA array (in device memory)
extern ENOKI_IMPORT void cuda_fetch_element(void *, uint32_t, size_t, size_t);
/// Copy a memory region to the device
extern ENOKI_IMPORT void cuda_memcpy_to_device(void *dst, const void *src, size_t size);
extern ENOKI_IMPORT void cuda_memcpy_to_device_async(void *dst, const void *src, size_t size);
/// Copy a memory region from the device
extern ENOKI_IMPORT void cuda_memcpy_from_device(void *dst, const void *src, size_t size);
extern ENOKI_IMPORT void cuda_memcpy_from_device_async(void *dst, const void *src, size_t size);
/// Return the free and total amount of memory (Wrapper around cudaMemGetInfo)
extern ENOKI_IMPORT void cuda_mem_get_info(size_t *free, size_t *total);
/// Allocate device-local memory (wrapper around cudaMalloc)
extern ENOKI_IMPORT void* cuda_malloc(size_t);
/// Allocate unified memory (wrapper around cudaMallocManaged)
extern ENOKI_IMPORT void* cuda_managed_malloc(size_t size);
/// Allocate host-pinned memory (wrapper around cudaMallocHost)
extern ENOKI_IMPORT void* cuda_host_malloc(size_t);
/// Allocate unified memory (wrapper around analogues of cudaMemsetAsync)
extern ENOKI_IMPORT void cuda_fill(uint8_t *ptr, uint8_t value, size_t size);
extern ENOKI_IMPORT void cuda_fill(uint16_t *ptr, uint16_t value, size_t size);
extern ENOKI_IMPORT void cuda_fill(uint32_t *ptr, uint32_t value, size_t size);
extern ENOKI_IMPORT void cuda_fill(uint64_t *ptr, uint64_t value, size_t size);
/// Reverse an array
extern ENOKI_IMPORT void cuda_reverse(uint8_t *out, const uint8_t *in, size_t size);
extern ENOKI_IMPORT void cuda_reverse(uint16_t *out, const uint16_t *in, size_t size);
extern ENOKI_IMPORT void cuda_reverse(uint32_t *out, const uint32_t *in, size_t size);
extern ENOKI_IMPORT void cuda_reverse(uint64_t *out, const uint64_t *in, size_t size);
/// Release device-local or unified memory
extern ENOKI_IMPORT void cuda_free(void *);
/// Release host-local memory
extern ENOKI_IMPORT void cuda_host_free(void *);
/// Release any unused held memory back to the device
extern ENOKI_IMPORT void cuda_malloc_trim();
/// Wait for all work queued on the device to finish
extern ENOKI_IMPORT void cuda_sync();
/// Print detailed information about currently allocated arrays
extern ENOKI_IMPORT char *cuda_whos();
/// Convert a variable into managed memory (if applicable)
extern ENOKI_IMPORT void cuda_make_managed(uint32_t);
/// Register a callback that will be invoked before cuda_eval()
extern void cuda_register_callback(void (*callback)(void *), void *payload);
/// Unregister a callback installed via 'cuda_register_callback()'
extern void cuda_unregister_callback(void (*callback)(void *), void *payload);
/**
* \brief Current log level (0: none, 1: kernel launches,
* 2: +ptxas statistics, 3: +ptx source, 4: +jit trace, 5: +ref counting)
*/
extern ENOKI_IMPORT void cuda_set_log_level(uint32_t);
extern ENOKI_IMPORT uint32_t cuda_log_level();
//! @}
// -----------------------------------------------------------------------
template <typename Value>
struct CUDAArray : ArrayBase<value_t<Value>, CUDAArray<Value>> {
template <typename T> friend struct CUDAArray;
using Index = uint32_t;
static constexpr EnokiType Type = enoki_type_v<Value>;
static constexpr bool IsCUDA = true;
template <typename T> using ReplaceValue = CUDAArray<T>;
using MaskType = CUDAArray<bool>;
using ArrayType = CUDAArray;
CUDAArray() = default;
~CUDAArray() {
cuda_dec_ref_ext(m_index);
if constexpr (std::is_pointer_v<Value> || std::is_same_v<Value, uintptr_t>)
delete m_cached_partition;
}
CUDAArray(const CUDAArray &a) : m_index(a.m_index) {
cuda_inc_ref_ext(m_index);
}
CUDAArray(CUDAArray &&a) : m_index(a.m_index) {
a.m_index = 0;
if constexpr (std::is_pointer_v<Value> || std::is_same_v<Value, uintptr_t>) {
m_cached_partition = a.m_cached_partition;
a.m_cached_partition = nullptr;
}
}
template <typename T> CUDAArray(const CUDAArray<T> &v) {
const char *op;
if (std::is_floating_point_v<T> && std::is_integral_v<Value>)
op = "cvt.rzi.$t1.$t2 $r1, $r2";
else if (std::is_integral_v<T> && std::is_floating_point_v<Value>)
op = "cvt.rn.$t1.$t2 $r1, $r2";
else
op = "cvt.$t1.$t2 $r1, $r2";
m_index = cuda_trace_append(Type, op, v.index_());
}
template <typename T>
CUDAArray(const CUDAArray<T> &v, detail::reinterpret_flag) {
static_assert(sizeof(T) == sizeof(Value));
if (std::is_integral_v<T> != std::is_integral_v<Value>) {
m_index = cuda_trace_append(Type, "mov.$b1 $r1, $r2", v.index_());
} else {
m_index = v.index_();
cuda_inc_ref_ext(m_index);
}
}
template <typename T, enable_if_t<std::is_scalar_v<T>> = 0>
CUDAArray(const T &value, detail::reinterpret_flag)
: CUDAArray(memcpy_cast<Value>(value)) { }
template <typename T, enable_if_t<std::is_scalar_v<T>> = 0>
CUDAArray(T value) : CUDAArray((Value) value) { }
CUDAArray(Value value) {
const char *fmt = nullptr;
switch (Type) {
case EnokiType::Float16:
fmt = "mov.$t1 $r1, %04x";
break;
case EnokiType::Float32:
fmt = "mov.$t1 $r1, 0f%08x";
break;
case EnokiType::Float64:
fmt = "mov.$t1 $r1, 0d%016llx";
break;
case EnokiType::Bool:
fmt = "mov.$t1 $r1, %i";
break;
case EnokiType::Int8:
case EnokiType::UInt8:
fmt = "mov.$t1 $r1, 0x%02x";
break;
case EnokiType::Int16:
case EnokiType::UInt16:
fmt = "mov.$t1 $r1, 0x%04x";
break;
case EnokiType::Int32:
case EnokiType::UInt32:
fmt = "mov.$t1 $r1, 0x%08x";
break;
case EnokiType::Pointer:
case EnokiType::Int64:
case EnokiType::UInt64:
fmt = "mov.$t1 $r1, 0x%016llx";
break;
default:
fmt = "<<invalid format during cast>>";
break;
}
char tmp[32];
snprintf(tmp, 32, fmt, memcpy_cast<uint_array_t<Value>>(value));
m_index = cuda_trace_append(Type, tmp);
}
template <typename... Args, enable_if_t<(sizeof...(Args) > 1)> = 0>
CUDAArray(Args&&... args) {
Value data[] = { (Value) args... };
m_index = cuda_var_copy_to_device(Type, sizeof...(Args), data);
}
CUDAArray &operator=(const CUDAArray &a) {
cuda_inc_ref_ext(a.m_index);
cuda_dec_ref_ext(m_index);
m_index = a.m_index;
if constexpr (std::is_pointer_v<Value> || std::is_same_v<Value, uintptr_t>)
m_cached_partition = nullptr;
return *this;
}
CUDAArray &operator=(CUDAArray &&a) {
std::swap(m_index, a.m_index);
if constexpr (std::is_pointer_v<Value> || std::is_same_v<Value, uintptr_t>)
std::swap(m_cached_partition, a.m_cached_partition);
return *this;
}
CUDAArray add_(const CUDAArray &v) const {
const char *op = std::is_floating_point_v<Value>
? "add.rn.ftz.$t1 $r1, $r2, $r3"
: "add.$t1 $r1, $r2, $r3";
return CUDAArray::from_index_(
cuda_trace_append(Type, op, index_(), v.index_()));
}
CUDAArray sub_(const CUDAArray &v) const {
const char *op = std::is_floating_point_v<Value>
? "sub.rn.ftz.$t1 $r1, $r2, $r3"
: "sub.$t1 $r1, $r2, $r3";
return CUDAArray::from_index_(
cuda_trace_append(Type, op, index_(), v.index_()));
}
CUDAArray mul_(const CUDAArray &v) const {
const char *op = std::is_floating_point_v<Value>
? "mul.rn.ftz.$t1 $r1, $r2, $r3"
: "mul.lo.$t1 $r1, $r2, $r3";
return CUDAArray::from_index_(
cuda_trace_append(Type, op, index_(), v.index_()));
}
CUDAArray mulhi_(const CUDAArray &v) const {
return CUDAArray::from_index_(cuda_trace_append(
Type, "mul.hi.$t1 $r1, $r2, $r3", index_(), v.index_()));
}
CUDAArray div_(const CUDAArray &v) const {
const char *op = std::is_floating_point_v<Value>
? "div.rn.ftz.$t1 $r1, $r2, $r3"
: "div.$t1 $r1, $r2, $r3";
return CUDAArray::from_index_(
cuda_trace_append(Type, op, index_(), v.index_()));
}
CUDAArray mod_(const CUDAArray &v) const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"rem.$t1 $r1, $r2, $r3", index_(), v.index_()));
}
CUDAArray fmadd_(const CUDAArray &a, const CUDAArray &b) const {
const char *op = std::is_floating_point_v<Value>
? "fma.rn.ftz.$t1 $r1, $r2, $r3, $r4"
: "mad.lo.$t1 $r1, $r2, $r3, $r4";
return CUDAArray::from_index_(
cuda_trace_append(Type, op, index_(), a.index_(), b.index_()));
}
CUDAArray fmsub_(const CUDAArray &a, const CUDAArray &b) const {
return fmadd_(a, -b);
}
CUDAArray fnmadd_(const CUDAArray &a, const CUDAArray &b) const {
return fmadd_(-a, b);
}
CUDAArray fnmsub_(const CUDAArray &a, const CUDAArray &b) const {
return -fmadd_(a, b);
}
CUDAArray max_(const CUDAArray &v) const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"max.ftz.$t1 $r1, $r2, $r3", index_(), v.index_()));
}
CUDAArray min_(const CUDAArray &v) const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"min.ftz.$t1 $r1, $r2, $r3", index_(), v.index_()));
}
CUDAArray abs_() const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"abs.ftz.$t1 $r1, $r2", index_()));
}
CUDAArray neg_() const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"neg.ftz.$t1 $r1, $r2", index_()));
}
CUDAArray sqrt_() const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"sqrt.rn.ftz.$t1 $r1, $r2", index_()));
}
CUDAArray exp_() const {
CUDAArray scaled = Value(1.4426950408889634074) * *this;
return CUDAArray::from_index_(cuda_trace_append(Type,
"ex2.approx.ftz.$t1 $r1, $r2", scaled.index_()));
}
CUDAArray log_() const {
return CUDAArray::from_index_(cuda_trace_append(
Type, "lg2.approx.ftz.$t1 $r1, $r2",
index_())) * Value(0.69314718055994530942);
}
CUDAArray sin_() const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"sin.approx.ftz.$t1 $r1, $r2", index_()));
}
CUDAArray cos_() const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"cos.approx.ftz.$t1 $r1, $r2", index_()));
}
std::pair<CUDAArray, CUDAArray> sincos_() const {
return { sin_(), cos_() };
}
CUDAArray rcp_() const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"rcp.approx.ftz.$t1 $r1, $r2", index_()));
}
CUDAArray rsqrt_() const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"rsqrt.approx.ftz.$t1 $r1, $r2", index_()));
}
CUDAArray floor_() const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"cvt.rmi.$t1.$t1 $r1, $r2", index_()));
}
CUDAArray ceil_() const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"cvt.rpi.$t1.$t1 $r1, $r2", index_()));
}
CUDAArray round_() const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"cvt.rni.$t1.$t1 $r1, $r2", index_()));
}
CUDAArray trunc_() const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"cvt.rzi.$t1.$t1 $r1, $r2", index_()));
}
template <typename T> T floor2int_() const {
return T::from_index_(cuda_trace_append(T::Type,
"cvt.rmi.$t1.$t2 $r1, $r2", index_()));
}
template <typename T> T ceil2int_() const {
return T::from_index_(cuda_trace_append(T::Type,
"cvt.rpi.$t1.$t2 $r1, $r2", index_()));
}
CUDAArray sl_(const CUDAArray &v) const {
if constexpr (sizeof(Value) == 4)
return CUDAArray::from_index_(cuda_trace_append(Type,
"shl.$b1 $r1, $r2, $r3", index_(), v.index_()));
else
return CUDAArray::from_index_(cuda_trace_append(Type,
"shl.$b1 $r1, $r2, $r3", index_(), CUDAArray<int32_t>(v).index_()));
}
CUDAArray sr_(const CUDAArray &v) const {
const char *op = std::is_signed_v<Value> ? "shr.$t1 $r1, $r2, $r3"
: "shr.$b1 $r1, $r2, $r3";
if constexpr (sizeof(Value) == 4)
return CUDAArray::from_index_(cuda_trace_append(Type,
op, index_(), v.index_()));
else
return CUDAArray::from_index_(cuda_trace_append(Type,
op, index_(), CUDAArray<int32_t>(v).index_()));
}
CUDAArray sl_(size_t value) const { return sl_(CUDAArray((Value) value)); }
CUDAArray sr_(size_t value) const { return sr_(CUDAArray((Value) value)); }
template <size_t Imm> CUDAArray sl_() const { return sl_(Imm); }
template <size_t Imm> CUDAArray sr_() const { return sr_(Imm); }
CUDAArray not_() const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"not.$b1 $r1, $r2", index_()));
}
CUDAArray popcnt_() const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"popc.$b1 $r1, $r2", index_()));
}
CUDAArray lzcnt_() const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"clz.$b1 $r1, $r2", index_()));
}
CUDAArray tzcnt_() const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"brev.$b1 $r1, $r2;\n clz.$b1 $r1, $r1", index_()));
}
template <typename T>
CUDAArray or_(const CUDAArray<T> &v) const {
Value all_ones = memcpy_cast<Value>(int_array_t<Value>(-1));
ENOKI_MARK_USED(all_ones);
if constexpr (std::is_same_v<T, Value>)
return CUDAArray::from_index_(cuda_trace_append(Type,
"or.$b1 $r1, $r2, $r3", index_(), v.index_()));
else
return CUDAArray::from_index_(cuda_trace_append(Type,
"selp.$t1 $r1, $r2, $r3, $r4", CUDAArray(all_ones).index_(),
index_(), v.index_()));
}
template <typename T>
CUDAArray and_(const CUDAArray<T> &v) const {
Value all_zeros = memcpy_cast<Value>(int_array_t<Value>(0));
ENOKI_MARK_USED(all_zeros);
if constexpr (std::is_same_v<T, Value>)
return CUDAArray::from_index_(cuda_trace_append(Type,
"and.$b1 $r1, $r2, $r3", index_(), v.index_()));
else
return CUDAArray::from_index_(cuda_trace_append(Type,
"selp.$t1 $r1, $r2, $r3, $r4", index_(),
CUDAArray(all_zeros).index_(), v.index_()));
}
template <typename T> CUDAArray andnot_(const CUDAArray<T> &v) const {
return and_(!v);
}
CUDAArray xor_(const CUDAArray &v) const {
return CUDAArray::from_index_(cuda_trace_append(Type,
"xor.$b1 $r1, $r2, $r3", index_(), v.index_()));
}
MaskType gt_(const CUDAArray &v) const {
const char *op = std::is_signed_v<Value>
? "setp.gt.$t2 $r1, $r2, $r3"
: "setp.hi.$t2 $r1, $r2, $r3";
return MaskType::from_index_(cuda_trace_append(
EnokiType::Bool, op, index_(), v.index_()));
}
MaskType ge_(const CUDAArray &v) const {
const char *op = std::is_signed_v<Value>
? "setp.ge.$t2 $r1, $r2, $r3"
: "setp.hs.$t2 $r1, $r2, $r3";
return MaskType::from_index_(cuda_trace_append(
EnokiType::Bool, op, index_(), v.index_()));
}
MaskType lt_(const CUDAArray &v) const {
const char *op = std::is_signed_v<Value>
? "setp.lt.$t2 $r1, $r2, $r3"
: "setp.lo.$t2 $r1, $r2, $r3";
return MaskType::from_index_(cuda_trace_append(
EnokiType::Bool, op, index_(), v.index_()));
}
MaskType le_(const CUDAArray &v) const {
const char *op = std::is_signed_v<Value>
? "setp.le.$t2 $r1, $r2, $r3"
: "setp.ls.$t2 $r1, $r2, $r3";
return MaskType::from_index_(cuda_trace_append(
EnokiType::Bool, op, index_(), v.index_()));
}
MaskType eq_(const CUDAArray &v) const {
const char *op = !std::is_same_v<Value, bool>
? "setp.eq.$t2 $r1, $r2, $r3" :
"xor.$t2 $r1, $r2, $r3;\n not.$t2 $r1, $r1";
return MaskType::from_index_(cuda_trace_append(
EnokiType::Bool, op, index_(), v.index_()));
}
MaskType neq_(const CUDAArray &v) const {
const char *op = !std::is_same_v<Value, bool>
? "setp.ne.$t2 $r1, $r2, $r3" :
"xor.$t2 $r1, $r2, $r3";
return MaskType::from_index_(cuda_trace_append(
EnokiType::Bool, op, index_(), v.index_()));
}
static CUDAArray select_(const MaskType &m, const CUDAArray &t, const CUDAArray &f) {
if constexpr (!std::is_same_v<Value, bool>) {
return CUDAArray::from_index_(cuda_trace_append(Type,
"selp.$t1 $r1, $r2, $r3, $r4", t.index_(), f.index_(), m.index_()));
} else {
return (m & t) | (~m & f);
}
}
static CUDAArray arange_(ssize_t start, ssize_t stop, ssize_t step) {
size_t size = size_t((stop - start + step - (step > 0 ? 1 : -1)) / step);
using UInt32 = CUDAArray<uint32_t>;
UInt32 index = UInt32::from_index_(
cuda_trace_append(EnokiType::UInt32, "mov.u32 $r1, $r2", 2));
cuda_var_set_size(index.index_(), size);
if (start == 0 && step == 1)
return index;
else
return fmadd(index, CUDAArray((Value) step), CUDAArray((Value) start));
}
static CUDAArray linspace_(Value min, Value max, size_t size) {
using UInt32 = CUDAArray<uint32_t>;
UInt32 index = UInt32::from_index_(
cuda_trace_append(EnokiType::UInt32, "mov.u32 $r1, $r2", 2));
cuda_var_set_size(index.index_(), size);
Value step = (max - min) / Value(size - 1);
return fmadd(index, CUDAArray(step), CUDAArray(min));
}
static CUDAArray empty_(size_t size) {
return CUDAArray::from_index_(cuda_var_register(
Type, size, cuda_malloc(size * sizeof(Value)), true));
}
static CUDAArray zero_(size_t size) {
if (size == 1) {
return CUDAArray(Value(0));
} else {
void *ptr = cuda_malloc(size * sizeof(Value));
cuda_fill((uint8_t *) ptr, 0, size * sizeof(Value));
uint32_t index = cuda_var_register(Type, size, ptr, true);
return CUDAArray::from_index_(index);
}
}
static CUDAArray full_(const Value &value, size_t size) {
if (size == 1) {
return CUDAArray(value);
} else {
using UInt = uint_array_t<Value>;
void *ptr = cuda_malloc(size * sizeof(Value));
cuda_fill((UInt *) ptr, memcpy_cast<UInt>(value), size);
uint32_t index = cuda_var_register(Type, size, ptr, true);
return CUDAArray::from_index_(index);
}
}
CUDAArray hsum_() const {
size_t n = size();
if (n == 1) {
return *this;
} else {
eval();
Value *result = cuda_hsum(n, (const Value *) cuda_var_ptr(m_index));
return CUDAArray::from_index_(cuda_var_register(Type, 1, result, true));
}
}
CUDAArray reverse_() const {
using UInt = uint_array_t<Value>;
size_t n = size();
if (n <= 1)
return *this;
eval();
UInt *result = (UInt *) cuda_malloc(n * sizeof(Value));
cuda_reverse(result, (const UInt *) cuda_var_ptr(m_index), n);
return CUDAArray::from_index_(cuda_var_register(Type, n, result, true));
}
CUDAArray psum_() const {
size_t n = size();
if (n <= 1) {
return *this;
} else {
eval();
Value *result = cuda_psum(n, (const Value *) cuda_var_ptr(m_index));
return CUDAArray::from_index_(cuda_var_register(Type, n, result, true));
}
}
CUDAArray hprod_() const {
size_t n = size();
if (n == 1) {
return *this;
} else {
eval();
Value *result = cuda_hprod(n, (const Value *) cuda_var_ptr(m_index));
return CUDAArray::from_index_(cuda_var_register(Type, 1, result, true));
}
}
CUDAArray hmax_() const {
size_t n = size();
if (n == 1) {
return *this;
} else {
eval();
Value *result = cuda_hmax(n, (const Value *) cuda_var_ptr(m_index));
return CUDAArray::from_index_(cuda_var_register(Type, 1, result, true));
}
}
CUDAArray hmin_() const {
size_t n = size();
if (n == 1) {
return *this;
} else {
eval();
Value *result = cuda_hmin(n, (const Value *) cuda_var_ptr(m_index));
return CUDAArray::from_index_(cuda_var_register(Type, 1, result, true));
}
}
bool all_() const {
size_t n = size();
if (n == 1) {
return coeff(0);
} else {
eval();
return cuda_all(n, (const Value *) cuda_var_ptr(m_index));
}
}
bool any_() const {
size_t n = size();
if (n == 1) {
return coeff(0);
} else {
eval();
return cuda_any(n, (const Value *) cuda_var_ptr(m_index));
}
}
CUDAArray &eval() {
cuda_eval_var(m_index);
return *this;
}
const CUDAArray &eval() const {
cuda_eval_var(m_index);
return *this;
}
size_t count_() const {
eval();
return cuda_count(cuda_var_size(m_index), (const Value *) cuda_var_ptr(m_index));
}
static CUDAArray map(void *ptr, size_t size, bool dealloc = false) {
return CUDAArray::from_index_(cuda_var_register(Type, size, ptr, dealloc));
}
static CUDAArray copy(const void *ptr, size_t size) {
return CUDAArray::from_index_(cuda_var_copy_to_device(Type, size, ptr));
}
CUDAArray &managed() {
cuda_make_managed(m_index);
return *this;
}
const CUDAArray &managed() const {
cuda_make_managed(m_index);
return *this;
}
template <typename T = Value, enable_if_t<std::is_pointer_v<T> || std::is_same_v<T, uintptr_t>> = 0>
std::vector<std::pair<Value, CUDAArray<uint32_t>>> partition_() const {
if (!m_cached_partition) {
eval();
void **unique = nullptr;
uint32_t *counts = nullptr;
uint32_t **perm = nullptr;
cuda_partition(size(), (const void **) data(),
&unique, &counts, &perm);
uint32_t num_unique = counts[0];
m_cached_partition = new std::vector<std::pair<Value, CUDAArray<uint32_t>>>(num_unique);
m_cached_partition->reserve(num_unique);
for (uint32_t i = 0; i < num_unique; ++i) {
m_cached_partition->emplace_back(
(Value) unique[i],
CUDAArray<uint32_t>::from_index_(cuda_var_register(
EnokiType::UInt32, counts[i + 1], perm[i], true)));
}
cuda_host_free(unique);
cuda_host_free(counts);
free(perm);
}
return *m_cached_partition;
}
template <size_t Stride, typename Index, typename Mask>
static CUDAArray gather_(const void *ptr_, const Index &index,
const Mask &mask) {
using UInt64 = CUDAArray<uint64_t>;
UInt64 ptr = UInt64::from_index_(cuda_var_register_ptr(ptr_)),
addr = fmadd(UInt64(index), (uint64_t) Stride, ptr);
if constexpr (!std::is_same_v<Value, bool>) {
return CUDAArray::from_index_(cuda_trace_append(
Type,
"@$r3 ld.global.$t1 $r1, [$r2];\n @!$r3 mov.$b1 $r1, 0",
addr.index_(), mask.index_()));
} else {
return neq(CUDAArray<uint32_t>::from_index_(cuda_trace_append(
EnokiType::UInt32,
"@$r3 ld.global.u8 $r1, [$r2];\n @!$r3 mov.$b1 $r1, 0",
addr.index_(), mask.index_())), 0u);
}
}
template <size_t Stride, typename Index, typename Mask>
ENOKI_INLINE void scatter_(void *ptr_, const Index &index, const Mask &mask) const {
using UInt64 = CUDAArray<uint64_t>;
UInt64 ptr = UInt64::from_index_(cuda_var_register_ptr(ptr_)),
addr = fmadd(UInt64(index), (uint64_t) Stride, ptr);
CUDAArray::Index var;
if constexpr (!std::is_same_v<Value, bool>) {
var = cuda_trace_append(EnokiType::UInt64,
"@$r4 st.global.$t3 [$r2], $r3",
addr.index_(), m_index, mask.index_()
);
} else {
using UInt32 = CUDAArray<uint32_t>;
UInt32 value = select(*this, UInt32(1), UInt32(0));
var = cuda_trace_append(EnokiType::UInt64,
"@$r4 st.global.u8 [$r2], $r3",
addr.index_(), value.index_(), mask.index_()
);
}
cuda_var_mark_side_effect(var);
}
template <size_t Stride, typename Index, typename Mask>
void scatter_add_(void *ptr_, const Index &index, const Mask &mask) const {
using UInt64 = CUDAArray<uint64_t>;
UInt64 ptr = UInt64::from_index_(cuda_var_register_ptr(ptr_)),
addr = fmadd(UInt64(index), (uint64_t) Stride, ptr);
CUDAArray::Index var = cuda_trace_append(Type,
"@$r4 atom.global.add.$t1 $r1, [$r2], $r3",
addr.index_(), m_index, mask.index_()
);
cuda_var_mark_side_effect(var);
}
template <typename Mask> CUDAArray compress_(const Mask &mask) const {
if (mask.size() == 0)
return CUDAArray();
else if (size() == 1 && mask.size() != 0)
return *this;
else if (mask.size() != size())
throw std::runtime_error("CUDAArray::compress_(): size mismatch!");
eval();
mask.eval();
Value *ptr;
size_t new_size;
cuda_compress(size(), (const Value *) data(),
(const bool *) mask.data(), &ptr, &new_size);
return map(ptr, new_size, true);
}
auto operator->() const {
using BaseType = std::decay_t<std::remove_pointer_t<Value>>;
return call_support<BaseType, CUDAArray>(*this);
}
Index index_() const { return m_index; }
size_t size() const { return cuda_var_size(m_index); }
bool empty() const { return size() == 0; }
const Value *data() const { return (const Value *) cuda_var_ptr(m_index); }
Value *data() { return (Value *) cuda_var_ptr(m_index); }
void resize(size_t size) {
m_index = cuda_var_set_size(m_index, size, true);
}
Value coeff(size_t i) const {
Value result = (Value) 0;
cuda_fetch_element(&result, m_index, i, sizeof(Value));
return result;
}
static CUDAArray from_index_(Index index) {
CUDAArray a;
a.m_index = index;
return a;
}
protected:
Index m_index = 0;
mutable std::vector<std::pair<Value, CUDAArray<uint32_t>>> *m_cached_partition = nullptr;
};
template <typename T, enable_if_t<!is_diff_array_v<T> && is_cuda_array_v<T>> = 0>
ENOKI_INLINE void set_label(const T& a, const char *label) {
if constexpr (array_depth_v<T> >= 2) {
for (size_t i = 0; i < T::Size; ++i)
set_label(a.coeff(i), (std::string(label) + "." + std::to_string(i)).c_str());
} else {
cuda_var_set_label(a.index_(), label);
}
}
template <typename T> class cuda_managed_allocator {
public:
using value_type = T;
using reference = T &;
using const_reference = const T &;
cuda_managed_allocator() = default;
template <typename T2>
cuda_managed_allocator(const cuda_managed_allocator<T2> &) { }
value_type *allocate(size_t n) {
return (value_type *) cuda_managed_malloc(n * sizeof(T));
}
void deallocate(value_type *ptr, size_t) {
cuda_free(ptr);
}
bool operator==(const cuda_managed_allocator &) { return true; }
bool operator!=(const cuda_managed_allocator &) { return false; }
};
template <typename T> class cuda_host_allocator {
public:
using value_type = T;
using reference = T &;
using const_reference = const T &;
cuda_host_allocator() = default;
template <typename T2>
cuda_host_allocator(const cuda_host_allocator<T2> &) { }
value_type *allocate(size_t n) {
return (value_type *) cuda_host_malloc(n * sizeof(T));
}
void deallocate(value_type *ptr, size_t) {
cuda_host_free(ptr);
}
bool operator==(const cuda_host_allocator &) { return true; }
bool operator!=(const cuda_host_allocator &) { return false; }
};
#if defined(_MSC_VER)
# define ENOKI_CUDA_EXTERN
#else
# define ENOKI_CUDA_EXTERN extern
#endif
#if defined(ENOKI_AUTODIFF_H) && !defined(ENOKI_BUILD)
ENOKI_CUDA_EXTERN template struct ENOKI_IMPORT Tape<CUDAArray<float>>;
ENOKI_CUDA_EXTERN template struct ENOKI_IMPORT DiffArray<CUDAArray<float>>;
ENOKI_CUDA_EXTERN template struct ENOKI_IMPORT Tape<CUDAArray<double>>;
ENOKI_CUDA_EXTERN template struct ENOKI_IMPORT DiffArray<CUDAArray<double>>;
#endif
NAMESPACE_END(enoki)