/* enoki/cuda.h -- CUDA-backed Enoki dynamic array with JIT compilation Enoki is a C++ template library that enables transparent vectorization of numerical kernels using SIMD instruction sets available on current processor architectures. Copyrighe (c) 2019 Wenzel Jakob All rights reserved. Use of this source code is governed by a BSD-style license that can be found in the LICENSE file. */ #pragma once #define ENOKI_CUDA_H 1 #include NAMESPACE_BEGIN(enoki) // ----------------------------------------------------------------------- //! @{ \name Imports from libenoki-cuda.so // ----------------------------------------------------------------------- /// Initialize the tracing JIT extern ENOKI_IMPORT void cuda_init(); /// Delete the trace, requires a subsequent call by cuda_init() extern ENOKI_IMPORT void cuda_shutdown(); /// Compile and evaluate the trace up to the current instruction extern ENOKI_IMPORT void cuda_eval(bool log_assembly /* = false */); /// Invokes 'cuda_eval' if the given variable has not been evaluated yet extern ENOKI_IMPORT void cuda_eval_var(uint32_t index, bool log_assembly = false); /// Increase the reference count of a variable extern ENOKI_IMPORT void cuda_inc_ref_ext(uint32_t); /// Decrease the reference count of a variable extern ENOKI_IMPORT void cuda_dec_ref_ext(uint32_t); /// Return the size of a variable extern ENOKI_IMPORT size_t cuda_var_size(uint32_t); /// Return the pointer address of a variable (in device memory) extern ENOKI_IMPORT void* cuda_var_ptr(uint32_t); /// Retroactively adjust the recorded size of a variable extern ENOKI_IMPORT uint32_t cuda_var_set_size(uint32_t index, size_t size, bool copy = false); /// Mark a variable as dirty (e.g. due to scatter) extern ENOKI_IMPORT void cuda_var_mark_dirty(uint32_t); /// Attach a label to a variable (written to PTX assembly) extern ENOKI_IMPORT void cuda_var_set_label(uint32_t, const char *); /// Needed to mark certain instructions with side effects (e.g. scatter) extern ENOKI_IMPORT void cuda_var_mark_side_effect(uint32_t); /// Set the current scatter/source operand array extern ENOKI_IMPORT void cuda_set_scatter_gather_operand(uint32_t index, bool gather); /// Append an operation to the trace (0 arguments) extern ENOKI_IMPORT uint32_t cuda_trace_append(EnokiType type, const char *op); /// Append an operation to the trace (1 argument) extern ENOKI_IMPORT uint32_t cuda_trace_append(EnokiType type, const char *op, uint32_t arg1); /// Append an operation to the trace (2 arguments) extern ENOKI_IMPORT uint32_t cuda_trace_append(EnokiType type, const char *op, uint32_t arg1, uint32_t arg2); /// Append an operation to the trace (3 arguments) extern ENOKI_IMPORT uint32_t cuda_trace_append(EnokiType type, const char *op, uint32_t arg1, uint32_t arg2, uint32_t arg3); /// Insert a "printf" instruction for the given instruction extern ENOKI_IMPORT void cuda_trace_printf(const char *fmt, uint32_t narg, uint32_t *arg); /// Computes the prefix sum of a given memory region template extern ENOKI_IMPORT T* cuda_psum(size_t, const T *); /// Computes the horizontal sum of a given memory region template extern ENOKI_IMPORT T* cuda_hsum(size_t, const T *); /// Computes the horizontal product of a given memory region template extern ENOKI_IMPORT T* cuda_hprod(size_t, const T *); /// Computes the horizontal maximum of a given memory region template extern ENOKI_IMPORT T* cuda_hmax(size_t, const T *); /// Computes the horizontal minimum of a given memory region template extern ENOKI_IMPORT T* cuda_hmin(size_t, const T *); /// Compute the number of entries set to 'true' extern ENOKI_IMPORT size_t cuda_count(size_t, const bool *); template extern ENOKI_IMPORT void cuda_compress(size_t, const T *, const bool *mask, T **, size_t *); /// Computes a horizontal reduction of a mask array via AND extern ENOKI_IMPORT bool cuda_all(size_t, const bool *); /// Computes a horizontal reduction of a mask array via OR extern ENOKI_IMPORT bool cuda_any(size_t, const bool *); /// Sort 'ptrs' and return unique instances and their count, as well as a permutation extern ENOKI_IMPORT void cuda_partition(size_t size, const void **ptrs, void ***unique_out, uint32_t **counts_out, uint32_t ***perm_out); /// Copy some host memory region to the device and wrap it in a variable extern ENOKI_IMPORT uint32_t cuda_var_copy_to_device(EnokiType type, size_t size, const void *value); /// Create a variable that stores a pointer to some (device) memory region extern ENOKI_IMPORT uint32_t cuda_var_register_ptr(const void *ptr); /// Register a memory region (in device memory) as a variable extern ENOKI_IMPORT uint32_t cuda_var_register(EnokiType type, size_t size, void *ptr, bool dealloc); /// Fetch a scalar value from a CUDA array (in device memory) extern ENOKI_IMPORT void cuda_fetch_element(void *, uint32_t, size_t, size_t); /// Copy a memory region to the device extern ENOKI_IMPORT void cuda_memcpy_to_device(void *dst, const void *src, size_t size); extern ENOKI_IMPORT void cuda_memcpy_to_device_async(void *dst, const void *src, size_t size); /// Copy a memory region from the device extern ENOKI_IMPORT void cuda_memcpy_from_device(void *dst, const void *src, size_t size); extern ENOKI_IMPORT void cuda_memcpy_from_device_async(void *dst, const void *src, size_t size); /// Return the free and total amount of memory (Wrapper around cudaMemGetInfo) extern ENOKI_IMPORT void cuda_mem_get_info(size_t *free, size_t *total); /// Allocate device-local memory (wrapper around cudaMalloc) extern ENOKI_IMPORT void* cuda_malloc(size_t); /// Allocate unified memory (wrapper around cudaMallocManaged) extern ENOKI_IMPORT void* cuda_managed_malloc(size_t size); /// Allocate host-pinned memory (wrapper around cudaMallocHost) extern ENOKI_IMPORT void* cuda_host_malloc(size_t); /// Allocate unified memory (wrapper around analogues of cudaMemsetAsync) extern ENOKI_IMPORT void cuda_fill(uint8_t *ptr, uint8_t value, size_t size); extern ENOKI_IMPORT void cuda_fill(uint16_t *ptr, uint16_t value, size_t size); extern ENOKI_IMPORT void cuda_fill(uint32_t *ptr, uint32_t value, size_t size); extern ENOKI_IMPORT void cuda_fill(uint64_t *ptr, uint64_t value, size_t size); /// Reverse an array extern ENOKI_IMPORT void cuda_reverse(uint8_t *out, const uint8_t *in, size_t size); extern ENOKI_IMPORT void cuda_reverse(uint16_t *out, const uint16_t *in, size_t size); extern ENOKI_IMPORT void cuda_reverse(uint32_t *out, const uint32_t *in, size_t size); extern ENOKI_IMPORT void cuda_reverse(uint64_t *out, const uint64_t *in, size_t size); /// Release device-local or unified memory extern ENOKI_IMPORT void cuda_free(void *); /// Release host-local memory extern ENOKI_IMPORT void cuda_host_free(void *); /// Release any unused held memory back to the device extern ENOKI_IMPORT void cuda_malloc_trim(); /// Wait for all work queued on the device to finish extern ENOKI_IMPORT void cuda_sync(); /// Print detailed information about currently allocated arrays extern ENOKI_IMPORT char *cuda_whos(); /// Convert a variable into managed memory (if applicable) extern ENOKI_IMPORT void cuda_make_managed(uint32_t); /// Register a callback that will be invoked before cuda_eval() extern void cuda_register_callback(void (*callback)(void *), void *payload); /// Unregister a callback installed via 'cuda_register_callback()' extern void cuda_unregister_callback(void (*callback)(void *), void *payload); /** * \brief Current log level (0: none, 1: kernel launches, * 2: +ptxas statistics, 3: +ptx source, 4: +jit trace, 5: +ref counting) */ extern ENOKI_IMPORT void cuda_set_log_level(uint32_t); extern ENOKI_IMPORT uint32_t cuda_log_level(); //! @} // ----------------------------------------------------------------------- template struct CUDAArray : ArrayBase, CUDAArray> { template friend struct CUDAArray; using Index = uint32_t; static constexpr EnokiType Type = enoki_type_v; static constexpr bool IsCUDA = true; template using ReplaceValue = CUDAArray; using MaskType = CUDAArray; using ArrayType = CUDAArray; CUDAArray() = default; ~CUDAArray() { cuda_dec_ref_ext(m_index); if constexpr (std::is_pointer_v || std::is_same_v) delete m_cached_partition; } CUDAArray(const CUDAArray &a) : m_index(a.m_index) { cuda_inc_ref_ext(m_index); } CUDAArray(CUDAArray &&a) : m_index(a.m_index) { a.m_index = 0; if constexpr (std::is_pointer_v || std::is_same_v) { m_cached_partition = a.m_cached_partition; a.m_cached_partition = nullptr; } } template CUDAArray(const CUDAArray &v) { const char *op; if (std::is_floating_point_v && std::is_integral_v) op = "cvt.rzi.$t1.$t2 $r1, $r2"; else if (std::is_integral_v && std::is_floating_point_v) op = "cvt.rn.$t1.$t2 $r1, $r2"; else op = "cvt.$t1.$t2 $r1, $r2"; m_index = cuda_trace_append(Type, op, v.index_()); } template CUDAArray(const CUDAArray &v, detail::reinterpret_flag) { static_assert(sizeof(T) == sizeof(Value)); if (std::is_integral_v != std::is_integral_v) { m_index = cuda_trace_append(Type, "mov.$b1 $r1, $r2", v.index_()); } else { m_index = v.index_(); cuda_inc_ref_ext(m_index); } } template > = 0> CUDAArray(const T &value, detail::reinterpret_flag) : CUDAArray(memcpy_cast(value)) { } template > = 0> CUDAArray(T value) : CUDAArray((Value) value) { } CUDAArray(Value value) { const char *fmt = nullptr; switch (Type) { case EnokiType::Float16: fmt = "mov.$t1 $r1, %04x"; break; case EnokiType::Float32: fmt = "mov.$t1 $r1, 0f%08x"; break; case EnokiType::Float64: fmt = "mov.$t1 $r1, 0d%016llx"; break; case EnokiType::Bool: fmt = "mov.$t1 $r1, %i"; break; case EnokiType::Int8: case EnokiType::UInt8: fmt = "mov.$t1 $r1, 0x%02x"; break; case EnokiType::Int16: case EnokiType::UInt16: fmt = "mov.$t1 $r1, 0x%04x"; break; case EnokiType::Int32: case EnokiType::UInt32: fmt = "mov.$t1 $r1, 0x%08x"; break; case EnokiType::Pointer: case EnokiType::Int64: case EnokiType::UInt64: fmt = "mov.$t1 $r1, 0x%016llx"; break; default: fmt = "<>"; break; } char tmp[32]; snprintf(tmp, 32, fmt, memcpy_cast>(value)); m_index = cuda_trace_append(Type, tmp); } template 1)> = 0> CUDAArray(Args&&... args) { Value data[] = { (Value) args... }; m_index = cuda_var_copy_to_device(Type, sizeof...(Args), data); } CUDAArray &operator=(const CUDAArray &a) { cuda_inc_ref_ext(a.m_index); cuda_dec_ref_ext(m_index); m_index = a.m_index; if constexpr (std::is_pointer_v || std::is_same_v) m_cached_partition = nullptr; return *this; } CUDAArray &operator=(CUDAArray &&a) { std::swap(m_index, a.m_index); if constexpr (std::is_pointer_v || std::is_same_v) std::swap(m_cached_partition, a.m_cached_partition); return *this; } CUDAArray add_(const CUDAArray &v) const { const char *op = std::is_floating_point_v ? "add.rn.ftz.$t1 $r1, $r2, $r3" : "add.$t1 $r1, $r2, $r3"; return CUDAArray::from_index_( cuda_trace_append(Type, op, index_(), v.index_())); } CUDAArray sub_(const CUDAArray &v) const { const char *op = std::is_floating_point_v ? "sub.rn.ftz.$t1 $r1, $r2, $r3" : "sub.$t1 $r1, $r2, $r3"; return CUDAArray::from_index_( cuda_trace_append(Type, op, index_(), v.index_())); } CUDAArray mul_(const CUDAArray &v) const { const char *op = std::is_floating_point_v ? "mul.rn.ftz.$t1 $r1, $r2, $r3" : "mul.lo.$t1 $r1, $r2, $r3"; return CUDAArray::from_index_( cuda_trace_append(Type, op, index_(), v.index_())); } CUDAArray mulhi_(const CUDAArray &v) const { return CUDAArray::from_index_(cuda_trace_append( Type, "mul.hi.$t1 $r1, $r2, $r3", index_(), v.index_())); } CUDAArray div_(const CUDAArray &v) const { const char *op = std::is_floating_point_v ? "div.rn.ftz.$t1 $r1, $r2, $r3" : "div.$t1 $r1, $r2, $r3"; return CUDAArray::from_index_( cuda_trace_append(Type, op, index_(), v.index_())); } CUDAArray mod_(const CUDAArray &v) const { return CUDAArray::from_index_(cuda_trace_append(Type, "rem.$t1 $r1, $r2, $r3", index_(), v.index_())); } CUDAArray fmadd_(const CUDAArray &a, const CUDAArray &b) const { const char *op = std::is_floating_point_v ? "fma.rn.ftz.$t1 $r1, $r2, $r3, $r4" : "mad.lo.$t1 $r1, $r2, $r3, $r4"; return CUDAArray::from_index_( cuda_trace_append(Type, op, index_(), a.index_(), b.index_())); } CUDAArray fmsub_(const CUDAArray &a, const CUDAArray &b) const { return fmadd_(a, -b); } CUDAArray fnmadd_(const CUDAArray &a, const CUDAArray &b) const { return fmadd_(-a, b); } CUDAArray fnmsub_(const CUDAArray &a, const CUDAArray &b) const { return -fmadd_(a, b); } CUDAArray max_(const CUDAArray &v) const { return CUDAArray::from_index_(cuda_trace_append(Type, "max.ftz.$t1 $r1, $r2, $r3", index_(), v.index_())); } CUDAArray min_(const CUDAArray &v) const { return CUDAArray::from_index_(cuda_trace_append(Type, "min.ftz.$t1 $r1, $r2, $r3", index_(), v.index_())); } CUDAArray abs_() const { return CUDAArray::from_index_(cuda_trace_append(Type, "abs.ftz.$t1 $r1, $r2", index_())); } CUDAArray neg_() const { return CUDAArray::from_index_(cuda_trace_append(Type, "neg.ftz.$t1 $r1, $r2", index_())); } CUDAArray sqrt_() const { return CUDAArray::from_index_(cuda_trace_append(Type, "sqrt.rn.ftz.$t1 $r1, $r2", index_())); } CUDAArray exp_() const { CUDAArray scaled = Value(1.4426950408889634074) * *this; return CUDAArray::from_index_(cuda_trace_append(Type, "ex2.approx.ftz.$t1 $r1, $r2", scaled.index_())); } CUDAArray log_() const { return CUDAArray::from_index_(cuda_trace_append( Type, "lg2.approx.ftz.$t1 $r1, $r2", index_())) * Value(0.69314718055994530942); } CUDAArray sin_() const { return CUDAArray::from_index_(cuda_trace_append(Type, "sin.approx.ftz.$t1 $r1, $r2", index_())); } CUDAArray cos_() const { return CUDAArray::from_index_(cuda_trace_append(Type, "cos.approx.ftz.$t1 $r1, $r2", index_())); } std::pair sincos_() const { return { sin_(), cos_() }; } CUDAArray rcp_() const { return CUDAArray::from_index_(cuda_trace_append(Type, "rcp.approx.ftz.$t1 $r1, $r2", index_())); } CUDAArray rsqrt_() const { return CUDAArray::from_index_(cuda_trace_append(Type, "rsqrt.approx.ftz.$t1 $r1, $r2", index_())); } CUDAArray floor_() const { return CUDAArray::from_index_(cuda_trace_append(Type, "cvt.rmi.$t1.$t1 $r1, $r2", index_())); } CUDAArray ceil_() const { return CUDAArray::from_index_(cuda_trace_append(Type, "cvt.rpi.$t1.$t1 $r1, $r2", index_())); } CUDAArray round_() const { return CUDAArray::from_index_(cuda_trace_append(Type, "cvt.rni.$t1.$t1 $r1, $r2", index_())); } CUDAArray trunc_() const { return CUDAArray::from_index_(cuda_trace_append(Type, "cvt.rzi.$t1.$t1 $r1, $r2", index_())); } template T floor2int_() const { return T::from_index_(cuda_trace_append(T::Type, "cvt.rmi.$t1.$t2 $r1, $r2", index_())); } template T ceil2int_() const { return T::from_index_(cuda_trace_append(T::Type, "cvt.rpi.$t1.$t2 $r1, $r2", index_())); } CUDAArray sl_(const CUDAArray &v) const { if constexpr (sizeof(Value) == 4) return CUDAArray::from_index_(cuda_trace_append(Type, "shl.$b1 $r1, $r2, $r3", index_(), v.index_())); else return CUDAArray::from_index_(cuda_trace_append(Type, "shl.$b1 $r1, $r2, $r3", index_(), CUDAArray(v).index_())); } CUDAArray sr_(const CUDAArray &v) const { const char *op = std::is_signed_v ? "shr.$t1 $r1, $r2, $r3" : "shr.$b1 $r1, $r2, $r3"; if constexpr (sizeof(Value) == 4) return CUDAArray::from_index_(cuda_trace_append(Type, op, index_(), v.index_())); else return CUDAArray::from_index_(cuda_trace_append(Type, op, index_(), CUDAArray(v).index_())); } CUDAArray sl_(size_t value) const { return sl_(CUDAArray((Value) value)); } CUDAArray sr_(size_t value) const { return sr_(CUDAArray((Value) value)); } template CUDAArray sl_() const { return sl_(Imm); } template CUDAArray sr_() const { return sr_(Imm); } CUDAArray not_() const { return CUDAArray::from_index_(cuda_trace_append(Type, "not.$b1 $r1, $r2", index_())); } CUDAArray popcnt_() const { return CUDAArray::from_index_(cuda_trace_append(Type, "popc.$b1 $r1, $r2", index_())); } CUDAArray lzcnt_() const { return CUDAArray::from_index_(cuda_trace_append(Type, "clz.$b1 $r1, $r2", index_())); } CUDAArray tzcnt_() const { return CUDAArray::from_index_(cuda_trace_append(Type, "brev.$b1 $r1, $r2;\n clz.$b1 $r1, $r1", index_())); } template CUDAArray or_(const CUDAArray &v) const { Value all_ones = memcpy_cast(int_array_t(-1)); ENOKI_MARK_USED(all_ones); if constexpr (std::is_same_v) return CUDAArray::from_index_(cuda_trace_append(Type, "or.$b1 $r1, $r2, $r3", index_(), v.index_())); else return CUDAArray::from_index_(cuda_trace_append(Type, "selp.$t1 $r1, $r2, $r3, $r4", CUDAArray(all_ones).index_(), index_(), v.index_())); } template CUDAArray and_(const CUDAArray &v) const { Value all_zeros = memcpy_cast(int_array_t(0)); ENOKI_MARK_USED(all_zeros); if constexpr (std::is_same_v) return CUDAArray::from_index_(cuda_trace_append(Type, "and.$b1 $r1, $r2, $r3", index_(), v.index_())); else return CUDAArray::from_index_(cuda_trace_append(Type, "selp.$t1 $r1, $r2, $r3, $r4", index_(), CUDAArray(all_zeros).index_(), v.index_())); } template CUDAArray andnot_(const CUDAArray &v) const { return and_(!v); } CUDAArray xor_(const CUDAArray &v) const { return CUDAArray::from_index_(cuda_trace_append(Type, "xor.$b1 $r1, $r2, $r3", index_(), v.index_())); } MaskType gt_(const CUDAArray &v) const { const char *op = std::is_signed_v ? "setp.gt.$t2 $r1, $r2, $r3" : "setp.hi.$t2 $r1, $r2, $r3"; return MaskType::from_index_(cuda_trace_append( EnokiType::Bool, op, index_(), v.index_())); } MaskType ge_(const CUDAArray &v) const { const char *op = std::is_signed_v ? "setp.ge.$t2 $r1, $r2, $r3" : "setp.hs.$t2 $r1, $r2, $r3"; return MaskType::from_index_(cuda_trace_append( EnokiType::Bool, op, index_(), v.index_())); } MaskType lt_(const CUDAArray &v) const { const char *op = std::is_signed_v ? "setp.lt.$t2 $r1, $r2, $r3" : "setp.lo.$t2 $r1, $r2, $r3"; return MaskType::from_index_(cuda_trace_append( EnokiType::Bool, op, index_(), v.index_())); } MaskType le_(const CUDAArray &v) const { const char *op = std::is_signed_v ? "setp.le.$t2 $r1, $r2, $r3" : "setp.ls.$t2 $r1, $r2, $r3"; return MaskType::from_index_(cuda_trace_append( EnokiType::Bool, op, index_(), v.index_())); } MaskType eq_(const CUDAArray &v) const { const char *op = !std::is_same_v ? "setp.eq.$t2 $r1, $r2, $r3" : "xor.$t2 $r1, $r2, $r3;\n not.$t2 $r1, $r1"; return MaskType::from_index_(cuda_trace_append( EnokiType::Bool, op, index_(), v.index_())); } MaskType neq_(const CUDAArray &v) const { const char *op = !std::is_same_v ? "setp.ne.$t2 $r1, $r2, $r3" : "xor.$t2 $r1, $r2, $r3"; return MaskType::from_index_(cuda_trace_append( EnokiType::Bool, op, index_(), v.index_())); } static CUDAArray select_(const MaskType &m, const CUDAArray &t, const CUDAArray &f) { if constexpr (!std::is_same_v) { return CUDAArray::from_index_(cuda_trace_append(Type, "selp.$t1 $r1, $r2, $r3, $r4", t.index_(), f.index_(), m.index_())); } else { return (m & t) | (~m & f); } } static CUDAArray arange_(ssize_t start, ssize_t stop, ssize_t step) { size_t size = size_t((stop - start + step - (step > 0 ? 1 : -1)) / step); using UInt32 = CUDAArray; UInt32 index = UInt32::from_index_( cuda_trace_append(EnokiType::UInt32, "mov.u32 $r1, $r2", 2)); cuda_var_set_size(index.index_(), size); if (start == 0 && step == 1) return index; else return fmadd(index, CUDAArray((Value) step), CUDAArray((Value) start)); } static CUDAArray linspace_(Value min, Value max, size_t size) { using UInt32 = CUDAArray; UInt32 index = UInt32::from_index_( cuda_trace_append(EnokiType::UInt32, "mov.u32 $r1, $r2", 2)); cuda_var_set_size(index.index_(), size); Value step = (max - min) / Value(size - 1); return fmadd(index, CUDAArray(step), CUDAArray(min)); } static CUDAArray empty_(size_t size) { return CUDAArray::from_index_(cuda_var_register( Type, size, cuda_malloc(size * sizeof(Value)), true)); } static CUDAArray zero_(size_t size) { if (size == 1) { return CUDAArray(Value(0)); } else { void *ptr = cuda_malloc(size * sizeof(Value)); cuda_fill((uint8_t *) ptr, 0, size * sizeof(Value)); uint32_t index = cuda_var_register(Type, size, ptr, true); return CUDAArray::from_index_(index); } } static CUDAArray full_(const Value &value, size_t size) { if (size == 1) { return CUDAArray(value); } else { using UInt = uint_array_t; void *ptr = cuda_malloc(size * sizeof(Value)); cuda_fill((UInt *) ptr, memcpy_cast(value), size); uint32_t index = cuda_var_register(Type, size, ptr, true); return CUDAArray::from_index_(index); } } CUDAArray hsum_() const { size_t n = size(); if (n == 1) { return *this; } else { eval(); Value *result = cuda_hsum(n, (const Value *) cuda_var_ptr(m_index)); return CUDAArray::from_index_(cuda_var_register(Type, 1, result, true)); } } CUDAArray reverse_() const { using UInt = uint_array_t; size_t n = size(); if (n <= 1) return *this; eval(); UInt *result = (UInt *) cuda_malloc(n * sizeof(Value)); cuda_reverse(result, (const UInt *) cuda_var_ptr(m_index), n); return CUDAArray::from_index_(cuda_var_register(Type, n, result, true)); } CUDAArray psum_() const { size_t n = size(); if (n <= 1) { return *this; } else { eval(); Value *result = cuda_psum(n, (const Value *) cuda_var_ptr(m_index)); return CUDAArray::from_index_(cuda_var_register(Type, n, result, true)); } } CUDAArray hprod_() const { size_t n = size(); if (n == 1) { return *this; } else { eval(); Value *result = cuda_hprod(n, (const Value *) cuda_var_ptr(m_index)); return CUDAArray::from_index_(cuda_var_register(Type, 1, result, true)); } } CUDAArray hmax_() const { size_t n = size(); if (n == 1) { return *this; } else { eval(); Value *result = cuda_hmax(n, (const Value *) cuda_var_ptr(m_index)); return CUDAArray::from_index_(cuda_var_register(Type, 1, result, true)); } } CUDAArray hmin_() const { size_t n = size(); if (n == 1) { return *this; } else { eval(); Value *result = cuda_hmin(n, (const Value *) cuda_var_ptr(m_index)); return CUDAArray::from_index_(cuda_var_register(Type, 1, result, true)); } } bool all_() const { size_t n = size(); if (n == 1) { return coeff(0); } else { eval(); return cuda_all(n, (const Value *) cuda_var_ptr(m_index)); } } bool any_() const { size_t n = size(); if (n == 1) { return coeff(0); } else { eval(); return cuda_any(n, (const Value *) cuda_var_ptr(m_index)); } } CUDAArray &eval() { cuda_eval_var(m_index); return *this; } const CUDAArray &eval() const { cuda_eval_var(m_index); return *this; } size_t count_() const { eval(); return cuda_count(cuda_var_size(m_index), (const Value *) cuda_var_ptr(m_index)); } static CUDAArray map(void *ptr, size_t size, bool dealloc = false) { return CUDAArray::from_index_(cuda_var_register(Type, size, ptr, dealloc)); } static CUDAArray copy(const void *ptr, size_t size) { return CUDAArray::from_index_(cuda_var_copy_to_device(Type, size, ptr)); } CUDAArray &managed() { cuda_make_managed(m_index); return *this; } const CUDAArray &managed() const { cuda_make_managed(m_index); return *this; } template || std::is_same_v> = 0> std::vector>> partition_() const { if (!m_cached_partition) { eval(); void **unique = nullptr; uint32_t *counts = nullptr; uint32_t **perm = nullptr; cuda_partition(size(), (const void **) data(), &unique, &counts, &perm); uint32_t num_unique = counts[0]; m_cached_partition = new std::vector>>(num_unique); m_cached_partition->reserve(num_unique); for (uint32_t i = 0; i < num_unique; ++i) { m_cached_partition->emplace_back( (Value) unique[i], CUDAArray::from_index_(cuda_var_register( EnokiType::UInt32, counts[i + 1], perm[i], true))); } cuda_host_free(unique); cuda_host_free(counts); free(perm); } return *m_cached_partition; } template static CUDAArray gather_(const void *ptr_, const Index &index, const Mask &mask) { using UInt64 = CUDAArray; UInt64 ptr = UInt64::from_index_(cuda_var_register_ptr(ptr_)), addr = fmadd(UInt64(index), (uint64_t) Stride, ptr); if constexpr (!std::is_same_v) { return CUDAArray::from_index_(cuda_trace_append( Type, "@$r3 ld.global.$t1 $r1, [$r2];\n @!$r3 mov.$b1 $r1, 0", addr.index_(), mask.index_())); } else { return neq(CUDAArray::from_index_(cuda_trace_append( EnokiType::UInt32, "@$r3 ld.global.u8 $r1, [$r2];\n @!$r3 mov.$b1 $r1, 0", addr.index_(), mask.index_())), 0u); } } template ENOKI_INLINE void scatter_(void *ptr_, const Index &index, const Mask &mask) const { using UInt64 = CUDAArray; UInt64 ptr = UInt64::from_index_(cuda_var_register_ptr(ptr_)), addr = fmadd(UInt64(index), (uint64_t) Stride, ptr); CUDAArray::Index var; if constexpr (!std::is_same_v) { var = cuda_trace_append(EnokiType::UInt64, "@$r4 st.global.$t3 [$r2], $r3", addr.index_(), m_index, mask.index_() ); } else { using UInt32 = CUDAArray; UInt32 value = select(*this, UInt32(1), UInt32(0)); var = cuda_trace_append(EnokiType::UInt64, "@$r4 st.global.u8 [$r2], $r3", addr.index_(), value.index_(), mask.index_() ); } cuda_var_mark_side_effect(var); } template void scatter_add_(void *ptr_, const Index &index, const Mask &mask) const { using UInt64 = CUDAArray; UInt64 ptr = UInt64::from_index_(cuda_var_register_ptr(ptr_)), addr = fmadd(UInt64(index), (uint64_t) Stride, ptr); CUDAArray::Index var = cuda_trace_append(Type, "@$r4 atom.global.add.$t1 $r1, [$r2], $r3", addr.index_(), m_index, mask.index_() ); cuda_var_mark_side_effect(var); } template CUDAArray compress_(const Mask &mask) const { if (mask.size() == 0) return CUDAArray(); else if (size() == 1 && mask.size() != 0) return *this; else if (mask.size() != size()) throw std::runtime_error("CUDAArray::compress_(): size mismatch!"); eval(); mask.eval(); Value *ptr; size_t new_size; cuda_compress(size(), (const Value *) data(), (const bool *) mask.data(), &ptr, &new_size); return map(ptr, new_size, true); } auto operator->() const { using BaseType = std::decay_t>; return call_support(*this); } Index index_() const { return m_index; } size_t size() const { return cuda_var_size(m_index); } bool empty() const { return size() == 0; } const Value *data() const { return (const Value *) cuda_var_ptr(m_index); } Value *data() { return (Value *) cuda_var_ptr(m_index); } void resize(size_t size) { m_index = cuda_var_set_size(m_index, size, true); } Value coeff(size_t i) const { Value result = (Value) 0; cuda_fetch_element(&result, m_index, i, sizeof(Value)); return result; } static CUDAArray from_index_(Index index) { CUDAArray a; a.m_index = index; return a; } protected: Index m_index = 0; mutable std::vector>> *m_cached_partition = nullptr; }; template && is_cuda_array_v> = 0> ENOKI_INLINE void set_label(const T& a, const char *label) { if constexpr (array_depth_v >= 2) { for (size_t i = 0; i < T::Size; ++i) set_label(a.coeff(i), (std::string(label) + "." + std::to_string(i)).c_str()); } else { cuda_var_set_label(a.index_(), label); } } template class cuda_managed_allocator { public: using value_type = T; using reference = T &; using const_reference = const T &; cuda_managed_allocator() = default; template cuda_managed_allocator(const cuda_managed_allocator &) { } value_type *allocate(size_t n) { return (value_type *) cuda_managed_malloc(n * sizeof(T)); } void deallocate(value_type *ptr, size_t) { cuda_free(ptr); } bool operator==(const cuda_managed_allocator &) { return true; } bool operator!=(const cuda_managed_allocator &) { return false; } }; template class cuda_host_allocator { public: using value_type = T; using reference = T &; using const_reference = const T &; cuda_host_allocator() = default; template cuda_host_allocator(const cuda_host_allocator &) { } value_type *allocate(size_t n) { return (value_type *) cuda_host_malloc(n * sizeof(T)); } void deallocate(value_type *ptr, size_t) { cuda_host_free(ptr); } bool operator==(const cuda_host_allocator &) { return true; } bool operator!=(const cuda_host_allocator &) { return false; } }; #if defined(_MSC_VER) # define ENOKI_CUDA_EXTERN #else # define ENOKI_CUDA_EXTERN extern #endif #if defined(ENOKI_AUTODIFF_H) && !defined(ENOKI_BUILD) ENOKI_CUDA_EXTERN template struct ENOKI_IMPORT Tape>; ENOKI_CUDA_EXTERN template struct ENOKI_IMPORT DiffArray>; ENOKI_CUDA_EXTERN template struct ENOKI_IMPORT Tape>; ENOKI_CUDA_EXTERN template struct ENOKI_IMPORT DiffArray>; #endif NAMESPACE_END(enoki)