enoki headers (#140)
This commit is contained in:
parent
c52c374c45
commit
8b90d7f9eb
|
|
@ -47,7 +47,7 @@ set(SOCKET_ROCKET_SOURCES
|
||||||
|
|
||||||
set(SOCKET_ROCKET_SOURCES_M ${SOCKET_ROCKET_SOURCES})
|
set(SOCKET_ROCKET_SOURCES_M ${SOCKET_ROCKET_SOURCES})
|
||||||
list(FILTER SOCKET_ROCKET_SOURCES_M INCLUDE REGEX ".*m$")
|
list(FILTER SOCKET_ROCKET_SOURCES_M INCLUDE REGEX ".*m$")
|
||||||
set_source_files_properties(${SOCKET_ROCKET_SOURCES_M} PROPERTIES COMPILE_FLAGS
|
set_source_files_properties(${SOCKET_ROCKET_SOURCES_M} PROPERTIES COMPILE_FLAGS
|
||||||
-fobjc-arc
|
-fobjc-arc
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -62,5 +62,5 @@ list(APPEND CC_EXTERNAL_PRIVATE_INCLUDES
|
||||||
${CMAKE_CURRENT_LIST_DIR}/Internal/Proxy
|
${CMAKE_CURRENT_LIST_DIR}/Internal/Proxy
|
||||||
)
|
)
|
||||||
|
|
||||||
list(APPEND CC_EXTERNAL_SROUCES ${SOCKET_ROCKET_SOURCES})
|
list(APPEND CC_EXTERNAL_SOURCES ${SOCKET_ROCKET_SOURCES})
|
||||||
list(APPEND CC_EXTERNAL_INCLUDES ${CMAKE_CURRENT_LIST_DIR})
|
list(APPEND CC_EXTERNAL_INCLUDES ${CMAKE_CURRENT_LIST_DIR})
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,182 @@
|
||||||
|
/*
|
||||||
|
enoki/array.h -- Main header file for the Enoki array class and
|
||||||
|
various template specializations
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# pragma warning(push)
|
||||||
|
# pragma warning(disable: 4146) // warning C4146: unary minus operator applied to unsigned type, result still unsigned
|
||||||
|
# pragma warning(disable: 4554) // warning C4554: '>>': check operator precedence for possible error; use parentheses to clarify precedence
|
||||||
|
# pragma warning(disable: 4702) // warning C4702: unreachable code
|
||||||
|
# pragma warning(disable: 4522) // warning C4522: multiple assignment operators specified
|
||||||
|
# pragma warning(disable: 4310) // warning C4310: cast truncates constant value
|
||||||
|
# pragma warning(disable: 4127) // warning C4127: conditional expression is constant
|
||||||
|
#elif defined(__GNUC__) && !defined(__clang__)
|
||||||
|
# pragma GCC diagnostic push
|
||||||
|
# pragma GCC diagnostic ignored "-Wclass-memaccess"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <enoki/array_generic.h>
|
||||||
|
|
||||||
|
#include <enoki/array_math.h>
|
||||||
|
|
||||||
|
#if defined(ENOKI_ARM_NEON) || defined(ENOKI_X86_SSE42)
|
||||||
|
# include <enoki/array_recursive.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512F)
|
||||||
|
# include <enoki/array_kmask.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_SSE42)
|
||||||
|
# include <enoki/array_sse42.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX)
|
||||||
|
# include <enoki/array_avx.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX2)
|
||||||
|
# include <enoki/array_avx2.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512F)
|
||||||
|
# include <enoki/array_avx512.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_ARM_NEON)
|
||||||
|
# include <enoki/array_neon.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <enoki/array_idiv.h>
|
||||||
|
#include <enoki/array_call.h>
|
||||||
|
#include <enoki/array_enum.h>
|
||||||
|
#include <enoki/array_utils.h>
|
||||||
|
#include <enoki/array_macro.h>
|
||||||
|
|
||||||
|
#include <enoki/half.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
template <typename Value_, size_t Size_>
|
||||||
|
struct Array : StaticArrayImpl<Value_, Size_, false, Array<Value_, Size_>> {
|
||||||
|
|
||||||
|
using Base = StaticArrayImpl<Value_, Size_, false, Array<Value_, Size_>>;
|
||||||
|
|
||||||
|
using ArrayType = Array;
|
||||||
|
using MaskType = Mask<Value_, Size_>;
|
||||||
|
|
||||||
|
/// Type alias for creating a similar-shaped array over a different type
|
||||||
|
template <typename T> using ReplaceValue = Array<T, Size_>;
|
||||||
|
|
||||||
|
ENOKI_ARRAY_IMPORT(Base, Array)
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Value_, size_t Size_>
|
||||||
|
struct Mask : StaticArrayImpl<Value_, Size_, true, Mask<Value_, Size_>> {
|
||||||
|
|
||||||
|
using Base = StaticArrayImpl<Value_, Size_, true, Mask<Value_, Size_>>;
|
||||||
|
|
||||||
|
using ArrayType = Array<Value_, Size_>;
|
||||||
|
using MaskType = Mask;
|
||||||
|
|
||||||
|
/// Type alias for creating a similar-shaped array over a different type
|
||||||
|
template <typename T> using ReplaceValue = Mask<T, Size_>;
|
||||||
|
|
||||||
|
Mask() = default;
|
||||||
|
|
||||||
|
template <typename T> Mask(T &&value)
|
||||||
|
: Base(std::forward<T>(value), detail::reinterpret_flag()) { }
|
||||||
|
|
||||||
|
template <typename T> Mask(T &&value, detail::reinterpret_flag)
|
||||||
|
: Base(std::forward<T>(value), detail::reinterpret_flag()) { }
|
||||||
|
|
||||||
|
/// Construct from sub-arrays
|
||||||
|
template <typename T1, typename T2, typename T = Mask, enable_if_t<
|
||||||
|
array_depth_v<T1> == array_depth_v<T> && array_size_v<T1> == Base::Size1 &&
|
||||||
|
array_depth_v<T2> == array_depth_v<T> && array_size_v<T2> == Base::Size2 &&
|
||||||
|
Base::Size2 != 0> = 0>
|
||||||
|
Mask(const T1 &a1, const T2 &a2)
|
||||||
|
: Base(a1, a2) { }
|
||||||
|
|
||||||
|
template <typename... Ts,
|
||||||
|
enable_if_t<(sizeof...(Ts) == Base::Size || sizeof...(Ts) == Base::ActualSize) && Size_ != 1 &&
|
||||||
|
std::conjunction_v<detail::is_not_reinterpret_flag<Ts>...>> = 0>
|
||||||
|
Mask(Ts&&... ts) : Base(std::forward<Ts>(ts)...) { }
|
||||||
|
|
||||||
|
ENOKI_ARRAY_IMPORT_BASIC(Base, Mask)
|
||||||
|
using Base::operator=;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Value_, size_t Size_>
|
||||||
|
struct Packet : StaticArrayImpl<Value_, Size_, false, Packet<Value_, Size_>> {
|
||||||
|
|
||||||
|
using Base = StaticArrayImpl<Value_, Size_, false, Packet<Value_, Size_>>;
|
||||||
|
|
||||||
|
using ArrayType = Packet;
|
||||||
|
using MaskType = PacketMask<Value_, Size_>;
|
||||||
|
|
||||||
|
static constexpr bool BroadcastPreferOuter = false;
|
||||||
|
|
||||||
|
/// Type alias for creating a similar-shaped array over a different type
|
||||||
|
template <typename T> using ReplaceValue = Packet<T, Size_>;
|
||||||
|
|
||||||
|
ENOKI_ARRAY_IMPORT(Base, Packet)
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Value_, size_t Size_>
|
||||||
|
struct PacketMask : StaticArrayImpl<Value_, Size_, true, PacketMask<Value_, Size_>> {
|
||||||
|
|
||||||
|
using Base = StaticArrayImpl<Value_, Size_, true, PacketMask<Value_, Size_>>;
|
||||||
|
|
||||||
|
static constexpr bool BroadcastPreferOuter = false;
|
||||||
|
|
||||||
|
using ArrayType = Packet<Value_, Size_>;
|
||||||
|
using MaskType = PacketMask;
|
||||||
|
|
||||||
|
/// Type alias for creating a similar-shaped array over a different type
|
||||||
|
template <typename T> using ReplaceValue = PacketMask<T, Size_>;
|
||||||
|
|
||||||
|
PacketMask() = default;
|
||||||
|
|
||||||
|
template <typename T> PacketMask(T &&value)
|
||||||
|
: Base(std::forward<T>(value), detail::reinterpret_flag()) { }
|
||||||
|
|
||||||
|
template <typename T> PacketMask(T &&value, detail::reinterpret_flag)
|
||||||
|
: Base(std::forward<T>(value), detail::reinterpret_flag()) { }
|
||||||
|
|
||||||
|
/// Construct from sub-arrays
|
||||||
|
template <typename T1, typename T2, typename T = PacketMask, enable_if_t<
|
||||||
|
array_depth_v<T1> == array_depth_v<T> && array_size_v<T1> == Base::Size1 &&
|
||||||
|
array_depth_v<T2> == array_depth_v<T> && array_size_v<T2> == Base::Size2 &&
|
||||||
|
Base::Size2 != 0> = 0>
|
||||||
|
PacketMask(const T1 &a1, const T2 &a2)
|
||||||
|
: Base(a1, a2) { }
|
||||||
|
|
||||||
|
template <typename... Ts,
|
||||||
|
enable_if_t<(sizeof...(Ts) == Base::Size || sizeof...(Ts) == Base::ActualSize) && Size_ != 1 &&
|
||||||
|
std::conjunction_v<detail::is_not_reinterpret_flag<Ts>...>> = 0>
|
||||||
|
PacketMask(Ts&&... ts) : Base(std::forward<Ts>(ts)...) { }
|
||||||
|
|
||||||
|
ENOKI_ARRAY_IMPORT_BASIC(Base, PacketMask)
|
||||||
|
using Base::operator=;
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# pragma warning(pop)
|
||||||
|
#elif defined(__GNUC__) && !defined(__clang__)
|
||||||
|
# pragma GCC diagnostic pop
|
||||||
|
#endif
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,240 @@
|
||||||
|
/*
|
||||||
|
enoki/array_base.h -- Base class of all Enoki arrays
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <enoki/array_router.h>
|
||||||
|
#include <enoki/array_masked.h>
|
||||||
|
#include <enoki/array_struct.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
template <typename Value_, typename Derived_> struct ArrayBase {
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Curiously Recurring Template design pattern
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Alias to the derived type
|
||||||
|
using Derived = Derived_;
|
||||||
|
|
||||||
|
/// Cast to derived type
|
||||||
|
ENOKI_INLINE Derived &derived() { return (Derived &) *this; }
|
||||||
|
|
||||||
|
/// Cast to derived type (const version)
|
||||||
|
ENOKI_INLINE const Derived &derived() const { return (Derived &) *this; }
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Basic declarations
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Actual type underlying the derived array
|
||||||
|
using Value = Value_;
|
||||||
|
|
||||||
|
/// Scalar data type all the way at the lowest level
|
||||||
|
using Scalar = scalar_t<Value_>;
|
||||||
|
|
||||||
|
/// Specifies how deeply nested this array is
|
||||||
|
static constexpr size_t Depth = 1 + array_depth_v<Value>;
|
||||||
|
|
||||||
|
/// Is this a mask type?
|
||||||
|
static constexpr bool IsMask = is_mask_v<Value_>;
|
||||||
|
|
||||||
|
/// Is this a dynamically allocated array (no by default)
|
||||||
|
static constexpr bool IsDynamic = is_dynamic_v<Value_>;
|
||||||
|
|
||||||
|
/// Does this array compute derivatives using automatic differentation?
|
||||||
|
static constexpr bool IsDiff = is_diff_array_v<Value_>;
|
||||||
|
|
||||||
|
/// Does this array reside on the GPU? (via CUDA)
|
||||||
|
static constexpr bool IsCUDA = is_cuda_array_v<Value_>;
|
||||||
|
|
||||||
|
/// Does this array map operations onto native vector instructions?
|
||||||
|
static constexpr bool IsNative = false;
|
||||||
|
|
||||||
|
/// Is this an AVX512-style 'k' mask register?
|
||||||
|
static constexpr bool IsKMask = false;
|
||||||
|
|
||||||
|
/// Is the storage representation of this array implemented recursively?
|
||||||
|
static constexpr bool IsRecursive = false;
|
||||||
|
|
||||||
|
/// Always prefer broadcasting to the outer dimensions of a N-D array
|
||||||
|
static constexpr bool BroadcastPreferOuter = true;
|
||||||
|
|
||||||
|
/// Does this array represent a fixed size vector?
|
||||||
|
static constexpr bool IsVector = false;
|
||||||
|
|
||||||
|
/// Does this array represent a complex number?
|
||||||
|
static constexpr bool IsComplex = false;
|
||||||
|
|
||||||
|
/// Does this array represent a quaternion?
|
||||||
|
static constexpr bool IsQuaternion = false;
|
||||||
|
|
||||||
|
/// Does this array represent a matrix?
|
||||||
|
static constexpr bool IsMatrix = false;
|
||||||
|
|
||||||
|
/// Does this array represent the result of a 'masked(...)' epxpression?
|
||||||
|
static constexpr bool IsMaskedArray = false;
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Iterators
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
ENOKI_INLINE auto begin() const { return derived().data(); }
|
||||||
|
ENOKI_INLINE auto begin() { return derived().data(); }
|
||||||
|
ENOKI_INLINE auto end() const { return derived().data() + derived().size(); }
|
||||||
|
ENOKI_INLINE auto end() { return derived().data() + derived().size(); }
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Element access
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Array indexing operator with bounds checks in debug mode
|
||||||
|
ENOKI_INLINE decltype(auto) operator[](size_t i) {
|
||||||
|
#if !defined(NDEBUG) && !defined(ENOKI_DISABLE_RANGE_CHECK)
|
||||||
|
if (i >= derived().size())
|
||||||
|
throw std::out_of_range(
|
||||||
|
"ArrayBase: out of range access (tried to access index " +
|
||||||
|
std::to_string(i) + " in an array of size " +
|
||||||
|
std::to_string(derived().size()) + ")");
|
||||||
|
#endif
|
||||||
|
return derived().coeff(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Array indexing operator with bounds checks in debug mode, const version
|
||||||
|
ENOKI_INLINE decltype(auto) operator[](size_t i) const {
|
||||||
|
#if !defined(NDEBUG) && !defined(ENOKI_DISABLE_RANGE_CHECK)
|
||||||
|
if (i >= derived().size())
|
||||||
|
throw std::out_of_range(
|
||||||
|
"ArrayBase: out of range access (tried to access index " +
|
||||||
|
std::to_string(i) + " in an array of size " +
|
||||||
|
std::to_string(derived().size()) + ")");
|
||||||
|
#endif
|
||||||
|
return derived().coeff(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Mask, enable_if_mask_t<Mask> = 0>
|
||||||
|
ENOKI_INLINE auto operator[](const Mask &m) {
|
||||||
|
return detail::MaskedArray<Derived>{ derived(), (const mask_t<Derived> &) m };
|
||||||
|
}
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Fallback implementations for masked operations
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
#define ENOKI_MASKED_OPERATOR_FALLBACK(name, expr) \
|
||||||
|
template <typename T, typename Mask> \
|
||||||
|
ENOKI_INLINE void m##name##_(const T &e, const Mask &m) { \
|
||||||
|
derived() = select(m, expr, derived()); \
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_MASKED_OPERATOR_FALLBACK(assign, e)
|
||||||
|
ENOKI_MASKED_OPERATOR_FALLBACK(add, derived() + e)
|
||||||
|
ENOKI_MASKED_OPERATOR_FALLBACK(sub, derived() - e)
|
||||||
|
ENOKI_MASKED_OPERATOR_FALLBACK(mul, derived() * e)
|
||||||
|
ENOKI_MASKED_OPERATOR_FALLBACK(div, derived() / e)
|
||||||
|
ENOKI_MASKED_OPERATOR_FALLBACK(or, derived() | e)
|
||||||
|
ENOKI_MASKED_OPERATOR_FALLBACK(and, derived() & e)
|
||||||
|
ENOKI_MASKED_OPERATOR_FALLBACK(xor, derived() ^ e)
|
||||||
|
|
||||||
|
#undef ENOKI_MASKED_OPERATOR_FALLBACK
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Dot product fallback implementation
|
||||||
|
ENOKI_INLINE auto dot_(const Derived &a) const { return hsum(derived() * a); }
|
||||||
|
|
||||||
|
/// Horizontal mean fallback implementation
|
||||||
|
ENOKI_INLINE auto hmean_() const {
|
||||||
|
return hsum(derived()) * (1.f / derived().size());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t Stride, typename Index, typename Mask>
|
||||||
|
ENOKI_INLINE void scatter_add_(void *mem, const Index &index,
|
||||||
|
const Mask &mask) const {
|
||||||
|
transform<Derived, Stride>(
|
||||||
|
mem, index, [](auto &a, auto &b, auto &) { a += b; },
|
||||||
|
derived(), mask);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE bool convert_mask(T value) {
|
||||||
|
if constexpr (std::is_same_v<T, bool>)
|
||||||
|
return value;
|
||||||
|
else
|
||||||
|
return memcpy_cast<typename type_chooser<sizeof(T)>::UInt>(value) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Stream, typename Array, size_t N, typename... Indices>
|
||||||
|
void print(Stream &os, const Array &a, bool abbrev,
|
||||||
|
const std::array<size_t, N> &size, Indices... indices) {
|
||||||
|
ENOKI_MARK_USED(size);
|
||||||
|
ENOKI_MARK_USED(abbrev);
|
||||||
|
if constexpr (sizeof...(Indices) == N) {
|
||||||
|
os << a.derived().coeff(indices...);
|
||||||
|
} else {
|
||||||
|
constexpr size_t k = N - sizeof...(Indices) - 1;
|
||||||
|
os << "[";
|
||||||
|
for (size_t i = 0; i < size[k]; ++i) {
|
||||||
|
if constexpr (is_dynamic_array_v<Array>) {
|
||||||
|
if (size[k] > 20 && i == 5 && abbrev) {
|
||||||
|
if (k > 0) {
|
||||||
|
os << ".. " << size[k] - 10 << " skipped ..,\n";
|
||||||
|
for (size_t j = 0; j <= sizeof...(Indices); ++j)
|
||||||
|
os << " ";
|
||||||
|
} else {
|
||||||
|
os << ".. " << size[k] - 10 << " skipped .., ";
|
||||||
|
}
|
||||||
|
i = size[k] - 6;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print(os, a, abbrev, size, i, indices...);
|
||||||
|
if (i + 1 < size[k]) {
|
||||||
|
if constexpr (k == 0) {
|
||||||
|
os << ", ";
|
||||||
|
} else {
|
||||||
|
os << ",\n";
|
||||||
|
for (size_t j = 0; j <= sizeof...(Indices); ++j)
|
||||||
|
os << " ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
os << "]";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Value, typename Derived>
|
||||||
|
ENOKI_NOINLINE std::ostream &operator<<(std::ostream &os, const ArrayBase<Value, Derived> &a) {
|
||||||
|
if (ragged(a))
|
||||||
|
os << "[ragged array]";
|
||||||
|
else
|
||||||
|
detail::print(os, a, true, shape(a));
|
||||||
|
return os;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,291 @@
|
||||||
|
/*
|
||||||
|
enoki/array_call.h -- Enoki arrays of pointers, support for
|
||||||
|
array (virtual) method calls
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <enoki/array_generic.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
template <typename Class, typename Storage> struct call_support {
|
||||||
|
call_support(const Storage &) { }
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Value_, size_t Size_, bool IsMask_, typename Derived_>
|
||||||
|
struct StaticArrayImpl<Value_, Size_, IsMask_, Derived_,
|
||||||
|
enable_if_t<detail::array_config<Value_, Size_>::use_pointer_impl>>
|
||||||
|
: StaticArrayImpl<uintptr_t, Size_, IsMask_, Derived_> {
|
||||||
|
|
||||||
|
using UnderlyingType = std::uintptr_t;
|
||||||
|
|
||||||
|
using Base = StaticArrayImpl<UnderlyingType, Size_, IsMask_, Derived_>;
|
||||||
|
|
||||||
|
ENOKI_ARRAY_DEFAULTS(StaticArrayImpl)
|
||||||
|
using Base::derived;
|
||||||
|
|
||||||
|
using Value = std::conditional_t<IsMask_, typename Base::Value, Value_>;
|
||||||
|
using Scalar = std::conditional_t<IsMask_, typename Base::Scalar, Value_>;
|
||||||
|
|
||||||
|
StaticArrayImpl() = default;
|
||||||
|
StaticArrayImpl(Value value) : Base(UnderlyingType(value)) { }
|
||||||
|
StaticArrayImpl(std::nullptr_t) : Base(UnderlyingType(0)) { }
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<!std::is_pointer_v<T>> = 0>
|
||||||
|
StaticArrayImpl(const T &b) : Base(b) { }
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
StaticArrayImpl(const T &b, detail::reinterpret_flag)
|
||||||
|
: Base(b, detail::reinterpret_flag()) { }
|
||||||
|
|
||||||
|
template <typename T1, typename T2, typename T = StaticArrayImpl, enable_if_t<
|
||||||
|
array_depth_v<T1> == array_depth_v<T> && array_size_v<T1> == Base::Size1 &&
|
||||||
|
array_depth_v<T2> == array_depth_v<T> && array_size_v<T2> == Base::Size2 &&
|
||||||
|
Base::Size2 != 0> = 0>
|
||||||
|
StaticArrayImpl(const T1 &a1, const T2 &a2)
|
||||||
|
: Base(a1, a2) { }
|
||||||
|
|
||||||
|
ENOKI_INLINE decltype(auto) coeff(size_t i) const {
|
||||||
|
using Coeff = decltype(Base::coeff(i));
|
||||||
|
if constexpr (std::is_same_v<Coeff, const typename Base::Value &>)
|
||||||
|
return (const Value &) Base::coeff(i);
|
||||||
|
else
|
||||||
|
return Base::coeff(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE decltype(auto) coeff(size_t i) {
|
||||||
|
using Coeff = decltype(Base::coeff(i));
|
||||||
|
if constexpr (std::is_same_v<Coeff, typename Base::Value &>)
|
||||||
|
return (Value &) Base::coeff(i);
|
||||||
|
else
|
||||||
|
return Base::coeff(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename Mask>
|
||||||
|
ENOKI_INLINE size_t compress_(T *&ptr, const Mask &mask) const {
|
||||||
|
return Base::compress_((UnderlyingType *&) ptr, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto operator->() const {
|
||||||
|
using BaseType = std::decay_t<std::remove_pointer_t<scalar_t<Derived_>>>;
|
||||||
|
return call_support<BaseType, Derived_>(derived());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> Derived_& operator=(T&& t) {
|
||||||
|
ENOKI_MARK_USED(t);
|
||||||
|
if constexpr (std::is_same_v<T, std::nullptr_t>)
|
||||||
|
return (Derived_ &) Base::operator=(UnderlyingType(0));
|
||||||
|
else if constexpr (std::is_convertible_v<T, Value>)
|
||||||
|
return (Derived_ &) Base::operator=(UnderlyingType(t));
|
||||||
|
else
|
||||||
|
return (Derived_ &) Base::operator=(std::forward<T>(t));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(detail)
|
||||||
|
template <typename, template <typename...> typename T, typename... Args>
|
||||||
|
struct is_callable : std::false_type {};
|
||||||
|
template <template <typename...> typename T, typename... Args>
|
||||||
|
struct is_callable<std::void_t<T<Args...>>, T, Args...> : std::true_type { };
|
||||||
|
template <template <typename...> typename T, typename... Args>
|
||||||
|
constexpr bool is_callable_v = is_callable<void, T, Args...>::value;
|
||||||
|
|
||||||
|
template <typename Guide, typename Result, typename = int> struct vectorize_result {
|
||||||
|
using type = Result;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Guide, typename Result> struct vectorize_result<Guide, Result, enable_if_t<is_scalar_v<Result>>> {
|
||||||
|
using type = replace_scalar_t<array_t<Guide>, Result, false>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, typename Perm>
|
||||||
|
decltype(auto) gather_helper(T&& v, const Perm &perm) {
|
||||||
|
ENOKI_MARK_USED(perm);
|
||||||
|
using DT = std::decay_t<T>;
|
||||||
|
if constexpr (!is_cuda_array_v<DT> && !std::is_class_v<DT>)
|
||||||
|
return v;
|
||||||
|
else
|
||||||
|
return gather<std::decay_t<DT>, 0, true, true>(v, perm);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Storage_> struct call_support_base {
|
||||||
|
using Storage = Storage_;
|
||||||
|
using InstancePtr = value_t<Storage_>;
|
||||||
|
using Mask = mask_t<Storage_>;
|
||||||
|
call_support_base(const Storage &self) : self(self) { }
|
||||||
|
const Storage &self;
|
||||||
|
|
||||||
|
template <typename Func, typename InputMask,
|
||||||
|
typename Tuple, size_t ... Indices>
|
||||||
|
ENOKI_INLINE auto dispatch(Func func, InputMask mask_, Tuple tuple,
|
||||||
|
std::index_sequence<Indices...>) const {
|
||||||
|
Mask mask = Mask(mask_) & neq(self, nullptr);
|
||||||
|
|
||||||
|
using FuncResult = decltype(func(
|
||||||
|
std::declval<InstancePtr>(),
|
||||||
|
mask,
|
||||||
|
std::get<Indices>(tuple)...
|
||||||
|
));
|
||||||
|
|
||||||
|
if constexpr (!std::is_void_v<FuncResult>) {
|
||||||
|
using Result = typename vectorize_result<Mask, FuncResult>::type;
|
||||||
|
Result result = zero<Result>(self.size());
|
||||||
|
|
||||||
|
if constexpr (!is_cuda_array_v<Storage>) {
|
||||||
|
while (any(mask)) {
|
||||||
|
InstancePtr value = extract(self, mask);
|
||||||
|
Mask active = mask & eq(self, value);
|
||||||
|
mask = andnot(mask, active);
|
||||||
|
masked(result, active) = func(value, active, std::get<Indices>(tuple)...);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
auto partitioned = partition(self);
|
||||||
|
|
||||||
|
if (partitioned.size() == 1 && partitioned[0].first != nullptr) {
|
||||||
|
result = func(partitioned[0].first, true,
|
||||||
|
std::get<Indices>(tuple)...);
|
||||||
|
} else {
|
||||||
|
for (auto [value, permutation] : partitioned) {
|
||||||
|
if (value == nullptr)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
Result temp = func(value, gather_helper(mask, permutation),
|
||||||
|
gather_helper(std::get<Indices>(tuple),
|
||||||
|
permutation)...);
|
||||||
|
|
||||||
|
scatter<0, true, true>(result, temp, permutation);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
} else {
|
||||||
|
if constexpr (!is_cuda_array_v<Storage>) {
|
||||||
|
while (any(mask)) {
|
||||||
|
InstancePtr value = extract(self, mask);
|
||||||
|
Mask active = mask & eq(self, value);
|
||||||
|
mask = andnot(mask, active);
|
||||||
|
func(value, active, std::get<Indices>(tuple)...);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
auto partitioned = partition(self);
|
||||||
|
|
||||||
|
if (partitioned.size() == 1 && partitioned[0].first != nullptr) {
|
||||||
|
func(partitioned[0].first, true, std::get<Indices>(tuple)...);
|
||||||
|
} else {
|
||||||
|
for (auto [value, permutation] : partitioned) {
|
||||||
|
if (value == nullptr)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
func(value, gather_helper(mask, permutation),
|
||||||
|
gather_helper(std::get<Indices>(tuple),
|
||||||
|
permutation)...);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#if defined(__GNUC__)
|
||||||
|
# pragma GCC diagnostic push
|
||||||
|
# pragma GCC diagnostic ignored "-Wunused-value"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <typename... Ts>
|
||||||
|
inline constexpr bool last_of(Ts... values) { return (false, ..., values); }
|
||||||
|
|
||||||
|
#if defined(__GNUC__)
|
||||||
|
# pragma GCC diagnostic pop
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_END(detail)
|
||||||
|
|
||||||
|
#define ENOKI_CALL_SUPPORT_FRIEND() \
|
||||||
|
template <typename, typename> friend struct enoki::call_support;
|
||||||
|
|
||||||
|
#define ENOKI_CALL_SUPPORT_BEGIN(Class_) \
|
||||||
|
namespace enoki { \
|
||||||
|
template <typename Storage> \
|
||||||
|
struct call_support<Class_, Storage> : detail::call_support_base<Storage> {\
|
||||||
|
using Base = detail::call_support_base<Storage>; \
|
||||||
|
using Base::Base; \
|
||||||
|
using typename Base::Mask; \
|
||||||
|
using Class = Class_; \
|
||||||
|
using typename Base::InstancePtr; \
|
||||||
|
using Base::self; \
|
||||||
|
auto operator-> () { return this; }
|
||||||
|
|
||||||
|
#define ENOKI_CALL_SUPPORT_TEMPLATE_BEGIN(Class_) \
|
||||||
|
namespace enoki { \
|
||||||
|
template <typename Storage, typename... Ts> \
|
||||||
|
struct call_support<Class_<Ts...>, Storage> \
|
||||||
|
: detail::call_support_base<Storage> { \
|
||||||
|
using Base = detail::call_support_base<Storage>; \
|
||||||
|
using Base::Base; \
|
||||||
|
using typename Base::Mask; \
|
||||||
|
using Class = Class_<Ts...>; \
|
||||||
|
using typename Base::InstancePtr; \
|
||||||
|
using Base::self; \
|
||||||
|
auto operator-> () { return this; }
|
||||||
|
|
||||||
|
#define ENOKI_CALL_SUPPORT_METHOD(func) \
|
||||||
|
private: \
|
||||||
|
template <typename... Args> \
|
||||||
|
using __##func##_t = \
|
||||||
|
decltype(std::declval<InstancePtr>()->func(std::declval<Args>()...)); \
|
||||||
|
\
|
||||||
|
public: \
|
||||||
|
template <typename... Args> auto func(Args&&... args) const { \
|
||||||
|
auto lambda = [](InstancePtr instance, const Mask &mask, \
|
||||||
|
auto &&... a) ENOKI_INLINE_LAMBDA { \
|
||||||
|
ENOKI_MARK_USED(mask); \
|
||||||
|
/* Does the method accept a mask argument? If so, provide. */ \
|
||||||
|
if constexpr (detail::is_callable_v<__##func##_t, decltype(a)..., \
|
||||||
|
Mask>) \
|
||||||
|
return instance->func(a..., mask); \
|
||||||
|
else \
|
||||||
|
return instance->func(a...); \
|
||||||
|
}; \
|
||||||
|
/* Was a mask provided to this function? If not, set to all ones. */ \
|
||||||
|
auto args_tuple = std::tie(args...); \
|
||||||
|
if constexpr (detail::last_of(is_mask_v<Args>...)) { \
|
||||||
|
return Base::dispatch( \
|
||||||
|
lambda, std::get<sizeof...(Args) - 1>(args_tuple), args_tuple, \
|
||||||
|
std::make_index_sequence<sizeof...(Args) - 1>()); \
|
||||||
|
} else { \
|
||||||
|
return Base::dispatch( \
|
||||||
|
lambda, true, args_tuple, \
|
||||||
|
std::make_index_sequence<sizeof...(Args)>()); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define ENOKI_CALL_SUPPORT_GETTER_TYPE(name, field, type) \
|
||||||
|
template < \
|
||||||
|
typename Field = decltype(Class::field), \
|
||||||
|
typename Return = replace_scalar_t<Storage, type, false>> \
|
||||||
|
Return name(Mask mask = true) const { \
|
||||||
|
using IntType = replace_scalar_t<Storage, std::uintptr_t, false>; \
|
||||||
|
auto offset = \
|
||||||
|
IntType(self) + (std::uintptr_t) &(((Class *) nullptr)->field); \
|
||||||
|
mask &= neq(self, nullptr); \
|
||||||
|
return gather<Return, 1>(nullptr, offset, mask); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define ENOKI_CALL_SUPPORT_GETTER(name, field) \
|
||||||
|
ENOKI_CALL_SUPPORT_GETTER_TYPE(name, field, Field)
|
||||||
|
|
||||||
|
#define ENOKI_CALL_SUPPORT_END(Name) \
|
||||||
|
}; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define ENOKI_CALL_SUPPORT_TEMPLATE_END(Name) \
|
||||||
|
ENOKI_CALL_SUPPORT_END(Name)
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,82 @@
|
||||||
|
/*
|
||||||
|
enoki/array_call.h -- Enoki arrays of pointers, support for
|
||||||
|
array (virtual) method calls
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
template <typename Value_, size_t Size_, bool IsMask_, typename Derived_>
|
||||||
|
struct StaticArrayImpl<Value_, Size_, IsMask_, Derived_,
|
||||||
|
enable_if_t<detail::array_config<Value_, Size_>::use_enum_impl>>
|
||||||
|
: StaticArrayImpl<std::underlying_type_t<Value_>, Size_, IsMask_, Derived_> {
|
||||||
|
|
||||||
|
using UnderlyingType = std::underlying_type_t<Value_>;
|
||||||
|
|
||||||
|
using Base = StaticArrayImpl<UnderlyingType, Size_, IsMask_, Derived_>;
|
||||||
|
|
||||||
|
ENOKI_ARRAY_DEFAULTS(StaticArrayImpl)
|
||||||
|
using Base::derived;
|
||||||
|
|
||||||
|
using Value = std::conditional_t<IsMask_, typename Base::Value, Value_>;
|
||||||
|
using Scalar = std::conditional_t<IsMask_, typename Base::Scalar, Value_>;
|
||||||
|
|
||||||
|
StaticArrayImpl() = default;
|
||||||
|
StaticArrayImpl(Value value) : Base(UnderlyingType(value)) { }
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<!std::is_enum_v<T>> = 0>
|
||||||
|
StaticArrayImpl(const T &b) : Base(b) { }
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<!is_array_v<T>> = 0>
|
||||||
|
StaticArrayImpl(const T &v1, const T &v2) : Base(v1, v2) { }
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
StaticArrayImpl(const T &b, detail::reinterpret_flag)
|
||||||
|
: Base(b, detail::reinterpret_flag()) { }
|
||||||
|
|
||||||
|
template <typename T1, typename T2, typename T = StaticArrayImpl, enable_if_t<
|
||||||
|
array_depth_v<T1> == array_depth_v<T> && array_size_v<T1> == Base::Size1 &&
|
||||||
|
array_depth_v<T2> == array_depth_v<T> && array_size_v<T2> == Base::Size2 &&
|
||||||
|
Base::Size2 != 0> = 0>
|
||||||
|
StaticArrayImpl(const T1 &a1, const T2 &a2)
|
||||||
|
: Base(a1, a2) { }
|
||||||
|
|
||||||
|
ENOKI_INLINE decltype(auto) coeff(size_t i) const {
|
||||||
|
using Coeff = decltype(Base::coeff(i));
|
||||||
|
if constexpr (std::is_same_v<Coeff, const typename Base::Value &>)
|
||||||
|
return (const Value &) Base::coeff(i);
|
||||||
|
else
|
||||||
|
return Base::coeff(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE decltype(auto) coeff(size_t i) {
|
||||||
|
using Coeff = decltype(Base::coeff(i));
|
||||||
|
if constexpr (std::is_same_v<Coeff, typename Base::Value &>)
|
||||||
|
return (Value &) Base::coeff(i);
|
||||||
|
else
|
||||||
|
return Base::coeff(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename Mask>
|
||||||
|
ENOKI_INLINE size_t compress_(T *&ptr, const Mask &mask) const {
|
||||||
|
return Base::compress_((UnderlyingType *&) ptr, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> Derived_& operator=(T&& t) {
|
||||||
|
ENOKI_MARK_USED(t);
|
||||||
|
if constexpr (std::is_same_v<T, std::nullptr_t>)
|
||||||
|
return (Derived_ &) Base::operator=(UnderlyingType(0));
|
||||||
|
else if constexpr (std::is_convertible_v<T, Value>)
|
||||||
|
return (Derived_ &) Base::operator=(UnderlyingType(t));
|
||||||
|
else
|
||||||
|
return (Derived_ &) Base::operator=(std::forward<T>(t));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,546 @@
|
||||||
|
/*
|
||||||
|
enoki/array_fallbacks.h -- Scalar fallback implementations of various
|
||||||
|
operations
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <enoki/array_intrin.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
NAMESPACE_BEGIN(detail)
|
||||||
|
|
||||||
|
/// Reciprocal (scalar fallback)
|
||||||
|
template <typename T> ENOKI_INLINE T rcp_scalar(const T &a) {
|
||||||
|
#if defined(ENOKI_X86_AVX512ER)
|
||||||
|
if (std::is_same_v<T, float>) {
|
||||||
|
__m128 v = _mm_set_ss((float) a);
|
||||||
|
return T(_mm_cvtss_f32(_mm_rcp28_ss(v, v))); /* rel error < 2^-28 */
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, float>) {
|
||||||
|
#if defined(ENOKI_X86_SSE42)
|
||||||
|
__m128 v = _mm_set_ss((float) a), r;
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512F)
|
||||||
|
r = _mm_rcp14_ss(v, v); /* rel error < 2^-14 */
|
||||||
|
#else
|
||||||
|
r = _mm_rcp_ss(v); /* rel error < 1.5*2^-12 */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Refine using one Newton-Raphson iteration */
|
||||||
|
__m128 ro = r;
|
||||||
|
|
||||||
|
__m128 t0 = _mm_add_ss(r, r);
|
||||||
|
__m128 t1 = _mm_mul_ss(r, v);
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_FMA)
|
||||||
|
r = _mm_fnmadd_ss(r, t1, t0);
|
||||||
|
#else
|
||||||
|
r = _mm_sub_ss(t0, _mm_mul_ss(r, t1));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512F)
|
||||||
|
(void) ro;
|
||||||
|
r = _mm_fixupimm_ss(r, v, _mm_set1_epi32(0x0087A622), 0);
|
||||||
|
#else
|
||||||
|
r = _mm_blendv_ps(r, ro, t1); /* mask bit is '1' iff t1 == nan */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return T(_mm_cvtss_f32(r));
|
||||||
|
#elif defined(ENOKI_ARM_NEON) && defined(ENOKI_ARM_64)
|
||||||
|
float v = (float) a;
|
||||||
|
float r = vrecpes_f32(v);
|
||||||
|
r *= vrecpss_f32(r, v);
|
||||||
|
r *= vrecpss_f32(r, v);
|
||||||
|
return T(r);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512F) || defined(ENOKI_X86_AVX512ER)
|
||||||
|
if constexpr (std::is_same_v<T, double>) {
|
||||||
|
__m128d v = _mm_set_sd((double) a), r;
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512ER)
|
||||||
|
r = _mm_rcp28_sd(v, v); /* rel error < 2^-28 */
|
||||||
|
#elif defined(ENOKI_X86_AVX512F)
|
||||||
|
r = _mm_rcp14_sd(v, v); /* rel error < 2^-14 */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__m128d ro = r, t0, t1;
|
||||||
|
|
||||||
|
/* Refine using 1-2 Newton-Raphson iterations */
|
||||||
|
ENOKI_UNROLL for (int i = 0; i < (has_avx512er ? 1 : 2); ++i) {
|
||||||
|
t0 = _mm_add_sd(r, r);
|
||||||
|
t1 = _mm_mul_sd(r, v);
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_FMA)
|
||||||
|
r = _mm_fnmadd_sd(t1, r, t0);
|
||||||
|
#else
|
||||||
|
r = _mm_sub_sd(t0, _mm_mul_sd(r, t1));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
r = _mm_blendv_pd(r, ro, t1); /* mask bit is '1' iff t1 == nan */
|
||||||
|
|
||||||
|
return T(_mm_cvtsd_f64(r));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return T(1) / a;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Reciprocal square root (scalar fallback)
|
||||||
|
template <typename T> ENOKI_INLINE T rsqrt_scalar(const T &a) {
|
||||||
|
#if defined(ENOKI_X86_AVX512ER)
|
||||||
|
if (std::is_same_v<T, float>) {
|
||||||
|
__m128 v = _mm_set_ss((float) a);
|
||||||
|
return T(_mm_cvtss_f32(_mm_rsqrt28_ss(v, v))); /* rel error < 2^-28 */
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, float>) {
|
||||||
|
#if defined(ENOKI_X86_SSE42)
|
||||||
|
__m128 v = _mm_set_ss((float) a), r;
|
||||||
|
#if defined(ENOKI_X86_AVX512F)
|
||||||
|
r = _mm_rsqrt14_ss(v, v); /* rel error < 2^-14 */
|
||||||
|
#else
|
||||||
|
r = _mm_rsqrt_ss(v); /* rel error < 1.5*2^-12 */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Refine using one Newton-Raphson iteration */
|
||||||
|
const __m128 c0 = _mm_set_ss(0.5f),
|
||||||
|
c1 = _mm_set_ss(3.0f);
|
||||||
|
|
||||||
|
__m128 t0 = _mm_mul_ss(r, c0),
|
||||||
|
t1 = _mm_mul_ss(r, v),
|
||||||
|
ro = r;
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_FMA)
|
||||||
|
r = _mm_mul_ss(_mm_fnmadd_ss(t1, r, c1), t0);
|
||||||
|
#else
|
||||||
|
r = _mm_mul_ss(_mm_sub_ss(c1, _mm_mul_ss(t1, r)), t0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512F)
|
||||||
|
(void) ro;
|
||||||
|
r = _mm_fixupimm_ss(r, v, _mm_set1_epi32(0x0383A622), 0);
|
||||||
|
#else
|
||||||
|
r = _mm_blendv_ps(r, ro, t1); /* mask bit is '1' iff t1 == nan */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return T(_mm_cvtss_f32(r));
|
||||||
|
#elif defined(ENOKI_ARM_NEON) && defined(ENOKI_ARM_64)
|
||||||
|
float v = (float) a;
|
||||||
|
float r = vrsqrtes_f32(v);
|
||||||
|
r *= vrsqrtss_f32(r*r, v);
|
||||||
|
r *= vrsqrtss_f32(r*r, v);
|
||||||
|
return r;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512F) || defined(ENOKI_X86_AVX512ER)
|
||||||
|
if constexpr (std::is_same_v<T, double>) {
|
||||||
|
__m128d v = _mm_set_sd((double) a), r;
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512ER)
|
||||||
|
r = _mm_rsqrt28_sd(v, v); /* rel error < 2^-28 */
|
||||||
|
#elif defined(ENOKI_X86_AVX512F)
|
||||||
|
r = _mm_rsqrt14_sd(v, v); /* rel error < 2^-14 */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const __m128d c0 = _mm_set_sd(0.5),
|
||||||
|
c1 = _mm_set_sd(3.0);
|
||||||
|
|
||||||
|
__m128d ro = r, t0, t1;
|
||||||
|
|
||||||
|
/* Refine using 1-2 Newton-Raphson iterations */
|
||||||
|
ENOKI_UNROLL for (int i = 0; i < (has_avx512er ? 1 : 2); ++i) {
|
||||||
|
t0 = _mm_mul_sd(r, c0);
|
||||||
|
t1 = _mm_mul_sd(r, v);
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_FMA)
|
||||||
|
r = _mm_mul_sd(_mm_fnmadd_sd(t1, r, c1), t0);
|
||||||
|
#else
|
||||||
|
r = _mm_mul_sd(_mm_sub_sd(c1, _mm_mul_sd(t1, r)), t0);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
r = _mm_blendv_pd(r, ro, t1); /* mask bit is '1' iff t1 == nan */
|
||||||
|
|
||||||
|
return T(_mm_cvtsd_f64(r));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return T(1) / std::sqrt(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE T popcnt_scalar(T v) {
|
||||||
|
static_assert(std::is_integral_v<T>, "popcnt(): requires an integer argument!");
|
||||||
|
#if defined(ENOKI_X86_SSE42)
|
||||||
|
if constexpr (sizeof(T) <= 4) {
|
||||||
|
return (T) _mm_popcnt_u32((unsigned int) v);
|
||||||
|
} else {
|
||||||
|
#if defined(ENOKI_X86_64)
|
||||||
|
return (T) _mm_popcnt_u64((unsigned long long) v);
|
||||||
|
#else
|
||||||
|
unsigned long long v_ = (unsigned long long) v;
|
||||||
|
unsigned int lo = (unsigned int) v_;
|
||||||
|
unsigned int hi = (unsigned int) (v_ >> 32);
|
||||||
|
return (T) (_mm_popcnt_u32(lo) + _mm_popcnt_u32(hi));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#elif defined(_MSC_VER)
|
||||||
|
if constexpr (sizeof(T) <= 4) {
|
||||||
|
uint32_t w = (uint32_t) v;
|
||||||
|
w -= (w >> 1) & 0x55555555;
|
||||||
|
w = (w & 0x33333333) + ((w >> 2) & 0x33333333);
|
||||||
|
w = (w + (w >> 4)) & 0x0F0F0F0F;
|
||||||
|
w = (w * 0x01010101) >> 24;
|
||||||
|
return (T) w;
|
||||||
|
} else {
|
||||||
|
uint64_t w = (uint64_t) v;
|
||||||
|
w -= (w >> 1) & 0x5555555555555555ull;
|
||||||
|
w = (w & 0x3333333333333333ull) + ((w >> 2) & 0x3333333333333333ull);
|
||||||
|
w = (w + (w >> 4)) & 0x0F0F0F0F0F0F0F0Full;
|
||||||
|
w = (w * 0x0101010101010101ull) >> 56;
|
||||||
|
return (T) w;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if constexpr (sizeof(T) <= 4)
|
||||||
|
return (T) __builtin_popcount((unsigned int) v);
|
||||||
|
else
|
||||||
|
return (T) __builtin_popcountll((unsigned long long) v);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE T lzcnt_scalar(T v) {
|
||||||
|
static_assert(std::is_integral_v<T>, "lzcnt(): requires an integer argument!");
|
||||||
|
#if defined(ENOKI_X86_AVX2)
|
||||||
|
if constexpr (sizeof(T) <= 4) {
|
||||||
|
return (T) _lzcnt_u32((unsigned int) v);
|
||||||
|
} else {
|
||||||
|
#if defined(ENOKI_X86_64)
|
||||||
|
return (T) _lzcnt_u64((unsigned long long) v);
|
||||||
|
#else
|
||||||
|
unsigned long long v_ = (unsigned long long) v;
|
||||||
|
unsigned int lo = (unsigned int) v_;
|
||||||
|
unsigned int hi = (unsigned int) (v_ >> 32);
|
||||||
|
return (T) (hi != 0 ? _lzcnt_u32(hi) : (_lzcnt_u32(lo) + 32));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#elif defined(_MSC_VER)
|
||||||
|
unsigned long result;
|
||||||
|
if constexpr (sizeof(T) <= 4) {
|
||||||
|
_BitScanReverse(&result, (unsigned long) v);
|
||||||
|
return (v != 0) ? (31 - result) : 32;
|
||||||
|
} else {
|
||||||
|
_BitScanReverse64(&result, (unsigned long long) v);
|
||||||
|
return (v != 0) ? (63 - result) : 64;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if constexpr (sizeof(T) <= 4)
|
||||||
|
return (T) (v != 0 ? __builtin_clz((unsigned int) v) : 32);
|
||||||
|
else
|
||||||
|
return (T) (v != 0 ? __builtin_clzll((unsigned long long) v) : 64);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE T tzcnt_scalar(T v) {
|
||||||
|
static_assert(std::is_integral_v<T>, "tzcnt(): requires an integer argument!");
|
||||||
|
#if defined(ENOKI_X86_AVX2)
|
||||||
|
if (sizeof(T) <= 4)
|
||||||
|
return (T) _tzcnt_u32((unsigned int) v);
|
||||||
|
#if defined(ENOKI_X86_64)
|
||||||
|
return (T) _tzcnt_u64((unsigned long long) v);
|
||||||
|
#else
|
||||||
|
unsigned long long v_ = (unsigned long long) v;
|
||||||
|
unsigned int lo = (unsigned int) v_;
|
||||||
|
unsigned int hi = (unsigned int) (v_ >> 32);
|
||||||
|
return (T) (lo != 0 ? _tzcnt_u32(lo) : (_tzcnt_u32(hi) + 32));
|
||||||
|
#endif
|
||||||
|
#elif defined(_MSC_VER)
|
||||||
|
unsigned long result;
|
||||||
|
if (sizeof(T) <= 4) {
|
||||||
|
_BitScanForward(&result, (unsigned long) v);
|
||||||
|
return (v != 0) ? result : 32;
|
||||||
|
} else {
|
||||||
|
_BitScanForward64(&result, (unsigned long long) v);
|
||||||
|
return (v != 0) ? result: 64;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if (sizeof(T) <= 4)
|
||||||
|
return (T) (v != 0 ? __builtin_ctz((unsigned int) v) : 32);
|
||||||
|
else
|
||||||
|
return (T) (v != 0 ? __builtin_ctzll((unsigned long long) v) : 64);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T1, typename T2>
|
||||||
|
ENOKI_INLINE T1 ldexp_scalar(const T1 &a1, const T2 &a2) {
|
||||||
|
#if defined(ENOKI_X86_AVX512F)
|
||||||
|
if constexpr (std::is_same_v<T1, float>) {
|
||||||
|
__m128 v1 = _mm_set_ss((float) a1),
|
||||||
|
v2 = _mm_set_ss((float) a2);
|
||||||
|
return T1(_mm_cvtss_f32(_mm_scalef_ss(v1, v2)));
|
||||||
|
} else if constexpr (std::is_same_v<T1, double>) {
|
||||||
|
__m128d v1 = _mm_set_sd((double) a1),
|
||||||
|
v2 = _mm_set_sd((double) a2);
|
||||||
|
return T1(_mm_cvtsd_f64(_mm_scalef_sd(v1, v2)));
|
||||||
|
} else {
|
||||||
|
return std::ldexp(a1, int(a2));
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
return std::ldexp(a1, int(a2));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Break floating-point number into normalized fraction and power of 2 (scalar fallback)
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE std::pair<T, T> frexp_scalar(const T &a) {
|
||||||
|
#if defined(ENOKI_X86_AVX512F)
|
||||||
|
if constexpr (std::is_same_v<T, float>) {
|
||||||
|
__m128 v = _mm_set_ss((float) a);
|
||||||
|
return std::make_pair(
|
||||||
|
T(_mm_cvtss_f32(_mm_getmant_ss(v, v, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src))),
|
||||||
|
T(_mm_cvtss_f32(_mm_getexp_ss(v, v))));
|
||||||
|
} else if constexpr (std::is_same_v<T, double>) {
|
||||||
|
__m128d v = _mm_set_sd((double) a);
|
||||||
|
return std::make_pair(
|
||||||
|
T(_mm_cvtsd_f64(_mm_getmant_sd(v, v, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src))),
|
||||||
|
T(_mm_cvtsd_f64(_mm_getexp_sd(v, v))));
|
||||||
|
} else {
|
||||||
|
int tmp;
|
||||||
|
T result = std::frexp(a, &tmp);
|
||||||
|
return std::make_pair(result, T(tmp) - T(1));
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
int tmp;
|
||||||
|
T result = std::frexp(a, &tmp);
|
||||||
|
return std::make_pair(result, T(tmp) - T(1));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE int32_t mulhi_scalar(int32_t x, int32_t y) {
|
||||||
|
int64_t rl = (int64_t) x * (int64_t) y;
|
||||||
|
return (int32_t) (rl >> 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE uint32_t mulhi_scalar(uint32_t x, uint32_t y) {
|
||||||
|
uint64_t rl = (uint64_t) x * (uint64_t) y;
|
||||||
|
return (uint32_t) (rl >> 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE uint64_t mulhi_scalar(uint64_t x, uint64_t y) {
|
||||||
|
#if defined(_MSC_VER) && defined(ENOKI_X86_64)
|
||||||
|
return __umulh(x, y);
|
||||||
|
#elif defined(__SIZEOF_INT128__)
|
||||||
|
__uint128_t rl = (__uint128_t) x * (__uint128_t) y;
|
||||||
|
return (uint64_t)(rl >> 64);
|
||||||
|
#else
|
||||||
|
// full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
|
||||||
|
const uint32_t mask = 0xFFFFFFFF;
|
||||||
|
const uint32_t x0 = (uint32_t) (x & mask), x1 = (uint32_t) (x >> 32);
|
||||||
|
const uint32_t y0 = (uint32_t) (y & mask), y1 = (uint32_t) (y >> 32);
|
||||||
|
const uint32_t x0y0_hi = mulhi_scalar(x0, y0);
|
||||||
|
const uint64_t x0y1 = x0 * (uint64_t) y1;
|
||||||
|
const uint64_t x1y0 = x1 * (uint64_t) y0;
|
||||||
|
const uint64_t x1y1 = x1 * (uint64_t) y1;
|
||||||
|
const uint64_t temp = x1y0 + x0y0_hi;
|
||||||
|
const uint64_t temp_lo = temp & mask, temp_hi = temp >> 32;
|
||||||
|
|
||||||
|
return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE int64_t mulhi_scalar(int64_t x, int64_t y) {
|
||||||
|
#if defined(_MSC_VER) && defined(_M_X64)
|
||||||
|
return __mulh(x, y);
|
||||||
|
#elif defined(__SIZEOF_INT128__)
|
||||||
|
__int128_t rl = (__int128_t) x * (__int128_t) y;
|
||||||
|
return (int64_t)(rl >> 64);
|
||||||
|
#else
|
||||||
|
// full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
|
||||||
|
const uint32_t mask = 0xFFFFFFFF;
|
||||||
|
const uint32_t x0 = (uint32_t) (x & mask), y0 = (uint32_t) (y & mask);
|
||||||
|
const int32_t x1 = (int32_t) (x >> 32), y1 = (int32_t) (y >> 32);
|
||||||
|
const uint32_t x0y0_hi = mulhi_scalar(x0, y0);
|
||||||
|
const int64_t t = x1 * (int64_t) y0 + x0y0_hi;
|
||||||
|
const int64_t w1 = x0 * (int64_t) y1 + (t & mask);
|
||||||
|
|
||||||
|
return x1 * (int64_t) y1 + (t >> 32) + (w1 >> 32);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE T abs_scalar(const T &a) {
|
||||||
|
if constexpr (std::is_signed_v<T>)
|
||||||
|
return std::abs(a);
|
||||||
|
else
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T1, typename T2, typename T3,
|
||||||
|
typename E = expr_t<T1, T2, T3>> ENOKI_INLINE E fmadd_scalar(T1 a1, T2 a2, T3 a3) {
|
||||||
|
#if defined(ENOKI_X86_FMA) || defined(ENOKI_ARM_FMA)
|
||||||
|
if constexpr (std::is_floating_point_v<E>)
|
||||||
|
return (E) std::fma((E) a1, (E) a2, (E) a3);
|
||||||
|
#endif
|
||||||
|
return (E) a1 * (E) a2 + (E) a3;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename Arg>
|
||||||
|
T ceil2int_scalar(Arg x) {
|
||||||
|
#if defined(ENOKI_X86_AVX512F)
|
||||||
|
if constexpr (std::is_same_v<Arg, float>) {
|
||||||
|
__m128 y = _mm_set_ss(x);
|
||||||
|
if constexpr (sizeof(T) == 4) {
|
||||||
|
if constexpr (std::is_signed_v<T>)
|
||||||
|
return _mm_cvt_roundss_i32(y, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
|
||||||
|
else
|
||||||
|
return _mm_cvt_roundss_u32(y, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
|
||||||
|
} else if constexpr (sizeof(T) == 8) {
|
||||||
|
if constexpr (std::is_signed_v<T>)
|
||||||
|
return _mm_cvt_roundss_i64(y, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
|
||||||
|
else
|
||||||
|
return _mm_cvt_roundss_u64(y, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
|
||||||
|
}
|
||||||
|
} else if constexpr (std::is_same_v<Arg, double>) {
|
||||||
|
__m128d y = _mm_set_sd(x);
|
||||||
|
if constexpr (sizeof(T) == 4) {
|
||||||
|
if constexpr (std::is_signed_v<T>)
|
||||||
|
return _mm_cvt_roundsd_i32(y, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
|
||||||
|
else
|
||||||
|
return _mm_cvt_roundsd_u32(y, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
|
||||||
|
} else if constexpr (sizeof(T) == 8) {
|
||||||
|
if constexpr (std::is_signed_v<T>)
|
||||||
|
return _mm_cvt_roundsd_i64(y, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
|
||||||
|
else
|
||||||
|
return _mm_cvt_roundsd_u64(y, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return T(std::ceil(x));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename Arg>
|
||||||
|
T floor2int_scalar(Arg x) {
|
||||||
|
#if defined(ENOKI_X86_AVX512F)
|
||||||
|
if constexpr (std::is_same_v<Arg, float>) {
|
||||||
|
__m128 y = _mm_set_ss(x);
|
||||||
|
if constexpr (sizeof(T) == 4) {
|
||||||
|
if constexpr (std::is_signed_v<T>)
|
||||||
|
return _mm_cvt_roundss_i32(y, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
|
||||||
|
else
|
||||||
|
return _mm_cvt_roundss_u32(y, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
|
||||||
|
} else if constexpr (sizeof(T) == 8) {
|
||||||
|
if constexpr (std::is_signed_v<T>)
|
||||||
|
return _mm_cvt_roundss_i64(y, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
|
||||||
|
else
|
||||||
|
return _mm_cvt_roundss_u64(y, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
|
||||||
|
}
|
||||||
|
} else if constexpr (std::is_same_v<Arg, double>) {
|
||||||
|
__m128d y = _mm_set_sd(x);
|
||||||
|
if constexpr (sizeof(T) == 4) {
|
||||||
|
if constexpr (std::is_signed_v<T>)
|
||||||
|
return _mm_cvt_roundsd_i32(y, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
|
||||||
|
else
|
||||||
|
return _mm_cvt_roundsd_u32(y, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
|
||||||
|
} else if constexpr (sizeof(T) == 8) {
|
||||||
|
if constexpr (std::is_signed_v<T>)
|
||||||
|
return _mm_cvt_roundsd_i64(y, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
|
||||||
|
else
|
||||||
|
return _mm_cvt_roundsd_u64(y, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return T(std::floor(x));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> auto or_(const T &a1, const T &a2) {
|
||||||
|
using Int = int_array_t<T, false>;
|
||||||
|
|
||||||
|
if constexpr (is_array_v<T> || std::is_integral_v<T>)
|
||||||
|
return a1 | a2;
|
||||||
|
else
|
||||||
|
return memcpy_cast<T>(memcpy_cast<Int>(a1) | memcpy_cast<Int>(a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> auto and_(const T &a1, const T &a2) {
|
||||||
|
using Int = int_array_t<T, false>;
|
||||||
|
|
||||||
|
if constexpr (is_array_v<T> || std::is_integral_v<T>)
|
||||||
|
return a1 & a2;
|
||||||
|
else
|
||||||
|
return memcpy_cast<T>(memcpy_cast<Int>(a1) & memcpy_cast<Int>(a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> auto andnot_(const T &a1, const T &a2) {
|
||||||
|
using Int = int_array_t<T, false>;
|
||||||
|
|
||||||
|
if constexpr (is_array_v<T>)
|
||||||
|
return andnot(a1, a2);
|
||||||
|
else if constexpr (std::is_same_v<T, bool>)
|
||||||
|
return a1 && !a2;
|
||||||
|
else if constexpr (std::is_integral_v<T>)
|
||||||
|
return a1 & ~a2;
|
||||||
|
else
|
||||||
|
return memcpy_cast<T>(memcpy_cast<Int>(a1) & ~memcpy_cast<Int>(a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> auto xor_(const T &a1, const T &a2) {
|
||||||
|
using Int = int_array_t<T, false>;
|
||||||
|
|
||||||
|
if constexpr (is_array_v<T> || std::is_integral_v<T>)
|
||||||
|
return a1 ^ a2;
|
||||||
|
else
|
||||||
|
return memcpy_cast<T>(memcpy_cast<Int>(a1) ^ memcpy_cast<Int>(a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<!std::is_same_v<T, bool>> = 0> auto or_(const T &a, const bool &b) {
|
||||||
|
using Scalar = scalar_t<T>;
|
||||||
|
using Int = int_array_t<Scalar>;
|
||||||
|
return or_(a, b ? memcpy_cast<Scalar>(Int(-1)) : memcpy_cast<Scalar>(Int(0)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<!std::is_same_v<T, bool>> = 0> auto and_(const T &a, const bool &b) {
|
||||||
|
using Scalar = scalar_t<T>;
|
||||||
|
using Int = int_array_t<Scalar>;
|
||||||
|
return and_(a, b ? memcpy_cast<Scalar>(Int(-1)) : memcpy_cast<Scalar>(Int(0)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<!std::is_same_v<T, bool>> = 0> auto andnot_(const T &a, const bool &b) {
|
||||||
|
using Scalar = scalar_t<T>;
|
||||||
|
using Int = int_array_t<Scalar>;
|
||||||
|
return andnot_(a, b ? memcpy_cast<Scalar>(Int(-1)) : memcpy_cast<Scalar>(Int(0)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<!std::is_same_v<T, bool>> = 0> auto xor_(const T &a, const bool &b) {
|
||||||
|
using Scalar = scalar_t<T>;
|
||||||
|
using Int = int_array_t<Scalar>;
|
||||||
|
return xor_(a, b ? memcpy_cast<Scalar>(Int(-1)) : memcpy_cast<Scalar>(Int(0)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T1, typename T2, enable_if_array_any_t<T1, T2> = 0>
|
||||||
|
auto or_(const T1 &a1, const T2 &a2) { return a1 | a2; }
|
||||||
|
|
||||||
|
template <typename T1, typename T2, enable_if_array_any_t<T1, T2> = 0>
|
||||||
|
auto and_(const T1 &a1, const T2 &a2) { return a1 & a2; }
|
||||||
|
|
||||||
|
template <typename T1, typename T2, enable_if_array_any_t<T1, T2> = 0>
|
||||||
|
auto andnot_(const T1 &a1, const T2 &a2) { return andnot(a1, a2); }
|
||||||
|
|
||||||
|
template <typename T1, typename T2, enable_if_array_any_t<T1, T2> = 0>
|
||||||
|
auto xor_(const T1 &a1, const T2 &a2) { return a1 ^ a2; }
|
||||||
|
|
||||||
|
NAMESPACE_END(detail)
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,626 @@
|
||||||
|
/*
|
||||||
|
enoki/array_generic.h -- Generic array implementation that forwards
|
||||||
|
all operations to the underlying data type (usually without making use of
|
||||||
|
hardware vectorization)
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <enoki/array_static.h>
|
||||||
|
#include <functional>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(nanogui)
|
||||||
|
template <typename Value, size_t Size> struct Array;
|
||||||
|
NAMESPACE_END(nanogui)
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
template <typename StorageType, typename T>
|
||||||
|
using is_constructible = std::bool_constant<
|
||||||
|
std::is_constructible_v<StorageType, T> &&
|
||||||
|
!std::is_same_v<std::decay_t<T>, reinterpret_flag>>;
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
using is_not_reinterpret_flag = std::bool_constant<
|
||||||
|
!std::is_same_v<std::decay_t<T>, reinterpret_flag>>;
|
||||||
|
|
||||||
|
template <typename Source, typename Target>
|
||||||
|
constexpr bool broadcast =
|
||||||
|
!is_static_array_v<Source> || array_size_v<Source> != Target::Size ||
|
||||||
|
!(array_depth_v<Source> == array_depth_v<Target> ||
|
||||||
|
(array_depth_v<Source> < array_depth_v<Target> &&
|
||||||
|
detail::array_broadcast_outer_v<Source>));
|
||||||
|
|
||||||
|
template <typename Value, size_t Size, typename = int>
|
||||||
|
struct is_native {
|
||||||
|
static constexpr bool value = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Value, size_t Size>
|
||||||
|
constexpr bool is_native_v = is_native<Value, Size>::value;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief The class StaticArrayImpl has several different implementations.
|
||||||
|
* This class specifies which one to use.
|
||||||
|
*/
|
||||||
|
template <typename Value, size_t Size>
|
||||||
|
struct array_config {
|
||||||
|
/// Use SSE/AVX/NEON implementation
|
||||||
|
static constexpr bool use_native_impl =
|
||||||
|
is_native_v<Value, Size>;
|
||||||
|
|
||||||
|
/// Reduce to several recursive operations
|
||||||
|
static constexpr bool use_recursive_impl =
|
||||||
|
!use_native_impl &&
|
||||||
|
is_std_type_v<Value> &&
|
||||||
|
has_vectorization &&
|
||||||
|
Size > 3;
|
||||||
|
|
||||||
|
/// Special case for arrays of enumerations
|
||||||
|
static constexpr bool use_enum_impl =
|
||||||
|
std::is_enum_v<Value>;
|
||||||
|
|
||||||
|
/// Special case for arrays of pointers of classes
|
||||||
|
static constexpr bool use_pointer_impl =
|
||||||
|
std::is_pointer_v<Value> &&
|
||||||
|
!std::is_arithmetic_v<std::remove_pointer_t<Value>>;
|
||||||
|
|
||||||
|
/// Catch-all for anything that wasn't matched so far
|
||||||
|
static constexpr bool use_generic_impl =
|
||||||
|
!use_native_impl &&
|
||||||
|
!use_recursive_impl &&
|
||||||
|
!use_enum_impl &&
|
||||||
|
!use_pointer_impl;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
using has_bitmask = decltype(std::declval<T>().bitmask_());
|
||||||
|
template <typename T>
|
||||||
|
constexpr bool has_bitmask_v = is_detected_v<has_bitmask, T>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Macro to initialize uninitialized floating point arrays with 1 bits (NaN/-1) in debug mode
|
||||||
|
#if defined(NDEBUG)
|
||||||
|
#define ENOKI_TRIVIAL_CONSTRUCTOR(Value) \
|
||||||
|
template <typename T = Value, \
|
||||||
|
enable_if_t<std::is_default_constructible_v<T>> = 0> \
|
||||||
|
ENOKI_INLINE StaticArrayImpl() { }
|
||||||
|
#else
|
||||||
|
#define ENOKI_TRIVIAL_CONSTRUCTOR(Value) \
|
||||||
|
template <typename T = Value, enable_if_t<std::is_scalar_v<T>> = 0> \
|
||||||
|
ENOKI_INLINE StaticArrayImpl() \
|
||||||
|
: StaticArrayImpl(memcpy_cast<T>(int_array_t<T>(-1))) { } \
|
||||||
|
template <typename T = Value, \
|
||||||
|
enable_if_t<!std::is_scalar_v<T> && \
|
||||||
|
std::is_default_constructible_v<T>> = 0> \
|
||||||
|
ENOKI_INLINE StaticArrayImpl() {}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/// SFINAE macro for constructors that convert from another type
|
||||||
|
#define ENOKI_CONVERT(Value) \
|
||||||
|
template <typename Value2, typename Derived2, \
|
||||||
|
enable_if_t<detail::is_same_v<Value2, Value>> = 0> \
|
||||||
|
ENOKI_INLINE StaticArrayImpl( \
|
||||||
|
const StaticArrayBase<Value2, Size, IsMask_, Derived2> &a)
|
||||||
|
|
||||||
|
/// SFINAE macro for constructors that reinterpret another type
|
||||||
|
#define ENOKI_REINTERPRET(Value) \
|
||||||
|
template <typename Value2, typename Derived2, bool IsMask2, \
|
||||||
|
enable_if_t<detail::is_same_v<Value2, Value>> = 0> \
|
||||||
|
ENOKI_INLINE StaticArrayImpl( \
|
||||||
|
const StaticArrayBase<Value2, Size, IsMask2, Derived2> &a, \
|
||||||
|
detail::reinterpret_flag)
|
||||||
|
|
||||||
|
#define ENOKI_ARRAY_DEFAULTS(Array) \
|
||||||
|
Array(const Array &) = default; \
|
||||||
|
Array(Array &&) = default; \
|
||||||
|
Array &operator=(const Array &) = default; \
|
||||||
|
Array &operator=(Array &&) = default;
|
||||||
|
|
||||||
|
/// Import the essentials when declaring an array subclass
|
||||||
|
#define ENOKI_ARRAY_IMPORT_BASIC(Base, Array) \
|
||||||
|
ENOKI_ARRAY_DEFAULTS(Array) \
|
||||||
|
using typename Base::Derived; \
|
||||||
|
using typename Base::Value; \
|
||||||
|
using typename Base::Scalar; \
|
||||||
|
using Base::Size; \
|
||||||
|
using Base::derived; \
|
||||||
|
|
||||||
|
/// Import the essentials when declaring an array subclass (+constructor/assignment op)
|
||||||
|
#define ENOKI_ARRAY_IMPORT(Base, Array) \
|
||||||
|
ENOKI_ARRAY_IMPORT_BASIC(Base, Array) \
|
||||||
|
using Base::Base; \
|
||||||
|
using Base::operator=;
|
||||||
|
|
||||||
|
|
||||||
|
/// Internal macro for native StaticArrayImpl overloads (SSE, AVX, ..)
|
||||||
|
#define ENOKI_NATIVE_ARRAY(Value_, Size_, Register_) \
|
||||||
|
using Base = \
|
||||||
|
StaticArrayBase<Value_, Size_, IsMask_, Derived_>; \
|
||||||
|
ENOKI_ARRAY_IMPORT_BASIC(Base, StaticArrayImpl) \
|
||||||
|
using typename Base::Array1; \
|
||||||
|
using typename Base::Array2; \
|
||||||
|
using Base::ActualSize; \
|
||||||
|
using Ref = const Derived &; \
|
||||||
|
using Register = Register_; \
|
||||||
|
static constexpr bool IsNative = true; \
|
||||||
|
Register m; \
|
||||||
|
ENOKI_TRIVIAL_CONSTRUCTOR(Value_) \
|
||||||
|
ENOKI_INLINE StaticArrayImpl(Register value) : m(value) {} \
|
||||||
|
ENOKI_INLINE StaticArrayImpl(Register value, detail::reinterpret_flag) \
|
||||||
|
: m(value) { } \
|
||||||
|
ENOKI_INLINE StaticArrayImpl(bool b, detail::reinterpret_flag) \
|
||||||
|
: StaticArrayImpl(b ? memcpy_cast<Value_>(int_array_t<Value>(-1)) \
|
||||||
|
: memcpy_cast<Value_>(int_array_t<Value>(0))) { } \
|
||||||
|
template <typename Value2, size_t Size2, typename Derived2, \
|
||||||
|
enable_if_t<is_scalar_v<Value2>> = 0> \
|
||||||
|
ENOKI_INLINE StaticArrayImpl( \
|
||||||
|
const StaticArrayBase<Value2, Size2, IsMask_, Derived2> &a) \
|
||||||
|
: Base(a) { } \
|
||||||
|
ENOKI_INLINE StaticArrayImpl &operator=(const Derived &v) { \
|
||||||
|
m = v.m; \
|
||||||
|
return *this; \
|
||||||
|
} \
|
||||||
|
template <typename T> ENOKI_INLINE StaticArrayImpl &operator=(const T &v) {\
|
||||||
|
return operator=(Derived(v)); return *this; \
|
||||||
|
} \
|
||||||
|
ENOKI_INLINE Value& raw_coeff_(size_t i) { \
|
||||||
|
union Data { \
|
||||||
|
Register value; \
|
||||||
|
Value data[Size_]; \
|
||||||
|
}; \
|
||||||
|
return ((Data *) &m)->data[i]; \
|
||||||
|
} \
|
||||||
|
ENOKI_INLINE const Value& raw_coeff_(size_t i) const { \
|
||||||
|
union Data { \
|
||||||
|
Register value; \
|
||||||
|
Value data[Size_]; \
|
||||||
|
}; \
|
||||||
|
return ((const Data *) &m)->data[i]; \
|
||||||
|
} \
|
||||||
|
ENOKI_INLINE decltype(auto) coeff(size_t i) { \
|
||||||
|
if constexpr (Derived::IsMask) \
|
||||||
|
return MaskBit<Derived &>(derived(), i); \
|
||||||
|
else \
|
||||||
|
return raw_coeff_(i); \
|
||||||
|
} \
|
||||||
|
ENOKI_INLINE decltype(auto) coeff(size_t i) const { \
|
||||||
|
if constexpr (Derived::IsMask) \
|
||||||
|
return MaskBit<const Derived &>(derived(), i); \
|
||||||
|
else \
|
||||||
|
return raw_coeff_(i); \
|
||||||
|
} \
|
||||||
|
ENOKI_INLINE bool bit_(size_t i) const { \
|
||||||
|
return detail::convert_mask(raw_coeff_(i)); \
|
||||||
|
} \
|
||||||
|
ENOKI_INLINE void set_bit_(size_t i, bool value) { \
|
||||||
|
raw_coeff_(i) = reinterpret_array<Value>(value); \
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Internal macro for native StaticArrayImpl overloads -- 3D special case
|
||||||
|
#define ENOKI_DECLARE_3D_ARRAY(Array) \
|
||||||
|
ENOKI_ARRAY_DEFAULTS(Array) \
|
||||||
|
using typename Base::Value; \
|
||||||
|
using typename Base::Derived; \
|
||||||
|
using typename Base::Ref; \
|
||||||
|
using Base::m; \
|
||||||
|
using Base::coeff; \
|
||||||
|
static constexpr size_t Size = 3; \
|
||||||
|
Array() = default; \
|
||||||
|
ENOKI_INLINE Array(Value v) : Base(v) { } \
|
||||||
|
ENOKI_INLINE Array(Value f1, Value f2, Value f3) \
|
||||||
|
: Base(f1, f2, f3, (Value) 0) { } \
|
||||||
|
ENOKI_INLINE Array(Value f1, Value f2, Value f3, Value f4) \
|
||||||
|
: Base(f1, f2, f3, f4) { } \
|
||||||
|
ENOKI_INLINE Array(typename Base::Register r) : Base(r) { } \
|
||||||
|
ENOKI_INLINE Array(typename Base::Register r, detail::reinterpret_flag) \
|
||||||
|
: Base(r, detail::reinterpret_flag()) { } \
|
||||||
|
ENOKI_INLINE Array(bool b, detail::reinterpret_flag) \
|
||||||
|
: Base(b, detail::reinterpret_flag()) { } \
|
||||||
|
template <typename Value2, typename Derived2> \
|
||||||
|
ENOKI_INLINE Array(const StaticArrayBase<Value2, 4, IsMask_, Derived2> &a) \
|
||||||
|
: Base(a) { } \
|
||||||
|
template <typename Value2, bool IsMask2, typename Derived2> \
|
||||||
|
ENOKI_INLINE Array(const StaticArrayBase<Value2, 4, IsMask2, Derived2> &a, \
|
||||||
|
detail::reinterpret_flag) \
|
||||||
|
: Base(a, detail::reinterpret_flag()) { } \
|
||||||
|
template <typename Value2, typename Derived2> \
|
||||||
|
ENOKI_INLINE Array(const StaticArrayBase<Value2, 3, IsMask_, Derived2>&a) {\
|
||||||
|
ENOKI_TRACK_SCALAR("Constructor (conversion, 3D case)"); \
|
||||||
|
Base::operator=(Derived(Value(a.derived().coeff(0)), \
|
||||||
|
Value(a.derived().coeff(1)), \
|
||||||
|
Value(a.derived().coeff(2)))); \
|
||||||
|
} \
|
||||||
|
template <typename Value2, typename Derived2, bool IsMask2> \
|
||||||
|
ENOKI_INLINE Array(const StaticArrayBase<Value2, 3, IsMask2, Derived2> &a, \
|
||||||
|
detail::reinterpret_flag) { \
|
||||||
|
ENOKI_TRACK_SCALAR("Constructor (reinterpreting, 3D case)"); \
|
||||||
|
Base::operator=( \
|
||||||
|
Derived(reinterpret_array<Value>(a.derived().coeff(0)), \
|
||||||
|
reinterpret_array<Value>(a.derived().coeff(1)), \
|
||||||
|
reinterpret_array<Value>(a.derived().coeff(2)))); \
|
||||||
|
} \
|
||||||
|
template <typename T> Array &operator=(T &&value) { \
|
||||||
|
return (Array&) Base::operator=(Derived(value)); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <typename Value_, size_t Size_, bool IsMask_, typename Derived_, typename = int>
|
||||||
|
struct StaticArrayImpl;
|
||||||
|
|
||||||
|
template <typename Value_, size_t Size_, bool IsMask_, typename Derived_>
|
||||||
|
struct StaticArrayImpl<
|
||||||
|
Value_, Size_, IsMask_, Derived_,
|
||||||
|
enable_if_t<detail::array_config<Value_, Size_>::use_generic_impl>>
|
||||||
|
: StaticArrayBase<std::conditional_t<IsMask_, mask_t<Value_>, Value_>,
|
||||||
|
Size_, IsMask_, Derived_> {
|
||||||
|
|
||||||
|
using Base =
|
||||||
|
StaticArrayBase<std::conditional_t<IsMask_, mask_t<Value_>, Value_>,
|
||||||
|
Size_, IsMask_, Derived_>;
|
||||||
|
|
||||||
|
using typename Base::Derived;
|
||||||
|
using typename Base::Value;
|
||||||
|
using typename Base::Scalar;
|
||||||
|
using typename Base::Array1;
|
||||||
|
using typename Base::Array2;
|
||||||
|
|
||||||
|
using Base::Size;
|
||||||
|
using Base::derived;
|
||||||
|
|
||||||
|
using StorageType =
|
||||||
|
std::conditional_t<std::is_reference_v<Value> && Size_ != 0,
|
||||||
|
std::reference_wrapper<std::remove_reference_t<Value>>,
|
||||||
|
std::remove_reference_t<Value>>;
|
||||||
|
|
||||||
|
using Ref = std::remove_reference_t<Value> &;
|
||||||
|
using ConstRef = const std::remove_reference_t<Value> &;
|
||||||
|
|
||||||
|
StaticArrayImpl(const StaticArrayImpl &) = default;
|
||||||
|
StaticArrayImpl(StaticArrayImpl &&) = default;
|
||||||
|
|
||||||
|
/// Trivial constructor
|
||||||
|
ENOKI_TRIVIAL_CONSTRUCTOR(Value)
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# pragma warning(push)
|
||||||
|
# pragma warning(disable:4244) // warning C4244: 'argument': conversion from 'int' to 'Value_', possible loss of data
|
||||||
|
# pragma warning(disable:4554) // warning C4554: '>>': check operator precedence for possible error; use parentheses to clarify precedence
|
||||||
|
# pragma warning(disable:4702) // warning C4702: unreachable code
|
||||||
|
#elif defined(__GNUC__)
|
||||||
|
// Don't be so noisy about sign conversion in constructor
|
||||||
|
# pragma GCC diagnostic push
|
||||||
|
# pragma GCC diagnostic ignored "-Wsign-conversion"
|
||||||
|
# pragma GCC diagnostic ignored "-Wdouble-promotion"
|
||||||
|
# pragma GCC diagnostic ignored "-Wunused-value"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <typename Src>
|
||||||
|
using cast_t = std::conditional_t<
|
||||||
|
std::is_scalar_v<Value> ||
|
||||||
|
!std::is_same_v<std::decay_t<Value>, std::decay_t<Src>>,
|
||||||
|
expr_t<Value>,
|
||||||
|
std::conditional_t<std::is_reference_v<Src>, Src, Src &&>>;
|
||||||
|
|
||||||
|
/// Construct from component values
|
||||||
|
template <typename... Ts, enable_if_t<sizeof...(Ts) == Size_ && Size_ != 1 &&
|
||||||
|
std::conjunction_v<detail::is_constructible<StorageType, Ts>...>> = 0>
|
||||||
|
ENOKI_INLINE StaticArrayImpl(Ts&&... ts)
|
||||||
|
: m_data{{ cast_t<Ts>(ts)... }} {
|
||||||
|
ENOKI_CHKSCALAR("Constructor (component values)");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Construct from a scalar or another array
|
||||||
|
template <typename T, typename ST = StorageType,
|
||||||
|
enable_if_t<!std::is_default_constructible_v<ST>> = 0>
|
||||||
|
ENOKI_INLINE StaticArrayImpl(T &&value)
|
||||||
|
: StaticArrayImpl(std::forward<T>(value),
|
||||||
|
std::make_index_sequence<Derived::Size>()) { }
|
||||||
|
|
||||||
|
template <typename T, typename ST = StorageType,
|
||||||
|
enable_if_t<!std::is_default_constructible_v<ST>> = 0>
|
||||||
|
ENOKI_INLINE StaticArrayImpl(T &&value, detail::reinterpret_flag)
|
||||||
|
: StaticArrayImpl(std::forward<T>(value),
|
||||||
|
std::make_index_sequence<Derived::Size>()) { }
|
||||||
|
|
||||||
|
/// Construct from a scalar or another array (potential optimizations)
|
||||||
|
template <typename T, typename ST = StorageType,
|
||||||
|
enable_if_t<std::is_default_constructible_v<ST>> = 0>
|
||||||
|
ENOKI_INLINE StaticArrayImpl(T &&value) {
|
||||||
|
if constexpr (Derived::IsMask) {
|
||||||
|
derived() = Derived(value, detail::reinterpret_flag());
|
||||||
|
} else if constexpr (is_recursive_array_v<T> &&
|
||||||
|
array_depth_v<T> == array_depth_v<Derived>) {
|
||||||
|
derived() = Derived(Array1(low(value)), Array2(high(value)));
|
||||||
|
} else {
|
||||||
|
assign_(std::forward<T>(value),
|
||||||
|
std::make_index_sequence<Derived::Size>());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Reinterpret another array (potential optimizations)
|
||||||
|
template <typename T, typename ST = StorageType,
|
||||||
|
enable_if_t<std::is_default_constructible_v<ST>> = 0>
|
||||||
|
ENOKI_INLINE StaticArrayImpl(T&& value, detail::reinterpret_flag) {
|
||||||
|
if constexpr (is_recursive_array_v<T> &&
|
||||||
|
array_depth_v<T> == array_depth_v<Derived>) {
|
||||||
|
derived() = Derived(reinterpret_array<Array1>(low(value)),
|
||||||
|
reinterpret_array<Array2>(high(value)));
|
||||||
|
} else {
|
||||||
|
assign_(std::forward<T>(value), detail::reinterpret_flag(),
|
||||||
|
std::make_index_sequence<Derived::Size>());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE StaticArrayImpl &operator=(T &&value) {
|
||||||
|
assign_(std::forward<T>(value),
|
||||||
|
std::make_index_sequence<Derived::Size>());
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
StaticArrayImpl& operator=(const StaticArrayImpl& value) {
|
||||||
|
assign_(value, std::make_index_sequence<Derived::Size>());
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
StaticArrayImpl& operator=(StaticArrayImpl& value) {
|
||||||
|
assign_(value, std::make_index_sequence<Derived::Size>());
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
StaticArrayImpl& operator=(StaticArrayImpl&& value) {
|
||||||
|
assign_(std::move(value), std::make_index_sequence<Derived::Size>());
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Construct from sub-arrays
|
||||||
|
template <typename T1, typename T2, typename T = StaticArrayImpl, enable_if_t<
|
||||||
|
array_depth_v<T1> == array_depth_v<T> && array_size_v<T1> == Base::Size1 &&
|
||||||
|
array_depth_v<T2> == array_depth_v<T> && array_size_v<T2> == Base::Size2 &&
|
||||||
|
Base::Size2 != 0> = 0>
|
||||||
|
StaticArrayImpl(const T1 &a1, const T2 &a2)
|
||||||
|
: StaticArrayImpl(a1, a2, std::make_index_sequence<Base::Size1>(),
|
||||||
|
std::make_index_sequence<Base::Size2>()) { }
|
||||||
|
|
||||||
|
private:
|
||||||
|
template <typename T, size_t... Is, enable_if_t<!detail::broadcast<T, Derived>> = 0>
|
||||||
|
ENOKI_INLINE StaticArrayImpl(T&& value, std::index_sequence<Is...>)
|
||||||
|
: m_data{{ cast_t<decltype(value.coeff(0))>(value.coeff(Is))... }} {
|
||||||
|
ENOKI_CHKSCALAR("Copy constructor");
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<detail::broadcast<T, Derived>> = 0, size_t... Is>
|
||||||
|
ENOKI_INLINE StaticArrayImpl(T&& value, std::index_sequence<Is...>)
|
||||||
|
: m_data{{ (Is, value)... }} {
|
||||||
|
ENOKI_CHKSCALAR("Copy constructor (broadcast)");
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T1, typename T2, size_t... Index1, size_t... Index2>
|
||||||
|
ENOKI_INLINE StaticArrayImpl(const T1 &a1, const T2 &a2,
|
||||||
|
std::index_sequence<Index1...>,
|
||||||
|
std::index_sequence<Index2...>)
|
||||||
|
: m_data{{ a1.coeff(Index1)..., a2.coeff(Index2)... }} {
|
||||||
|
ENOKI_CHKSCALAR("Copy constructor (from 2 components)");
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, size_t... Is>
|
||||||
|
ENOKI_INLINE void assign_(T&& value, std::index_sequence<Is...>) {
|
||||||
|
if constexpr (std::is_same_v<array_shape_t<T>, array_shape_t<Derived>> &&
|
||||||
|
std::is_same_v<Value, half>) {
|
||||||
|
#if defined(ENOKI_X86_F16C)
|
||||||
|
using Value2 = value_t<T>;
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<Value2, double>) {
|
||||||
|
derived() = float32_array_t<T, false>(value);
|
||||||
|
return;
|
||||||
|
} else if constexpr (std::is_same_v<Value2, float>) {
|
||||||
|
if constexpr (Size == 4) {
|
||||||
|
long long result = detail::mm_cvtsi128_si64(_mm_cvtps_ph(
|
||||||
|
value.derived().m, _MM_FROUND_CUR_DIRECTION));
|
||||||
|
memcpy(m_data.data(), &result, sizeof(long long));
|
||||||
|
return;
|
||||||
|
} else if constexpr (Size == 8) {
|
||||||
|
__m128i result = _mm256_cvtps_ph(value.derived().m,
|
||||||
|
_MM_FROUND_CUR_DIRECTION);
|
||||||
|
_mm_storeu_si128((__m128i *) m_data.data(), result);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#if defined(ENOKI_X86_AVX512F)
|
||||||
|
if constexpr (Size == 16) {
|
||||||
|
__m256i result = _mm512_cvtps_ph(value.derived().m,
|
||||||
|
_MM_FROUND_CUR_DIRECTION);
|
||||||
|
_mm256_storeu_si256((__m256i *) m_data.data(), result);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr bool Move = !std::is_lvalue_reference_v<T> && !is_scalar_v<Value> &&
|
||||||
|
std::is_same_v<value_t<T>, value_t<Derived>>;
|
||||||
|
ENOKI_MARK_USED(Move);
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<std::decay_t<T>, nanogui::Array<Value, Size>>) {
|
||||||
|
for (size_t i = 0; i < Size; ++i)
|
||||||
|
coeff(i) = value[i];
|
||||||
|
} else if constexpr (detail::broadcast<T, Derived>) {
|
||||||
|
auto s = static_cast<cast_t<T>>(value);
|
||||||
|
bool unused[] = { (coeff(Is) = s, false)..., false };
|
||||||
|
(void) unused; (void) s;
|
||||||
|
} else {
|
||||||
|
if constexpr (Move) {
|
||||||
|
bool unused[] = { (coeff(Is) = std::move(value.derived().coeff(Is)), false)..., false };
|
||||||
|
(void) unused;
|
||||||
|
} else {
|
||||||
|
using Src = decltype(value.derived().coeff(0));
|
||||||
|
bool unused[] = { (coeff(Is) = cast_t<Src>(value.derived().coeff(Is)), false)..., false };
|
||||||
|
(void) unused;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, size_t... Is>
|
||||||
|
ENOKI_INLINE void assign_(T&& value, detail::reinterpret_flag, std::index_sequence<Is...>) {
|
||||||
|
if constexpr (std::is_same_v<array_shape_t<T>, array_shape_t<Derived>> &&
|
||||||
|
std::is_same_v<Value, bool> && detail::has_bitmask_v<T>) {
|
||||||
|
#if defined(ENOKI_X86_AVX512VL)
|
||||||
|
if constexpr (Size == 16) {
|
||||||
|
_mm_storeu_si128((__m128i *) data(),
|
||||||
|
_mm_maskz_set1_epi8((__mmask16) value.bitmask_(), (char) 1));
|
||||||
|
return;
|
||||||
|
} else if constexpr (Size == 8) {
|
||||||
|
uint64_t result = (uint64_t) detail::mm_cvtsi128_si64(
|
||||||
|
_mm_maskz_set1_epi8((__mmask8) value.bitmask_(), (char) 1));
|
||||||
|
memcpy(data(), &result, sizeof(uint64_t));
|
||||||
|
return;
|
||||||
|
} else if constexpr (Size == 4) {
|
||||||
|
uint32_t result = (uint32_t) _mm_cvtsi128_si32(
|
||||||
|
_mm_maskz_set1_epi8((__mmask8) value.bitmask_(), (char) 1));
|
||||||
|
memcpy(data(), &result, sizeof(uint32_t));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#elif defined(ENOKI_X86_AVX2) && defined(ENOKI_X86_64)
|
||||||
|
uint32_t k = value.bitmask_();
|
||||||
|
if constexpr (Size == 16) {
|
||||||
|
uint64_t low = (uint64_t) _pdep_u64(k, 0x0101010101010101ull);
|
||||||
|
uint64_t hi = (uint64_t) _pdep_u64(k >> 8, 0x0101010101010101ull);
|
||||||
|
memcpy((uint8_t *) data(), &low, sizeof(uint64_t));
|
||||||
|
memcpy((uint8_t *) data() + sizeof(uint64_t), &hi, sizeof(uint64_t));
|
||||||
|
return;
|
||||||
|
} else if constexpr (Size == 8) {
|
||||||
|
uint64_t result = (uint64_t) _pdep_u64(k, 0x0101010101010101ull);
|
||||||
|
memcpy(data(), &result, sizeof(uint64_t));
|
||||||
|
return;
|
||||||
|
} else if constexpr (Size == 4) {
|
||||||
|
uint32_t result = (uint32_t) _pdep_u32(k, 0x01010101ull);
|
||||||
|
memcpy(data(), &result, sizeof(uint32_t));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr(detail::broadcast<T, Derived>) {
|
||||||
|
bool unused[] = { (coeff(Is) = reinterpret_array<Value>(value), false)..., false };
|
||||||
|
(void) unused;
|
||||||
|
} else {
|
||||||
|
bool unused[] = { (coeff(Is) = reinterpret_array<Value>(value.coeff(Is)), false)..., false };
|
||||||
|
(void) unused;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# pragma warning(pop)
|
||||||
|
#elif defined(__GNUC__)
|
||||||
|
# pragma GCC diagnostic pop
|
||||||
|
#endif
|
||||||
|
|
||||||
|
public:
|
||||||
|
/// Return the size in bytes
|
||||||
|
size_t nbytes() const {
|
||||||
|
if constexpr (is_dynamic_v<Value>) {
|
||||||
|
size_t result = 0;
|
||||||
|
for (size_t i = 0; i < Derived::Size; ++i)
|
||||||
|
result += coeff(i).nbytes();
|
||||||
|
return result;
|
||||||
|
} else {
|
||||||
|
return Base::nbytes();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Arithmetic NOT operation
|
||||||
|
ENOKI_INLINE Derived not_() const {
|
||||||
|
Derived result;
|
||||||
|
ENOKI_CHKSCALAR("not");
|
||||||
|
for (size_t i = 0; i < Derived::Size; ++i) {
|
||||||
|
if constexpr (IsMask_)
|
||||||
|
(Value &) result.coeff(i) = !(Value) derived().coeff(i);
|
||||||
|
else
|
||||||
|
(Value &) result.coeff(i) = ~(Value) derived().coeff(i);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Arithmetic unary negation operation
|
||||||
|
ENOKI_INLINE Derived neg_() const {
|
||||||
|
Derived result;
|
||||||
|
ENOKI_CHKSCALAR("neg");
|
||||||
|
for (size_t i = 0; i < Derived::Size; ++i)
|
||||||
|
(Value &) result.coeff(i) = - (Value) derived().coeff(i);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Array indexing operator
|
||||||
|
ENOKI_INLINE Ref coeff(size_t i) {
|
||||||
|
ENOKI_CHKSCALAR("coeff");
|
||||||
|
return m_data[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Array indexing operator (const)
|
||||||
|
ENOKI_INLINE ConstRef coeff(size_t i) const {
|
||||||
|
ENOKI_CHKSCALAR("coeff");
|
||||||
|
return m_data[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Recursive array indexing operator (const)
|
||||||
|
template <typename... Args, enable_if_t<(sizeof...(Args) >= 1)> = 0>
|
||||||
|
ENOKI_INLINE decltype(auto) coeff(size_t i0, Args... other) const {
|
||||||
|
return coeff(i0).coeff(size_t(other)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Recursive array indexing operator
|
||||||
|
template <typename... Args, enable_if_t<(sizeof...(Args) >= 1)> = 0>
|
||||||
|
ENOKI_INLINE decltype(auto) coeff(size_t i0, Args... other) {
|
||||||
|
return coeff(i0).coeff(size_t(other)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
StorageType *data() { return m_data.data(); }
|
||||||
|
const StorageType *data() const { return m_data.data(); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::array<StorageType, Size> m_data;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BitRef {
|
||||||
|
private:
|
||||||
|
struct BitWrapper {
|
||||||
|
virtual bool get() = 0;
|
||||||
|
virtual void set(bool value) = 0;
|
||||||
|
virtual ~BitWrapper() = default;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unique_ptr<BitWrapper> accessor;
|
||||||
|
public:
|
||||||
|
BitRef(bool &b) {
|
||||||
|
struct BoolWrapper : BitWrapper {
|
||||||
|
BoolWrapper(bool& data) : data(data) { }
|
||||||
|
bool get() override { return data; }
|
||||||
|
void set(bool value) override { data = value; }
|
||||||
|
bool &data;
|
||||||
|
};
|
||||||
|
accessor = std::make_unique<BoolWrapper>(b);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
BitRef(MaskBit<T> b) {
|
||||||
|
struct MaskBitWrapper : BitWrapper {
|
||||||
|
MaskBitWrapper(MaskBit<T> data) : data(data) { }
|
||||||
|
bool get() override { return (bool) data; }
|
||||||
|
void set(bool value) override { data = value; }
|
||||||
|
MaskBit<T> data;
|
||||||
|
};
|
||||||
|
accessor = std::make_unique<MaskBitWrapper>(b);
|
||||||
|
}
|
||||||
|
|
||||||
|
operator bool() const { return accessor->get(); }
|
||||||
|
BitRef& operator=(bool value) { accessor->set(value); return *this; }
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,327 @@
|
||||||
|
/*
|
||||||
|
enoki/array_idiv.h -- fast precomputed integer division by constants based
|
||||||
|
on libdivide (https://github.com/ridiculousfish/libdivide)
|
||||||
|
|
||||||
|
Copyright (C) 2010 ridiculous_fish
|
||||||
|
|
||||||
|
This software is provided 'as-is', without any express or implied
|
||||||
|
warranty. In no event will the authors be held liable for any damages
|
||||||
|
arising from the use of this software.
|
||||||
|
|
||||||
|
Permission is granted to anyone to use this software for any purpose,
|
||||||
|
including commercial applications, and to alter it and redistribute it
|
||||||
|
freely, subject to the following restrictions:
|
||||||
|
|
||||||
|
1. The origin of this software must not be misrepresented; you must not
|
||||||
|
claim that you wrote the original software. If you use this software
|
||||||
|
in a product, an acknowledgment in the product documentation would be
|
||||||
|
appreciated but is not required.
|
||||||
|
2. Altered source versions must be plainly marked as such, and must not be
|
||||||
|
misrepresented as being the original software.
|
||||||
|
3. This notice may not be removed or altered from any source distribution.
|
||||||
|
|
||||||
|
libdivide@ridiculousfish.com
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <enoki/array_generic.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(detail)
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Precomputation for division by integer constants
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
template <bool UseIntrinsic = false>
|
||||||
|
std::pair<uint32_t, uint32_t> div_wide(uint32_t u1, uint32_t u0, uint32_t v) {
|
||||||
|
#if defined(__GNUC__) && (defined(ENOKI_X86_32) || defined(ENOKI_X86_64))
|
||||||
|
if constexpr (UseIntrinsic) {
|
||||||
|
uint32_t res, rem;
|
||||||
|
__asm__("divl %[v]"
|
||||||
|
: "=a"(res), "=d"(rem)
|
||||||
|
: [v] "r"(v), "a"(u0), "d"(u1));
|
||||||
|
return { res, rem };
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
uint64_t u = (((uint64_t) u1) << 32) | u0;
|
||||||
|
|
||||||
|
return { (uint32_t) (u / v),
|
||||||
|
(uint32_t) (u % v) };
|
||||||
|
}
|
||||||
|
|
||||||
|
template <bool UseIntrinsic = false>
|
||||||
|
std::pair<uint64_t, uint64_t> div_wide(uint64_t u1, uint64_t u0, uint64_t d) {
|
||||||
|
#if defined(__GNUC__) && defined(ENOKI_X86_64)
|
||||||
|
if constexpr (UseIntrinsic) {
|
||||||
|
uint64_t res, rem;
|
||||||
|
__asm__("divq %[v]"
|
||||||
|
: "=a"(res), "=d"(rem)
|
||||||
|
: [v]"r"(d), "a"(u0), "d"(u1));
|
||||||
|
return { res, rem };
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__SIZEOF_INT128__)
|
||||||
|
__uint128_t n = (((__uint128_t) u1) << 64) | u0;
|
||||||
|
return {
|
||||||
|
(uint64_t) (n / d),
|
||||||
|
(uint64_t) (n % d)
|
||||||
|
};
|
||||||
|
#else
|
||||||
|
// Code taken from Hacker's Delight:
|
||||||
|
// http://www.hackersdelight.org/HDcode/divlu.c.
|
||||||
|
// License permits inclusion here per:
|
||||||
|
// http://www.hackersdelight.org/permissions.htm
|
||||||
|
|
||||||
|
const uint64_t b = (1ULL << 32); // Number base (16 bits).
|
||||||
|
uint64_t un1, un0, // Norm. dividend LSD's.
|
||||||
|
vn1, vn0, // Norm. divisor digits.
|
||||||
|
q1, q0, // Quotient digits.
|
||||||
|
un64, un21, un10, // Dividend digit pairs.
|
||||||
|
rhat; // A remainder.
|
||||||
|
int s; // Shift amount for norm.
|
||||||
|
|
||||||
|
if (u1 >= d) // overflow
|
||||||
|
return { (uint64_t) -1, (uint64_t) -1 };
|
||||||
|
|
||||||
|
// count leading zeros
|
||||||
|
s = (int) (63 - log2i(d)); // 0 <= s <= 63.
|
||||||
|
if (s > 0) {
|
||||||
|
d = d << s; // Normalize divisor.
|
||||||
|
un64 = (u1 << s) | ((u0 >> (64 - s)) & uint64_t(-s >> 31));
|
||||||
|
un10 = u0 << s; // Shift dividend left.
|
||||||
|
} else {
|
||||||
|
// Avoid undefined behavior.
|
||||||
|
un64 = u1 | u0;
|
||||||
|
un10 = u0;
|
||||||
|
}
|
||||||
|
|
||||||
|
vn1 = d >> 32; // Break divisor up into
|
||||||
|
vn0 = d & 0xFFFFFFFF; // two 32-bit digits.
|
||||||
|
|
||||||
|
un1 = un10 >> 32; // Break right half of
|
||||||
|
un0 = un10 & 0xFFFFFFFF; // dividend into two digits.
|
||||||
|
|
||||||
|
q1 = un64/vn1; // Compute the first
|
||||||
|
rhat = un64 - q1*vn1; // quotient digit, q1.
|
||||||
|
|
||||||
|
again1:
|
||||||
|
if (q1 >= b || q1*vn0 > b*rhat + un1) {
|
||||||
|
q1 = q1 - 1;
|
||||||
|
rhat = rhat + vn1;
|
||||||
|
if (rhat < b)
|
||||||
|
goto again1;
|
||||||
|
}
|
||||||
|
|
||||||
|
un21 = un64*b + un1 - q1*d; // Multiply and subtract.
|
||||||
|
|
||||||
|
q0 = un21/vn1; // Compute the second
|
||||||
|
rhat = un21 - q0*vn1; // quotient digit, q0.
|
||||||
|
|
||||||
|
again2:
|
||||||
|
if (q0 >= b || q0 * vn0 > b * rhat + un0) {
|
||||||
|
q0 = q0 - 1;
|
||||||
|
rhat = rhat + vn1;
|
||||||
|
if (rhat < b)
|
||||||
|
goto again2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
q1*b + q0,
|
||||||
|
(un21*b + un0 - q0*d) >> s
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
NAMESPACE_END(detail)
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# pragma pack(push)
|
||||||
|
# pragma pack(1)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <typename T, bool UseIntrinsic>
|
||||||
|
struct divisor<T, UseIntrinsic, enable_if_t<std::is_unsigned_v<T>>> {
|
||||||
|
T multiplier;
|
||||||
|
uint8_t shift;
|
||||||
|
|
||||||
|
divisor() = default;
|
||||||
|
|
||||||
|
ENOKI_INLINE divisor(T d) {
|
||||||
|
/* Division by +/-1 is not supported by the
|
||||||
|
precomputation-based approach */
|
||||||
|
assert(d != 1);
|
||||||
|
shift = (uint8_t) log2i(d);
|
||||||
|
|
||||||
|
if ((d & (d - 1)) == 0) {
|
||||||
|
/* Power of two */
|
||||||
|
multiplier = 0;
|
||||||
|
shift--;
|
||||||
|
} else {
|
||||||
|
/* General case */
|
||||||
|
auto [m, rem] =
|
||||||
|
detail::div_wide<UseIntrinsic>(T(1) << shift, T(0), d);
|
||||||
|
multiplier = m * 2 + 1;
|
||||||
|
assert(rem > 0 && rem < d);
|
||||||
|
|
||||||
|
T rem2 = rem * 2;
|
||||||
|
if (rem2 >= d || rem2 < rem)
|
||||||
|
multiplier += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
ENOKI_INLINE auto operator()(const T2 &value) const {
|
||||||
|
using Expr = decltype(value + value);
|
||||||
|
auto q = mulhi(Expr(multiplier), value);
|
||||||
|
auto t = sr<1>(value - q) + q;
|
||||||
|
return t >> shift;
|
||||||
|
}
|
||||||
|
} ENOKI_PACK;
|
||||||
|
|
||||||
|
template <typename T, bool UseIntrinsic>
|
||||||
|
struct divisor<T, UseIntrinsic, enable_if_t<std::is_signed_v<T>>> {
|
||||||
|
using U = std::make_unsigned_t<T>;
|
||||||
|
|
||||||
|
T multiplier;
|
||||||
|
uint8_t shift;
|
||||||
|
|
||||||
|
divisor() = default;
|
||||||
|
|
||||||
|
ENOKI_INLINE divisor(T d) {
|
||||||
|
/* Division by +/-1 is not supported by the
|
||||||
|
precomputation-based approach */
|
||||||
|
assert(d != 1 && d != -1);
|
||||||
|
|
||||||
|
U ad = d < 0 ? (U) -d : (U) d;
|
||||||
|
shift = (uint8_t) log2i(ad);
|
||||||
|
|
||||||
|
if ((ad & (ad - 1)) == 0) {
|
||||||
|
/* Power of two */
|
||||||
|
multiplier = 0;
|
||||||
|
} else {
|
||||||
|
/* General case */
|
||||||
|
auto [m, rem] =
|
||||||
|
detail::div_wide<UseIntrinsic>(U(1) << (shift - 1), U(0), ad);
|
||||||
|
multiplier = T(m * 2 + 1);
|
||||||
|
|
||||||
|
U rem2 = rem * 2;
|
||||||
|
if (rem2 >= ad || rem2 < rem)
|
||||||
|
multiplier += 1;
|
||||||
|
}
|
||||||
|
if (d < 0)
|
||||||
|
shift |= 0x80;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
ENOKI_INLINE auto operator()(const T2 &value) const {
|
||||||
|
using Expr = decltype(value + value);
|
||||||
|
uint8_t shift_ = shift & 0x3f;
|
||||||
|
Expr sign(int8_t(shift) >> 7);
|
||||||
|
|
||||||
|
auto q = mulhi(Expr(multiplier), value) + value;
|
||||||
|
auto q_sign = sr<sizeof(T) * 8 - 1>(q);
|
||||||
|
q += q_sign & ((T(1) << shift_) - (multiplier == 0 ? 1 : 0));
|
||||||
|
|
||||||
|
return ((q >> shift_) ^ sign) - sign;
|
||||||
|
}
|
||||||
|
} ENOKI_PACK;
|
||||||
|
|
||||||
|
/// Stores *both* the original divisor + magic number
|
||||||
|
template <typename T> struct divisor_ext : divisor<T> {
|
||||||
|
T value;
|
||||||
|
ENOKI_INLINE divisor_ext(T value) : divisor<T>(value), value(value) { }
|
||||||
|
} ENOKI_PACK;
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# pragma pack(pop)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<std::is_integral_v<scalar_t<T>>> = 0>
|
||||||
|
ENOKI_INLINE auto operator/(const T &a, const divisor<scalar_t<T>> &div) {
|
||||||
|
return div(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<std::is_integral_v<scalar_t<T>>> = 0>
|
||||||
|
ENOKI_INLINE auto operator%(const T &a, const divisor_ext<scalar_t<T>> &div) {
|
||||||
|
return a - div(a) * div.value;
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Arithmetic operations for pointer arrays
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename T1, typename T2,
|
||||||
|
typename S1 = scalar_t<T1>, typename S2 = scalar_t<T2>,
|
||||||
|
enable_if_t<std::is_pointer_v<S1> || std::is_pointer_v<S2>> = 0,
|
||||||
|
enable_if_array_any_t<T1, T2> = 0>
|
||||||
|
ENOKI_INLINE auto operator-(const T1 &a1_, const T2 &a2_) {
|
||||||
|
using Int = std::conditional_t<sizeof(void *) == 8, int64_t, int32_t>;
|
||||||
|
using T1i = replace_scalar_t<T1, Int, false>;
|
||||||
|
using T2i = replace_scalar_t<T2, Int, false>;
|
||||||
|
using Ti = expr_t<T1i, T2i>;
|
||||||
|
using T = expr_t<T1, T2>;
|
||||||
|
|
||||||
|
constexpr Int InstanceSize = sizeof(std::remove_pointer_t<scalar_t<T1>>),
|
||||||
|
LogInstanceSize = detail::clog2i(InstanceSize);
|
||||||
|
|
||||||
|
constexpr bool PointerDiff = std::is_pointer_v<S1> &&
|
||||||
|
std::is_pointer_v<S2>;
|
||||||
|
|
||||||
|
using Ret = std::conditional_t<PointerDiff, Ti, T>;
|
||||||
|
Ti a1 = Ti((T1i) a1_),
|
||||||
|
a2 = Ti((T2i) a2_);
|
||||||
|
|
||||||
|
if constexpr (InstanceSize == 1) {
|
||||||
|
return Ret(a1.sub_(a2));
|
||||||
|
} else if constexpr ((1 << LogInstanceSize) == InstanceSize) {
|
||||||
|
if constexpr (PointerDiff)
|
||||||
|
return Ret(a1.sub_(a2).template sr_<LogInstanceSize>());
|
||||||
|
else
|
||||||
|
return Ret(a1.sub_(a2.template sl_<LogInstanceSize>()));
|
||||||
|
} else {
|
||||||
|
if constexpr (PointerDiff)
|
||||||
|
return Ret(a1.sub_(a2) / InstanceSize);
|
||||||
|
else
|
||||||
|
return Ret(a1.sub_(a2 * InstanceSize));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <typename T1, typename T2,
|
||||||
|
typename S1 = scalar_t<T1>, typename S2 = scalar_t<T2>,
|
||||||
|
enable_if_t<std::is_pointer_v<S1> && !std::is_pointer_v<S2>> = 0,
|
||||||
|
enable_if_array_any_t<T1, T2> = 0>
|
||||||
|
ENOKI_INLINE auto operator+(const T1 &a1_, const T2 &a2_) {
|
||||||
|
using Int = std::conditional_t<sizeof(void *) == 8, int64_t, int32_t>;
|
||||||
|
using T1i = replace_scalar_t<T1, Int, false>;
|
||||||
|
using T2i = replace_scalar_t<T2, Int, false>;
|
||||||
|
using Ti = expr_t<T1i, T2i>;
|
||||||
|
using Ret = expr_t<T1, T2>;
|
||||||
|
|
||||||
|
constexpr Int InstanceSize = sizeof(std::remove_pointer_t<scalar_t<T1>>),
|
||||||
|
LogInstanceSize = detail::clog2i(InstanceSize);
|
||||||
|
|
||||||
|
Ti a1 = Ti((T1i) a1_),
|
||||||
|
a2 = Ti((T2i) a2_);
|
||||||
|
|
||||||
|
if constexpr (InstanceSize == 1)
|
||||||
|
return Ret(a1.add_(a2));
|
||||||
|
if constexpr ((1 << LogInstanceSize) == InstanceSize)
|
||||||
|
return Ret(a1.add_(a2.template sl_<LogInstanceSize>()));
|
||||||
|
else
|
||||||
|
return Ret(a1.add_(a2 * InstanceSize));
|
||||||
|
}
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,326 @@
|
||||||
|
/*
|
||||||
|
enoki/array_kmask.h -- Hardware-specific intrinsics and compatibility
|
||||||
|
wrappers
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using ENOKI instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#include <enoki/fwd.h>
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_64) || defined(ENOKI_X86_32)
|
||||||
|
# if defined(__GNUC__) && !defined(__clang__)
|
||||||
|
# pragma GCC diagnostic push
|
||||||
|
# pragma GCC diagnostic ignored "-Wconversion"
|
||||||
|
# pragma GCC diagnostic ignored "-Wuninitialized"
|
||||||
|
# pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
|
||||||
|
# endif
|
||||||
|
# include <immintrin.h>
|
||||||
|
# if defined(__GNUC__) && !defined(__clang__)
|
||||||
|
# pragma GCC diagnostic pop
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_ARM_NEON)
|
||||||
|
# include <arm_neon.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# include <intrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Available instruction sets
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512F)
|
||||||
|
static constexpr bool has_avx512f = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_avx512f = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512CD)
|
||||||
|
static constexpr bool has_avx512cd = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_avx512cd = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512DQ)
|
||||||
|
static constexpr bool has_avx512dq = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_avx512dq = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512VL)
|
||||||
|
static constexpr bool has_avx512vl = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_avx512vl = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512BW)
|
||||||
|
static constexpr bool has_avx512bw = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_avx512bw = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512PF)
|
||||||
|
static constexpr bool has_avx512pf = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_avx512pf = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512ER)
|
||||||
|
static constexpr bool has_avx512er = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_avx512er = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__AVX512VBMI__)
|
||||||
|
static constexpr bool has_avx512vbmi = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_avx512vbmi = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512VPOPCNTDQ)
|
||||||
|
static constexpr bool has_avx512vpopcntdq = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_avx512vpopcntdq = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX2)
|
||||||
|
static constexpr bool has_avx2 = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_avx2 = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_FMA) || defined(ENOKI_ARM_FMA)
|
||||||
|
static constexpr bool has_fma = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_fma = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_F16C)
|
||||||
|
static constexpr bool has_f16c = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_f16c = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX)
|
||||||
|
static constexpr bool has_avx = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_avx = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_SSE42)
|
||||||
|
static constexpr bool has_sse42 = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_sse42 = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_32)
|
||||||
|
static constexpr bool has_x86_32 = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_x86_32 = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_64)
|
||||||
|
static constexpr bool has_x86_64 = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_x86_64 = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_ARM_NEON)
|
||||||
|
static constexpr bool has_neon = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_neon = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_ARM_32)
|
||||||
|
static constexpr bool has_arm_32 = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_arm_32 = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_ARM_64)
|
||||||
|
static constexpr bool has_arm_64 = true;
|
||||||
|
#else
|
||||||
|
static constexpr bool has_arm_64 = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static constexpr bool has_vectorization = has_sse42 || has_neon;
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_SSE42)
|
||||||
|
/// Flush denormalized numbers to zero
|
||||||
|
inline void set_flush_denormals(bool value) {
|
||||||
|
_MM_SET_FLUSH_ZERO_MODE(value ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF);
|
||||||
|
_MM_SET_DENORMALS_ZERO_MODE(value ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool flush_denormals() {
|
||||||
|
return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
inline void set_flush_denormals(bool) { }
|
||||||
|
inline bool flush_denormals() { return false; }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct scoped_flush_denormals {
|
||||||
|
public:
|
||||||
|
scoped_flush_denormals(bool value) {
|
||||||
|
m_old_value = flush_denormals();
|
||||||
|
set_flush_denormals(value);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
~scoped_flush_denormals() {
|
||||||
|
set_flush_denormals(m_old_value);
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
bool m_old_value;
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(detail)
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Helper routines to merge smaller arrays into larger ones
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX)
|
||||||
|
ENOKI_INLINE __m256 concat(__m128 l, __m128 h) {
|
||||||
|
return _mm256_insertf128_ps(_mm256_castps128_ps256(l), h, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE __m256d concat(__m128d l, __m128d h) {
|
||||||
|
return _mm256_insertf128_pd(_mm256_castpd128_pd256(l), h, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE __m256i concat(__m128i l, __m128i h) {
|
||||||
|
return _mm256_insertf128_si256(_mm256_castsi128_si256(l), h, 1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512F)
|
||||||
|
ENOKI_INLINE __m512 concat(__m256 l, __m256 h) {
|
||||||
|
#if defined(ENOKI_X86_AVX512DQ)
|
||||||
|
return _mm512_insertf32x8(_mm512_castps256_ps512(l), h, 1);
|
||||||
|
#else
|
||||||
|
return _mm512_castpd_ps(
|
||||||
|
_mm512_insertf64x4(_mm512_castps_pd(_mm512_castps256_ps512(l)),
|
||||||
|
_mm256_castps_pd(h), 1));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE __m512d concat(__m256d l, __m256d h) {
|
||||||
|
return _mm512_insertf64x4(_mm512_castpd256_pd512(l), h, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE __m512i concat(__m256i l, __m256i h) {
|
||||||
|
return _mm512_inserti64x4(_mm512_castsi256_si512(l), h, 1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Mask conversion routines for various platforms
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX)
|
||||||
|
ENOKI_INLINE __m256i mm256_cvtepi32_epi64(__m128i x) {
|
||||||
|
#if defined(ENOKI_X86_AVX2)
|
||||||
|
return _mm256_cvtepi32_epi64(x);
|
||||||
|
#else
|
||||||
|
/* This version is only suitable for mask conversions */
|
||||||
|
__m128i xl = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 1, 0, 0));
|
||||||
|
__m128i xh = _mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 2, 2));
|
||||||
|
return detail::concat(xl, xh);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE __m128i mm256_cvtepi64_epi32(__m256i x) {
|
||||||
|
#if defined(ENOKI_X86_AVX512VL)
|
||||||
|
return _mm256_cvtepi64_epi32(x);
|
||||||
|
#else
|
||||||
|
__m128i x0 = _mm256_castsi256_si128(x);
|
||||||
|
__m128i x1 = _mm256_extractf128_si256(x, 1);
|
||||||
|
return _mm_castps_si128(_mm_shuffle_ps(
|
||||||
|
_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), _MM_SHUFFLE(2, 0, 2, 0)));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE __m256i mm512_cvtepi64_epi32(__m128i x0, __m128i x1, __m128i x2, __m128i x3) {
|
||||||
|
__m128i y0 = _mm_castps_si128(_mm_shuffle_ps(
|
||||||
|
_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), _MM_SHUFFLE(2, 0, 2, 0)));
|
||||||
|
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(
|
||||||
|
_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), _MM_SHUFFLE(2, 0, 2, 0)));
|
||||||
|
return detail::concat(y0, y1);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE __m256i mm512_cvtepi64_epi32(__m256i x0, __m256i x1) {
|
||||||
|
__m128i y0 = _mm256_castsi256_si128(x0);
|
||||||
|
__m128i y1 = _mm256_extractf128_si256(x0, 1);
|
||||||
|
__m128i y2 = _mm256_castsi256_si128(x1);
|
||||||
|
__m128i y3 = _mm256_extractf128_si256(x1, 1);
|
||||||
|
return mm512_cvtepi64_epi32(y0, y1, y2, y3);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_SSE42)
|
||||||
|
|
||||||
|
ENOKI_INLINE __m128i mm256_cvtepi64_epi32(__m128i x0, __m128i x1) {
|
||||||
|
return _mm_castps_si128(_mm_shuffle_ps(
|
||||||
|
_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), _MM_SHUFFLE(2, 0, 2, 0)));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE __m128i mm_cvtsi64_si128(long long a) {
|
||||||
|
#if defined(ENOKI_X86_64)
|
||||||
|
return _mm_cvtsi64_si128(a);
|
||||||
|
#else
|
||||||
|
alignas(16) long long x[2] = { a, 0ll };
|
||||||
|
return _mm_load_si128((__m128i *) x);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE long long mm_cvtsi128_si64(__m128i m) {
|
||||||
|
#if defined(ENOKI_X86_64)
|
||||||
|
return _mm_cvtsi128_si64(m);
|
||||||
|
#else
|
||||||
|
alignas(16) long long x[2];
|
||||||
|
_mm_store_si128((__m128i *) x, m);
|
||||||
|
return x[0];
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int Imm8>
|
||||||
|
ENOKI_INLINE long long mm_extract_epi64(__m128i m) {
|
||||||
|
#if defined(ENOKI_X86_64)
|
||||||
|
return _mm_extract_epi64(m, Imm8);
|
||||||
|
#else
|
||||||
|
alignas(16) long long x[2];
|
||||||
|
_mm_store_si128((__m128i *) x, m);
|
||||||
|
return x[Imm8];
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
NAMESPACE_END(detail)
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,296 @@
|
||||||
|
/*
|
||||||
|
enoki/array_kmask.h -- Abstraction around AVX512 'k' mask registers
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using ENOKI instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
/// SFINAE macro for constructors that reinterpret another type
|
||||||
|
#define ENOKI_REINTERPRET_KMASK(Value) \
|
||||||
|
template <typename Value2, typename Derived2, bool IsMask2, \
|
||||||
|
enable_if_t<detail::is_same_v<Value2, Value>> = 0> \
|
||||||
|
ENOKI_INLINE KMaskBase( \
|
||||||
|
const StaticArrayBase<Value2, Size, IsMask2, Derived2> &a, \
|
||||||
|
detail::reinterpret_flag)
|
||||||
|
|
||||||
|
#define ENOKI_REINTERPRET_KMASK_SIZE(Value, Size) \
|
||||||
|
template <typename Value2, typename Derived2, bool IsMask2, \
|
||||||
|
enable_if_t<detail::is_same_v<Value2, Value>> = 0> \
|
||||||
|
ENOKI_INLINE KMaskBase( \
|
||||||
|
const StaticArrayBase<Value2, Size, IsMask2, Derived2> &a, \
|
||||||
|
detail::reinterpret_flag)
|
||||||
|
|
||||||
|
template <typename Value_, size_t Size_> struct KMask;
|
||||||
|
|
||||||
|
template <typename Value_, size_t Size_, typename Derived_>
|
||||||
|
struct KMaskBase : StaticArrayBase<Value_, Size_, true, Derived_> {
|
||||||
|
using Register = std::conditional_t<(Size_ > 8), __mmask16, __mmask8>;
|
||||||
|
using Derived = Derived_;
|
||||||
|
using Base = StaticArrayBase<Value_, Size_, true, Derived_>;
|
||||||
|
using Base::Size;
|
||||||
|
using Base::derived;
|
||||||
|
static constexpr bool IsNative = true;
|
||||||
|
static constexpr bool IsKMask = true;
|
||||||
|
static constexpr Register BitMask = Register((1 << Size_) - 1);
|
||||||
|
|
||||||
|
ENOKI_ARRAY_DEFAULTS(KMaskBase)
|
||||||
|
|
||||||
|
#if defined(NDEBUG)
|
||||||
|
KMaskBase() = default;
|
||||||
|
#else
|
||||||
|
KMaskBase() : k(BitMask) { }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <typename Array, enable_if_t<std::is_same_v<Register, typename Array::Derived::Register>> = 0>
|
||||||
|
ENOKI_INLINE KMaskBase(const Array &other, detail::reinterpret_flag) : k(other.derived().k) { }
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<std::is_same_v<bool, T> || std::is_same_v<int, T>> = 0>
|
||||||
|
ENOKI_INLINE KMaskBase(const T &b, detail::reinterpret_flag)
|
||||||
|
: k(bool(b) ? BitMask : Register(0)) { }
|
||||||
|
|
||||||
|
ENOKI_REINTERPRET_KMASK(bool) {
|
||||||
|
__m128i value;
|
||||||
|
if constexpr (Size == 16)
|
||||||
|
value = _mm_loadu_si128((__m128i *) a.derived().data());
|
||||||
|
else if constexpr (Size == 8)
|
||||||
|
value = _mm_loadl_epi64((const __m128i *) a.derived().data());
|
||||||
|
else if constexpr (Size == 4 || Size == 3)
|
||||||
|
value = _mm_cvtsi32_si128(*((const int *) a.derived().data()));
|
||||||
|
else if constexpr (Size == 2)
|
||||||
|
value = _mm_cvtsi32_si128((int) *((const short *) a.derived().data()));
|
||||||
|
else
|
||||||
|
static_assert(detail::false_v<Value2>, "Unsupported number of elements");
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX512VL) && defined(ENOKI_X86_AVX512BW)
|
||||||
|
k = (Register) _mm_test_epi8_mask(value, _mm_set1_epi8((char) 0xFF));
|
||||||
|
#else
|
||||||
|
k = (Register) _mm512_test_epi32_mask(_mm512_cvtepi8_epi32(value),
|
||||||
|
_mm512_set1_epi8((char) 0xFF));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#if !defined(ENOKI_X86_AVX512VL)
|
||||||
|
ENOKI_REINTERPRET_KMASK_SIZE(float, 8) : k((Register) _mm256_movemask_ps(a.derived().m)) { }
|
||||||
|
ENOKI_REINTERPRET_KMASK_SIZE(int32_t, 8) : k((Register) _mm256_movemask_ps(_mm256_castsi256_ps(a.derived().m))) { }
|
||||||
|
ENOKI_REINTERPRET_KMASK_SIZE(uint32_t, 8) : k((Register) _mm256_movemask_ps(_mm256_castsi256_ps(a.derived().m))) { }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ENOKI_REINTERPRET_KMASK_SIZE(double, 16) { k = _mm512_kunpackb(high(a).k, low(a).k); }
|
||||||
|
ENOKI_REINTERPRET_KMASK_SIZE(int64_t, 16) { k = _mm512_kunpackb(high(a).k, low(a).k); }
|
||||||
|
ENOKI_REINTERPRET_KMASK_SIZE(uint64_t, 16) { k = _mm512_kunpackb(high(a).k, low(a).k); }
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE static Derived from_k(const T &k) {
|
||||||
|
Derived result;
|
||||||
|
result.k = (Register) k;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived eq_(const Derived &a) const {
|
||||||
|
if constexpr (Size_ == 16) /* Use intrinsic if possible */
|
||||||
|
return Derived::from_k(_mm512_kxnor(k, a.k));
|
||||||
|
else
|
||||||
|
return Derived::from_k(~(k ^ a.k));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived neq_(const Derived &a) const {
|
||||||
|
if constexpr (Size_ == 16) /* Use intrinsic if possible */
|
||||||
|
return Derived::from_k(_mm512_kxor(k, a.k));
|
||||||
|
else
|
||||||
|
return Derived::from_k(k ^ a.k);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived or_(const Derived &a) const {
|
||||||
|
if constexpr (Size_ == 16) /* Use intrinsic if possible */
|
||||||
|
return Derived::from_k(_mm512_kor(k, a.k));
|
||||||
|
else
|
||||||
|
return Derived::from_k(k | a.k);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived and_(const Derived &a) const {
|
||||||
|
if constexpr (Size_ == 16) /* Use intrinsic if possible */
|
||||||
|
return Derived::from_k(_mm512_kand(k, a.k));
|
||||||
|
else
|
||||||
|
return Derived::from_k(k & a.k);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived andnot_(const Derived &a) const {
|
||||||
|
if constexpr (Size_ == 16) /* Use intrinsic if possible */
|
||||||
|
return Derived::from_k(_mm512_kandn(a.k, k));
|
||||||
|
else
|
||||||
|
return Derived::from_k(k & ~a.k);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived xor_(const Derived &a) const {
|
||||||
|
if constexpr (Size_ == 16) /* Use intrinsic if possible */
|
||||||
|
return Derived::from_k(_mm512_kxor(k, a.k));
|
||||||
|
else
|
||||||
|
return Derived::from_k(k ^ a.k);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived not_() const {
|
||||||
|
if constexpr (Size_ == 16)
|
||||||
|
return Derived::from_k(_mm512_knot(k));
|
||||||
|
else
|
||||||
|
return Derived::from_k(~k);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE Derived select_(const Derived &m, const Derived &t, const Derived &f) {
|
||||||
|
if constexpr (Size_ == 16)
|
||||||
|
return Derived::from_k(_mm512_kor(_mm512_kand (m.k, t.k),
|
||||||
|
_mm512_kandn(m.k, f.k)));
|
||||||
|
else
|
||||||
|
return Derived::from_k((m.k & t.k) | (~m.k & f.k));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE bool all_() const {
|
||||||
|
if constexpr (Size_ == 16)
|
||||||
|
return _mm512_kortestc(k, k);
|
||||||
|
else if constexpr (Size_ == 8)
|
||||||
|
return k == BitMask;
|
||||||
|
else
|
||||||
|
return (k & BitMask) == BitMask;
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE bool any_() const {
|
||||||
|
if constexpr (Size_ == 16)
|
||||||
|
return !_mm512_kortestz(k, k);
|
||||||
|
else if constexpr (Size_ == 8)
|
||||||
|
return k != 0;
|
||||||
|
else
|
||||||
|
return (k & BitMask) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE uint32_t bitmask_() const {
|
||||||
|
if constexpr (Size_ == 8 || Size_ == 16)
|
||||||
|
return (uint32_t) k;
|
||||||
|
else
|
||||||
|
return (uint32_t) (k & BitMask);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE size_t count_() const {
|
||||||
|
return (size_t) _mm_popcnt_u32(bitmask_());
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE bool bit_(size_t i) const {
|
||||||
|
return (k & ((Register) 1 << i)) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE void set_bit_(size_t i, bool value) {
|
||||||
|
k = (Register) (k ^ ((-value ^ k) & ((Register) 1 << i)));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE auto coeff(size_t i) const {
|
||||||
|
return MaskBit<const Derived &>(derived(), i);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE auto coeff(size_t i) {
|
||||||
|
return MaskBit<Derived &>(derived(), i);
|
||||||
|
}
|
||||||
|
|
||||||
|
static Derived zero_() { return Derived::from_k(0); }
|
||||||
|
|
||||||
|
template <typename Return = KMask<Value_, Size_ / 2>>
|
||||||
|
ENOKI_INLINE Return low_() const {
|
||||||
|
if constexpr (Size == 16)
|
||||||
|
return Return::from_k(__mmask8(k));
|
||||||
|
else
|
||||||
|
return Return::from_k(Return::BitMask & k);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Return = KMask<Value_, Size_ / 2>>
|
||||||
|
ENOKI_INLINE Return high_() const {
|
||||||
|
return Return::from_k(k >> (Size_ / 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE void store_(void *ptr) const {
|
||||||
|
store_unaligned_(ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE void store_unaligned_(void *ptr) const {
|
||||||
|
memcpy(ptr, &k, sizeof(Register));
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE Derived load_(const void *ptr) {
|
||||||
|
return load_unaligned_(ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE Derived load_unaligned_(const void *ptr) {
|
||||||
|
Derived result;
|
||||||
|
memcpy(&result.k, ptr, sizeof(Register));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t Stride, typename Index, typename Mask>
|
||||||
|
static ENOKI_INLINE Derived gather_(const void *ptr, const Index &index_, const Mask &mask) {
|
||||||
|
using UInt32 = Array<uint32_t, Size>;
|
||||||
|
|
||||||
|
UInt32 index_32 = UInt32(index_),
|
||||||
|
index, offset;
|
||||||
|
|
||||||
|
if (Size == 2) {
|
||||||
|
index = sr<1>(index_32);
|
||||||
|
offset = Index(1) << (index_32 & (uint32_t) 0x1);
|
||||||
|
} else if (Size == 4) {
|
||||||
|
index = sr<2>(index_32);
|
||||||
|
offset = Index(1) << (index_32 & (uint32_t) 0x3);
|
||||||
|
} else {
|
||||||
|
index = sr<3>(index_32);
|
||||||
|
offset = Index(1) << (index_32 & (uint32_t) 0x7);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
const uint8_t *in = (const uint8_t *) ptr;
|
||||||
|
Register bit = 1, accum = 0;
|
||||||
|
for (size_t i = 0; i < Size; ++i) {
|
||||||
|
if ((bool) mask.coeff(i) && (in[index.coeff(i)] & offset.coeff(i)) != 0)
|
||||||
|
accum |= bit;
|
||||||
|
bit <<= 1;
|
||||||
|
}
|
||||||
|
return Derived::from_k(accum);
|
||||||
|
#else
|
||||||
|
return Derived(neq(gather<UInt32, 1>(ptr, index, mask) & offset, (uint32_t) 0));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Array, enable_if_t<std::is_same_v<Register, typename Array::Derived::Register>> = 0>
|
||||||
|
ENOKI_INLINE Derived& operator=(const Array &other) {
|
||||||
|
k = other.derived().k;
|
||||||
|
return derived();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<std::is_same_v<bool, T> || std::is_same_v<int, T>> = 0>
|
||||||
|
ENOKI_INLINE Derived& operator=(const T &b) {
|
||||||
|
k = bool(b) ? BitMask : Register(0);
|
||||||
|
return derived();
|
||||||
|
}
|
||||||
|
|
||||||
|
Register k;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Value_, size_t Size_>
|
||||||
|
struct KMask : KMaskBase<Value_, Size_, KMask<Value_, Size_>> {
|
||||||
|
using Base = KMaskBase<Value_, Size_, KMask<Value_, Size_>>;
|
||||||
|
|
||||||
|
ENOKI_ARRAY_IMPORT(Base, KMask)
|
||||||
|
};
|
||||||
|
|
||||||
|
#define ENOKI_DECLARE_KMASK(Type, Size, Derived, SFINAE) \
|
||||||
|
struct StaticArrayImpl<Type, Size, true, Derived, SFINAE> \
|
||||||
|
: KMaskBase<Type, Size, Derived> { \
|
||||||
|
using Base = KMaskBase<Type, Size, Derived>; \
|
||||||
|
ENOKI_ARRAY_DEFAULTS(StaticArrayImpl) \
|
||||||
|
using Base::Base; \
|
||||||
|
using Base::operator=; \
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,419 @@
|
||||||
|
/*
|
||||||
|
enoki/array_macro.h -- Code generation macros for custom data structures
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
// The main idea of this macro is borrowed from https://github.com/swansontec/map-macro
|
||||||
|
// (C) William Swanson, Paul Fultz
|
||||||
|
#define ENOKI_EVAL_0(...) __VA_ARGS__
|
||||||
|
#define ENOKI_EVAL_1(...) ENOKI_EVAL_0(ENOKI_EVAL_0(ENOKI_EVAL_0(__VA_ARGS__)))
|
||||||
|
#define ENOKI_EVAL_2(...) ENOKI_EVAL_1(ENOKI_EVAL_1(ENOKI_EVAL_1(__VA_ARGS__)))
|
||||||
|
#define ENOKI_EVAL_3(...) ENOKI_EVAL_2(ENOKI_EVAL_2(ENOKI_EVAL_2(__VA_ARGS__)))
|
||||||
|
#define ENOKI_EVAL_4(...) ENOKI_EVAL_3(ENOKI_EVAL_3(ENOKI_EVAL_3(__VA_ARGS__)))
|
||||||
|
#define ENOKI_EVAL(...) ENOKI_EVAL_4(ENOKI_EVAL_4(ENOKI_EVAL_4(__VA_ARGS__)))
|
||||||
|
#define ENOKI_MAP_END(...)
|
||||||
|
#define ENOKI_MAP_OUT
|
||||||
|
#define ENOKI_MAP_COMMA ,
|
||||||
|
#define ENOKI_MAP_GET_END() 0, ENOKI_MAP_END
|
||||||
|
#define ENOKI_MAP_NEXT_0(test, next, ...) next ENOKI_MAP_OUT
|
||||||
|
#define ENOKI_MAP_NEXT_1(test, next) ENOKI_MAP_NEXT_0(test, next, 0)
|
||||||
|
#define ENOKI_MAP_NEXT(test, next) ENOKI_MAP_NEXT_1(ENOKI_MAP_GET_END test, next)
|
||||||
|
#define ENOKI_EXTRACT_0(next, ...) next
|
||||||
|
|
||||||
|
#if defined(_MSC_VER) // MSVC is not as eager to expand macros, hence this workaround
|
||||||
|
#define ENOKI_MAP_EXPR_NEXT_1(test, next) \
|
||||||
|
ENOKI_EVAL_0(ENOKI_MAP_NEXT_0(test, ENOKI_MAP_COMMA next, 0))
|
||||||
|
#define ENOKI_MAP_STMT_NEXT_1(test, next) \
|
||||||
|
ENOKI_EVAL_0(ENOKI_MAP_NEXT_0(test, next, 0))
|
||||||
|
#else
|
||||||
|
#define ENOKI_MAP_EXPR_NEXT_1(test, next) \
|
||||||
|
ENOKI_MAP_NEXT_0(test, ENOKI_MAP_COMMA next, 0)
|
||||||
|
#define ENOKI_MAP_STMT_NEXT_1(test, next) \
|
||||||
|
ENOKI_MAP_NEXT_0(test, next, 0)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define ENOKI_MAP_EXPR_NEXT(test, next) \
|
||||||
|
ENOKI_MAP_EXPR_NEXT_1 (ENOKI_MAP_GET_END test, next)
|
||||||
|
#define ENOKI_MAP_STMT_NEXT(test, next) \
|
||||||
|
ENOKI_MAP_STMT_NEXT_1 (ENOKI_MAP_GET_END test, next)
|
||||||
|
|
||||||
|
#define ENOKI_MAP_TEMPLATE_FWD_0(x, peek, ...) \
|
||||||
|
typename T##x ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_TEMPLATE_FWD_1)(peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_MAP_TEMPLATE_FWD_1(x, peek, ...) \
|
||||||
|
typename T##x ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_TEMPLATE_FWD_0)(peek, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define ENOKI_MAP_EXPR_DECL_FWD_0(x, peek, ...) \
|
||||||
|
T##x &&x ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_DECL_FWD_1)(peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_MAP_EXPR_DECL_FWD_1(x, peek, ...) \
|
||||||
|
T##x &&x ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_DECL_FWD_0)(peek, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define ENOKI_MAP_EXPR_BASE_FWD_0(x, peek, ...) \
|
||||||
|
std::forward<T##x>(x) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_BASE_FWD_1)(peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_MAP_EXPR_BASE_FWD_1(x, peek, ...) \
|
||||||
|
std::forward<T##x>(x) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_BASE_FWD_0)(peek, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define ENOKI_MAP_EXPR_FWD_0(x, peek, ...) \
|
||||||
|
x(std::forward<T##x>(x)) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_FWD_1)(peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_MAP_EXPR_FWD_1(x, peek, ...) \
|
||||||
|
x(std::forward<T##x>(x)) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_FWD_0)(peek, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define ENOKI_MAP_EXPR_COPY_0(x, peek, ...) \
|
||||||
|
x(x) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_COPY_1)(peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_MAP_EXPR_COPY_1(x, peek, ...) \
|
||||||
|
x(x) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_COPY_0)(peek, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define ENOKI_MAP_EXPR_COPY_V_0(v, x, peek, ...) \
|
||||||
|
x(v.x) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_COPY_V_1)(v, peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_MAP_EXPR_COPY_V_1(v, x, peek, ...) \
|
||||||
|
x(v.x) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_COPY_V_0)(v, peek, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define ENOKI_MAP_EXPR_MOVE_V_0(v, x, peek, ...) \
|
||||||
|
x(std::move(v.x)) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_MOVE_V_1)(v, peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_MAP_EXPR_MOVE_V_1(v, x, peek, ...) \
|
||||||
|
x(std::move(v.x)) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_MOVE_V_0)(v, peek, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define ENOKI_MAP_STMT_ASSIGN_0(v, x, peek, ...) \
|
||||||
|
this->x = v.x; \
|
||||||
|
ENOKI_MAP_STMT_NEXT(peek, ENOKI_MAP_STMT_ASSIGN_1)(v, peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_MAP_STMT_ASSIGN_1(v, x, peek, ...) \
|
||||||
|
this->x = v.x; \
|
||||||
|
ENOKI_MAP_STMT_NEXT(peek, ENOKI_MAP_STMT_ASSIGN_0)(v, peek, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define ENOKI_MAP_STMT_MOVE_0(v, x, peek, ...) \
|
||||||
|
this->x = std::move(v.x); \
|
||||||
|
ENOKI_MAP_STMT_NEXT(peek, ENOKI_MAP_STMT_MOVE_1)(v, peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_MAP_STMT_MOVE_1(v, x, peek, ...) \
|
||||||
|
this->x = std::move(v.x); \
|
||||||
|
ENOKI_MAP_STMT_NEXT(peek, ENOKI_MAP_STMT_MOVE_0)(v, peek, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define ENOKI_MAP_EXPR_F1_0(f, v, x, peek, ...) \
|
||||||
|
f(v.x) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_F1_1)(f, v, peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_MAP_EXPR_F1_1(f, v, x, peek, ...) \
|
||||||
|
f(v.x) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_F1_0)(f, v, peek, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define ENOKI_MAP_EXPR_F2_0(f, v, t, x, peek, ...) \
|
||||||
|
f(v.x, t) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_F2_1)(f, v, t, peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_MAP_EXPR_F2_1(f, v, t, x, peek, ...) \
|
||||||
|
f(v.x, t) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_F2_0)(f, v, t, peek, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define ENOKI_MAP_EXPR_F3_0(f, m, v, t, x, peek, ...) \
|
||||||
|
f(m.x, v.x, t) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_F3_1)(f, m, v, t, peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_MAP_EXPR_F3_1(f, m, v, t, x, peek, ...) \
|
||||||
|
f(m.x, v.x, t) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_F3_0)(f, m, v, t, peek, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define ENOKI_MAP_EXPR_T2_0(f, t, x, peek, ...) \
|
||||||
|
f<decltype(Value::x)>(t) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_T2_1)(f, t, peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_MAP_EXPR_T2_1(f, t, x, peek, ...) \
|
||||||
|
f<decltype(Value::x)>(t) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_T2_0)(f, t, peek, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define ENOKI_MAP_EXPR_GATHER_0(x, peek, ...) \
|
||||||
|
enoki::gather<decltype(Value::x)>(src.x, index, mask) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_GATHER_1)(peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_MAP_EXPR_GATHER_1(x, peek, ...) \
|
||||||
|
enoki::gather<decltype(Value::x)>(src.x, index, mask) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_GATHER_0)(peek, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define ENOKI_MAP_EXPR_SCATTER_0(x, peek, ...) \
|
||||||
|
enoki::scatter(dst.x, value.x, index, mask) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_SCATTER_1)(peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_MAP_EXPR_SCATTER_1(x, peek, ...) \
|
||||||
|
enoki::scatter(dst.x, value.x, index, mask) ENOKI_MAP_EXPR_NEXT(peek, ENOKI_MAP_EXPR_SCATTER_0)(peek, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define ENOKI_USING_MEMBERS_0(base, x, peek, ...) \
|
||||||
|
using base::x; \
|
||||||
|
ENOKI_MAP_STMT_NEXT(peek, ENOKI_USING_MEMBERS_1)(base, peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_USING_MEMBERS_1(base, x, peek, ...) \
|
||||||
|
using base::x; \
|
||||||
|
ENOKI_MAP_STMT_NEXT(peek, ENOKI_USING_MEMBERS_0)(base, peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_USING_MEMBERS_2(base, peek, ...) \
|
||||||
|
ENOKI_EVAL(ENOKI_MAP_STMT_NEXT(peek, ENOKI_USING_MEMBERS_0)(base, peek, __VA_ARGS__))
|
||||||
|
|
||||||
|
#define ENOKI_USING_TYPES_0(base, x, peek, ...) \
|
||||||
|
using x = typename base::x; \
|
||||||
|
ENOKI_MAP_STMT_NEXT(peek, ENOKI_USING_TYPES_1)(base, peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_USING_TYPES_1(base, x, peek, ...) \
|
||||||
|
using x = typename base::x; \
|
||||||
|
ENOKI_MAP_STMT_NEXT(peek, ENOKI_USING_TYPES_0)(base, peek, __VA_ARGS__)
|
||||||
|
#define ENOKI_USING_TYPES_2(base, peek, ...) \
|
||||||
|
ENOKI_EVAL(ENOKI_MAP_STMT_NEXT(peek, ENOKI_USING_TYPES_0)(base, peek, __VA_ARGS__))
|
||||||
|
|
||||||
|
// ENOKI_MAP_TEMPLATE_FWD(a1, a2, ...) expands to typename Ta1, typename Ta2, ...
|
||||||
|
#define ENOKI_MAP_TEMPLATE_FWD(...) \
|
||||||
|
ENOKI_EVAL(ENOKI_MAP_TEMPLATE_FWD_0(__VA_ARGS__, (), 0))
|
||||||
|
|
||||||
|
// ENOKI_MAP_EXPR_DECL_FWD(a1, a2, ...) expands to Ta1 &&a1, Ta2&& a2...
|
||||||
|
#define ENOKI_MAP_EXPR_DECL_FWD(...) \
|
||||||
|
ENOKI_EVAL(ENOKI_MAP_EXPR_DECL_FWD_0(__VA_ARGS__, (), 0))
|
||||||
|
|
||||||
|
// ENOKI_MAP_EXPR_BASE_FWD(a1, a2, ...) expands to std::forward<Ta1>(a1), std::std::forward<Ta2>(a2), ...
|
||||||
|
#define ENOKI_MAP_EXPR_BASE_FWD(...) \
|
||||||
|
ENOKI_EVAL(ENOKI_MAP_EXPR_BASE_FWD_0(__VA_ARGS__, (), 0))
|
||||||
|
|
||||||
|
// ENOKI_MAP_EXPR_FWD(a1, a2, ...) expands to a1(std::forward<Ta1>(a1)), a2(std::std::forward<Ta2>(a2)), ...
|
||||||
|
#define ENOKI_MAP_EXPR_FWD(...) \
|
||||||
|
ENOKI_EVAL(ENOKI_MAP_EXPR_FWD_0(__VA_ARGS__, (), 0))
|
||||||
|
|
||||||
|
// ENOKI_MAP_EXPR_COPY(a1, a2, ...) expands to a1(a1), a2(a2), ...
|
||||||
|
#define ENOKI_MAP_EXPR_COPY(...) \
|
||||||
|
ENOKI_EVAL(ENOKI_MAP_EXPR_COPY_0(__VA_ARGS__, (), 0))
|
||||||
|
|
||||||
|
// ENOKI_MAP_EXPR_COPY_V(v, a1, a2, ...) expands to a1(v.a1), a2(v.a2), ...
|
||||||
|
#define ENOKI_MAP_EXPR_COPY_V(v, ...) \
|
||||||
|
ENOKI_EVAL(ENOKI_MAP_EXPR_COPY_V_0(v, __VA_ARGS__, (), 0))
|
||||||
|
|
||||||
|
// ENOKI_MAP_EXPR_MOVE_V(v, a1, a2, ...) expands to a1(std::move(v.a1)), a2(std::move(v.a2)), ...
|
||||||
|
#define ENOKI_MAP_EXPR_MOVE_V(v, ...) \
|
||||||
|
ENOKI_EVAL(ENOKI_MAP_EXPR_MOVE_V_0(v, __VA_ARGS__, (), 0))
|
||||||
|
|
||||||
|
// ENOKI_MAP_STMT_ASSIGN(v, a1, a2, ...) expands to this->a1 = v.a1; ..
|
||||||
|
#define ENOKI_MAP_STMT_ASSIGN(v, ...) \
|
||||||
|
ENOKI_EVAL(ENOKI_MAP_STMT_ASSIGN_0(v, __VA_ARGS__, (), 0))
|
||||||
|
|
||||||
|
// ENOKI_MAP_STMT_MOVE(v, a1, a2, ...) expands to this->a1 = std::move(v.a1); ..
|
||||||
|
#define ENOKI_MAP_STMT_MOVE(v, ...) \
|
||||||
|
ENOKI_EVAL(ENOKI_MAP_STMT_MOVE_0(v, __VA_ARGS__, (), 0))
|
||||||
|
|
||||||
|
// ENOKI_MAP_EXPR_F1(f, v, a1, a2, ...) expands to f(v.a1), f(v.a2), ...
|
||||||
|
#define ENOKI_MAP_EXPR_F1(f, v, ...) \
|
||||||
|
ENOKI_EVAL(ENOKI_MAP_EXPR_F1_0(f, v, __VA_ARGS__, (), 0))
|
||||||
|
|
||||||
|
// ENOKI_MAP_EXPR_F2(f, v, t, a1, a2, ...) expands to f(v.a1, t), f(v.a2, t), ...
|
||||||
|
#define ENOKI_MAP_EXPR_F2(f, v, t, ...) \
|
||||||
|
ENOKI_EVAL(ENOKI_MAP_EXPR_F2_0(f, v, t, __VA_ARGS__, (), 0))
|
||||||
|
|
||||||
|
// ENOKI_MAP_EXPR_T2(f, v, t, a1, a2, ...) expands to f<decltype(Value::a1)>(t), f<decltype(Value::a2>>(t), ...
|
||||||
|
#define ENOKI_MAP_EXPR_T2(f, v, t, ...) \
|
||||||
|
ENOKI_EVAL(ENOKI_MAP_EXPR_T2_0(f, v, t, __VA_ARGS__, (), 0))
|
||||||
|
|
||||||
|
// ENOKI_MAP_EXPR_F3(f, m, v, t, a1, a2, ...) expands to f(m.a1, v.a1, t), f(m.a2, v.a2, t), ...
|
||||||
|
#define ENOKI_MAP_EXPR_F3(f, v, t, ...) \
|
||||||
|
ENOKI_EVAL(ENOKI_MAP_EXPR_F3_0(f, v, t, __VA_ARGS__, (), 0))
|
||||||
|
|
||||||
|
// ENOKI_MAP_EXPR_GATHER(a1, a2, ...) expands to enoki::gather<decltype(Value::a1)>(src.a1, index, mask), ..
|
||||||
|
#define ENOKI_MAP_EXPR_GATHER(...) \
|
||||||
|
ENOKI_EVAL(ENOKI_MAP_EXPR_GATHER_0(__VA_ARGS__, (), 0))
|
||||||
|
|
||||||
|
// ENOKI_MAP_EXPR_SCATTER(a1, a2, ...) expands to enoki::scatter(dst.a1, src.a1, index, mask), ..
|
||||||
|
#define ENOKI_MAP_EXPR_SCATTER(...) \
|
||||||
|
ENOKI_EVAL(ENOKI_MAP_EXPR_SCATTER_0(__VA_ARGS__, (), 0))
|
||||||
|
|
||||||
|
// ENOKI_USING_TYPES(base, a1, a2, ...) expands to using a1 = typename base::a1; using a2 = typename base::a2; ...
|
||||||
|
#define ENOKI_USING_TYPES(...) \
|
||||||
|
ENOKI_EVAL_0(ENOKI_USING_TYPES_2(__VA_ARGS__, (), 0))
|
||||||
|
|
||||||
|
// ENOKI_USING_MEMBERS(base, a1, a2, ...) expands to using base::a1; using base::a2; ...
|
||||||
|
#define ENOKI_USING_MEMBERS(...) \
|
||||||
|
ENOKI_EVAL_0(ENOKI_USING_MEMBERS_2(__VA_ARGS__, (), 0))
|
||||||
|
|
||||||
|
|
||||||
|
#define ENOKI_STRUCT(Struct, ...) \
|
||||||
|
Struct() = default; \
|
||||||
|
template <ENOKI_MAP_TEMPLATE_FWD(__VA_ARGS__)> \
|
||||||
|
ENOKI_INLINE Struct(ENOKI_MAP_EXPR_DECL_FWD(__VA_ARGS__)) \
|
||||||
|
: ENOKI_MAP_EXPR_FWD(__VA_ARGS__) { } \
|
||||||
|
template <typename... Args> \
|
||||||
|
ENOKI_INLINE Struct(const Struct<Args...> &value) \
|
||||||
|
: ENOKI_MAP_EXPR_COPY_V(value, __VA_ARGS__) { } \
|
||||||
|
template <typename... Args> \
|
||||||
|
ENOKI_INLINE Struct(Struct<Args...> &&value) \
|
||||||
|
: ENOKI_MAP_EXPR_MOVE_V(value, __VA_ARGS__) { } \
|
||||||
|
template <typename... Args> \
|
||||||
|
ENOKI_INLINE Struct &operator=(const Struct<Args...> &value) { \
|
||||||
|
ENOKI_MAP_STMT_ASSIGN(value, __VA_ARGS__) \
|
||||||
|
return *this; \
|
||||||
|
} \
|
||||||
|
template <typename... Args> \
|
||||||
|
ENOKI_INLINE Struct &operator=(Struct<Args...> &&value) { \
|
||||||
|
ENOKI_MAP_STMT_MOVE(value, __VA_ARGS__) \
|
||||||
|
return *this; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define ENOKI_BASE_FIELDS(...) __VA_ARGS__
|
||||||
|
#define ENOKI_DERIVED_FIELDS(...) __VA_ARGS__
|
||||||
|
|
||||||
|
#define ENOKI_DERIVED_STRUCT(Struct, Base, BaseFields, StructFields) \
|
||||||
|
Struct() = default; \
|
||||||
|
template <ENOKI_MAP_TEMPLATE_FWD(BaseFields), \
|
||||||
|
ENOKI_MAP_TEMPLATE_FWD(StructFields)> \
|
||||||
|
ENOKI_INLINE Struct(ENOKI_MAP_EXPR_DECL_FWD(BaseFields), \
|
||||||
|
ENOKI_MAP_EXPR_DECL_FWD(StructFields)) \
|
||||||
|
: Base(ENOKI_MAP_EXPR_BASE_FWD(BaseFields)), \
|
||||||
|
ENOKI_MAP_EXPR_FWD(StructFields) { } \
|
||||||
|
template <typename... Args> \
|
||||||
|
ENOKI_INLINE Struct(const Struct<Args...> &value) \
|
||||||
|
: Base(value), ENOKI_MAP_EXPR_COPY_V(value, StructFields) { } \
|
||||||
|
template <typename... Args> \
|
||||||
|
ENOKI_INLINE Struct(Struct<Args...> &&value) \
|
||||||
|
: Base(std::move(value)), \
|
||||||
|
ENOKI_MAP_EXPR_MOVE_V(value, StructFields) { } \
|
||||||
|
template <typename... Args> \
|
||||||
|
ENOKI_INLINE Struct &operator=(const Struct<Args...> &value) { \
|
||||||
|
Base::operator=(value); \
|
||||||
|
ENOKI_MAP_STMT_ASSIGN(value, StructFields) \
|
||||||
|
return *this; \
|
||||||
|
} \
|
||||||
|
template <typename... Args> \
|
||||||
|
ENOKI_INLINE Struct &operator=(Struct<Args...> &&value) { \
|
||||||
|
Base::operator=(std::move(value)); \
|
||||||
|
ENOKI_MAP_STMT_MOVE(value, StructFields) \
|
||||||
|
return *this; \
|
||||||
|
} \
|
||||||
|
template <typename Mask, enoki::enable_if_mask_t<Mask> = 0> \
|
||||||
|
auto operator[](const Mask &m) { return masked(*this, m); } \
|
||||||
|
|
||||||
|
|
||||||
|
#define ENOKI_STRUCT_SUPPORT(Struct, ...) \
|
||||||
|
NAMESPACE_BEGIN(enoki) \
|
||||||
|
template <typename... Args> struct struct_support<Struct<Args...>> { \
|
||||||
|
static constexpr bool IsDynamic = \
|
||||||
|
std::disjunction_v<enoki::is_dynamic<Args>...>; \
|
||||||
|
using Dynamic = Struct<enoki::make_dynamic_t<Args>...>; \
|
||||||
|
using Value = Struct<Args...>; \
|
||||||
|
template <typename T, typename Arg> \
|
||||||
|
using ArgType = \
|
||||||
|
std::conditional_t<std::is_const_v<std::remove_reference_t<T>>, \
|
||||||
|
const Arg &, Arg &>; \
|
||||||
|
static ENOKI_INLINE size_t packets(const Value &value) { \
|
||||||
|
return enoki::packets( \
|
||||||
|
value.ENOKI_EVAL_0(ENOKI_EXTRACT_0(__VA_ARGS__))); \
|
||||||
|
} \
|
||||||
|
static ENOKI_INLINE size_t slices(const Value &value) { \
|
||||||
|
return enoki::slices( \
|
||||||
|
value.ENOKI_EVAL_0(ENOKI_EXTRACT_0(__VA_ARGS__))); \
|
||||||
|
} \
|
||||||
|
static void set_slices(Value &value, size_t size) { \
|
||||||
|
ENOKI_MAP_EXPR_F2(enoki::set_slices, value, size, __VA_ARGS__); \
|
||||||
|
} \
|
||||||
|
template <typename Mem, typename Mask> \
|
||||||
|
static ENOKI_INLINE size_t compress(Mem &mem, const Value &value, \
|
||||||
|
const Mask &mask) { \
|
||||||
|
return ENOKI_MAP_EXPR_F3(enoki::compress, mem, value, \
|
||||||
|
mask, __VA_ARGS__); \
|
||||||
|
} \
|
||||||
|
template <typename Src, typename Index, typename Mask> \
|
||||||
|
static ENOKI_INLINE Value gather(Src &src, const Index &index, \
|
||||||
|
const Mask &mask) { \
|
||||||
|
return Value(ENOKI_MAP_EXPR_GATHER(__VA_ARGS__)); \
|
||||||
|
} \
|
||||||
|
template <typename Dst, typename Index, typename Mask> \
|
||||||
|
static void scatter(Dst &dst, const Value &value, const Index &index, \
|
||||||
|
const Mask &mask) { \
|
||||||
|
ENOKI_MAP_EXPR_SCATTER(__VA_ARGS__); \
|
||||||
|
} \
|
||||||
|
template <typename T> \
|
||||||
|
static ENOKI_INLINE auto slice(T &&value, size_t index) { \
|
||||||
|
using Value = Struct<decltype(enoki::slice(std::declval< \
|
||||||
|
ArgType<T, Args>>(), index))...>; \
|
||||||
|
return Value(ENOKI_MAP_EXPR_F2(enoki::slice, value, index, \
|
||||||
|
__VA_ARGS__)); \
|
||||||
|
} \
|
||||||
|
template <typename T> \
|
||||||
|
static ENOKI_INLINE auto slice_ptr(T &&value, size_t index) { \
|
||||||
|
using Value = Struct<decltype(enoki::slice_ptr(std::declval< \
|
||||||
|
ArgType<T, Args>>(), index))...>; \
|
||||||
|
return Value(ENOKI_MAP_EXPR_F2(enoki::slice_ptr, value, index, \
|
||||||
|
__VA_ARGS__)); \
|
||||||
|
} \
|
||||||
|
template <typename T> \
|
||||||
|
static ENOKI_INLINE auto packet(T &&value, size_t index) { \
|
||||||
|
using Value = Struct<decltype(enoki::packet(std::declval< \
|
||||||
|
ArgType<T, Args>>(), index))...>; \
|
||||||
|
return Value(ENOKI_MAP_EXPR_F2(enoki::packet, value, index, \
|
||||||
|
__VA_ARGS__)); \
|
||||||
|
} \
|
||||||
|
template <typename T> static ENOKI_INLINE auto ref_wrap(T &&value) { \
|
||||||
|
using Value = Struct<decltype(enoki::ref_wrap(std::declval< \
|
||||||
|
ArgType<T, Args>>()))...>; \
|
||||||
|
return Value(ENOKI_MAP_EXPR_F1(enoki::ref_wrap, value, \
|
||||||
|
__VA_ARGS__)); \
|
||||||
|
} \
|
||||||
|
template <typename T> static ENOKI_INLINE auto detach(T &&value) { \
|
||||||
|
using Value = Struct<decltype(enoki::detach(std::declval< \
|
||||||
|
ArgType<T, Args>>()))...>; \
|
||||||
|
return Value(ENOKI_MAP_EXPR_F1(enoki::detach, value, \
|
||||||
|
__VA_ARGS__)); \
|
||||||
|
} \
|
||||||
|
template <typename T, typename M> static ENOKI_INLINE \
|
||||||
|
auto masked(T& value, const M & mask) { \
|
||||||
|
using Value = Struct<decltype(enoki::masked( \
|
||||||
|
std::declval<Args &>(), mask))...>; \
|
||||||
|
return Value(ENOKI_MAP_EXPR_F2(enoki::masked, \
|
||||||
|
value, mask, __VA_ARGS__) ); \
|
||||||
|
} \
|
||||||
|
static ENOKI_INLINE auto zero(size_t size) { \
|
||||||
|
return Value(ENOKI_EVAL_0( \
|
||||||
|
ENOKI_MAP_EXPR_T2(enoki::zero, size, __VA_ARGS__))); \
|
||||||
|
} \
|
||||||
|
static ENOKI_INLINE auto empty(size_t size) { \
|
||||||
|
return Value(ENOKI_EVAL_0( \
|
||||||
|
ENOKI_MAP_EXPR_T2(enoki::empty, size, __VA_ARGS__))); \
|
||||||
|
} \
|
||||||
|
}; \
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
||||||
|
#define ENOKI_PINNED_OPERATOR_NEW(Type) \
|
||||||
|
void *operator new(size_t size) { \
|
||||||
|
if constexpr (enoki::is_cuda_array_v<Type>) \
|
||||||
|
return enoki::cuda_host_malloc(size); \
|
||||||
|
else \
|
||||||
|
return ::operator new(size); \
|
||||||
|
} \
|
||||||
|
void *operator new(size_t size, std::align_val_t align) { \
|
||||||
|
ENOKI_MARK_USED(align); \
|
||||||
|
if constexpr (enoki::is_cuda_array_v<Type>) \
|
||||||
|
return enoki::cuda_host_malloc(size); \
|
||||||
|
else \
|
||||||
|
return ::operator new(size, align); \
|
||||||
|
} \
|
||||||
|
void *operator new[](size_t size) { \
|
||||||
|
if constexpr (enoki::is_cuda_array_v<Type>) \
|
||||||
|
return enoki::cuda_host_malloc(size); \
|
||||||
|
else \
|
||||||
|
return ::operator new[](size); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
void *operator new[](size_t size, std::align_val_t align) { \
|
||||||
|
ENOKI_MARK_USED(align); \
|
||||||
|
if constexpr (enoki::is_cuda_array_v<Type>) \
|
||||||
|
return enoki::cuda_host_malloc(size); \
|
||||||
|
else \
|
||||||
|
return ::operator new[](size, align); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
void operator delete(void *ptr) { \
|
||||||
|
if constexpr (enoki::is_cuda_array_v<Type>) \
|
||||||
|
enoki::cuda_host_free(ptr); \
|
||||||
|
else \
|
||||||
|
return ::operator delete(ptr); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
void operator delete(void *ptr, std::align_val_t align) { \
|
||||||
|
ENOKI_MARK_USED(align); \
|
||||||
|
if constexpr (enoki::is_cuda_array_v<Type>) \
|
||||||
|
enoki::cuda_host_free(ptr); \
|
||||||
|
else \
|
||||||
|
return ::operator delete(ptr, align); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
void operator delete[](void *ptr) { \
|
||||||
|
if constexpr (enoki::is_cuda_array_v<Type>) \
|
||||||
|
enoki::cuda_host_free(ptr); \
|
||||||
|
else \
|
||||||
|
return ::operator delete[](ptr); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
void operator delete[](void *ptr, std::align_val_t align) { \
|
||||||
|
ENOKI_MARK_USED(align); \
|
||||||
|
if constexpr (enoki::is_cuda_array_v<Type>) \
|
||||||
|
enoki::cuda_host_free(ptr); \
|
||||||
|
else \
|
||||||
|
return ::operator delete[](ptr, align); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -0,0 +1,92 @@
|
||||||
|
/*
|
||||||
|
enoki/array_masked.h -- Helper classes for masked assignments and
|
||||||
|
in-place operators
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using ENOKI instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Masked array helper classes
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(detail)
|
||||||
|
|
||||||
|
template <typename T> struct MaskedValue {
|
||||||
|
MaskedValue(T &d, bool m) : d(d), m(m) { }
|
||||||
|
|
||||||
|
template <typename T2> ENOKI_INLINE void operator =(const T2 &value) { if (m) d = value; }
|
||||||
|
template <typename T2> ENOKI_INLINE void operator+=(const T2 &value) { if (m) d += value; }
|
||||||
|
template <typename T2> ENOKI_INLINE void operator-=(const T2 &value) { if (m) d -= value; }
|
||||||
|
template <typename T2> ENOKI_INLINE void operator*=(const T2 &value) { if (m) d *= value; }
|
||||||
|
template <typename T2> ENOKI_INLINE void operator/=(const T2 &value) { if (m) d /= value; }
|
||||||
|
template <typename T2> ENOKI_INLINE void operator|=(const T2 &value) { if (m) d |= value; }
|
||||||
|
template <typename T2> ENOKI_INLINE void operator&=(const T2 &value) { if (m) d &= value; }
|
||||||
|
template <typename T2> ENOKI_INLINE void operator^=(const T2 &value) { if (m) d ^= value; }
|
||||||
|
|
||||||
|
T &d;
|
||||||
|
bool m;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct MaskedArray : ArrayBase<value_t<T>, MaskedArray<T>> {
|
||||||
|
using Mask = mask_t<T>;
|
||||||
|
using Scalar = MaskedValue<scalar_t<T>>;
|
||||||
|
using MaskType = MaskedArray<Mask>;
|
||||||
|
using Value = std::conditional_t<is_scalar_v<value_t<T>>,
|
||||||
|
MaskedValue<value_t<T>>,
|
||||||
|
MaskedArray<value_t<T>>>;
|
||||||
|
using UnderlyingValue = value_t<T>;
|
||||||
|
static constexpr size_t Size = array_size_v<T>;
|
||||||
|
static constexpr bool IsMaskedArray = true;
|
||||||
|
|
||||||
|
MaskedArray(T &d, const Mask &m) : d(d), m(m) { }
|
||||||
|
|
||||||
|
template <typename T2> ENOKI_INLINE void operator =(const T2 &value) { d.massign_(value, m); }
|
||||||
|
template <typename T2> ENOKI_INLINE void operator+=(const T2 &value) { d.madd_(value, m); }
|
||||||
|
template <typename T2> ENOKI_INLINE void operator-=(const T2 &value) { d.msub_(value, m); }
|
||||||
|
template <typename T2> ENOKI_INLINE void operator*=(const T2 &value) { d.mmul_(value, m); }
|
||||||
|
template <typename T2> ENOKI_INLINE void operator/=(const T2 &value) { d.mdiv_(value, m); }
|
||||||
|
template <typename T2> ENOKI_INLINE void operator|=(const T2 &value) { d.mor_(value, m); }
|
||||||
|
template <typename T2> ENOKI_INLINE void operator&=(const T2 &value) { d.mand_(value, m); }
|
||||||
|
template <typename T2> ENOKI_INLINE void operator^=(const T2 &value) { d.mxor_(value, m); }
|
||||||
|
|
||||||
|
/// Type alias for a similar-shaped array over a different type
|
||||||
|
template <typename T2> using ReplaceValue = MaskedArray<typename T::template ReplaceValue<T2>>;
|
||||||
|
|
||||||
|
T &d;
|
||||||
|
Mask m;
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(detail)
|
||||||
|
|
||||||
|
template <typename Value_, size_t Size_>
|
||||||
|
struct Array<detail::MaskedArray<Value_>, Size_>
|
||||||
|
: detail::MaskedArray<Array<Value_, Size_>> {
|
||||||
|
using Base = detail::MaskedArray<Array<Value_, Size_>>;
|
||||||
|
using Base::Base;
|
||||||
|
using Base::operator=;
|
||||||
|
Array(const Base &b) : Base(b) { }
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, typename Mask>
|
||||||
|
ENOKI_INLINE auto masked(T &value, const Mask &mask) {
|
||||||
|
if constexpr (std::is_same_v<Mask, bool>)
|
||||||
|
return detail::MaskedValue<T>{ value, mask };
|
||||||
|
else
|
||||||
|
return struct_support_t<T>::masked(value, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,556 @@
|
||||||
|
/*
|
||||||
|
enoki/array_recursive.h -- Template specialization that recursively
|
||||||
|
instantiates Array instances with smaller sizes when the requested packet
|
||||||
|
float array size is not directly supported by the processor's SIMD
|
||||||
|
instructions
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using ENOKI instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <enoki/array_generic.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
template <typename Value_, size_t Size_, bool IsMask_, typename Derived_>
|
||||||
|
struct StaticArrayImpl<Value_, Size_, IsMask_, Derived_,
|
||||||
|
enable_if_t<detail::array_config<Value_, Size_>::use_recursive_impl>>
|
||||||
|
: StaticArrayBase<Value_, Size_, IsMask_, Derived_> {
|
||||||
|
|
||||||
|
using Base = StaticArrayBase<Value_, Size_, IsMask_, Derived_>;
|
||||||
|
|
||||||
|
ENOKI_ARRAY_IMPORT_BASIC(Base, StaticArrayImpl)
|
||||||
|
|
||||||
|
using typename Base::Array1;
|
||||||
|
using typename Base::Array2;
|
||||||
|
using Base::Size1;
|
||||||
|
using Base::Size2;
|
||||||
|
using Ref = const Derived &;
|
||||||
|
static constexpr bool IsRecursive = true;
|
||||||
|
|
||||||
|
StaticArrayImpl() = default;
|
||||||
|
|
||||||
|
/// Initialize all entries with a constant
|
||||||
|
ENOKI_INLINE StaticArrayImpl(const Value &value) : a1(value), a2(value) { }
|
||||||
|
|
||||||
|
/// Initialize from a list of component values
|
||||||
|
template <typename... Ts, enable_if_t<sizeof...(Ts) == Size &&
|
||||||
|
std::conjunction_v<detail::is_constructible<Value, Ts>...>> = 0>
|
||||||
|
ENOKI_INLINE StaticArrayImpl(Ts... args) {
|
||||||
|
alignas(alignof(Array1)) Value storage[Size] = { (Value) args... };
|
||||||
|
a1 = load<Array1>(storage);
|
||||||
|
a2 = load<Array2>(storage + Size1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Construct from the two sub-components
|
||||||
|
template <typename T1, typename T2,
|
||||||
|
enable_if_t<T1::Size == Size1 && T2::Size == Size2> = 0>
|
||||||
|
ENOKI_INLINE StaticArrayImpl(const T1 &a1, const T2 &a2)
|
||||||
|
: a1(a1), a2(a2) { }
|
||||||
|
|
||||||
|
/// Cast another array
|
||||||
|
template <size_t Size2, typename Value2,
|
||||||
|
typename Derived2, enable_if_t<Derived2::Size == Size_> = 0>
|
||||||
|
ENOKI_INLINE StaticArrayImpl(
|
||||||
|
const StaticArrayBase<Value2, Size2, IsMask_, Derived2> &a)
|
||||||
|
: a1(low(a)), a2(high(a)) { }
|
||||||
|
|
||||||
|
/// Reinterpret another array
|
||||||
|
template <typename Value2, size_t Size2,
|
||||||
|
bool IsMask2, typename Derived2, enable_if_t<Derived2::Size == Size_> = 0>
|
||||||
|
ENOKI_INLINE StaticArrayImpl(
|
||||||
|
const StaticArrayBase<Value2, Size2, IsMask2, Derived2> &a,
|
||||||
|
detail::reinterpret_flag)
|
||||||
|
: a1(low (a), detail::reinterpret_flag()),
|
||||||
|
a2(high(a), detail::reinterpret_flag()) { }
|
||||||
|
|
||||||
|
/// Reinterpret another array (masks)
|
||||||
|
template <bool M = IsMask_, enable_if_t<M> = 0>
|
||||||
|
ENOKI_INLINE StaticArrayImpl(bool b, detail::reinterpret_flag)
|
||||||
|
: a1(b, detail::reinterpret_flag()),
|
||||||
|
a2(b, detail::reinterpret_flag()) { }
|
||||||
|
|
||||||
|
template <bool M = IsMask_, enable_if_t<!M> = 0>
|
||||||
|
ENOKI_INLINE StaticArrayImpl &operator=(Value_ v) {
|
||||||
|
*this = StaticArrayImpl(v);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <bool M = IsMask_, enable_if_t<M> = 0>
|
||||||
|
ENOKI_INLINE StaticArrayImpl &operator=(bool v) {
|
||||||
|
*this = StaticArrayImpl(v, detail::reinterpret_flag());
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Vertical operations
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived add_(Ref a) const { return Derived(a1 + a.a1, a2 + a.a2); }
|
||||||
|
ENOKI_INLINE Derived sub_(Ref a) const { return Derived(a1 - a.a1, a2 - a.a2); }
|
||||||
|
ENOKI_INLINE Derived mul_(Ref a) const { return Derived(a1 * a.a1, a2 * a.a2); }
|
||||||
|
ENOKI_INLINE Derived div_(Ref a) const { return Derived(a1 / a.a1, a2 / a.a2); }
|
||||||
|
ENOKI_INLINE Derived mod_(Ref a) const { return Derived(a1 % a.a1, a2 % a.a2); }
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived mulhi_(Ref a) const {
|
||||||
|
return Derived(mulhi(a1, a.a1), mulhi(a2, a.a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived fmod_(Ref a) const {
|
||||||
|
return Derived(fmod(a1, a.a1), fmod(a2, a.a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE auto lt_ (Ref a) const { return mask_t<Derived>(a1 < a.a1, a2 < a.a2); }
|
||||||
|
ENOKI_INLINE auto gt_ (Ref a) const { return mask_t<Derived>(a1 > a.a1, a2 > a.a2); }
|
||||||
|
ENOKI_INLINE auto le_ (Ref a) const { return mask_t<Derived>(a1 <= a.a1, a2 <= a.a2); }
|
||||||
|
ENOKI_INLINE auto ge_ (Ref a) const { return mask_t<Derived>(a1 >= a.a1, a2 >= a.a2); }
|
||||||
|
ENOKI_INLINE auto eq_ (Ref a) const { return mask_t<Derived>(eq(a1, a.a1), eq(a2, a.a2)); }
|
||||||
|
ENOKI_INLINE auto neq_(Ref a) const { return mask_t<Derived>(neq(a1, a.a1), neq(a2, a.a2)); }
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived min_(Ref a) const { return Derived(min(a1, a.a1), min(a2, a.a2)); }
|
||||||
|
ENOKI_INLINE Derived max_(Ref a) const { return Derived(max(a1, a.a1), max(a2, a.a2)); }
|
||||||
|
ENOKI_INLINE Derived abs_() const { return Derived(abs(a1), abs(a2)); }
|
||||||
|
ENOKI_INLINE Derived ceil_() const { return Derived(ceil(a1), ceil(a2)); }
|
||||||
|
ENOKI_INLINE Derived floor_() const { return Derived(floor(a1), floor(a2)); }
|
||||||
|
ENOKI_INLINE Derived sqrt_() const { return Derived(sqrt(a1), sqrt(a2)); }
|
||||||
|
ENOKI_INLINE Derived round_() const { return Derived(round(a1), round(a2)); }
|
||||||
|
ENOKI_INLINE Derived trunc_() const { return Derived(trunc(a1), trunc(a2)); }
|
||||||
|
ENOKI_INLINE Derived rcp_() const { return Derived(rcp(a1), rcp(a2)); }
|
||||||
|
ENOKI_INLINE Derived rsqrt_() const { return Derived(rsqrt(a1), rsqrt(a2)); }
|
||||||
|
ENOKI_INLINE Derived not_() const { return Derived(~a1, ~a2); }
|
||||||
|
ENOKI_INLINE Derived neg_() const { return Derived(-a1, -a2); }
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived fmadd_(Ref b, Ref c) const {
|
||||||
|
return Derived(fmadd(a1, b.a1, c.a1), fmadd(a2, b.a2, c.a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived fnmadd_(Ref b, Ref c) const {
|
||||||
|
return Derived(fnmadd(a1, b.a1, c.a1), fnmadd(a2, b.a2, c.a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived fmsub_(Ref b, Ref c) const {
|
||||||
|
return Derived(fmsub(a1, b.a1, c.a1), fmsub(a2, b.a2, c.a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived fnmsub_(Ref b, Ref c) const {
|
||||||
|
return Derived(fnmsub(a1, b.a1, c.a1), fnmsub(a2, b.a2, c.a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived fmaddsub_(Ref b, Ref c) const {
|
||||||
|
return Derived(fmaddsub(a1, b.a1, c.a1), fmaddsub(a2, b.a2, c.a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived fmsubadd_(Ref b, Ref c) const {
|
||||||
|
return Derived(fmsubadd(a1, b.a1, c.a1), fmsubadd(a2, b.a2, c.a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE Derived or_(const T &a) const {
|
||||||
|
return Derived(a1 | low(a), a2 | high(a));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE Derived andnot_(const T &a) const {
|
||||||
|
return Derived(andnot(a1, low(a)), andnot(a2, high(a)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE Derived and_(const T &a) const {
|
||||||
|
return Derived(a1 & low(a), a2 & high(a));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE Derived xor_(const T &a) const {
|
||||||
|
return Derived(a1 ^ low(a), a2 ^ high(a));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t Imm> ENOKI_INLINE Derived sl_() const {
|
||||||
|
return Derived(sl<Imm>(a1), sl<Imm>(a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived sl_(size_t k) const {
|
||||||
|
return Derived(a1 << k, a2 << k);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived sl_(Ref a) const {
|
||||||
|
return Derived(a1 << a.a1, a2 << a.a2);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t Imm> ENOKI_INLINE Derived sr_() const {
|
||||||
|
return Derived(sr<Imm>(a1), sr<Imm>(a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived sr_(size_t k) const {
|
||||||
|
return Derived(a1 >> k, a2 >> k);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived sr_(Ref a) const {
|
||||||
|
return Derived(a1 >> a.a1, a2 >> a.a2);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t Imm> ENOKI_INLINE Derived rol_() const {
|
||||||
|
return Derived(rol<Imm>(a1), rol<Imm>(a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t Imm> ENOKI_INLINE Derived ror_() const {
|
||||||
|
return Derived(ror<Imm>(a1), ror<Imm>(a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived rol_(Ref arg) const {
|
||||||
|
return Derived(rol(a1, arg.a1), rol(a2, arg.a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Derived ror_(Ref arg) const {
|
||||||
|
return Derived(ror(a1, arg.a1), ror(a2, arg.a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Mask>
|
||||||
|
static ENOKI_INLINE Derived select_(const Mask &m, Ref t, Ref f) {
|
||||||
|
return Derived(select(m.a1, t.a1, f.a1),
|
||||||
|
select(m.a2, t.a2, f.a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t Imm> ENOKI_INLINE Derived ror_array_() const {
|
||||||
|
if constexpr (Size1 == Size2) {
|
||||||
|
static_assert(
|
||||||
|
Imm <= Size1 && Imm <= Size2,
|
||||||
|
"ror_array(): Refusing to rotate a recursively defined array by an "
|
||||||
|
"amount that is larger than the recursive array sizes.");
|
||||||
|
const mask_t<Array1> mask = arange<Array1>() >= Scalar(Imm);
|
||||||
|
|
||||||
|
Array1 a1_r = ror_array<Imm>(a1);
|
||||||
|
Array2 a2_r = ror_array<Imm>(a2);
|
||||||
|
|
||||||
|
return Derived(
|
||||||
|
select(mask, a1_r, a2_r),
|
||||||
|
select(mask, a2_r, a1_r)
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
return Base::template ror_array_<Imm>();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t Imm> ENOKI_INLINE Derived rol_array_() const {
|
||||||
|
if constexpr (Size1 == Size2) {
|
||||||
|
static_assert(
|
||||||
|
Imm <= Size1 && Imm <= Size2,
|
||||||
|
"rol_array(): Refusing to rotate a recursively defined array "
|
||||||
|
"by an amount that is larger than the recursive array sizes.");
|
||||||
|
const mask_t<Array1> mask = arange<Array1>() < Scalar(Size1 - Imm);
|
||||||
|
|
||||||
|
Array1 a1_r = rol_array<Imm>(a1);
|
||||||
|
Array2 a2_r = rol_array<Imm>(a2);
|
||||||
|
|
||||||
|
return Derived(
|
||||||
|
select(mask, a1_r, a2_r),
|
||||||
|
select(mask, a2_r, a1_r)
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
return Base::template rol_array_<Imm>();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Derived ldexp_(Ref a) const {
|
||||||
|
return Derived(ldexp(a1, a.a1), ldexp(a2, a.a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<Derived, Derived> frexp_() const {
|
||||||
|
auto r1 = frexp(a1);
|
||||||
|
auto r2 = frexp(a2);
|
||||||
|
return std::make_pair<Derived, Derived>(
|
||||||
|
Derived(r1.first, r2.first),
|
||||||
|
Derived(r1.second, r2.second)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE auto ceil2int_() const {
|
||||||
|
return T(ceil2int<typename T::Array1>(a1),
|
||||||
|
ceil2int<typename T::Array2>(a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE auto floor2int_() const {
|
||||||
|
return T(floor2int<typename T::Array1>(a1),
|
||||||
|
floor2int<typename T::Array2>(a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
Derived lzcnt_() const { return Derived(lzcnt(a1), lzcnt(a2)); }
|
||||||
|
Derived tzcnt_() const { return Derived(tzcnt(a1), tzcnt(a2)); }
|
||||||
|
Derived popcnt_() const { return Derived(popcnt(a1), popcnt(a2)); }
|
||||||
|
|
||||||
|
template<size_t... Is, size_t ... Is2>
|
||||||
|
static constexpr auto split_(std::index_sequence<Is...>,
|
||||||
|
std::index_sequence<Is2...>) {
|
||||||
|
constexpr std::array<size_t, sizeof...(Is)> out { Is ... };
|
||||||
|
return std::make_pair(std::index_sequence<out[Is2]...>(),
|
||||||
|
std::index_sequence<out[Is2 + Size1]...>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t... Indices> ENOKI_INLINE Derived shuffle_() const {
|
||||||
|
if constexpr (Size1 != Size2) {
|
||||||
|
return Base::template shuffle_<Indices...>();
|
||||||
|
} else {
|
||||||
|
constexpr auto indices = split_(std::index_sequence<Indices...>(),
|
||||||
|
std::make_index_sequence<Size1>());
|
||||||
|
return shuffle_impl_(indices.first, indices.second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t... Indices1, typename T= size_t, size_t... Indices2>
|
||||||
|
ENOKI_INLINE Derived shuffle_impl_(std::index_sequence<Indices1...>,
|
||||||
|
std::index_sequence<Indices2...>) const {
|
||||||
|
using Int = int_array_t<Array1>;
|
||||||
|
Array1 a1l = a1.template shuffle_<(size_t) std::min(Size1 - 1, Indices1)...>(),
|
||||||
|
a1h = a2.template shuffle_<(size_t) std::max((ssize_t) 0, (ssize_t) Indices1 - (ssize_t) Size1)...>(),
|
||||||
|
a1f = select(Int(Indices1...) < Int(Size1), a1l, a1h);
|
||||||
|
|
||||||
|
Array2 a2l = a1.template shuffle_<std::min(Size1 - 1, Indices2)...>(),
|
||||||
|
a2h = a2.template shuffle_<(size_t) std::max((ssize_t) 0, (ssize_t) Indices2 - (ssize_t) Size1)...>(),
|
||||||
|
a2f = select(Int(Indices2...) < Int(Size1), a2l, a2h);
|
||||||
|
|
||||||
|
return Derived(a1f, a2f);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Index> ENOKI_INLINE Derived shuffle_(const Index &index) const {
|
||||||
|
if constexpr (Size1 != Size2) {
|
||||||
|
return Base::shuffle_(index);
|
||||||
|
} else {
|
||||||
|
auto il = low(index), ih = high(index);
|
||||||
|
|
||||||
|
decltype(il) size = scalar_t<Index>(Size1);
|
||||||
|
|
||||||
|
Array1 a1l = a1.shuffle_(il),
|
||||||
|
a1h = a2.shuffle_(il - size),
|
||||||
|
a1f = select(il < size, a1l, a1h);
|
||||||
|
|
||||||
|
Array2 a2l = a1.shuffle_(ih),
|
||||||
|
a2h = a2.shuffle_(ih - size),
|
||||||
|
a2f = select(ih < size, a2l, a2h);
|
||||||
|
|
||||||
|
return Derived(a1f, a2f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define ENOKI_MASKED_OPERATOR(name) \
|
||||||
|
template <typename Mask> \
|
||||||
|
ENOKI_INLINE void m##name##_(Ref value, const Mask &mask) { \
|
||||||
|
a1.m##name##_(low(value), low(mask)); \
|
||||||
|
a2.m##name##_(high(value), high(mask)); \
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_MASKED_OPERATOR(assign)
|
||||||
|
ENOKI_MASKED_OPERATOR(add)
|
||||||
|
ENOKI_MASKED_OPERATOR(sub)
|
||||||
|
ENOKI_MASKED_OPERATOR(mul)
|
||||||
|
ENOKI_MASKED_OPERATOR(div)
|
||||||
|
ENOKI_MASKED_OPERATOR(and)
|
||||||
|
ENOKI_MASKED_OPERATOR(or)
|
||||||
|
ENOKI_MASKED_OPERATOR(xor)
|
||||||
|
|
||||||
|
#undef ENOKI_MASKED_OPERATOR
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Horizontal operations
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
ENOKI_INLINE Value hsum_() const {
|
||||||
|
if constexpr (Size1 == Size2)
|
||||||
|
return hsum(a1 + a2);
|
||||||
|
else
|
||||||
|
return hsum(a1) + hsum(a2);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Value hprod_() const {
|
||||||
|
if constexpr (Size1 == Size2)
|
||||||
|
return hprod(a1 * a2);
|
||||||
|
else
|
||||||
|
return hprod(a1) * hprod(a2);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Value hmin_() const {
|
||||||
|
if constexpr (Size1 == Size2)
|
||||||
|
return hmin(min(a1, a2));
|
||||||
|
else
|
||||||
|
return min(hmin(a1), hmin(a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Value hmax_() const {
|
||||||
|
if constexpr (Size1 == Size2)
|
||||||
|
return hmax(max(a1, a2));
|
||||||
|
else
|
||||||
|
return max(hmax(a1), hmax(a2));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Value dot_(Ref a) const {
|
||||||
|
if constexpr (Size1 == Size2)
|
||||||
|
return hsum(fmadd(a1, a.a1, a2 * a.a2));
|
||||||
|
else
|
||||||
|
return dot(a1, a.a1) + dot(a2, a.a2);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE bool all_() const {
|
||||||
|
if constexpr (Size1 == Size2)
|
||||||
|
return all(a1 & a2);
|
||||||
|
else
|
||||||
|
return all(a1) && all(a2);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE bool any_() const {
|
||||||
|
if constexpr (Size1 == Size2)
|
||||||
|
return any(a1 | a2);
|
||||||
|
else
|
||||||
|
return any(a1) || any(a2);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE size_t count_() const { return count(a1) + count(a2); }
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Initialization, loading/writing data
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
ENOKI_INLINE void store_(void *mem) const {
|
||||||
|
store((uint8_t *) mem, a1);
|
||||||
|
store((uint8_t *) mem + sizeof(Array1), a2);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Mask>
|
||||||
|
ENOKI_INLINE void store_(void *mem, const Mask &mask) const {
|
||||||
|
store((uint8_t *) mem, a1, low(mask));
|
||||||
|
store((uint8_t *) mem + sizeof(Array1), a2, high(mask));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE void store_unaligned_(void *mem) const {
|
||||||
|
store_unaligned((uint8_t *) mem, a1);
|
||||||
|
store_unaligned((uint8_t *) mem + sizeof(Array1), a2);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Mask>
|
||||||
|
ENOKI_INLINE void store_unaligned_(void *mem, const Mask &mask) const {
|
||||||
|
store_unaligned((uint8_t *) mem, a1, low(mask));
|
||||||
|
store_unaligned((uint8_t *) mem + sizeof(Array1), a2, high(mask));
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE Derived load_(const void *mem) {
|
||||||
|
return Derived(
|
||||||
|
load<Array1>((uint8_t *) mem),
|
||||||
|
load<Array2>((uint8_t *) mem + sizeof(Array1))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Mask>
|
||||||
|
static ENOKI_INLINE Derived load_(const void *mem, const Mask &mask) {
|
||||||
|
return Derived(
|
||||||
|
load<Array1>((uint8_t *) mem, low(mask)),
|
||||||
|
load<Array2>((uint8_t *) mem + sizeof(Array1), high(mask))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE Derived load_unaligned_(const void *a) {
|
||||||
|
return Derived(
|
||||||
|
load_unaligned<Array1>((uint8_t *) a),
|
||||||
|
load_unaligned<Array2>((uint8_t *) a + sizeof(Array1))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Mask>
|
||||||
|
static ENOKI_INLINE Derived load_unaligned_(const void *a, const Mask &mask) {
|
||||||
|
return Derived(
|
||||||
|
load_unaligned<Array1>((uint8_t *) a, low(mask)),
|
||||||
|
load_unaligned<Array2>((uint8_t *) a + sizeof(Array1), high(mask))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE Derived zero_() {
|
||||||
|
return Derived(zero<Array1>(), zero<Array2>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <bool Write, size_t Level, size_t Stride, typename Index, typename Mask>
|
||||||
|
static ENOKI_INLINE void prefetch_(const void *ptr, const Index &index, const Mask &mask) {
|
||||||
|
prefetch<Array1, Write, Level, Stride>(ptr, low(index), low(mask));
|
||||||
|
prefetch<Array2, Write, Level, Stride>(ptr, high(index), high(mask));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t Stride, typename Index, typename Mask>
|
||||||
|
static ENOKI_INLINE Derived gather_(const void *ptr, const Index &index, const Mask &mask) {
|
||||||
|
return Derived(
|
||||||
|
gather<Array1, Stride>(ptr, low(index), low(mask)),
|
||||||
|
gather<Array2, Stride>(ptr, high(index), high(mask))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t Stride, typename Index, typename Mask>
|
||||||
|
ENOKI_INLINE void scatter_(void *ptr, const Index &index, const Mask &mask) const {
|
||||||
|
scatter<Stride>(ptr, a1, low(index), low(mask));
|
||||||
|
scatter<Stride>(ptr, a2, high(index), high(mask));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t Stride, typename Index, typename Func, typename... Args, typename Mask>
|
||||||
|
static ENOKI_INLINE void transform_(void *ptr, const Index &index, const Mask &,
|
||||||
|
const Func &func, const Args &... args) {
|
||||||
|
transform<Array1, Stride>(ptr, low(index), func, low(args)...);
|
||||||
|
transform<Array2, Stride>(ptr, high(index), func, high(args)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Mask>
|
||||||
|
ENOKI_INLINE Value extract_(const Mask &mask) const {
|
||||||
|
if constexpr (Size1 == Size2) {
|
||||||
|
return extract(select(low(mask), a1, a2), low(mask) | high(mask));
|
||||||
|
} else {
|
||||||
|
if (ENOKI_LIKELY(any(low(mask))))
|
||||||
|
return extract(a1, low(mask));
|
||||||
|
else
|
||||||
|
return extract(a2, high(mask));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename Mask>
|
||||||
|
ENOKI_INLINE size_t compress_(T *&ptr, const Mask &mask) const {
|
||||||
|
size_t r0 = compress(ptr, a1, low(mask));
|
||||||
|
size_t r1 = compress(ptr, a2, high(mask));
|
||||||
|
return r0 + r1;
|
||||||
|
}
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Component access
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
ENOKI_INLINE const Array1& low_() const { return a1; }
|
||||||
|
ENOKI_INLINE const Array2& high_() const { return a2; }
|
||||||
|
|
||||||
|
ENOKI_INLINE decltype(auto) coeff(size_t i) const {
|
||||||
|
if constexpr (Size1 == Size2)
|
||||||
|
return ((i < Size1) ? a1 : a2).coeff(i % Size1);
|
||||||
|
else
|
||||||
|
return (i < Size1) ? a1.coeff(i) : a2.coeff(i - Size1);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE decltype(auto) coeff(size_t i) {
|
||||||
|
if constexpr (Size1 == Size2)
|
||||||
|
return ((i < Size1) ? a1 : a2).coeff(i % Size1);
|
||||||
|
else
|
||||||
|
return (i < Size1) ? a1.coeff(i) : a2.coeff(i - Size1);
|
||||||
|
}
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
Array1 a1;
|
||||||
|
Array2 a2;
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,156 @@
|
||||||
|
/*
|
||||||
|
enoki/array_round.h -- Fallback for nonstandard rounding modes
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using ENOKI instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <enoki/array_generic.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_64) || defined(ENOKI_X86_32)
|
||||||
|
/// RAII wrapper that saves and restores the FP Control/Status Register
|
||||||
|
template <RoundingMode Mode> struct set_rounding_mode {
|
||||||
|
set_rounding_mode() : value(_mm_getcsr()) {
|
||||||
|
unsigned int csr = value & ~(unsigned int) _MM_ROUND_MASK;
|
||||||
|
switch (Mode) {
|
||||||
|
case RoundingMode::Nearest: csr |= _MM_ROUND_NEAREST; break;
|
||||||
|
case RoundingMode::Down: csr |= _MM_ROUND_DOWN; break;
|
||||||
|
case RoundingMode::Up: csr |= _MM_ROUND_UP; break;
|
||||||
|
case RoundingMode::Zero: csr |= _MM_ROUND_TOWARD_ZERO; break;
|
||||||
|
}
|
||||||
|
_mm_setcsr(csr);
|
||||||
|
}
|
||||||
|
|
||||||
|
~set_rounding_mode() {
|
||||||
|
_mm_setcsr(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int value;
|
||||||
|
};
|
||||||
|
#else
|
||||||
|
template <RoundingMode Mode> struct set_rounding_mode {
|
||||||
|
// Don't know how to change rounding modes on this platform :(
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <typename Value_, size_t Size_, bool Approx_, RoundingMode Mode_, bool IsMask_, typename Derived_>
|
||||||
|
struct StaticArrayImpl<Value_, Size_, Approx_, Mode_, IsMask_, Derived_,
|
||||||
|
enable_if_t<detail::array_config<Value_, Size_, Mode_>::use_rounding_fallback_impl>>
|
||||||
|
: StaticArrayImpl<Value_, Size_, Approx_, RoundingMode::Default, IsMask_, Derived_> {
|
||||||
|
|
||||||
|
using Base = StaticArrayImpl<Value_, Size_, Approx_, RoundingMode::Default, IsMask_, Derived_>;
|
||||||
|
using Derived = Derived_;
|
||||||
|
|
||||||
|
using Base::derived;
|
||||||
|
|
||||||
|
/// Rounding mode of arithmetic operations
|
||||||
|
static constexpr RoundingMode Mode = Mode_;
|
||||||
|
|
||||||
|
template <typename Arg, enable_if_t<std::is_same_v<value_t<Arg>, Value_>> = 0>
|
||||||
|
ENOKI_INLINE StaticArrayImpl(Arg&& arg) : Base(std::forward<Arg>(arg)) { }
|
||||||
|
|
||||||
|
template <typename... Args>
|
||||||
|
ENOKI_INLINE StaticArrayImpl(Args&&... args) : Base(std::forward<Args>(args)...) { }
|
||||||
|
|
||||||
|
template <typename Arg, enable_if_t<!std::is_same_v<value_t<Arg>, Value_>> = 0>
|
||||||
|
ENOKI_NOINLINE StaticArrayImpl(Arg&& arg) {
|
||||||
|
set_rounding_mode<Mode_> mode; (void) mode;
|
||||||
|
using Base2 = std::conditional_t<IsMask_,
|
||||||
|
Array<Value_, Size_, Approx_, RoundingMode::Default>,
|
||||||
|
Packet<Value_, Size_, Approx_, RoundingMode::Default>>;
|
||||||
|
Base::operator=(Base2(std::forward<Arg>(arg)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Arg, enable_if_t<std::is_same_v<value_t<Arg>, Value_>> = 0>
|
||||||
|
ENOKI_NOINLINE Derived& operator=(Arg&& arg) {
|
||||||
|
Base::operator=(std::forward<Arg>(arg));
|
||||||
|
return derived();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Arg, enable_if_t<!std::is_same_v<value_t<Arg>, Value_>> = 0>
|
||||||
|
ENOKI_NOINLINE Derived& operator=(Arg&& arg) {
|
||||||
|
set_rounding_mode<Mode_> mode; (void) mode;
|
||||||
|
using Base2 = std::conditional_t<IsMask_,
|
||||||
|
Array<Value_, Size_, Approx_, RoundingMode::Default>,
|
||||||
|
Packet<Value_, Size_, Approx_, RoundingMode::Default>>;
|
||||||
|
Base::operator=(Base2(std::forward<Arg>(arg)));
|
||||||
|
return derived();
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_NOINLINE Derived add_(const Derived &a) const {
|
||||||
|
set_rounding_mode<Mode_> mode; (void) mode;
|
||||||
|
return Base::add_(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_NOINLINE Derived sub_(const Derived &a) const {
|
||||||
|
set_rounding_mode<Mode_> mode; (void) mode;
|
||||||
|
return Base::sub_(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_NOINLINE Derived mul_(const Derived &a) const {
|
||||||
|
set_rounding_mode<Mode_> mode; (void) mode;
|
||||||
|
return Base::mul_(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_NOINLINE Derived div_(const Derived &a) const {
|
||||||
|
set_rounding_mode<Mode_> mode; (void) mode;
|
||||||
|
return Base::div_(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_NOINLINE Derived sqrt_() const {
|
||||||
|
set_rounding_mode<Mode_> mode; (void) mode;
|
||||||
|
return Base::sqrt_();
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_NOINLINE Derived fmadd_(const Derived &b, const Derived &c) const {
|
||||||
|
set_rounding_mode<Mode_> mode; (void) mode;
|
||||||
|
return Base::fmadd_(b, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_NOINLINE Derived fmsub_(const Derived &b, const Derived &c) const {
|
||||||
|
set_rounding_mode<Mode_> mode; (void) mode;
|
||||||
|
return Base::fmsub_(b, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_NOINLINE Derived fnmadd_(const Derived &b, const Derived &c) const {
|
||||||
|
set_rounding_mode<Mode_> mode; (void) mode;
|
||||||
|
return Base::fnmadd_(b, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_NOINLINE Derived fnmsub_(const Derived &b, const Derived &c) const {
|
||||||
|
set_rounding_mode<Mode_> mode; (void) mode;
|
||||||
|
return Base::fnmsub_(b, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_NOINLINE Derived fmsubadd_(const Derived &b, const Derived &c) const {
|
||||||
|
set_rounding_mode<Mode_> mode; (void) mode;
|
||||||
|
return Base::fmsubadd_(b, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_NOINLINE Derived fmaddsub_(const Derived &b, const Derived &c) const {
|
||||||
|
set_rounding_mode<Mode_> mode; (void) mode;
|
||||||
|
return Base::fmaddsub_(b, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_NOINLINE Value_ hsum() const {
|
||||||
|
set_rounding_mode<Mode_> mode; (void) mode;
|
||||||
|
return Base::hsum_();
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_NOINLINE Value_ hprod() const {
|
||||||
|
set_rounding_mode<Mode_> mode; (void) mode;
|
||||||
|
return Base::hprod_();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,544 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
template <typename T> using is_dynamic = std::bool_constant<struct_support_t<T>::IsDynamic>;
|
||||||
|
template <typename T> constexpr bool is_dynamic_v = is_dynamic<T>::value;
|
||||||
|
|
||||||
|
/// Gather operations with an array or other data structure as source
|
||||||
|
template <typename Array, size_t Stride = 0, bool Packed = true,
|
||||||
|
bool IsPermute = false, typename Source, typename Index,
|
||||||
|
typename Mask = mask_t<Index>, enable_if_t<is_dynamic_v<Source>> = 0>
|
||||||
|
ENOKI_INLINE Array gather(const Source &source, const Index &index,
|
||||||
|
const identity_t<Mask> &mask = true) {
|
||||||
|
if constexpr (array_depth_v<Source> == 1) {
|
||||||
|
|
||||||
|
if constexpr (is_dynamic_v<Array> && is_dynamic_v<Source> &&
|
||||||
|
array_depth_v<Source> >= array_depth_v<Mask>) {
|
||||||
|
if (source.size() <= 1)
|
||||||
|
return source & mask;
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (is_diff_array_v<Source>) {
|
||||||
|
Source::set_scatter_gather_operand_(source, IsPermute);
|
||||||
|
if constexpr (is_cuda_array_v<Source>)
|
||||||
|
cuda_set_scatter_gather_operand(source.value_().index_(), true);
|
||||||
|
} else if constexpr (is_cuda_array_v<Source>) {
|
||||||
|
cuda_set_scatter_gather_operand(source.index_(), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
Array result = gather<Array, Stride, Packed>(source.data(), index, mask);
|
||||||
|
|
||||||
|
if constexpr (is_diff_array_v<Source>) {
|
||||||
|
Source::clear_scatter_gather_operand_();
|
||||||
|
if constexpr (is_cuda_array_v<Source>)
|
||||||
|
cuda_set_scatter_gather_operand(0);
|
||||||
|
} else if constexpr (is_cuda_array_v<Source>) {
|
||||||
|
cuda_set_scatter_gather_operand(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
} else {
|
||||||
|
return struct_support_t<Array>::gather(source, index, mask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Array, size_t = 0, bool = true, bool = false,
|
||||||
|
typename Source, typename Index, typename Mask = mask_t<Index>,
|
||||||
|
enable_if_t<!is_dynamic_v<Source> && !std::is_pointer_v<std::decay_t<Source>> &&
|
||||||
|
!std::is_same_v<std::decay_t<Source>, std::nullptr_t>> = 0>
|
||||||
|
ENOKI_INLINE Array gather(Source &&source, const Index &index,
|
||||||
|
const identity_t<Mask> &mask= true) {
|
||||||
|
ENOKI_MARK_USED(index);
|
||||||
|
ENOKI_MARK_USED(mask);
|
||||||
|
return (Array) source;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Scatter operations with an array or other data structure as target
|
||||||
|
template <size_t Stride = 0, bool Packed = true, bool IsPermute = false,
|
||||||
|
typename Target, typename Index, typename Value,
|
||||||
|
typename Mask = mask_t<Index>, enable_if_t<is_dynamic_v<Target>> = 0>
|
||||||
|
ENOKI_INLINE void scatter(Target &target,
|
||||||
|
const Value &value,
|
||||||
|
const Index &index,
|
||||||
|
const identity_t<Mask> &mask = true) {
|
||||||
|
if constexpr (array_depth_v<Target> == 1) {
|
||||||
|
if constexpr (is_diff_array_v<Target>) {
|
||||||
|
Target::set_scatter_gather_operand_(target, IsPermute);
|
||||||
|
if constexpr (is_cuda_array_v<Target>)
|
||||||
|
cuda_set_scatter_gather_operand(target.value_().index_());
|
||||||
|
} else if constexpr (is_cuda_array_v<Target>) {
|
||||||
|
cuda_set_scatter_gather_operand(target.index_());
|
||||||
|
}
|
||||||
|
|
||||||
|
scatter<Stride, Packed>(target.data(), value, index, mask);
|
||||||
|
|
||||||
|
if constexpr (is_diff_array_v<Target>) {
|
||||||
|
Target::clear_scatter_gather_operand_();
|
||||||
|
if constexpr (is_cuda_array_v<Target>) {
|
||||||
|
cuda_var_mark_dirty(target.value_().index_());
|
||||||
|
cuda_set_scatter_gather_operand(0);
|
||||||
|
}
|
||||||
|
} else if constexpr (is_cuda_array_v<Target>) {
|
||||||
|
cuda_var_mark_dirty(target.index_());
|
||||||
|
cuda_set_scatter_gather_operand(0);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
struct_support_t<Target>::scatter(target, value, index, mask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Scatter-add operations with an array or other data structure as target
|
||||||
|
template <size_t Stride = 0, bool Packed = true, bool IsPermute = false,
|
||||||
|
typename Target, typename Index, typename Value,
|
||||||
|
typename Mask = mask_t<Index>, enable_if_t<is_dynamic_v<Target>> = 0>
|
||||||
|
ENOKI_INLINE void scatter_add(Target &target,
|
||||||
|
const Value &value,
|
||||||
|
const Index &index,
|
||||||
|
const identity_t<Mask> &mask = true) {
|
||||||
|
if constexpr (array_depth_v<Target> == 1) {
|
||||||
|
if constexpr (is_diff_array_v<Target>) {
|
||||||
|
Target::set_scatter_gather_operand_(target, IsPermute);
|
||||||
|
if constexpr (is_cuda_array_v<Target>)
|
||||||
|
cuda_set_scatter_gather_operand(target.value_().index_());
|
||||||
|
} else if constexpr (is_cuda_array_v<Target>) {
|
||||||
|
cuda_set_scatter_gather_operand(target.index_());
|
||||||
|
}
|
||||||
|
|
||||||
|
scatter_add<Stride>(target.data(), value, index, mask);
|
||||||
|
|
||||||
|
if constexpr (is_diff_array_v<Target>) {
|
||||||
|
Target::clear_scatter_gather_operand_();
|
||||||
|
if constexpr (is_cuda_array_v<Target>) {
|
||||||
|
cuda_var_mark_dirty(target.value_().index_());
|
||||||
|
cuda_set_scatter_gather_operand(0);
|
||||||
|
}
|
||||||
|
} else if constexpr (is_cuda_array_v<Target>) {
|
||||||
|
cuda_var_mark_dirty(target.index_());
|
||||||
|
cuda_set_scatter_gather_operand(0);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
struct_support_t<Target>::scatter_add(target, value, index, mask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Adapter and routing functions for dynamic data structures
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename T, typename>
|
||||||
|
struct struct_support {
|
||||||
|
static constexpr bool IsDynamic = false;
|
||||||
|
using Dynamic = T;
|
||||||
|
|
||||||
|
static ENOKI_INLINE size_t slices(const T &) { return 1; }
|
||||||
|
static ENOKI_INLINE size_t packets(const T &) { return 1; }
|
||||||
|
static ENOKI_INLINE void set_slices(const T &, size_t) { }
|
||||||
|
|
||||||
|
template <typename T2> static ENOKI_INLINE decltype(auto) slice(T2&& value, size_t) { return value; }
|
||||||
|
template <typename T2> static ENOKI_INLINE decltype(auto) slice_ptr(T2&& value, size_t) { return &value; }
|
||||||
|
template <typename T2> static ENOKI_INLINE decltype(auto) packet(T2&& value, size_t) { return value; }
|
||||||
|
template <typename T2> static ENOKI_INLINE decltype(auto) ref_wrap(T2&& value) { return value; }
|
||||||
|
template <typename T2> static ENOKI_INLINE decltype(auto) detach(T2&& value) { return value; }
|
||||||
|
|
||||||
|
template <typename Mem>
|
||||||
|
static ENOKI_INLINE size_t compress(Mem &mem, const T &value, bool mask) {
|
||||||
|
size_t count = mask ? 1 : 0;
|
||||||
|
*mem = value;
|
||||||
|
mem += count;
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE T zero(size_t) { return T(0); }
|
||||||
|
static ENOKI_INLINE T empty(size_t) { T x; return x; }
|
||||||
|
|
||||||
|
static ENOKI_INLINE detail::MaskedValue<T> masked(T &value, bool mask) {
|
||||||
|
return detail::MaskedValue<T>{ value, mask };
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct struct_support<void, int> { using Dynamic = void; };
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE T zero(size_t size) {
|
||||||
|
return struct_support_t<T>::zero(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE T empty(size_t size) {
|
||||||
|
return struct_support_t<T>::empty(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE size_t packets(const T &value) {
|
||||||
|
return struct_support_t<T>::packets(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE size_t slices(const T &value) {
|
||||||
|
return struct_support_t<T>::slices(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_NOINLINE void set_slices(T &value, size_t size) {
|
||||||
|
ENOKI_MARK_USED(value); ENOKI_MARK_USED(size);
|
||||||
|
if constexpr (is_dynamic_v<T>)
|
||||||
|
struct_support_t<T>::set_slices(value, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE decltype(auto) packet(T &&value, size_t i) {
|
||||||
|
ENOKI_MARK_USED(i);
|
||||||
|
if constexpr (is_dynamic_v<T>)
|
||||||
|
return struct_support_t<T>::packet(value, i);
|
||||||
|
else
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE decltype(auto) slice(T &value, size_t i) {
|
||||||
|
return struct_support_t<T>::slice(value, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE decltype(auto) slice_ptr(T &value, size_t i) {
|
||||||
|
return struct_support_t<T>::slice_ptr(value, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE decltype(auto) ref_wrap(T &value) {
|
||||||
|
if constexpr (is_dynamic_v<T>)
|
||||||
|
return struct_support_t<T>::ref_wrap(value);
|
||||||
|
else
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Mem, typename Value, typename Mask>
|
||||||
|
ENOKI_INLINE size_t compress(Mem &mem, const Value &value, const Mask& mask) {
|
||||||
|
return struct_support_t<Value>::compress(mem, value, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Value, typename Mask>
|
||||||
|
ENOKI_INLINE Value compress(const Value &value, const Mask& mask) {
|
||||||
|
return struct_support_t<Value>::compress(value, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> using enable_if_dynamic_t = enable_if_t<is_dynamic_v<T>>;
|
||||||
|
template <typename T> using enable_if_static_t = enable_if_t<!is_dynamic_v<T>>;
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
using make_dynamic_t = typename struct_support_t<T>::Dynamic;
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct struct_support<T, enable_if_static_array_t<T>> {
|
||||||
|
static constexpr bool IsDynamic = is_dynamic_v<value_t<T>>;
|
||||||
|
static constexpr size_t Size = T::Size;
|
||||||
|
|
||||||
|
using Dynamic = std::conditional_t<
|
||||||
|
array_depth_v<T> == 1,
|
||||||
|
std::conditional_t<
|
||||||
|
is_mask_v<T>,
|
||||||
|
DynamicMask<std::decay_t<T>>,
|
||||||
|
DynamicArray<std::decay_t<T>>
|
||||||
|
>,
|
||||||
|
typename T::template ReplaceValue<make_dynamic_t<value_t<T>>>>;
|
||||||
|
|
||||||
|
static ENOKI_INLINE size_t slices(const T &value) {
|
||||||
|
if constexpr (Size == 0)
|
||||||
|
return 0;
|
||||||
|
else
|
||||||
|
return enoki::slices(value.x());
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE size_t packets(const T& value) {
|
||||||
|
if constexpr (Size == 0)
|
||||||
|
return 0;
|
||||||
|
else
|
||||||
|
return enoki::packets(value.x());
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE void set_slices(T &value, size_t size) {
|
||||||
|
for (size_t i = 0; i < Size; ++i)
|
||||||
|
enoki::set_slices(value.coeff(i), size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE T zero(size_t size) {
|
||||||
|
ENOKI_MARK_USED(size);
|
||||||
|
if constexpr (array_depth_v<T> == 1) {
|
||||||
|
return T::zero_();
|
||||||
|
} else {
|
||||||
|
T result;
|
||||||
|
for (size_t i = 0; i < Size; ++i)
|
||||||
|
result.coeff(i) = enoki::zero<value_t<T>>(size);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE T empty(size_t size) {
|
||||||
|
ENOKI_MARK_USED(size);
|
||||||
|
if constexpr (array_depth_v<T> == 1) {
|
||||||
|
return T::empty_();
|
||||||
|
} else {
|
||||||
|
T result;
|
||||||
|
for (size_t i = 0; i < Size; ++i)
|
||||||
|
result.coeff(i) = enoki::empty<value_t<T>>(size);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE auto masked(T &value, const mask_t<T> &mask) {
|
||||||
|
return detail::MaskedArray<T>{ value, mask };
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE decltype(auto) packet(T2 &value, size_t i) {
|
||||||
|
ENOKI_MARK_USED(i);
|
||||||
|
if constexpr (!is_dynamic_v<T>)
|
||||||
|
return value;
|
||||||
|
else
|
||||||
|
return packet(value, i, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE decltype(auto) detach(T2 &value) {
|
||||||
|
if constexpr (!is_diff_array_v<T>)
|
||||||
|
return value;
|
||||||
|
else
|
||||||
|
return detach(value, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE decltype(auto) gradient(T2 &value) {
|
||||||
|
if constexpr (!is_diff_array_v<T>)
|
||||||
|
return value;
|
||||||
|
else
|
||||||
|
return gradient(value, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE decltype(auto) slice(T2 &value, size_t i) {
|
||||||
|
if constexpr (array_depth_v<T> == 1)
|
||||||
|
return value.coeff(i);
|
||||||
|
else
|
||||||
|
return slice(value, i, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE decltype(auto) slice_ptr(T2 &value, size_t i) {
|
||||||
|
if constexpr (array_depth_v<T> == 1)
|
||||||
|
return value.data() + i;
|
||||||
|
else
|
||||||
|
return slice_ptr(value, i, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE decltype(auto) ref_wrap(T2 &value) {
|
||||||
|
if constexpr (!is_dynamic_v<T>)
|
||||||
|
return value;
|
||||||
|
else
|
||||||
|
return ref_wrap(value, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Mem>
|
||||||
|
static ENOKI_INLINE size_t compress(Mem &mem, const expr_t<T>& value, const mask_t<expr_t<T>> &mask) {
|
||||||
|
if constexpr (is_array_v<Mem>) {
|
||||||
|
size_t result = 0;
|
||||||
|
for (size_t i = 0; i < Size; ++i)
|
||||||
|
result = enoki::compress(mem.coeff(i), value.coeff(i), mask.coeff(i));
|
||||||
|
return result;
|
||||||
|
} else {
|
||||||
|
return value.compress_(mem, mask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE T compress(const T &value, const mask_t<T> &mask) {
|
||||||
|
T result;
|
||||||
|
for (size_t i = 0; i < Size; ++i)
|
||||||
|
result.coeff(i) = enoki::compress(value.coeff(i), mask.coeff(i));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Src, typename Index, typename Mask>
|
||||||
|
static ENOKI_INLINE T gather(const Src &src, const Index &index, const Mask &mask) {
|
||||||
|
return gather(src, index, mask, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Dst, typename Index, typename Mask>
|
||||||
|
static ENOKI_INLINE void scatter(Dst &dst, const T &value, const Index &index, const Mask &mask) {
|
||||||
|
scatter(dst, value, index, mask, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Dst, typename Index, typename Mask>
|
||||||
|
static ENOKI_INLINE void scatter_add(Dst &dst, const T &value, const Index &index, const Mask &mask) {
|
||||||
|
scatter_add(dst, value, index, mask, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
template <typename T2, size_t... Is>
|
||||||
|
static ENOKI_INLINE decltype(auto) packet(T2 &value, size_t i, std::index_sequence<Is...>) {
|
||||||
|
using Value = decltype(enoki::packet(value.coeff(0), i));
|
||||||
|
using Return = typename T::template ReplaceValue<Value>;
|
||||||
|
return Return(enoki::packet(value.coeff(Is), i)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, size_t... Is>
|
||||||
|
static ENOKI_INLINE decltype(auto) slice(T2 &value, size_t i, std::index_sequence<Is...>) {
|
||||||
|
using Value = decltype(enoki::slice(value.coeff(0), i));
|
||||||
|
using Return = typename T::template ReplaceValue<Value>;
|
||||||
|
return Return(enoki::slice(value.coeff(Is), i)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, size_t... Is>
|
||||||
|
static ENOKI_INLINE decltype(auto) slice_ptr(T2 &value, size_t i, std::index_sequence<Is...>) {
|
||||||
|
using Value = decltype(enoki::slice_ptr(value.coeff(0), i));
|
||||||
|
using Return = typename T::template ReplaceValue<Value>;
|
||||||
|
return Return(enoki::slice_ptr(value.coeff(Is), i)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, size_t... Is>
|
||||||
|
static ENOKI_INLINE decltype(auto) ref_wrap(T2 &value, std::index_sequence<Is...>) {
|
||||||
|
using Value = decltype(enoki::ref_wrap(value.coeff(0)));
|
||||||
|
using Return = typename T::template ReplaceValue<Value>;
|
||||||
|
return Return(enoki::ref_wrap(value.coeff(Is))...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Src, typename Index, typename Mask, size_t... Is>
|
||||||
|
static ENOKI_INLINE T gather(const Src &src, const Index &index, const Mask &mask,
|
||||||
|
std::index_sequence<Is...>) {
|
||||||
|
return T(enoki::gather<value_t<T>>(src.coeff(Is), index, mask)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, size_t... Is>
|
||||||
|
static ENOKI_INLINE decltype(auto) detach(T2 &a, std::index_sequence<Is...>) {
|
||||||
|
using Value = decltype(enoki::detach(a.coeff(0)));
|
||||||
|
using Return = typename T::template ReplaceValue<Value>;
|
||||||
|
return Return(enoki::detach(a.coeff(Is))...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, size_t... Is>
|
||||||
|
static ENOKI_INLINE decltype(auto) gradient(T2 &a, std::index_sequence<Is...>) {
|
||||||
|
using Value = decltype(enoki::gradient(a.coeff(0)));
|
||||||
|
using Return = typename T::template ReplaceValue<Value>;
|
||||||
|
return Return(enoki::gradient(a.coeff(Is))...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Dst, typename Index, typename Mask, size_t... Is>
|
||||||
|
static ENOKI_INLINE void scatter(Dst &src, const T &value, const Index &index,
|
||||||
|
const Mask &mask, std::index_sequence<Is...>) {
|
||||||
|
bool unused[] = { (enoki::scatter(src.coeff(Is), value.coeff(Is), index, mask), false) ... , false };
|
||||||
|
ENOKI_MARK_USED(unused);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Dst, typename Index, typename Mask, size_t... Is>
|
||||||
|
static ENOKI_INLINE void scatter_add(Dst &src, const T &value, const Index &index,
|
||||||
|
const Mask &mask, std::index_sequence<Is...>) {
|
||||||
|
bool unused[] = { (enoki::scatter_add(src.coeff(Is), value.coeff(Is), index, mask), false) ... , false };
|
||||||
|
ENOKI_MARK_USED(unused);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct struct_support<T, enable_if_dynamic_array_t<T>> {
|
||||||
|
static constexpr bool IsDynamic = true;
|
||||||
|
using Dynamic = T;
|
||||||
|
|
||||||
|
static ENOKI_INLINE T zero(size_t size) { return T::zero_(size); }
|
||||||
|
static ENOKI_INLINE T empty(size_t size) { return T::empty_(size); }
|
||||||
|
|
||||||
|
static ENOKI_INLINE auto masked(T &value, const mask_t<T> &mask) {
|
||||||
|
return detail::MaskedArray<T>{ value, mask };
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE size_t packets(const T &value) { return value.packets(); }
|
||||||
|
static ENOKI_INLINE size_t slices(const T &value) { return value.size(); }
|
||||||
|
static ENOKI_INLINE void set_slices(T &value, size_t size) { value.resize(size); }
|
||||||
|
static ENOKI_INLINE decltype(auto) packet(const T &value, size_t i) { return value.packet(i); }
|
||||||
|
static ENOKI_INLINE decltype(auto) packet(T &value, size_t i) { return value.packet(i); }
|
||||||
|
static ENOKI_INLINE decltype(auto) slice(const T &value, size_t i) { return value.coeff(i); }
|
||||||
|
static ENOKI_INLINE decltype(auto) slice(T &value, size_t i) { return value.coeff(i); }
|
||||||
|
static ENOKI_INLINE decltype(auto) slice_ptr(const T &value, size_t i) { return value.data() + i; }
|
||||||
|
static ENOKI_INLINE decltype(auto) slice_ptr(T &value, size_t i) { return value.data() + i; }
|
||||||
|
static ENOKI_INLINE decltype(auto) detach(const T &value) { return value; }
|
||||||
|
static ENOKI_INLINE decltype(auto) detach(T &value) { return value; }
|
||||||
|
static ENOKI_INLINE auto ref_wrap(T &value) { return value.ref_wrap_(); }
|
||||||
|
static ENOKI_INLINE auto ref_wrap(const T &value) { return value.ref_wrap_(); }
|
||||||
|
|
||||||
|
template <typename Mem>
|
||||||
|
static ENOKI_INLINE size_t compress(Mem &mem, const T& value, const mask_t<T> &mask) {
|
||||||
|
return value.compress_(mem, mask);
|
||||||
|
}
|
||||||
|
static ENOKI_INLINE T compress(const T &value, const mask_t<T> &mask) {
|
||||||
|
return value.compress_(mask);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
/// Recursive helper function used by enoki::shape
|
||||||
|
template <typename T>
|
||||||
|
void extract_shape_recursive(size_t *out, size_t i, const T &array) {
|
||||||
|
ENOKI_MARK_USED(out); ENOKI_MARK_USED(i); ENOKI_MARK_USED(array);
|
||||||
|
using Value = value_t<T>;
|
||||||
|
|
||||||
|
if constexpr (is_array_v<T>) {
|
||||||
|
*out = array.derived().size();
|
||||||
|
if constexpr (is_array_v<Value>) {
|
||||||
|
if (*out > 0)
|
||||||
|
extract_shape_recursive(out + 1, i + 1, array.derived().coeff(0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
bool is_ragged_recursive(const T &a, const size_t *shape) {
|
||||||
|
ENOKI_MARK_USED(shape);
|
||||||
|
if constexpr (is_array_v<T>) {
|
||||||
|
size_t size = a.derived().size();
|
||||||
|
if (*shape != size)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
bool match = true;
|
||||||
|
using Value = value_t<T>;
|
||||||
|
if constexpr (is_static_array_v<T> && is_dynamic_v<Value>) {
|
||||||
|
for (size_t i = 0; i < size; ++i)
|
||||||
|
match &= !is_ragged_recursive(a.derived().coeff(i), shape + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return !match;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE void set_shape_recursive(T &&a, const size_t *shape) {
|
||||||
|
ENOKI_MARK_USED(shape);
|
||||||
|
if constexpr (is_array_v<T>) {
|
||||||
|
size_t size = a.derived().size();
|
||||||
|
a.resize(*shape);
|
||||||
|
|
||||||
|
if (is_dynamic_array_v<T>) {
|
||||||
|
/* done. */
|
||||||
|
} else if (is_dynamic_v<value_t<T>>) {
|
||||||
|
for (size_t i = 0; i < size; ++i)
|
||||||
|
set_shape_recursive(a.derived().coeff(i), shape + 1);
|
||||||
|
} else {
|
||||||
|
if (size > 0)
|
||||||
|
set_shape_recursive(a.derived().coeff(0), shape + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract the shape of a nested array as an std::array
|
||||||
|
template <typename T, typename Result = std::array<size_t, array_depth_v<T>>>
|
||||||
|
Result shape(const T &array) {
|
||||||
|
Result result{0};
|
||||||
|
detail::extract_shape_recursive(result.data(), 0, array);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void set_shape(T &a, const std::array<size_t, array_depth_v<T>> &value) {
|
||||||
|
detail::set_shape_recursive(a, value.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> bool ragged(const T &a) {
|
||||||
|
return detail::is_ragged_recursive(a, shape(a).data());
|
||||||
|
}
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,615 @@
|
||||||
|
/*
|
||||||
|
enoki/array_traits.h -- Type traits for Enoki arrays
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "fwd.h"
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cassert>
|
||||||
|
#include <array>
|
||||||
|
#include <limits>
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <tuple>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name General type traits (not specific to Enoki arrays)
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Convenience wrapper around std::enable_if
|
||||||
|
template <bool B> using enable_if_t = std::enable_if_t<B, int>;
|
||||||
|
|
||||||
|
constexpr size_t Dynamic = (size_t) -1;
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
/// Identity function for types
|
||||||
|
template <typename T, typename...> struct identity {
|
||||||
|
using type = T;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <template <typename...> typename B, typename T>
|
||||||
|
struct is_base_of_impl {
|
||||||
|
private:
|
||||||
|
template <typename... Ts>
|
||||||
|
static constexpr std::true_type test(const B<Ts...> *);
|
||||||
|
static constexpr std::false_type test(...);
|
||||||
|
|
||||||
|
public:
|
||||||
|
using type = decltype(test(std::declval<T *>()));
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename, template <typename...> typename Op, typename... Ts>
|
||||||
|
struct detector : std::false_type { };
|
||||||
|
|
||||||
|
template <template <typename...> typename Op, typename... Ts>
|
||||||
|
struct detector<std::void_t<Op<Ts...>>, Op, Ts...>
|
||||||
|
: std::true_type { };
|
||||||
|
|
||||||
|
template <typename... > constexpr bool false_v = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename... Ts> using identity_t = typename detail::identity<Ts...>::type;
|
||||||
|
|
||||||
|
template <template<typename ...> class Op, class... Args>
|
||||||
|
constexpr bool is_detected_v = detail::detector<void, Op, Args...>::value;
|
||||||
|
|
||||||
|
/// Check if 'T' is a subtype of a given template 'B'
|
||||||
|
template <template <typename...> typename B, typename T>
|
||||||
|
using is_base_of = typename detail::is_base_of_impl<B, T>::type;
|
||||||
|
|
||||||
|
template <template <typename...> typename B, typename T>
|
||||||
|
constexpr bool is_base_of_v = is_base_of<B, T>::value;
|
||||||
|
|
||||||
|
/// Check if T is an integer of a given size (supports both 'int' and 'long' family)
|
||||||
|
template <typename T> using is_int8 = std::bool_constant<std::is_integral_v<T> && sizeof(T) == 1>;
|
||||||
|
template <typename T> constexpr bool is_int8_v = is_int8<T>::value;
|
||||||
|
|
||||||
|
template <typename T> using is_int16 = std::bool_constant<std::is_integral_v<T> && sizeof(T) == 2>;
|
||||||
|
template <typename T> constexpr bool is_int16_v = is_int16<T>::value;
|
||||||
|
|
||||||
|
template <typename T> using is_int32 = std::bool_constant<std::is_integral_v<T> && sizeof(T) == 4>;
|
||||||
|
template <typename T> constexpr bool is_int32_v = is_int32<T>::value;
|
||||||
|
|
||||||
|
template <typename T> using is_int64 = std::bool_constant<std::is_integral_v<T> && sizeof(T) == 8>;
|
||||||
|
template <typename T> constexpr bool is_int64_v = is_int64<T>::value;
|
||||||
|
|
||||||
|
template <typename T> constexpr bool is_float_v = std::is_same_v<T, float>;
|
||||||
|
template <typename T> constexpr bool is_double_v = std::is_same_v<T, double>;
|
||||||
|
|
||||||
|
template <typename T> using is_std_float = std::bool_constant<is_float_v<T> || is_double_v<T>>;
|
||||||
|
template <typename T> constexpr bool is_std_float_v = is_std_float<T>::value;
|
||||||
|
|
||||||
|
template <typename T> using is_std_int = std::bool_constant<is_int32_v<T> || is_int64_v<T>>;
|
||||||
|
template <typename T> constexpr bool is_std_int_v = is_std_int<T>::value;
|
||||||
|
|
||||||
|
template <typename T> using is_std_type = std::bool_constant<is_std_int_v<T> || is_std_float_v<T>>;
|
||||||
|
template <typename T> constexpr bool is_std_type_v = is_std_type<T>::value;
|
||||||
|
|
||||||
|
template <typename T> using enable_if_int32_t = enable_if_t<is_int32_v<T>>;
|
||||||
|
template <typename T> using enable_if_int64_t = enable_if_t<is_int64_v<T>>;
|
||||||
|
template <typename T> using enable_if_std_int_v = enable_if_t<is_std_int_v<T>>;
|
||||||
|
template <typename T> using enable_if_std_float_v = enable_if_t<is_std_float_v<T>>;
|
||||||
|
template <typename T> using enable_if_std_type_v = enable_if_t<is_std_type_v<T>>;
|
||||||
|
|
||||||
|
template <typename T> constexpr bool is_scalar_v = std::is_scalar_v<std::decay_t<T>>;
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
/// Value equivalence between arithmetic type to work around subtle issues between 'long' vs 'long long' on OSX
|
||||||
|
template <typename T0, typename T1>
|
||||||
|
struct is_same {
|
||||||
|
static constexpr bool value =
|
||||||
|
sizeof(T0) == sizeof(T1) &&
|
||||||
|
std::is_floating_point_v<T0> == std::is_floating_point_v<T1> &&
|
||||||
|
std::is_signed_v<T0> == std::is_signed_v<T1> &&
|
||||||
|
std::is_arithmetic_v<T0> == std::is_arithmetic_v<T1>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T0, typename T1>
|
||||||
|
static constexpr bool is_same_v = is_same<T0, T1>::value;
|
||||||
|
|
||||||
|
template <typename T> using has_size = std::enable_if_t<std::decay_t<T>::Size != Dynamic>;
|
||||||
|
template <typename T> constexpr bool has_size_v = is_detected_v<has_size, T>;
|
||||||
|
|
||||||
|
template <typename T> using is_masked_array = std::enable_if_t<T::IsMaskedArray>;
|
||||||
|
template <typename T> constexpr bool is_masked_array_v = is_detected_v<is_masked_array, T>;
|
||||||
|
}
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Type traits for Enoki arrays
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Is 'T' an Enoki array? (any variant)
|
||||||
|
template <typename T> using is_array = is_base_of<ArrayBase, std::decay_t<T>>;
|
||||||
|
template <typename T> constexpr bool is_array_v = is_array<T>::value;
|
||||||
|
template <typename T> using enable_if_array_t = enable_if_t<is_array_v<T>>;
|
||||||
|
template <typename T> using enable_if_not_array_t = enable_if_t<!is_array_v<T>>;
|
||||||
|
|
||||||
|
template <typename... Ts> using is_array_any = std::disjunction<is_array<Ts>...>;
|
||||||
|
template <typename... Ts> constexpr bool is_array_any_v = is_array_any<Ts...>::value;
|
||||||
|
template <typename... Ts> using enable_if_array_any_t = enable_if_t<is_array_any_v<Ts...>>;
|
||||||
|
|
||||||
|
template <typename T> using is_static_array = std::bool_constant<is_array_v<T> && detail::has_size_v<T>>;
|
||||||
|
template <typename T> constexpr bool is_static_array_v = is_static_array<T>::value;
|
||||||
|
template <typename T> using enable_if_static_array_t = enable_if_t<is_static_array_v<T>>;
|
||||||
|
|
||||||
|
template <typename T> using is_dynamic_array = std::bool_constant<is_array_v<T> && !detail::has_size_v<T>>;
|
||||||
|
template <typename T> constexpr bool is_dynamic_array_v = is_dynamic_array<T>::value;
|
||||||
|
template <typename T> using enable_if_dynamic_array_t = enable_if_t<is_dynamic_array_v<T>>;
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
template <typename T, typename = int> struct value {
|
||||||
|
using type = std::decay_t<T>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, typename = int> struct packet_ {
|
||||||
|
using type = std::decay_t<T>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct value<T, enable_if_array_t<T>> {
|
||||||
|
using type = typename std::decay_t<T>::Derived::Value;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct packet_<
|
||||||
|
T, enable_if_t<is_array_v<T> && !detail::is_masked_array_v<T>>> {
|
||||||
|
using type = typename std::decay_t<T>::Derived::Value;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct packet_<
|
||||||
|
T, enable_if_t<is_array_v<T> && detail::is_masked_array_v<T>>> {
|
||||||
|
using type = typename std::decay_t<T>::Derived::UnderlyingValue;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Type trait to access the value type of an array
|
||||||
|
template <typename T> using value_t = typename detail::value<T>::type;
|
||||||
|
|
||||||
|
/// Is 'T' an Enoki mask or a boolean?
|
||||||
|
template <typename T, typename = int> struct is_mask {
|
||||||
|
static constexpr bool value = std::is_same_v<std::decay_t<T>, bool>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct is_mask<MaskBit<T>> {
|
||||||
|
static constexpr bool value = true;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct is_mask<T, enable_if_array_t<T>> {
|
||||||
|
static constexpr bool value = std::decay_t<T>::Derived::IsMask;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> constexpr bool is_mask_v = is_mask<T>::value;
|
||||||
|
template <typename T> using enable_if_mask_t = enable_if_t<is_mask_v<T>>;
|
||||||
|
template <typename T> using enable_if_not_mask_t = enable_if_t<!is_mask_v<T>>;
|
||||||
|
|
||||||
|
/// Is 'T' implemented using a recursive implementation?
|
||||||
|
template <typename T, typename = int> struct is_recursive_array {
|
||||||
|
static constexpr bool value = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct is_recursive_array<T, enable_if_array_t<T>> {
|
||||||
|
static constexpr bool value = std::decay_t<T>::Derived::IsRecursive;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> constexpr bool is_recursive_array_v = is_recursive_array<T>::value;
|
||||||
|
template <typename T> using enable_if_recursive_t = enable_if_t<is_recursive_array_v<T>>;
|
||||||
|
|
||||||
|
/// Does this array compute derivatives using automatic differentiation?
|
||||||
|
template <typename T, typename = int> struct is_diff_array {
|
||||||
|
static constexpr bool value = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct is_diff_array<T, enable_if_array_t<T>> {
|
||||||
|
static constexpr bool value = std::decay_t<T>::Derived::IsDiff;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> constexpr bool is_diff_array_v = is_diff_array<T>::value;
|
||||||
|
template <typename T> using enable_if_diff_array_t = enable_if_t<is_diff_array_v<T>>;
|
||||||
|
|
||||||
|
/// Does this array reside on the GPU (via CUDA)?
|
||||||
|
template <typename T, typename = int> struct is_cuda_array {
|
||||||
|
static constexpr bool value = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct is_cuda_array<T, enable_if_array_t<T>> {
|
||||||
|
static constexpr bool value = std::decay_t<T>::Derived::IsCUDA;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> constexpr bool is_cuda_array_v = is_cuda_array<T>::value;
|
||||||
|
template <typename T> using enable_if_cuda_t = enable_if_t<is_cuda_array_v<T>>;
|
||||||
|
|
||||||
|
/// Determine the depth of a nested Enoki array (scalars evaluate to zero)
|
||||||
|
template <typename T, typename = int> struct array_depth {
|
||||||
|
static constexpr size_t value = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct array_depth<T, enable_if_array_t<T>> {
|
||||||
|
static constexpr size_t value = std::decay_t<T>::Derived::Depth;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> constexpr size_t array_depth_v = array_depth<T>::value;
|
||||||
|
|
||||||
|
/// Determine the size of a nested Enoki array (scalars evaluate to one)
|
||||||
|
template <typename T, typename = int> struct array_size {
|
||||||
|
static constexpr size_t value = 1;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct array_size<T, enable_if_static_array_t<T>> {
|
||||||
|
static constexpr size_t value = std::decay_t<T>::Derived::Size;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct array_size<T, enable_if_dynamic_array_t<T>> {
|
||||||
|
static constexpr size_t value = Dynamic;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> constexpr size_t array_size_v = array_size<T>::value;
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
template <typename T, size_t>
|
||||||
|
struct prepend_index { };
|
||||||
|
|
||||||
|
template <size_t... Index, size_t Value>
|
||||||
|
struct prepend_index<std::index_sequence<Index...>, Value> {
|
||||||
|
using type = std::index_sequence<Value, Index...>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, size_t Value>
|
||||||
|
using prepend_index_t = typename prepend_index<T, Value>::type;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Determine the shape of an array
|
||||||
|
template <typename T, typename = int> struct array_shape {
|
||||||
|
using type = std::index_sequence<>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
using array_shape_t = typename array_shape<T>::type;
|
||||||
|
|
||||||
|
template <typename T> struct array_shape<T, enable_if_array_t<T>> {
|
||||||
|
using type = detail::prepend_index_t<array_shape_t<value_t<T>>, array_size_v<T>>;
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
template <typename T, typename = int> struct scalar {
|
||||||
|
using type = std::decay_t<T>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct scalar<T, enable_if_array_t<T>> {
|
||||||
|
using type = typename std::decay_t<T>::Derived::Scalar;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> using packet_t = typename detail::packet_<T>::type;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Type trait to access the base scalar type underlying a potentially nested array
|
||||||
|
template <typename T> using scalar_t = typename detail::scalar<T>::type;
|
||||||
|
|
||||||
|
struct BitRef;
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
/// Copy modifier flags (const/pointer/lvalue/rvalue reference from 'S' to 'T')
|
||||||
|
template <typename S, typename T> struct copy_flags {
|
||||||
|
private:
|
||||||
|
using R = std::remove_reference_t<S>;
|
||||||
|
using T1 = std::conditional_t<std::is_const_v<R>, std::add_const_t<T>, T>;
|
||||||
|
using T2 = std::conditional_t<std::is_pointer_v<S>,
|
||||||
|
std::add_pointer_t<T1>, T1>;
|
||||||
|
using T3 = std::conditional_t<std::is_lvalue_reference_v<S>,
|
||||||
|
std::add_lvalue_reference_t<T2>, T2>;
|
||||||
|
using T4 = std::conditional_t<std::is_rvalue_reference_v<S>,
|
||||||
|
std::add_rvalue_reference_t<T3>, T3>;
|
||||||
|
|
||||||
|
public:
|
||||||
|
using type = T4;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename S, typename T>
|
||||||
|
using copy_flags_t = typename detail::copy_flags<S, T>::type;
|
||||||
|
|
||||||
|
template <typename T, bool CopyFlags, typename = int> struct mask {
|
||||||
|
using type = bool;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, bool CopyFlags> struct mask<T&, CopyFlags, enable_if_t<is_scalar_v<T>>> {
|
||||||
|
using type = BitRef;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, bool CopyFlags> struct mask<T, CopyFlags, enable_if_array_t<T>> {
|
||||||
|
private:
|
||||||
|
using Mask = copy_flags_t<T, typename std::decay_t<T>::Derived::MaskType>;
|
||||||
|
public:
|
||||||
|
using type = std::conditional_t<CopyFlags, detail::copy_flags_t<T, Mask>, Mask>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, bool CopyFlags, typename = int> struct array { };
|
||||||
|
|
||||||
|
template <typename T, bool CopyFlags> struct array<T, CopyFlags, enable_if_array_t<T>> {
|
||||||
|
private:
|
||||||
|
using Array = copy_flags_t<T, typename std::decay_t<T>::Derived::ArrayType>;
|
||||||
|
public:
|
||||||
|
using type = std::conditional_t<CopyFlags, detail::copy_flags_t<T, Array>, Array>;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Type trait to access the mask type underlying an array
|
||||||
|
template <typename T, bool CopyFlags = true> using mask_t = typename detail::mask<T, CopyFlags>::type;
|
||||||
|
|
||||||
|
/// Type trait to access the array type underlying a mask
|
||||||
|
template <typename T, bool CopyFlags = true> using array_t = typename detail::array<T, CopyFlags>::type;
|
||||||
|
|
||||||
|
/// Extract the most deeply nested Enoki array type from a list of arguments
|
||||||
|
template <typename... Args> struct deepest_array;
|
||||||
|
template <> struct deepest_array<> { using type = void; };
|
||||||
|
|
||||||
|
template <typename Arg, typename... Args> struct deepest_array<Arg, Args...> {
|
||||||
|
private:
|
||||||
|
using T0 = Arg;
|
||||||
|
using T1 = typename deepest_array<Args...>::type;
|
||||||
|
|
||||||
|
// Give precedence to dynamic arrays
|
||||||
|
static constexpr size_t D0 = array_depth_v<T0>;
|
||||||
|
static constexpr size_t D1 = array_depth_v<T1>;
|
||||||
|
|
||||||
|
public:
|
||||||
|
using type = std::conditional_t<(D1 > D0 || D0 == 0), T1, T0>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename... Args> using deepest_array_t = typename deepest_array<Args...>::type;
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
template <typename... Ts> struct expr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Type trait to compute the type of an arithmetic expression involving Ts...
|
||||||
|
template <typename... Ts> using expr_t = typename detail::expr<Ts...>::type;
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
/// Type trait to compute the result of a unary expression
|
||||||
|
template <typename Array, typename T> struct expr_1;
|
||||||
|
|
||||||
|
template <typename T> struct expr_1<T, T> {
|
||||||
|
private:
|
||||||
|
using Td = std::decay_t<T>;
|
||||||
|
using Entry = value_t<T>;
|
||||||
|
using EntryExpr = expr_t<Entry>;
|
||||||
|
|
||||||
|
public:
|
||||||
|
using type = std::conditional_t<
|
||||||
|
std::is_same_v<Entry, EntryExpr>,
|
||||||
|
Td, typename Td::Derived::template ReplaceValue<EntryExpr>
|
||||||
|
>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct expr_1<void, T> { using type = std::decay_t<T>; };
|
||||||
|
|
||||||
|
/// Type trait to compute the result of a n-ary expression involving types (T, Ts...)
|
||||||
|
template <typename Array, typename T, typename... Ts>
|
||||||
|
struct expr_n {
|
||||||
|
private:
|
||||||
|
using Value = expr_t<detail::packet_t<T>, detail::packet_t<Ts>...>;
|
||||||
|
public:
|
||||||
|
using type = typename std::decay_t<Array>::Derived::template ReplaceValue<Value>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, typename... Ts>
|
||||||
|
struct expr_n<void, T, Ts...> {
|
||||||
|
using type = decltype(std::declval<T>() + std::declval<expr_t<Ts...>>());
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T1, typename T2> struct expr_n<void, T1*, T2*> { using type = std::common_type_t<T1*, T2*>; };
|
||||||
|
template <typename T> struct expr_n<void, T*, std::nullptr_t> { using type = T*; };
|
||||||
|
template <typename T> struct expr_n<void, T*, unsigned long long> { using type = T*; };
|
||||||
|
template <typename T> struct expr_n<void, T*, unsigned long> { using type = T*; };
|
||||||
|
template <typename T> struct expr_n<void, std::nullptr_t, T*> { using type = T*; };
|
||||||
|
template <typename T, typename T2> struct expr_n<void, T, enoki::divisor_ext<T2>> { using type = T2; };
|
||||||
|
template <typename T, typename T2> struct expr_n<void, T, enoki::divisor<T2>> { using type = T2; };
|
||||||
|
template <> struct expr_n<void, bool, bool> { using type = bool; };
|
||||||
|
|
||||||
|
/// Type trait to compute the result of arbitrary expressions
|
||||||
|
template <typename... Ts> struct expr : detail::expr_n<deepest_array_t<Ts...>, Ts...> { };
|
||||||
|
template <typename T> struct expr<T> : detail::expr_1<deepest_array_t<T>, T> { };
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
template <typename T, typename = int> struct array_broadcast_outer {
|
||||||
|
static constexpr bool value = true;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct array_broadcast_outer<T, enable_if_array_t<T>> {
|
||||||
|
static constexpr bool value = std::decay_t<T>::Derived::BroadcastPreferOuter;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> constexpr bool array_broadcast_outer_v = array_broadcast_outer<T>::value;
|
||||||
|
|
||||||
|
/// Convenience class to choose an arithmetic type based on its size and flavor
|
||||||
|
template <size_t Size> struct type_chooser { };
|
||||||
|
|
||||||
|
template <> struct type_chooser<1> {
|
||||||
|
using Int = int8_t;
|
||||||
|
using UInt = uint8_t;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <> struct type_chooser<2> {
|
||||||
|
using Int = int16_t;
|
||||||
|
using UInt = uint16_t;
|
||||||
|
using Float = half;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <> struct type_chooser<4> {
|
||||||
|
using Int = int32_t;
|
||||||
|
using UInt = uint32_t;
|
||||||
|
using Float = float;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <> struct type_chooser<8> {
|
||||||
|
using Int = int64_t;
|
||||||
|
using UInt = uint64_t;
|
||||||
|
using Float = double;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Replace the base scalar type of a (potentially nested) array
|
||||||
|
template <typename T, typename Value, bool CopyFlags = true, typename = int>
|
||||||
|
struct replace_scalar { };
|
||||||
|
|
||||||
|
template <typename T, typename Value, bool CopyFlags = true>
|
||||||
|
using replace_scalar_t = typename replace_scalar<T, Value, CopyFlags>::type;
|
||||||
|
|
||||||
|
template <typename T, typename Value, bool CopyFlags> struct replace_scalar<T, Value, CopyFlags, enable_if_not_array_t<T>> {
|
||||||
|
using type = std::conditional_t<CopyFlags, detail::copy_flags_t<T, Value>, Value>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, typename Value, bool CopyFlags> struct replace_scalar<T, Value, CopyFlags, enable_if_array_t<T>> {
|
||||||
|
private:
|
||||||
|
using Entry = replace_scalar_t<detail::packet_t<T>, Value, CopyFlags>;
|
||||||
|
using Array = typename std::decay_t<T>::Derived::template ReplaceValue<Entry>;
|
||||||
|
public:
|
||||||
|
using type = std::conditional_t<CopyFlags, detail::copy_flags_t<T, Array>, Array>;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Integer-based version of a given array class
|
||||||
|
template <typename T, bool CopyFlags = true>
|
||||||
|
using int_array_t = replace_scalar_t<T, typename detail::type_chooser<sizeof(scalar_t<T>)>::Int, CopyFlags>;
|
||||||
|
|
||||||
|
/// Unsigned integer-based version of a given array class
|
||||||
|
template <typename T, bool CopyFlags = true>
|
||||||
|
using uint_array_t = replace_scalar_t<T, typename detail::type_chooser<sizeof(scalar_t<T>)>::UInt, CopyFlags>;
|
||||||
|
|
||||||
|
/// Floating point-based version of a given array class
|
||||||
|
template <typename T, bool CopyFlags = true>
|
||||||
|
using float_array_t = replace_scalar_t<T, typename detail::type_chooser<sizeof(scalar_t<T>)>::Float, CopyFlags>;
|
||||||
|
|
||||||
|
|
||||||
|
template <typename T, bool CopyFlags = true> using int32_array_t = replace_scalar_t<T, int32_t, CopyFlags>;
|
||||||
|
template <typename T, bool CopyFlags = true> using uint32_array_t = replace_scalar_t<T, uint32_t, CopyFlags>;
|
||||||
|
template <typename T, bool CopyFlags = true> using int64_array_t = replace_scalar_t<T, int64_t, CopyFlags>;
|
||||||
|
template <typename T, bool CopyFlags = true> using uint64_array_t = replace_scalar_t<T, uint64_t, CopyFlags>;
|
||||||
|
template <typename T, bool CopyFlags = true> using float16_array_t = replace_scalar_t<T, half, CopyFlags>;
|
||||||
|
template <typename T, bool CopyFlags = true> using float32_array_t = replace_scalar_t<T, float, CopyFlags>;
|
||||||
|
template <typename T, bool CopyFlags = true> using float64_array_t = replace_scalar_t<T, double, CopyFlags>;
|
||||||
|
template <typename T, bool CopyFlags = true> using bool_array_t = replace_scalar_t<T, bool, CopyFlags>;
|
||||||
|
template <typename T, bool CopyFlags = true> using size_array_t = replace_scalar_t<T, size_t, CopyFlags>;
|
||||||
|
template <typename T, bool CopyFlags = true> using ssize_array_t = replace_scalar_t<T, ssize_t, CopyFlags>;
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename T> using struct_support_t = struct_support<std::decay_t<T>>;
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Type enumeration
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
enum class EnokiType { Invalid = 0, Int8, UInt8, Int16, UInt16,
|
||||||
|
Int32, UInt32, Int64, UInt64, Float16,
|
||||||
|
Float32, Float64, Bool, Pointer };
|
||||||
|
|
||||||
|
template <typename T, typename = int> struct enoki_type {
|
||||||
|
static constexpr EnokiType value = EnokiType::Invalid;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct enoki_type<T, enable_if_t<is_int8_v<T>>> {
|
||||||
|
static constexpr EnokiType value =
|
||||||
|
std::is_signed_v<T> ? EnokiType::Int8 : EnokiType::UInt8;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct enoki_type<T, enable_if_t<is_int16_v<T>>> {
|
||||||
|
static constexpr EnokiType value =
|
||||||
|
std::is_signed_v<T> ? EnokiType::Int16 : EnokiType::UInt16;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct enoki_type<T, enable_if_t<is_int32_v<T>>> {
|
||||||
|
static constexpr EnokiType value =
|
||||||
|
std::is_signed_v<T> ? EnokiType::Int32 : EnokiType::UInt32;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct enoki_type<T, enable_if_t<is_int64_v<T>>> {
|
||||||
|
static constexpr EnokiType value =
|
||||||
|
std::is_signed_v<T> ? EnokiType::Int64 : EnokiType::UInt64;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct enoki_type<T, enable_if_t<std::is_enum_v<T>>> {
|
||||||
|
static constexpr EnokiType value = enoki_type<std::underlying_type_t<T>>::value;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <> struct enoki_type<half> {
|
||||||
|
static constexpr EnokiType value = EnokiType::Float16;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <> struct enoki_type<float> {
|
||||||
|
static constexpr EnokiType value = EnokiType::Float32;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <> struct enoki_type<double> {
|
||||||
|
static constexpr EnokiType value = EnokiType::Float64;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <> struct enoki_type<bool> {
|
||||||
|
static constexpr EnokiType value = EnokiType::Bool;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct enoki_type<T *> {
|
||||||
|
static constexpr EnokiType value = EnokiType::Pointer;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> constexpr EnokiType enoki_type_v = enoki_type<T>::value;
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Type trait to inspect the return/argument types of functions
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename T, typename SFINAE = void> struct function_traits { };
|
||||||
|
|
||||||
|
// Vanilla function
|
||||||
|
template <typename R, typename... A> struct function_traits<R(*)(A...)> {
|
||||||
|
using Args = std::tuple<A...>;
|
||||||
|
using Return = R;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Method
|
||||||
|
template <typename C, typename R, typename... A> struct function_traits<R(C::*)(A...)> {
|
||||||
|
using Class = C;
|
||||||
|
using Args = std::tuple<A...>;
|
||||||
|
using Return = R;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Method (const)
|
||||||
|
template <typename C, typename R, typename... A> struct function_traits<R(C::*)(A...) const> {
|
||||||
|
using Class = C;
|
||||||
|
using Args = std::tuple<A...>;
|
||||||
|
using Return = R;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Lambda function -- strip lambda closure and delegate back to ``function_traits``
|
||||||
|
template <typename F>
|
||||||
|
struct function_traits<
|
||||||
|
F, std::enable_if_t<std::is_member_function_pointer_v<decltype(
|
||||||
|
&std::remove_reference_t<F>::operator())>>>
|
||||||
|
: function_traits<decltype(&std::remove_reference_t<F>::operator())> { };
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,200 @@
|
||||||
|
/*
|
||||||
|
enoki/array_router.h -- Helper functions which route function calls
|
||||||
|
in the enoki namespace to the intended recipients
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <enoki/array_generic.h>
|
||||||
|
#include <enoki/array_idiv.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
/// Analagous to meshgrid() in NumPy or MATLAB; for dynamic arrays
|
||||||
|
template <typename T, enable_if_dynamic_array_t<T> = 0>
|
||||||
|
Array<T, 2> meshgrid(const T &x, const T &y) {
|
||||||
|
if constexpr (is_cuda_array_v<T> || is_diff_array_v<T>) {
|
||||||
|
x.eval(); y.eval();
|
||||||
|
|
||||||
|
if (x.size() == 1) {
|
||||||
|
T x2(x);
|
||||||
|
set_slices(x2, slices(y));
|
||||||
|
return Array<T, 2>(
|
||||||
|
std::move(x2),
|
||||||
|
y
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t n = (uint32_t) x.size() * (uint32_t) y.size();
|
||||||
|
divisor<uint32_t> div((uint32_t) x.size());
|
||||||
|
|
||||||
|
using UInt32 = uint32_array_t<T>;
|
||||||
|
UInt32 index = arange<UInt32>(n),
|
||||||
|
yi = div(index),
|
||||||
|
xi = index - yi * (uint32_t) x.size();
|
||||||
|
|
||||||
|
return Array<T, 2>(
|
||||||
|
gather<T>(x, xi),
|
||||||
|
gather<T>(y, yi)
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
T X, Y;
|
||||||
|
set_slices(X, x.size() * y.size());
|
||||||
|
set_slices(Y, x.size() * y.size());
|
||||||
|
|
||||||
|
size_t pos = 0;
|
||||||
|
|
||||||
|
if (x.size() % T::PacketSize == 0) {
|
||||||
|
/* Fast path */
|
||||||
|
|
||||||
|
for (size_t i = 0; i < y.size(); ++i) {
|
||||||
|
for (size_t j = 0; j < packets(x); ++j) {
|
||||||
|
packet(X, pos) = packet(x, j);
|
||||||
|
packet(Y, pos) = y.coeff(i);
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (size_t i = 0; i < y.size(); ++i) {
|
||||||
|
for (size_t j = 0; j < x.size(); ++j) {
|
||||||
|
X.coeff(pos) = x.coeff(j);
|
||||||
|
Y.coeff(pos) = y.coeff(i);
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Array<T, 2>(std::move(X), std::move(Y));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Vectorized N-dimensional 'range' iterable with automatic mask computation
|
||||||
|
template <typename Value> struct range {
|
||||||
|
static constexpr size_t Dimension = array_depth_v<Value> == 2 ?
|
||||||
|
array_size_v<Value> : 1;
|
||||||
|
static constexpr size_t PacketSize = array_depth_v<Value> == 2 ?
|
||||||
|
array_size_v<value_t<Value>> : array_size_v<Value>;
|
||||||
|
|
||||||
|
using Scalar = scalar_t<Value>;
|
||||||
|
using Packet = Array<Scalar, PacketSize>;
|
||||||
|
using Size = Array<Scalar, Dimension>;
|
||||||
|
|
||||||
|
struct iterator {
|
||||||
|
iterator(size_t index) : index(index) { }
|
||||||
|
iterator(size_t index, Size size)
|
||||||
|
: index(index), index_p(arange<Packet>()), size(size) {
|
||||||
|
for (size_t i = 0; i < Dimension - 1; ++i)
|
||||||
|
div[i] = size[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
bool operator==(const iterator &it) const { return it.index == index; }
|
||||||
|
bool operator!=(const iterator &it) const { return it.index != index; }
|
||||||
|
|
||||||
|
iterator &operator++() {
|
||||||
|
index += 1;
|
||||||
|
index_p += Scalar(Packet::Size);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<Value, mask_t<Packet>> operator*() const {
|
||||||
|
if constexpr (array_depth_v<Value> == 1) {
|
||||||
|
return { index_p, index_p < size[0] };
|
||||||
|
} else {
|
||||||
|
Value value;
|
||||||
|
value[0] = index_p;
|
||||||
|
ENOKI_UNROLL for (size_t i = 0; i < Dimension - 1; ++i)
|
||||||
|
value[i + 1] = div[i](value[i]);
|
||||||
|
Packet offset = zero<Packet>();
|
||||||
|
ENOKI_UNROLL for (size_t i = Dimension - 2; ; --i) {
|
||||||
|
offset = size[i] * (value[i + 1] + offset);
|
||||||
|
value[i] -= offset;
|
||||||
|
if (i == 0)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return { value, value[Dimension - 1] < size[Dimension - 1] };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
size_t index;
|
||||||
|
Packet index_p;
|
||||||
|
Size size;
|
||||||
|
divisor<Scalar> div[Dimension > 1 ? (Dimension - 1) : 1];
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename... Args>
|
||||||
|
range(Args&&... args) : size(args...) { }
|
||||||
|
|
||||||
|
iterator begin() {
|
||||||
|
return iterator(0, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
iterator end() {
|
||||||
|
return iterator((hprod(size) + Packet::Size - 1) / Packet::Size);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
Size size;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Predicate,
|
||||||
|
typename Args = typename function_traits<Predicate>::Args,
|
||||||
|
typename Index = std::decay_t<std::tuple_element_t<0, Args>>>
|
||||||
|
Index binary_search(scalar_t<Index> start_,
|
||||||
|
scalar_t<Index> end_,
|
||||||
|
const Predicate &pred) {
|
||||||
|
Index start(start_), end(end_);
|
||||||
|
|
||||||
|
scalar_t<Index> iterations = (start_ < end_) ?
|
||||||
|
(log2i(end_ - start_) + 1) : 0;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < iterations; ++i) {
|
||||||
|
Index middle = sr<1>(start + end);
|
||||||
|
|
||||||
|
mask_t<Index> cond = pred(middle);
|
||||||
|
|
||||||
|
masked(start, cond) = min(middle + 1, end);
|
||||||
|
masked(end, !cond) = middle;
|
||||||
|
}
|
||||||
|
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Stack memory allocation
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Wrapper around alloca(), which returns aligned (and, optionally,
|
||||||
|
* zero-initialized) memory
|
||||||
|
*/
|
||||||
|
#define ENOKI_ALIGNED_ALLOCA(Array, Count, Clear) \
|
||||||
|
enoki::detail::alloca_helper<Array, Clear>((uint8_t *) alloca( \
|
||||||
|
sizeof(Array) * (Count) + enoki::max_packet_size - 4), \
|
||||||
|
sizeof(Array) * (Count))
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
template <typename Array, bool Clear>
|
||||||
|
ENOKI_INLINE Array *alloca_helper(uint8_t *ptr, size_t size) {
|
||||||
|
(uintptr_t &) ptr +=
|
||||||
|
((max_packet_size - (uintptr_t) ptr) % max_packet_size);
|
||||||
|
if constexpr (Clear)
|
||||||
|
memset(ptr, 0, size);
|
||||||
|
return (Array *) ptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,95 @@
|
||||||
|
/*
|
||||||
|
enoki/color.h -- Color space transformations (only sRGB so far)
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <enoki/array.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
template <typename T> expr_t<T> linear_to_srgb(const T &x) {
|
||||||
|
using Value = expr_t<T>;
|
||||||
|
using Mask = mask_t<Value>;
|
||||||
|
using Scalar = scalar_t<Value>;
|
||||||
|
constexpr bool Single = std::is_same_v<Scalar, float>;
|
||||||
|
|
||||||
|
Value r = Scalar(12.92);
|
||||||
|
Mask large_mask = x > Scalar(0.0031308);
|
||||||
|
|
||||||
|
if (ENOKI_LIKELY(any(large_mask))) {
|
||||||
|
Value y = sqrt(x), p, q;
|
||||||
|
|
||||||
|
if constexpr (Single) {
|
||||||
|
p = poly5(y, -0.0016829072605308378, 0.03453868659826638,
|
||||||
|
0.7642611304733891, 2.0041169284241644,
|
||||||
|
0.7551545191665577, -0.016202083165206348);
|
||||||
|
q = poly5(y, 4.178892964897981e-7, -0.00004375359692957097,
|
||||||
|
0.03467195408529984, 0.6085338522168684,
|
||||||
|
1.8970238036421054, 1.);
|
||||||
|
} else {
|
||||||
|
p = poly10(y, -3.7113872202050023e-6, -0.00021805827098915798,
|
||||||
|
0.002531335520959116, 0.2263810267005674,
|
||||||
|
3.0477578489880823, 15.374469584296442,
|
||||||
|
32.44669922192121, 27.901125077137042, 8.450947414259522,
|
||||||
|
0.5838023820686707, -0.0031151377052754843);
|
||||||
|
q = poly10(y, 2.2380622409188757e-11, -8.387527630781522e-9,
|
||||||
|
0.00007045228641004039, 0.007244514696840552,
|
||||||
|
0.21749170309546628, 2.575446652731678,
|
||||||
|
13.297981743005433, 30.50364355650628, 29.70548706952188,
|
||||||
|
10.723011300050162, 1.);
|
||||||
|
}
|
||||||
|
|
||||||
|
masked(r, large_mask) = p / q;
|
||||||
|
}
|
||||||
|
|
||||||
|
return r * x;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> expr_t<T> srgb_to_linear(const T &x) {
|
||||||
|
using Value = expr_t<T>;
|
||||||
|
using Mask = mask_t<Value>;
|
||||||
|
using Scalar = scalar_t<Value>;
|
||||||
|
constexpr bool Single = std::is_same_v<Scalar, float>;
|
||||||
|
|
||||||
|
Value r = Scalar(1.0 / 12.92);
|
||||||
|
Mask large_mask = x > Scalar(0.04045);
|
||||||
|
|
||||||
|
if (ENOKI_LIKELY(any(large_mask))) {
|
||||||
|
Value p, q;
|
||||||
|
|
||||||
|
if constexpr (Single) {
|
||||||
|
p = poly4(x, -0.0163933279112946, -0.7386328024653209,
|
||||||
|
-11.199318357635072, -47.46726633009393,
|
||||||
|
-36.04572663838034);
|
||||||
|
q = poly4(x, -0.004261480793199332, -19.140923959601675,
|
||||||
|
-59.096406619244426, -18.225745396846637, 1.);
|
||||||
|
} else {
|
||||||
|
p = poly9(x, -0.008042950896814532, -0.5489744177844188,
|
||||||
|
-14.786385491859248, -200.19589605282445,
|
||||||
|
-1446.951694673217, -5548.704065887224,
|
||||||
|
-10782.158977031822, -9735.250875334352,
|
||||||
|
-3483.4445569178347, -342.62884098034357);
|
||||||
|
q = poly9(x, -2.2132610916769585e-8, -9.646075249097724,
|
||||||
|
-237.47722999429413, -2013.8039726540235,
|
||||||
|
-7349.477378676199, -11916.470977597566,
|
||||||
|
-8059.219012060384, -1884.7738197074218,
|
||||||
|
-84.8098437770271, 1.);
|
||||||
|
}
|
||||||
|
|
||||||
|
masked(r, large_mask) = p / q;
|
||||||
|
}
|
||||||
|
|
||||||
|
return r * x;
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,289 @@
|
||||||
|
/*
|
||||||
|
enoki/complex.h -- Complex number data structure
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <enoki/array.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
/// SFINAE helper for complex numbers
|
||||||
|
template <typename T> using is_complex_helper = enable_if_t<std::decay_t<T>::IsComplex>;
|
||||||
|
template <typename T> constexpr bool is_complex_v = is_detected_v<is_complex_helper, T>;
|
||||||
|
template <typename T> using enable_if_complex_t = enable_if_t<is_complex_v<T>>;
|
||||||
|
template <typename T> using enable_if_not_complex_t = enable_if_t<!is_complex_v<T>>;
|
||||||
|
|
||||||
|
template <typename Value_>
|
||||||
|
struct Complex : StaticArrayImpl<Value_, 2, false, Complex<Value_>> {
|
||||||
|
using Base = StaticArrayImpl<Value_, 2, false, Complex<Value_>>;
|
||||||
|
ENOKI_ARRAY_IMPORT_BASIC(Base, Complex);
|
||||||
|
using Base::operator=;
|
||||||
|
|
||||||
|
static constexpr bool IsComplex = true;
|
||||||
|
static constexpr bool IsVector = false;
|
||||||
|
|
||||||
|
using ArrayType = Complex;
|
||||||
|
using MaskType = Mask<Value_, 2>;
|
||||||
|
|
||||||
|
template <typename T> using ReplaceValue = Complex<T>;
|
||||||
|
|
||||||
|
Complex() = default;
|
||||||
|
|
||||||
|
template <typename T, enable_if_complex_t<T> = 0>
|
||||||
|
ENOKI_INLINE Complex(T&& z) : Base(z) { }
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<(array_depth_v<T> < Base::Depth && (is_scalar_v<T> || is_array_v<T>))> = 0,
|
||||||
|
enable_if_not_complex_t<T> = 0>
|
||||||
|
ENOKI_INLINE Complex(T &&v) : Base(v, zero<Value_>()) { }
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<(array_depth_v<T> == Base::Depth || !(is_scalar_v<T> || is_array_v<T>))> = 0,
|
||||||
|
enable_if_not_complex_t<T> = 0>
|
||||||
|
ENOKI_INLINE Complex(T &&v) : Base(std::forward<T>(v)) { }
|
||||||
|
|
||||||
|
ENOKI_INLINE Complex(const Value_ &v1, const Value_ &v2) : Base(v1, v2) { }
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE static Complex full_(const T &value, size_t size) {
|
||||||
|
return Array<Value, 2>::full_(value, size);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, enable_if_complex_t<T> = 0>
|
||||||
|
ENOKI_INLINE T identity(size_t size = 1) {
|
||||||
|
using Value = value_t<T>;
|
||||||
|
return T(full<Value>(1.f, size), zero<Value>(size));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE expr_t<T> real(const Complex<T> &z) { return z.x(); }
|
||||||
|
template <typename T> ENOKI_INLINE expr_t<T> imag(const Complex<T> &z) { return z.y(); }
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE expr_t<T> squared_norm(const Complex<T> &z) {
|
||||||
|
return squared_norm(Array<expr_t<T>, 2>(z));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE expr_t<T> norm(const Complex<T> &z) {
|
||||||
|
return norm(Array<expr_t<T>, 2>(z));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE Complex<expr_t<T>> normalize(const Complex<T> &q) {
|
||||||
|
return enoki::normalize(Array<expr_t<T>, 2>(q));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE Complex<expr_t<T>> rcp(const Complex<T> &z) {
|
||||||
|
auto scale = rcp(squared_norm(z));
|
||||||
|
return Complex<expr_t<T>>(
|
||||||
|
real(z) * scale,
|
||||||
|
-imag(z) * scale
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T0, typename T1,
|
||||||
|
typename Value = expr_t<T0, T1>, typename Result = Complex<Value>>
|
||||||
|
ENOKI_INLINE Result operator*(const Complex<T0> &z0, const Complex<T1> &z1) {
|
||||||
|
using Base = Array<Value, 2>;
|
||||||
|
|
||||||
|
Base z1_perm = shuffle<1, 0>(z1),
|
||||||
|
z0_im = shuffle<1, 1>(z0),
|
||||||
|
z0_re = shuffle<0, 0>(z0);
|
||||||
|
|
||||||
|
return fmaddsub(z0_re, z1, z0_im * z1_perm);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T0, typename T1,
|
||||||
|
typename Value = expr_t<T0, T1>,
|
||||||
|
typename Result = Complex<Value>>
|
||||||
|
ENOKI_INLINE Result operator*(const Complex<T0> &z0, const T1 &v1) {
|
||||||
|
return Array<expr_t<T0>, 2>(z0) * v1;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T0, typename T1,
|
||||||
|
typename Value = expr_t<T0, T1>, typename Result = Complex<Value>>
|
||||||
|
ENOKI_INLINE Result operator*(const T0 &v0, const Complex<T1> &z1) {
|
||||||
|
return v0 * Array<expr_t<T1>, 2>(z1);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T0, typename T1,
|
||||||
|
typename Value = expr_t<T0, T1>, typename Result = Complex<Value>>
|
||||||
|
ENOKI_INLINE Result operator/(const Complex<T0> &z0, const Complex<T1> &z1) {
|
||||||
|
return z0 * rcp(z1);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T0, typename T1,
|
||||||
|
typename Value = expr_t<T0, T1>, typename Result = Complex<Value>>
|
||||||
|
ENOKI_INLINE Result operator/(const Complex<T0> &z0, const T1 &v1) {
|
||||||
|
return Array<expr_t<T0>, 2>(z0) / v1;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE Complex<expr_t<T>> conj(const Complex<T> &z) {
|
||||||
|
const Complex<expr_t<T>> mask(0.f, -0.f);
|
||||||
|
return z ^ mask;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE expr_t<T> abs(const Complex<T> &z) {
|
||||||
|
return norm(z);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE Complex<expr_t<T>> exp(const Complex<T> &z) {
|
||||||
|
auto exp_r = exp(real(z));
|
||||||
|
auto [s, c] = sincos(imag(z));
|
||||||
|
return { exp_r * c, exp_r * s };
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE Complex<expr_t<T>> log(const Complex<T> &z) {
|
||||||
|
return { .5f * log(squared_norm(z)), arg(z) };
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE expr_t<T> arg(const Complex<T> &z) {
|
||||||
|
return atan2(imag(z), real(z));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T1, typename T2, typename Expr = expr_t<T1, T2>> std::pair<Expr, Expr>
|
||||||
|
sincos_arg_diff(const Complex<T1> &z1, const Complex<T2> &z2) {
|
||||||
|
Expr normalization = rsqrt(squared_norm(z1) * squared_norm(z2));
|
||||||
|
Complex<Expr> value = z1 * conj(z2) * normalization;
|
||||||
|
return { imag(value), real(value) };
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T0, typename T1>
|
||||||
|
ENOKI_INLINE auto pow(const Complex<T0> &z0, const Complex<T1> &z1) {
|
||||||
|
return exp(log(z0) * z1);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE Complex<expr_t<T>> sqrt(const Complex<T> &z) {
|
||||||
|
auto [s, c] = sincos(arg(z) * .5f);
|
||||||
|
auto r = sqrt(abs(z));
|
||||||
|
return Complex<expr_t<T>>(c * r, s * r);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE Complex<expr_t<T>> sqrtz(const T &x) {
|
||||||
|
auto r = sqrt(abs(x)), z = zero<T>();
|
||||||
|
auto is_real = x >= 0;
|
||||||
|
return { select(is_real, r, z), select(is_real, z, r) };
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE Complex<expr_t<T>> sin(const Complex<T> &z) {
|
||||||
|
auto [s, c] = sincos(real(z));
|
||||||
|
auto [sh, ch] = sincosh(imag(z));
|
||||||
|
return Complex<expr_t<T>>(s * ch, c * sh);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE Complex<expr_t<T>> cos(const Complex<T> &z) {
|
||||||
|
auto [s, c] = sincos(real(z));
|
||||||
|
auto [sh, ch] = sincosh(imag(z));
|
||||||
|
return Complex<expr_t<T>>(c * ch, -s * sh);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename R = Complex<expr_t<T>>>
|
||||||
|
ENOKI_INLINE std::pair<R, R> sincos(const Complex<T> &z) {
|
||||||
|
auto [s, c] = sincos(real(z));
|
||||||
|
auto [sh, ch] = sincosh(imag(z));
|
||||||
|
return std::make_pair<R, R>(
|
||||||
|
R(s * ch, c * sh),
|
||||||
|
R(c * ch, -s * sh)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE Complex<expr_t<T>> tan(const Complex<T> &z) {
|
||||||
|
auto [s, c] = sincos(z);
|
||||||
|
return s / c;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename R = Complex<expr_t<T>>>
|
||||||
|
ENOKI_INLINE R asin(const Complex<T> &z) {
|
||||||
|
auto tmp = log(R(-imag(z), real(z)) + sqrt(1.f - z*z));
|
||||||
|
return R(imag(tmp), -real(tmp));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename R = Complex<expr_t<T>>>
|
||||||
|
ENOKI_INLINE R acos(const Complex<T> &z) {
|
||||||
|
auto tmp = sqrt(1.f - z*z);
|
||||||
|
tmp = log(z + R(-imag(tmp), real(tmp)));
|
||||||
|
return R(imag(tmp), -real(tmp));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename R = Complex<expr_t<T>>>
|
||||||
|
ENOKI_INLINE R atan(const Complex<T> &z) {
|
||||||
|
const R I(0.f, 1.f);
|
||||||
|
auto tmp = log((I-z) / (I+z));
|
||||||
|
return R(imag(tmp) * .5f, -real(tmp) * .5f);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE Complex<expr_t<T>> sinh(const Complex<T> &z) {
|
||||||
|
auto [s, c] = sincos(imag(z));
|
||||||
|
auto [sh, ch] = sincosh(real(z));
|
||||||
|
return { sh * c, ch * s };
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE Complex<expr_t<T>> cosh(const Complex<T> &z) {
|
||||||
|
auto [s, c] = sincos(imag(z));
|
||||||
|
auto [sh, ch] = sincosh(real(z));
|
||||||
|
return { ch * c, sh * s };
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename R = Complex<expr_t<T>>>
|
||||||
|
ENOKI_INLINE std::pair<R, R> sincosh(const Complex<T> &z) {
|
||||||
|
auto [s, c] = sincos(imag(z));
|
||||||
|
auto [sh, ch] = sincosh(real(z));
|
||||||
|
return std::make_pair<R, R>(
|
||||||
|
R(sh * c, ch * s),
|
||||||
|
R(ch * c, sh * s)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE Complex<expr_t<T>> tanh(const Complex<T> &z) {
|
||||||
|
auto [sh, ch] = sincosh(z);
|
||||||
|
return sh / ch;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE Complex<expr_t<T>> asinh(const Complex<T> &z) {
|
||||||
|
return log(z + sqrt(z*z + 1.f));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE Complex<expr_t<T>> acosh(const Complex<T> &z) {
|
||||||
|
return log(z + sqrt(z*z - 1.f));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename R = Complex<expr_t<T>>>
|
||||||
|
ENOKI_INLINE R atanh(const Complex<T> &z) {
|
||||||
|
return log((R(1.f) + z) / (R(1.f) - z)) * R(.5f);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, enable_if_not_array_t<T> = 0>
|
||||||
|
ENOKI_NOINLINE std::ostream &operator<<(std::ostream &os, const Complex<T> &z) {
|
||||||
|
os << z.x();
|
||||||
|
os << (z.y() < 0 ? " - " : " + ") << abs(z.y()) << "i";
|
||||||
|
return os;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, enable_if_array_t<T> = 0, enable_if_not_array_t<value_t<T>> = 0>
|
||||||
|
ENOKI_NOINLINE std::ostream &operator<<(std::ostream &os, const Complex<T> &z) {
|
||||||
|
os << "[";
|
||||||
|
size_t size = z.x().size();
|
||||||
|
for (size_t i = 0; i < size; ++i) {
|
||||||
|
os << z.x().coeff(i);
|
||||||
|
os << (z.y().coeff(i) < 0 ? " - " : " + ") << abs(z.y().coeff(i)) << "i";
|
||||||
|
if (i + 1 < size)
|
||||||
|
os << ",\n ";
|
||||||
|
}
|
||||||
|
os << "]";
|
||||||
|
return os;
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,330 @@
|
||||||
|
/*
|
||||||
|
enoki/fwd.h -- Preprocessor definitions and forward declarations
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# if !defined(_USE_MATH_DEFINES)
|
||||||
|
# define _USE_MATH_DEFINES
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
|
#include <cstring>
|
||||||
|
#include <type_traits>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# define ENOKI_NOINLINE __declspec(noinline)
|
||||||
|
# define ENOKI_INLINE __forceinline
|
||||||
|
# define ENOKI_INLINE_LAMBDA
|
||||||
|
# define ENOKI_PURE
|
||||||
|
# define ENOKI_MALLOC __declspec(restrict)
|
||||||
|
# define ENOKI_MAY_ALIAS
|
||||||
|
# define ENOKI_ASSUME_ALIGNED(x, s) x
|
||||||
|
# define ENOKI_UNROLL
|
||||||
|
# define ENOKI_NOUNROLL
|
||||||
|
# define ENOKI_IVDEP __pragma(loop(ivdep))
|
||||||
|
# define ENOKI_PACK
|
||||||
|
# define ENOKI_LIKELY(x) x
|
||||||
|
# define ENOKI_UNLIKELY(x) x
|
||||||
|
# define ENOKI_REGCALL
|
||||||
|
# define ENOKI_IMPORT __declspec(dllimport)
|
||||||
|
# define ENOKI_EXPORT __declspec(dllexport)
|
||||||
|
#else
|
||||||
|
# define ENOKI_NOINLINE __attribute__ ((noinline))
|
||||||
|
# define ENOKI_INLINE __attribute__ ((always_inline)) inline
|
||||||
|
# define ENOKI_INLINE_LAMBDA __attribute__ ((always_inline))
|
||||||
|
# define ENOKI_PURE __attribute__ ((const,nothrow))
|
||||||
|
# define ENOKI_MALLOC __attribute__ ((malloc))
|
||||||
|
# define ENOKI_ASSUME_ALIGNED(x, s) __builtin_assume_aligned(x, s)
|
||||||
|
# define ENOKI_LIKELY(x) __builtin_expect(!!(x), 1)
|
||||||
|
# define ENOKI_UNLIKELY(x) __builtin_expect(!!(x), 0)
|
||||||
|
# define ENOKI_PACK __attribute__ ((packed))
|
||||||
|
# if defined(__clang__)
|
||||||
|
# define ENOKI_UNROLL _Pragma("unroll")
|
||||||
|
# define ENOKI_NOUNROLL _Pragma("nounroll")
|
||||||
|
# define ENOKI_IVDEP
|
||||||
|
# define ENOKI_MAY_ALIAS __attribute__ ((may_alias))
|
||||||
|
# define ENOKI_REGCALL __attribute__ ((regcall))
|
||||||
|
# elif defined(__INTEL_COMPILER)
|
||||||
|
# define ENOKI_MAY_ALIAS
|
||||||
|
# define ENOKI_UNROLL _Pragma("unroll")
|
||||||
|
# define ENOKI_NOUNROLL _Pragma("nounroll")
|
||||||
|
# define ENOKI_IVDEP _Pragma("ivdep")
|
||||||
|
# define ENOKI_REGCALL __attribute__ ((regcall))
|
||||||
|
# else
|
||||||
|
# define ENOKI_MAY_ALIAS __attribute__ ((may_alias))
|
||||||
|
# define ENOKI_UNROLL
|
||||||
|
# define ENOKI_NOUNROLL
|
||||||
|
# if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9))
|
||||||
|
# define ENOKI_IVDEP _Pragma("GCC ivdep")
|
||||||
|
# else
|
||||||
|
# define ENOKI_IVDEP
|
||||||
|
# endif
|
||||||
|
# define ENOKI_REGCALL
|
||||||
|
# endif
|
||||||
|
# define ENOKI_IMPORT
|
||||||
|
# define ENOKI_EXPORT __attribute__ ((visibility("default")))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define ENOKI_MARK_USED(x) (void) x
|
||||||
|
|
||||||
|
#if !defined(NAMESPACE_BEGIN)
|
||||||
|
# define NAMESPACE_BEGIN(name) namespace name {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(NAMESPACE_END)
|
||||||
|
# define NAMESPACE_END(name) }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define ENOKI_VERSION_MAJOR 0
|
||||||
|
#define ENOKI_VERSION_MINOR 1
|
||||||
|
#define ENOKI_VERSION_PATCH 0
|
||||||
|
|
||||||
|
#define ENOKI_STRINGIFY(x) #x
|
||||||
|
#define ENOKI_TOSTRING(x) ENOKI_STRINGIFY(x)
|
||||||
|
#define ENOKI_VERSION \
|
||||||
|
(ENOKI_TOSTRING(ENOKI_VERSION_MAJOR) "." \
|
||||||
|
ENOKI_TOSTRING(ENOKI_VERSION_MINOR) "." \
|
||||||
|
ENOKI_TOSTRING(ENOKI_VERSION_PATCH))
|
||||||
|
|
||||||
|
#if defined(__clang__) && defined(__apple_build_version__)
|
||||||
|
# if __clang_major__ < 10
|
||||||
|
# error Enoki requires a very recent version of AppleClang (XCode >= 10.0)
|
||||||
|
# endif
|
||||||
|
#elif defined(__clang__)
|
||||||
|
# if __clang_major__ < 7 && !defined(EMSCRIPTEN)
|
||||||
|
# error Enoki requires a very recent version of Clang/LLVM (>= 7.0)
|
||||||
|
# endif
|
||||||
|
#elif defined(__GNUC__)
|
||||||
|
# if (__GNUC__ < 8) || (__GNUC__ == 8 && __GNUC_MINOR__ < 2)
|
||||||
|
# error Enoki requires a very recent version of GCC (>= 8.2)
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
|
# define ENOKI_X86_64 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (defined(__i386__) || defined(_M_IX86)) && !defined(ENOKI_X86_64)
|
||||||
|
# define ENOKI_X86_32 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__aarch64__)
|
||||||
|
# define ENOKI_ARM_64 1
|
||||||
|
#elif defined(__arm__)
|
||||||
|
# define ENOKI_ARM_32 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (defined(_MSC_VER) && defined(ENOKI_X86_32)) && !defined(ENOKI_DISABLE_VECTORIZATION)
|
||||||
|
// Enoki does not support vectorization on 32-bit Windows due to various
|
||||||
|
// platform limitations (unaligned stack, calling conventions don't allow
|
||||||
|
// passing vector registers, etc.).
|
||||||
|
# define ENOKI_DISABLE_VECTORIZATION 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
# if !defined(ENOKI_DISABLE_VECTORIZATION)
|
||||||
|
# if defined(__AVX512F__)
|
||||||
|
# define ENOKI_X86_AVX512F 1
|
||||||
|
# endif
|
||||||
|
# if defined(__AVX512CD__)
|
||||||
|
# define ENOKI_X86_AVX512CD 1
|
||||||
|
# endif
|
||||||
|
# if defined(__AVX512DQ__)
|
||||||
|
# define ENOKI_X86_AVX512DQ 1
|
||||||
|
# endif
|
||||||
|
# if defined(__AVX512VL__)
|
||||||
|
# define ENOKI_X86_AVX512VL 1
|
||||||
|
# endif
|
||||||
|
# if defined(__AVX512BW__)
|
||||||
|
# define ENOKI_X86_AVX512BW 1
|
||||||
|
# endif
|
||||||
|
# if defined(__AVX512PF__)
|
||||||
|
# define ENOKI_X86_AVX512PF 1
|
||||||
|
# endif
|
||||||
|
# if defined(__AVX512ER__)
|
||||||
|
# define ENOKI_X86_AVX512ER 1
|
||||||
|
# endif
|
||||||
|
# if defined(__AVX512VBMI__)
|
||||||
|
# define ENOKI_X86_AVX512VBMI 1
|
||||||
|
# endif
|
||||||
|
# if defined(__AVX512VPOPCNTDQ__)
|
||||||
|
# define ENOKI_X86_AVX512VPOPCNTDQ 1
|
||||||
|
# endif
|
||||||
|
# if defined(__AVX2__)
|
||||||
|
# define ENOKI_X86_AVX2 1
|
||||||
|
# endif
|
||||||
|
# if defined(__FMA__)
|
||||||
|
# define ENOKI_X86_FMA 1
|
||||||
|
# endif
|
||||||
|
# if defined(__F16C__)
|
||||||
|
# define ENOKI_X86_F16C 1
|
||||||
|
# endif
|
||||||
|
# if defined(__AVX__)
|
||||||
|
# define ENOKI_X86_AVX 1
|
||||||
|
# endif
|
||||||
|
# if defined(__SSE4_2__)
|
||||||
|
# define ENOKI_X86_SSE42 1
|
||||||
|
# endif
|
||||||
|
# if defined(__ARM_NEON)
|
||||||
|
# define ENOKI_ARM_NEON
|
||||||
|
# endif
|
||||||
|
# if defined(__ARM_FEATURE_FMA)
|
||||||
|
# define ENOKI_ARM_FMA
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Fix missing/inconsistent preprocessor flags */
|
||||||
|
#if defined(ENOKI_X86_AVX512F) && !defined(ENOKI_X86_AVX2)
|
||||||
|
# define ENOKI_X86_AVX2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX2) && !defined(ENOKI_X86_F16C)
|
||||||
|
# define ENOKI_X86_F16C
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX2) && !defined(ENOKI_X86_FMA)
|
||||||
|
# define ENOKI_X86_FMA
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX2) && !defined(ENOKI_X86_AVX)
|
||||||
|
# define ENOKI_X86_AVX
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX) && !defined(ENOKI_X86_SSE42)
|
||||||
|
# define ENOKI_X86_SSE42
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* The following macro is used by the test suite to detect
|
||||||
|
unimplemented methods in vectorized backends */
|
||||||
|
|
||||||
|
#if !defined(ENOKI_TRACK_SCALAR)
|
||||||
|
# define ENOKI_TRACK_SCALAR(reason)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_ALLOC_VERBOSE)
|
||||||
|
# define ENOKI_TRACK_ALLOC(ptr, size) \
|
||||||
|
printf("Enoki: %p: alloc(%llu)\n", (ptr), (unsigned long long) (size));
|
||||||
|
# define ENOKI_TRACK_DEALLOC(ptr, size) \
|
||||||
|
printf("Enoki: %p: dealloc(%llu)\n", (ptr), (unsigned long long) (size));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(ENOKI_TRACK_ALLOC)
|
||||||
|
# define ENOKI_TRACK_ALLOC(ptr, size)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(ENOKI_TRACK_DEALLOC)
|
||||||
|
# define ENOKI_TRACK_DEALLOC(ptr, size)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define ENOKI_CHKSCALAR(reason) \
|
||||||
|
if (std::is_arithmetic_v<std::decay_t<Value>>) { \
|
||||||
|
ENOKI_TRACK_SCALAR(reason) \
|
||||||
|
}
|
||||||
|
|
||||||
|
#if !defined(ENOKI_APPROX_DEFAULT)
|
||||||
|
# define ENOKI_APPROX_DEFAULT 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
using ssize_t = std::make_signed_t<size_t>;
|
||||||
|
|
||||||
|
/// Maximum hardware-supported packet size in bytes
|
||||||
|
#if defined(ENOKI_X86_AVX512F)
|
||||||
|
static constexpr size_t max_packet_size = 64;
|
||||||
|
#elif defined(ENOKI_X86_AVX)
|
||||||
|
static constexpr size_t max_packet_size = 32;
|
||||||
|
#elif defined(ENOKI_X86_SSE42) || defined(ENOKI_ARM_NEON)
|
||||||
|
static constexpr size_t max_packet_size = 16;
|
||||||
|
#else
|
||||||
|
static constexpr size_t max_packet_size = 4;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
constexpr size_t array_default_size = max_packet_size / 4;
|
||||||
|
|
||||||
|
/// Base class of all arrays
|
||||||
|
template <typename Value_, typename Derived_> struct ArrayBase;
|
||||||
|
|
||||||
|
/// Base class of all statically sized arrays
|
||||||
|
template <typename Value_, size_t Size_, bool IsMask_, typename Derived_>
|
||||||
|
struct StaticArrayBase;
|
||||||
|
|
||||||
|
/// Generic array class, which broadcasts from the outer to inner dimensions
|
||||||
|
template <typename Value_, size_t Size_ = array_default_size>
|
||||||
|
struct Array;
|
||||||
|
|
||||||
|
/// Generic array class, which broadcasts from the inner to outer dimensions
|
||||||
|
template <typename Value_, size_t Size_ = array_default_size>
|
||||||
|
struct Packet;
|
||||||
|
|
||||||
|
/// Generic mask class, which broadcasts from the outer to inner dimensions
|
||||||
|
template <typename Value_, size_t Size_ = array_default_size>
|
||||||
|
struct Mask;
|
||||||
|
|
||||||
|
/// Generic mask class, which broadcasts from the inner to outer dimensions
|
||||||
|
template <typename Value_, size_t Size_ = array_default_size>
|
||||||
|
struct PacketMask;
|
||||||
|
|
||||||
|
/// Dynamically sized array
|
||||||
|
template <typename Packet_> struct DynamicArray;
|
||||||
|
template <typename Packet_> struct DynamicMask;
|
||||||
|
|
||||||
|
/// Reverse-mode autodiff array
|
||||||
|
template <typename Value> struct DiffArray;
|
||||||
|
|
||||||
|
template <typename Value_, size_t Size_>
|
||||||
|
struct Matrix;
|
||||||
|
|
||||||
|
template <typename Value_>
|
||||||
|
struct Complex;
|
||||||
|
|
||||||
|
template <typename Value_>
|
||||||
|
struct Quaternion;
|
||||||
|
|
||||||
|
/// Helper class for custom data structures
|
||||||
|
template <typename T, typename = int>
|
||||||
|
struct struct_support;
|
||||||
|
|
||||||
|
template <typename Value>
|
||||||
|
struct CUDAArray;
|
||||||
|
|
||||||
|
template <typename T> class cuda_host_allocator;
|
||||||
|
template <typename T> class cuda_managed_allocator;
|
||||||
|
|
||||||
|
extern ENOKI_IMPORT void* cuda_host_malloc(size_t);
|
||||||
|
extern ENOKI_IMPORT void cuda_host_free(void *);
|
||||||
|
|
||||||
|
/// Half-precision floating point value
|
||||||
|
struct half;
|
||||||
|
|
||||||
|
template <typename T> struct MaskBit;
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
struct reinterpret_flag { };
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, bool UseIntrinsic = false, typename = int>
|
||||||
|
struct divisor;
|
||||||
|
template <typename T>
|
||||||
|
struct divisor_ext;
|
||||||
|
|
||||||
|
/// Reinterpret the binary represesentation of a data type
|
||||||
|
template<typename T, typename U> ENOKI_INLINE T memcpy_cast(const U &val) {
|
||||||
|
static_assert(sizeof(T) == sizeof(U), "memcpy_cast: sizes did not match!");
|
||||||
|
T result;
|
||||||
|
std::memcpy(&result, &val, sizeof(T));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,193 @@
|
||||||
|
/*
|
||||||
|
enoki/half.h -- minimal half precision number type
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <enoki/array_traits.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
struct half;
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(std)
|
||||||
|
template<> struct is_floating_point<enoki::half> : true_type { };
|
||||||
|
template<> struct is_arithmetic<enoki::half> : true_type { };
|
||||||
|
template<> struct is_signed<enoki::half> : true_type { };
|
||||||
|
NAMESPACE_END(std)
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
struct half {
|
||||||
|
uint16_t value;
|
||||||
|
|
||||||
|
half()
|
||||||
|
#if !defined(NDEBUG)
|
||||||
|
: value(0x7FFF) /* Initialize with NaN */
|
||||||
|
#endif
|
||||||
|
{ }
|
||||||
|
|
||||||
|
#define ENOKI_IF_SCALAR template <typename Value, enable_if_t<std::is_arithmetic_v<Value>> = 0>
|
||||||
|
|
||||||
|
ENOKI_IF_SCALAR half(Value val) : value(float32_to_float16(float(val))) { }
|
||||||
|
|
||||||
|
half operator+(half h) const { return half(float(*this) + float(h)); }
|
||||||
|
half operator-(half h) const { return half(float(*this) - float(h)); }
|
||||||
|
half operator*(half h) const { return half(float(*this) * float(h)); }
|
||||||
|
half operator/(half h) const { return half(float(*this) / float(h)); }
|
||||||
|
|
||||||
|
half operator-() const { return half(-float(*this)); }
|
||||||
|
|
||||||
|
ENOKI_IF_SCALAR friend half operator+(Value val, half h) { return half(val) + h; }
|
||||||
|
ENOKI_IF_SCALAR friend half operator-(Value val, half h) { return half(val) - h; }
|
||||||
|
ENOKI_IF_SCALAR friend half operator*(Value val, half h) { return half(val) * h; }
|
||||||
|
ENOKI_IF_SCALAR friend half operator/(Value val, half h) { return half(val) / h; }
|
||||||
|
|
||||||
|
half& operator+=(half h) { return operator=(*this + h); }
|
||||||
|
half& operator-=(half h) { return operator=(*this - h); }
|
||||||
|
half& operator*=(half h) { return operator=(*this * h); }
|
||||||
|
half& operator/=(half h) { return operator=(*this / h); }
|
||||||
|
|
||||||
|
bool operator==(half h) const { return float(*this) == float(h); }
|
||||||
|
bool operator!=(half h) const { return float(*this) != float(h); }
|
||||||
|
bool operator<(half h) const { return float(*this) < float(h); }
|
||||||
|
bool operator>(half h) const { return float(*this) > float(h); }
|
||||||
|
bool operator<=(half h) const { return float(*this) <= float(h); }
|
||||||
|
bool operator>=(half h) const { return float(*this) >= float(h); }
|
||||||
|
|
||||||
|
ENOKI_IF_SCALAR operator Value() const { return Value(float16_to_float32(value)); }
|
||||||
|
|
||||||
|
static half from_binary(uint16_t value) { half h; h.value = value; return h; }
|
||||||
|
|
||||||
|
friend std::ostream &operator<<(std::ostream &os, const half &h) {
|
||||||
|
os << float(h);
|
||||||
|
return os;
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef ENOKI_IF_SCALAR
|
||||||
|
private:
|
||||||
|
/*
|
||||||
|
Value float32<->float16 conversion code by Paul A. Tessier (@Phernost)
|
||||||
|
Used with permission by the author, who released this code into the public domain
|
||||||
|
*/
|
||||||
|
union Bits {
|
||||||
|
float f;
|
||||||
|
int32_t si;
|
||||||
|
uint32_t ui;
|
||||||
|
};
|
||||||
|
|
||||||
|
static constexpr int const shift = 13;
|
||||||
|
static constexpr int const shiftSign = 16;
|
||||||
|
|
||||||
|
static constexpr int32_t const infN = 0x7F800000; // flt32 infinity
|
||||||
|
static constexpr int32_t const maxN = 0x477FE000; // max flt16 normal as a flt32
|
||||||
|
static constexpr int32_t const minN = 0x38800000; // min flt16 normal as a flt32
|
||||||
|
static constexpr int32_t const signN = (int32_t) 0x80000000; // flt32 sign bit
|
||||||
|
|
||||||
|
static constexpr int32_t const infC = infN >> shift;
|
||||||
|
static constexpr int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32
|
||||||
|
static constexpr int32_t const maxC = maxN >> shift;
|
||||||
|
static constexpr int32_t const minC = minN >> shift;
|
||||||
|
static constexpr int32_t const signC = signN >> shiftSign; // flt16 sign bit
|
||||||
|
|
||||||
|
static constexpr int32_t const mulN = 0x52000000; // (1 << 23) / minN
|
||||||
|
static constexpr int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift))
|
||||||
|
|
||||||
|
static constexpr int32_t const subC = 0x003FF; // max flt32 subnormal down shifted
|
||||||
|
static constexpr int32_t const norC = 0x00400; // min flt32 normal down shifted
|
||||||
|
|
||||||
|
static constexpr int32_t const maxD = infC - maxC - 1;
|
||||||
|
static constexpr int32_t const minD = minC - subC - 1;
|
||||||
|
|
||||||
|
public:
|
||||||
|
static uint16_t float32_to_float16(float value) {
|
||||||
|
#if defined(ENOKI_X86_F16C)
|
||||||
|
return (uint16_t) _mm_cvtsi128_si32(
|
||||||
|
_mm_cvtps_ph(_mm_set_ss(value), _MM_FROUND_CUR_DIRECTION));
|
||||||
|
#elif defined(ENOKI_ARM_NEON)
|
||||||
|
return memcpy_cast<uint16_t>((__fp16) value);
|
||||||
|
#else
|
||||||
|
Bits v, s;
|
||||||
|
v.f = value;
|
||||||
|
uint32_t sign = (uint32_t) (v.si & signN);
|
||||||
|
v.si ^= sign;
|
||||||
|
sign >>= shiftSign; // logical shift
|
||||||
|
s.si = mulN;
|
||||||
|
s.si = (int32_t) (s.f * v.f); // correct subnormals
|
||||||
|
v.si ^= (s.si ^ v.si) & -(minN > v.si);
|
||||||
|
v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
|
||||||
|
v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
|
||||||
|
v.ui >>= shift; // logical shift
|
||||||
|
v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
|
||||||
|
v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
|
||||||
|
return (uint16_t) (v.ui | sign);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static float float16_to_float32(uint16_t value) {
|
||||||
|
#if defined(ENOKI_X86_F16C)
|
||||||
|
return _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128((int32_t) value)));
|
||||||
|
#elif defined(ENOKI_ARM_NEON)
|
||||||
|
return (float) memcpy_cast<__fp16>(value);
|
||||||
|
#else
|
||||||
|
Bits v;
|
||||||
|
v.ui = value;
|
||||||
|
int32_t sign = v.si & signC;
|
||||||
|
v.si ^= sign;
|
||||||
|
sign <<= shiftSign;
|
||||||
|
v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
|
||||||
|
v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
|
||||||
|
Bits s;
|
||||||
|
s.si = mulC;
|
||||||
|
s.f *= float(v.si);
|
||||||
|
int32_t mask = -(norC > v.si);
|
||||||
|
v.si <<= shift;
|
||||||
|
v.si ^= (s.si ^ v.si) & mask;
|
||||||
|
v.si |= sign;
|
||||||
|
return v.f;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(std)
|
||||||
|
|
||||||
|
template<> struct numeric_limits<enoki::half> {
|
||||||
|
static constexpr bool is_signed = true;
|
||||||
|
static constexpr bool is_exact = false;
|
||||||
|
static constexpr bool is_modulo = false;
|
||||||
|
static constexpr bool is_iec559 = true;
|
||||||
|
static constexpr bool has_infinity = true;
|
||||||
|
static constexpr bool has_quiet_NaN = true;
|
||||||
|
static constexpr int digits = 11;
|
||||||
|
static constexpr int digits10 = 3;
|
||||||
|
static constexpr int max_digits10 = 5;
|
||||||
|
static constexpr int radix = 2;
|
||||||
|
static constexpr int min_exponent = -13;
|
||||||
|
static constexpr int min_exponent10 = -4;
|
||||||
|
static constexpr int max_exponent = 16;
|
||||||
|
static constexpr int max_exponent10 = 4;
|
||||||
|
static constexpr float_denorm_style has_denorm = denorm_present;
|
||||||
|
static constexpr float_round_style round_style = round_indeterminate;
|
||||||
|
static enoki::half min() noexcept { return enoki::half::from_binary(0x0400); }
|
||||||
|
static enoki::half lowest() noexcept { return enoki::half::from_binary(0xFBFF); }
|
||||||
|
static enoki::half max() noexcept { return enoki::half::from_binary(0x7BFF); }
|
||||||
|
static enoki::half epsilon() noexcept { return enoki::half::from_binary(0x1400); }
|
||||||
|
static enoki::half round_error() noexcept { return enoki::half::from_binary(0x3C00); }
|
||||||
|
static enoki::half infinity() noexcept { return enoki::half::from_binary(0x7C00); }
|
||||||
|
static enoki::half quiet_NaN() noexcept { return enoki::half::from_binary(0x7FFF); }
|
||||||
|
static enoki::half signaling_NaN() noexcept { return enoki::half::from_binary(0x7DFF); }
|
||||||
|
static enoki::half denorm_min() noexcept { return enoki::half::from_binary(0x0001); }
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(std)
|
||||||
|
|
||||||
|
|
@ -0,0 +1,658 @@
|
||||||
|
/*
|
||||||
|
enoki/quaternion.h -- Matrix data structure
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <enoki/array.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
/// Value trait to access the column type of a matrix
|
||||||
|
template <typename T> using column_t = typename std::decay_t<T>::Column;
|
||||||
|
|
||||||
|
/// Value trait to access the entry type of a matrix
|
||||||
|
template <typename T> using entry_t = value_t<column_t<T>>;
|
||||||
|
|
||||||
|
/// SFINAE helper for matrixs
|
||||||
|
template <typename T> using is_matrix_helper = enable_if_t<std::decay_t<T>::IsMatrix>;
|
||||||
|
template <typename T> constexpr bool is_matrix_v = is_detected_v<is_matrix_helper, T>;
|
||||||
|
template <typename T> using enable_if_matrix_t = enable_if_t<is_matrix_v<T>>;
|
||||||
|
template <typename T> using enable_if_not_matrix_t = enable_if_t<!is_matrix_v<T>>;
|
||||||
|
|
||||||
|
template <typename Value_, size_t Size_>
|
||||||
|
struct Matrix : StaticArrayImpl<Array<Value_, Size_>, Size_, false, Matrix<Value_, Size_>> {
|
||||||
|
|
||||||
|
using Entry = Value_;
|
||||||
|
using Column = Array<Entry, Size_>;
|
||||||
|
|
||||||
|
using Base = StaticArrayImpl<Column, Size_, false, Matrix<Value_, Size_>>;
|
||||||
|
using Base::coeff;
|
||||||
|
|
||||||
|
ENOKI_ARRAY_IMPORT_BASIC(Base, Matrix);
|
||||||
|
using Base::operator=;
|
||||||
|
|
||||||
|
static constexpr bool IsMatrix = true;
|
||||||
|
static constexpr bool IsVector = false;
|
||||||
|
|
||||||
|
using ArrayType = Matrix;
|
||||||
|
using MaskType = Mask<mask_t<Column>, Size_>;
|
||||||
|
|
||||||
|
template <typename T> using ReplaceValue = Matrix<value_t<T>, Size_>;
|
||||||
|
|
||||||
|
Matrix() = default;
|
||||||
|
|
||||||
|
/// Initialize from a incompatible matrix
|
||||||
|
template <typename Value2, size_t Size2, enable_if_t<Size2 == Size_> = 0>
|
||||||
|
ENOKI_INLINE Matrix(const Matrix<Value2, Size2> &m)
|
||||||
|
: Base(m) { }
|
||||||
|
|
||||||
|
/// Initialize from an incompatible matrix
|
||||||
|
template <size_t Size2, enable_if_t<Size2 != Size_> = 0>
|
||||||
|
ENOKI_INLINE Matrix(const Matrix<Value_, Size2> &m) {
|
||||||
|
if constexpr (Size2 > Size) {
|
||||||
|
/// Other matrix is bigger -- retain the top left part
|
||||||
|
for (size_t i = 0; i < Size; ++i)
|
||||||
|
coeff(i) = head<Size>(m.coeff(i));
|
||||||
|
} else {
|
||||||
|
/// Other matrix is smaller -- copy the top left part and set remainder to identity
|
||||||
|
using Remainder = Array<Value_, Size - Size2>;
|
||||||
|
for (size_t i = 0; i < Size2; ++i)
|
||||||
|
coeff(i) = concat(m.coeff(i), zero<Remainder>());
|
||||||
|
for (size_t i = Size2; i < Size; ++i) {
|
||||||
|
auto col = zero<Column>();
|
||||||
|
col.coeff(i) = 1;
|
||||||
|
coeff(i) = col;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<(array_depth_v<T> <= Base::Depth - 2)> = 0,
|
||||||
|
enable_if_not_matrix_t<T> = 0>
|
||||||
|
ENOKI_INLINE Matrix(T&& v) {
|
||||||
|
for (size_t i = 0; i < Size; ++i) {
|
||||||
|
coeff(i) = zero<Column>();
|
||||||
|
coeff(i, i) = v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<(array_depth_v<T> == Base::Depth)> = 0,
|
||||||
|
enable_if_not_matrix_t<T> = 0>
|
||||||
|
ENOKI_INLINE Matrix(T&& v) : Base(std::forward<T>(v)) { }
|
||||||
|
|
||||||
|
/// Initialize the matrix from a list of columns
|
||||||
|
template <typename... Args, enable_if_t<sizeof...(Args) == Size_ &&
|
||||||
|
std::conjunction_v<std::is_constructible<Column, Args>...>> = 0>
|
||||||
|
ENOKI_INLINE Matrix(const Args&... args) : Base(args...) { }
|
||||||
|
|
||||||
|
/// Initialize the matrix from a list of entries in row-major order
|
||||||
|
template <typename... Args, enable_if_t<sizeof...(Args) == Size_ * Size_ &&
|
||||||
|
std::conjunction_v<std::is_constructible<Entry, Args>...>> = 0>
|
||||||
|
ENOKI_INLINE Matrix(const Args&... args) {
|
||||||
|
alignas(alignof(Column)) Entry values[sizeof...(Args)] = { Entry(args)... };
|
||||||
|
for (size_t j = 0; j < Size; ++j)
|
||||||
|
for (size_t i = 0; i < Size; ++i)
|
||||||
|
coeff(j, i) = values[i * Size + j];
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename... Column>
|
||||||
|
ENOKI_INLINE static Matrix from_cols(const Column&... cols) {
|
||||||
|
return Matrix(cols...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename... Row>
|
||||||
|
ENOKI_INLINE static Matrix from_rows(const Row&... rows) {
|
||||||
|
return transpose(Matrix(rows...));
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_INLINE Column& col(size_t index) { return coeff(index); }
|
||||||
|
ENOKI_INLINE const Column& col(size_t index) const { return coeff(index); }
|
||||||
|
|
||||||
|
ENOKI_INLINE Column row(size_t index) const {
|
||||||
|
using Index = Array<uint32_t, Size>;
|
||||||
|
return gather<Column>(coeff(0).data() + index,
|
||||||
|
arange<Index>() * uint32_t(Size));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return a reference to the (i, j) element
|
||||||
|
ENOKI_INLINE decltype(auto) operator()(size_t i, size_t j) { return coeff(j, i); }
|
||||||
|
|
||||||
|
/// Return a reference to the (i, j) element (const)
|
||||||
|
ENOKI_INLINE decltype(auto) operator()(size_t i, size_t j) const { return coeff(j, i); }
|
||||||
|
|
||||||
|
static ENOKI_INLINE Derived zero_(size_t size) {
|
||||||
|
Derived result;
|
||||||
|
for (size_t i = 0; i < Size; ++i)
|
||||||
|
result.coeff(i) = zero<Column>(size);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE Derived empty_(size_t size) {
|
||||||
|
Derived result;
|
||||||
|
for (size_t i = 0; i < Size; ++i)
|
||||||
|
result.coeff(i) = empty<Column>(size);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE static Matrix full_(const T &value, size_t size) {
|
||||||
|
return Array<Column, Size>::full_(value, size);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T0, typename T1, size_t Size,
|
||||||
|
typename Result = Matrix<expr_t<T0, T1>, Size>,
|
||||||
|
typename Column = column_t<Result>>
|
||||||
|
ENOKI_INLINE Result operator*(const Matrix<T0, Size> &m0,
|
||||||
|
const Matrix<T1, Size> &m1) {
|
||||||
|
Result result;
|
||||||
|
/* 4x4 case reduced to 4 multiplications, 12 fused multiply-adds,
|
||||||
|
and 16 broadcasts (also fused on AVX512VL) */
|
||||||
|
for (size_t j = 0; j < Size; ++j) {
|
||||||
|
Column sum = m0.coeff(0) * Column::full_(m1(0, j), 1);
|
||||||
|
for (size_t i = 1; i < Size; ++i)
|
||||||
|
sum = fmadd(m0.coeff(i), Column::full_(m1(i, j), 1), sum);
|
||||||
|
result.coeff(j) = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T0, typename T1, size_t Size, enable_if_t<!T1::IsMatrix> = 0>
|
||||||
|
ENOKI_INLINE auto operator*(const Matrix<T0, Size> &m, const T1 &s) {
|
||||||
|
if constexpr (array_size_v<T1> == Size && T1::IsVector) {
|
||||||
|
using EValue = expr_t<T0, value_t<T1>>;
|
||||||
|
using EVector = Array<EValue, Size>;
|
||||||
|
EVector sum = m.coeff(0) * EVector::full_(s.coeff(0), 1);
|
||||||
|
for (size_t i = 1; i < Size; ++i)
|
||||||
|
sum = fmadd(m.coeff(i), EVector::full_(s.coeff(i), 1), sum);
|
||||||
|
return sum;
|
||||||
|
} else {
|
||||||
|
using EValue = expr_t<T0, T1>;
|
||||||
|
using EArray = Array<Array<EValue, Size>, Size>;
|
||||||
|
using EMatrix = Matrix<EValue, Size>;
|
||||||
|
|
||||||
|
return EMatrix(EArray(m) * EArray::full_(EValue(s), 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T0, typename T1, size_t Size, enable_if_t<!T0::IsMatrix> = 0>
|
||||||
|
ENOKI_INLINE auto operator*(const T0 &s, const Matrix<T1, Size> &m) {
|
||||||
|
using EValue = expr_t<T0, T1>;
|
||||||
|
using EArray = Array<Array<EValue, Size>, Size>;
|
||||||
|
using EMatrix = Matrix<EValue, Size>;
|
||||||
|
|
||||||
|
return EMatrix(EArray::full_(EValue(s), 1) * EArray(m));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T0, typename T1, size_t Size, enable_if_t<!T1::IsMatrix> = 0>
|
||||||
|
ENOKI_INLINE auto operator/(const Matrix<T0, Size> &m, const T1 &s) {
|
||||||
|
using EValue = expr_t<T0, T1>;
|
||||||
|
using EArray = Array<Array<EValue, Size>, Size>;
|
||||||
|
using EMatrix = Matrix<EValue, Size>;
|
||||||
|
|
||||||
|
return EMatrix(EArray(m) * EArray::full_(rcp(EValue(s)), 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Value, size_t Size>
|
||||||
|
ENOKI_INLINE expr_t<Value> trace(const Matrix<Value, Size> &m) {
|
||||||
|
expr_t<Value> result = m.coeff(0, 0);
|
||||||
|
for (size_t i = 1; i < Size; ++i)
|
||||||
|
result += m(i, i);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Value, size_t Size>
|
||||||
|
ENOKI_INLINE expr_t<Value> frob(const Matrix<Value, Size> &matrix) {
|
||||||
|
expr_t<column_t<Matrix<Value, Size>>> result = sqr(matrix.coeff(0));
|
||||||
|
for (size_t i = 1; i < Size; ++i)
|
||||||
|
result = fmadd(matrix.coeff(i), matrix.coeff(i), result);
|
||||||
|
return hsum(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, enable_if_matrix_t<T> = 0>
|
||||||
|
ENOKI_INLINE T identity(size_t size = 1) {
|
||||||
|
T result = zero<T>(size);
|
||||||
|
for (size_t i = 0; i < T::Size; ++i)
|
||||||
|
result(i, i) = full<typename T::Entry>(scalar_t<T>(1.f), size);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <typename Matrix, enable_if_matrix_t<Matrix> = 0>
|
||||||
|
ENOKI_INLINE Matrix diag(const column_t<Matrix> &value) {
|
||||||
|
Matrix result = zero<Matrix>();
|
||||||
|
for (size_t i = 0; i < Matrix::Size; ++i)
|
||||||
|
result(i, i) = value.coeff(i);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Matrix, enable_if_matrix_t<Matrix> = 0>
|
||||||
|
ENOKI_INLINE column_t<expr_t<Matrix>> diag(const Matrix &value) {
|
||||||
|
column_t<expr_t<Matrix>> result;
|
||||||
|
for (size_t i = 0; i < Matrix::Size; ++i)
|
||||||
|
result.coeff(i) = value(i, i);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename E = expr_t<T>>
|
||||||
|
ENOKI_INLINE Matrix<E, 1> inverse(const Matrix<T, 1> &m) {
|
||||||
|
return rcp(m(0, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename E = expr_t<T>>
|
||||||
|
ENOKI_INLINE Matrix<E, 1>
|
||||||
|
inverse_transpose(const Matrix<T, 1> &m) {
|
||||||
|
return rcp(m(0, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename E = expr_t<T>>
|
||||||
|
ENOKI_INLINE E det(const Matrix<T, 1> &m) {
|
||||||
|
return m(0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename E = expr_t<T>>
|
||||||
|
ENOKI_INLINE Matrix<E, 2> inverse(const Matrix<T, 2> &m) {
|
||||||
|
E inv_det = rcp(fmsub(m(0, 0), m(1, 1), m(0, 1) * m(1, 0)));
|
||||||
|
return Matrix<E, 2>(
|
||||||
|
m(1, 1) * inv_det, -m(0, 1) * inv_det,
|
||||||
|
-m(1, 0) * inv_det, m(0, 0) * inv_det
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename E = expr_t<T>>
|
||||||
|
ENOKI_INLINE E det(const Matrix<T, 2> &m) {
|
||||||
|
return fmsub(m(0, 0), m(1, 1), m(0, 1) * m(1, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename E = expr_t<T>>
|
||||||
|
ENOKI_INLINE Matrix<E, 2>
|
||||||
|
inverse_transpose(const Matrix<T, 2> &m) {
|
||||||
|
E inv_det = rcp(fmsub(m(0, 0), m(1, 1), m(0, 1) * m(1, 0)));
|
||||||
|
return Matrix<E, 2>(
|
||||||
|
m(1, 1) * inv_det, -m(1, 0) * inv_det,
|
||||||
|
-m(0, 1) * inv_det, m(0, 0) * inv_det
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename E = expr_t<T>>
|
||||||
|
ENOKI_INLINE Matrix<E, 3>
|
||||||
|
inverse_transpose(const Matrix<T, 3> &m) {
|
||||||
|
using Vector = Array<E, 3>;
|
||||||
|
|
||||||
|
Vector col0 = m.coeff(0),
|
||||||
|
col1 = m.coeff(1),
|
||||||
|
col2 = m.coeff(2);
|
||||||
|
|
||||||
|
Vector row0 = cross(col1, col2),
|
||||||
|
row1 = cross(col2, col0),
|
||||||
|
row2 = cross(col0, col1);
|
||||||
|
|
||||||
|
Vector inv_det = Vector(rcp(dot(col0, row0)));
|
||||||
|
|
||||||
|
return Matrix<E, 3>(
|
||||||
|
row0 * inv_det,
|
||||||
|
row1 * inv_det,
|
||||||
|
row2 * inv_det
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename E = expr_t<T>>
|
||||||
|
ENOKI_INLINE Matrix<E, 3> inverse(const Matrix<T, 3> &m) {
|
||||||
|
return transpose(inverse_transpose(m));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename E = expr_t<T>>
|
||||||
|
ENOKI_INLINE E det(const Matrix<T, 3> &m) {
|
||||||
|
return dot(m.coeff(0), cross(m.coeff(1), m.coeff(2)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename E = expr_t<T>>
|
||||||
|
ENOKI_INLINE Matrix<E, 4>
|
||||||
|
inverse_transpose(const Matrix<T, 4> &m) {
|
||||||
|
using Vector = Array<E, 4>;
|
||||||
|
|
||||||
|
Vector col0 = m.coeff(0), col1 = m.coeff(1),
|
||||||
|
col2 = m.coeff(2), col3 = m.coeff(3);
|
||||||
|
|
||||||
|
col1 = shuffle<2, 3, 0, 1>(col1);
|
||||||
|
col3 = shuffle<2, 3, 0, 1>(col3);
|
||||||
|
|
||||||
|
Vector tmp, row0, row1, row2, row3;
|
||||||
|
|
||||||
|
tmp = shuffle<1, 0, 3, 2>(col2 * col3);
|
||||||
|
row0 = col1 * tmp;
|
||||||
|
row1 = col0 * tmp;
|
||||||
|
tmp = shuffle<2, 3, 0, 1>(tmp);
|
||||||
|
row0 = fmsub(col1, tmp, row0);
|
||||||
|
row1 = shuffle<2, 3, 0, 1>(fmsub(col0, tmp, row1));
|
||||||
|
|
||||||
|
tmp = shuffle<1, 0, 3, 2>(col1 * col2);
|
||||||
|
row0 = fmadd(col3, tmp, row0);
|
||||||
|
row3 = col0 * tmp;
|
||||||
|
tmp = shuffle<2, 3, 0, 1>(tmp);
|
||||||
|
row0 = fnmadd(col3, tmp, row0);
|
||||||
|
row3 = shuffle<2, 3, 0, 1>(fmsub(col0, tmp, row3));
|
||||||
|
|
||||||
|
tmp = shuffle<1, 0, 3, 2>(shuffle<2, 3, 0, 1>(col1) * col3);
|
||||||
|
col2 = shuffle<2, 3, 0, 1>(col2);
|
||||||
|
row0 = fmadd(col2, tmp, row0);
|
||||||
|
row2 = col0 * tmp;
|
||||||
|
tmp = shuffle<2, 3, 0, 1>(tmp);
|
||||||
|
row0 = fnmadd(col2, tmp, row0);
|
||||||
|
row2 = shuffle<2, 3, 0, 1>(fmsub(col0, tmp, row2));
|
||||||
|
|
||||||
|
tmp = shuffle<1, 0, 3, 2>(col0 * col1);
|
||||||
|
row2 = fmadd(col3, tmp, row2);
|
||||||
|
row3 = fmsub(col2, tmp, row3);
|
||||||
|
tmp = shuffle<2, 3, 0, 1>(tmp);
|
||||||
|
row2 = fmsub(col3, tmp, row2);
|
||||||
|
row3 = fnmadd(col2, tmp, row3);
|
||||||
|
|
||||||
|
tmp = shuffle<1, 0, 3, 2>(col0 * col3);
|
||||||
|
row1 = fnmadd(col2, tmp, row1);
|
||||||
|
row2 = fmadd(col1, tmp, row2);
|
||||||
|
tmp = shuffle<2, 3, 0, 1>(tmp);
|
||||||
|
row1 = fmadd(col2, tmp, row1);
|
||||||
|
row2 = fnmadd(col1, tmp, row2);
|
||||||
|
|
||||||
|
tmp = shuffle<1, 0, 3, 2>(col0 * col2);
|
||||||
|
row1 = fmadd(col3, tmp, row1);
|
||||||
|
row3 = fnmadd(col1, tmp, row3);
|
||||||
|
tmp = shuffle<2, 3, 0, 1>(tmp);
|
||||||
|
row1 = fnmadd(col3, tmp, row1);
|
||||||
|
row3 = fmadd(col1, tmp, row3);
|
||||||
|
|
||||||
|
Vector inv_det = Vector(rcp(dot(col0, row0)));
|
||||||
|
|
||||||
|
return Matrix<E, 4>(
|
||||||
|
row0 * inv_det, row1 * inv_det,
|
||||||
|
row2 * inv_det, row3 * inv_det
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename E = expr_t<T>>
|
||||||
|
ENOKI_INLINE Matrix<E, 4> inverse(const Matrix<T, 4> &m) {
|
||||||
|
return transpose(inverse_transpose(m));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename E = expr_t<T>>
|
||||||
|
ENOKI_INLINE E det(const Matrix<T, 4> &m) {
|
||||||
|
using Vector = Array<E, 4>;
|
||||||
|
|
||||||
|
Vector col0 = m.coeff(0), col1 = m.coeff(1),
|
||||||
|
col2 = m.coeff(2), col3 = m.coeff(3);
|
||||||
|
|
||||||
|
col1 = shuffle<2, 3, 0, 1>(col1);
|
||||||
|
col3 = shuffle<2, 3, 0, 1>(col3);
|
||||||
|
|
||||||
|
Vector tmp, row0;
|
||||||
|
|
||||||
|
tmp = shuffle<1, 0, 3, 2>(col2 * col3);
|
||||||
|
row0 = col1 * tmp;
|
||||||
|
tmp = shuffle<2, 3, 0, 1>(tmp);
|
||||||
|
row0 = fmsub(col1, tmp, row0);
|
||||||
|
|
||||||
|
tmp = shuffle<1, 0, 3, 2>(col1 * col2);
|
||||||
|
row0 = fmadd(col3, tmp, row0);
|
||||||
|
tmp = shuffle<2, 3, 0, 1>(tmp);
|
||||||
|
row0 = fnmadd(col3, tmp, row0);
|
||||||
|
|
||||||
|
col1 = shuffle<2, 3, 0, 1>(col1);
|
||||||
|
col2 = shuffle<2, 3, 0, 1>(col2);
|
||||||
|
tmp = shuffle<1, 0, 3, 2>(col1 * col3);
|
||||||
|
row0 = fmadd(col2, tmp, row0);
|
||||||
|
tmp = shuffle<2, 3, 0, 1>(tmp);
|
||||||
|
row0 = fnmadd(col2, tmp, row0);
|
||||||
|
|
||||||
|
return dot(col0, row0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Value, size_t Size, bool IsMask_, typename Derived>
|
||||||
|
ENOKI_INLINE auto transpose(const StaticArrayBase<Value, Size, IsMask_, Derived> &a) {
|
||||||
|
static_assert(Value::Size == Size && array_depth<Derived>::value >= 2,
|
||||||
|
"Array must be a square matrix!");
|
||||||
|
using Column = value_t<Derived>;
|
||||||
|
|
||||||
|
if constexpr (Column::IsNative) {
|
||||||
|
#if defined(ENOKI_X86_SSE42)
|
||||||
|
if constexpr (std::is_same_v<value_t<Column>, float> && Size == 3) {
|
||||||
|
__m128 c0 = a.derived().coeff(0).m,
|
||||||
|
c1 = a.derived().coeff(1).m,
|
||||||
|
c2 = a.derived().coeff(2).m;
|
||||||
|
|
||||||
|
__m128 t0 = _mm_unpacklo_ps(c0, c1);
|
||||||
|
__m128 t1 = _mm_unpacklo_ps(c2, c2);
|
||||||
|
__m128 t2 = _mm_unpackhi_ps(c0, c1);
|
||||||
|
__m128 t3 = _mm_unpackhi_ps(c2, c2);
|
||||||
|
|
||||||
|
return Derived(
|
||||||
|
_mm_movelh_ps(t0, t1),
|
||||||
|
_mm_movehl_ps(t1, t0),
|
||||||
|
_mm_movelh_ps(t2, t3)
|
||||||
|
);
|
||||||
|
} else if constexpr (std::is_same_v<value_t<Column>, float> && Size == 4) {
|
||||||
|
__m128 c0 = a.derived().coeff(0).m, c1 = a.derived().coeff(1).m,
|
||||||
|
c2 = a.derived().coeff(2).m, c3 = a.derived().coeff(3).m;
|
||||||
|
|
||||||
|
__m128 t0 = _mm_unpacklo_ps(c0, c1);
|
||||||
|
__m128 t1 = _mm_unpacklo_ps(c2, c3);
|
||||||
|
__m128 t2 = _mm_unpackhi_ps(c0, c1);
|
||||||
|
__m128 t3 = _mm_unpackhi_ps(c2, c3);
|
||||||
|
|
||||||
|
return Derived(
|
||||||
|
_mm_movelh_ps(t0, t1),
|
||||||
|
_mm_movehl_ps(t1, t0),
|
||||||
|
_mm_movelh_ps(t2, t3),
|
||||||
|
_mm_movehl_ps(t3, t2)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX)
|
||||||
|
if constexpr (std::is_same_v<value_t<Column>, double> && Size == 3) {
|
||||||
|
__m256d c0 = a.derived().coeff(0).m,
|
||||||
|
c1 = a.derived().coeff(1).m,
|
||||||
|
c2 = a.derived().coeff(2).m;
|
||||||
|
|
||||||
|
__m256d t3 = _mm256_shuffle_pd(c2, c2, 0b0000),
|
||||||
|
t2 = _mm256_shuffle_pd(c2, c2, 0b1111),
|
||||||
|
t1 = _mm256_shuffle_pd(c0, c1, 0b0000),
|
||||||
|
t0 = _mm256_shuffle_pd(c0, c1, 0b1111);
|
||||||
|
|
||||||
|
return Derived(
|
||||||
|
_mm256_permute2f128_pd(t1, t3, 0b0010'0000),
|
||||||
|
_mm256_permute2f128_pd(t0, t2, 0b0010'0000),
|
||||||
|
_mm256_permute2f128_pd(t1, t3, 0b0011'0001)
|
||||||
|
);
|
||||||
|
} else if constexpr (std::is_same_v<value_t<Column>, double> && Size == 4) {
|
||||||
|
__m256d c0 = a.derived().coeff(0).m, c1 = a.derived().coeff(1).m,
|
||||||
|
c2 = a.derived().coeff(2).m, c3 = a.derived().coeff(3).m;
|
||||||
|
|
||||||
|
__m256d t3 = _mm256_shuffle_pd(c2, c3, 0b0000),
|
||||||
|
t2 = _mm256_shuffle_pd(c2, c3, 0b1111),
|
||||||
|
t1 = _mm256_shuffle_pd(c0, c1, 0b0000),
|
||||||
|
t0 = _mm256_shuffle_pd(c0, c1, 0b1111);
|
||||||
|
|
||||||
|
return Derived(
|
||||||
|
_mm256_permute2f128_pd(t1, t3, 0b0010'0000),
|
||||||
|
_mm256_permute2f128_pd(t0, t2, 0b0010'0000),
|
||||||
|
_mm256_permute2f128_pd(t1, t3, 0b0011'0001),
|
||||||
|
_mm256_permute2f128_pd(t0, t2, 0b0011'0001)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENOKI_ARM_NEON)
|
||||||
|
if constexpr (std::is_same_v<value_t<Column>, float> && Size == 3) {
|
||||||
|
float32x4x2_t v01 = vtrnq_f32(a.derived().coeff(0).m, a.derived().coeff(1).m);
|
||||||
|
float32x4x2_t v23 = vtrnq_f32(a.derived().coeff(2).m, a.derived().coeff(2).m);
|
||||||
|
|
||||||
|
return Derived(
|
||||||
|
vcombine_f32(vget_low_f32 (v01.val[0]), vget_low_f32 (v23.val[0])),
|
||||||
|
vcombine_f32(vget_low_f32 (v01.val[1]), vget_low_f32 (v23.val[1])),
|
||||||
|
vcombine_f32(vget_high_f32(v01.val[0]), vget_high_f32(v23.val[0]))
|
||||||
|
);
|
||||||
|
} else if constexpr (std::is_same_v<value_t<Column>, float> && Size == 4) {
|
||||||
|
float32x4x2_t v01 = vtrnq_f32(a.derived().coeff(0).m, a.derived().coeff(1).m);
|
||||||
|
float32x4x2_t v23 = vtrnq_f32(a.derived().coeff(2).m, a.derived().coeff(3).m);
|
||||||
|
|
||||||
|
return Derived(
|
||||||
|
vcombine_f32(vget_low_f32 (v01.val[0]), vget_low_f32 (v23.val[0])),
|
||||||
|
vcombine_f32(vget_low_f32 (v01.val[1]), vget_low_f32 (v23.val[1])),
|
||||||
|
vcombine_f32(vget_high_f32(v01.val[0]), vget_high_f32(v23.val[0])),
|
||||||
|
vcombine_f32(vget_high_f32(v01.val[1]), vget_high_f32(v23.val[1]))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ENOKI_CHKSCALAR("transpose");
|
||||||
|
|
||||||
|
Derived result;
|
||||||
|
for (size_t i = 0; i < Size; ++i)
|
||||||
|
for (size_t j = 0; j < Size; ++j)
|
||||||
|
result.coeff(i, j) = a.derived().coeff(j, i);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, size_t Size, typename Expr = expr_t<T>,
|
||||||
|
typename Matrix = Matrix<Expr, Size>>
|
||||||
|
std::pair<Matrix, Matrix> ENOKI_INLINE
|
||||||
|
polar_decomp(const enoki::Matrix<T, Size> &A, size_t it = 10) {
|
||||||
|
using Arr = Array<Array<Expr, Size>, Size>;
|
||||||
|
Matrix Q = A;
|
||||||
|
for (size_t i = 0; i < it; ++i) {
|
||||||
|
Matrix Qi = inverse_transpose(Q);
|
||||||
|
Expr gamma = sqrt(frob(Qi) / frob(Q));
|
||||||
|
Q = fmadd(Arr(Q), gamma * .5f, Arr(Qi) * (rcp(gamma) * 0.5f));
|
||||||
|
}
|
||||||
|
return std::make_pair(Q, transpose(Q) * A);
|
||||||
|
}
|
||||||
|
|
||||||
|
// =======================================================================
|
||||||
|
//! @{ \name Enoki accessors for static & dynamic vectorization
|
||||||
|
// =======================================================================
|
||||||
|
|
||||||
|
template <typename T, size_t Size>
|
||||||
|
struct struct_support<Matrix<T, Size>,
|
||||||
|
enable_if_static_array_t<Matrix<T, Size>>> {
|
||||||
|
static constexpr bool IsDynamic = enoki::is_dynamic_v<T>;
|
||||||
|
using Dynamic = Matrix<enoki::make_dynamic_t<T>, Size>;
|
||||||
|
using Value = Matrix<T, Size>;
|
||||||
|
using Column = column_t<Value>;
|
||||||
|
|
||||||
|
static ENOKI_INLINE size_t slices(const Value &value) {
|
||||||
|
return enoki::slices(value.coeff(0, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE size_t packets(const Value &value) {
|
||||||
|
return enoki::packets(value.coeff(0, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE void set_slices(Value &value, size_t size) {
|
||||||
|
for (size_t i = 0; i < Size; ++i)
|
||||||
|
enoki::set_slices(value.coeff(i), size);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto packet(T2&& value, size_t i) {
|
||||||
|
return packet(value, i, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto slice(T2&& value, size_t i) {
|
||||||
|
return slice(value, i, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto slice_ptr(T2&& value, size_t i) {
|
||||||
|
return slice_ptr(value, i, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto ref_wrap(T2&& value) {
|
||||||
|
return ref_wrap(value, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto detach(T2&& value) {
|
||||||
|
return detach(value, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto gradient(T2&& value) {
|
||||||
|
return gradient(value, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE Value zero(size_t size) {
|
||||||
|
return Value::zero_(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE Value empty(size_t size) {
|
||||||
|
return Value::empty_(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, typename Mask,
|
||||||
|
enable_if_t<array_size<T2>::value == array_size<Mask>::value> = 0>
|
||||||
|
static ENOKI_INLINE auto masked(T2 &value, const Mask &mask) {
|
||||||
|
return detail::MaskedArray<T2>{ value, mask_t<T2>(mask) };
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, typename Mask,
|
||||||
|
enable_if_t<array_size<T2>::value != array_size<Mask>::value> = 0>
|
||||||
|
static ENOKI_INLINE auto masked(T2 &value, const Mask &mask) {
|
||||||
|
using Arr = Array<Array<T, Size>, Size>;
|
||||||
|
return enoki::masked((Arr&) value, mask_t<Arr>(mask));
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
template <typename T2, size_t... Index>
|
||||||
|
static ENOKI_INLINE auto packet(T2&& value, size_t i, std::index_sequence<Index...>) {
|
||||||
|
return Matrix<decltype(enoki::packet(value.coeff(0, 0), i)), Size>(
|
||||||
|
enoki::packet(value.coeff(Index), i)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, size_t... Index>
|
||||||
|
static ENOKI_INLINE auto slice(T2&& value, size_t i, std::index_sequence<Index...>) {
|
||||||
|
return Matrix<decltype(enoki::slice(value.coeff(0, 0), i)), Size>(
|
||||||
|
enoki::slice(value.coeff(Index), i)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, size_t... Index>
|
||||||
|
static ENOKI_INLINE auto slice_ptr(T2&& value, size_t i, std::index_sequence<Index...>) {
|
||||||
|
return Matrix<decltype(enoki::slice_ptr(value.coeff(0, 0), i)), Size>(
|
||||||
|
enoki::slice_ptr(value.coeff(Index), i)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, size_t... Index>
|
||||||
|
static ENOKI_INLINE auto ref_wrap(T2&& value, std::index_sequence<Index...>) {
|
||||||
|
return Matrix<decltype(enoki::ref_wrap(value.coeff(0, 0))), Size>(
|
||||||
|
enoki::ref_wrap(value.coeff(Index))...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, size_t... Index>
|
||||||
|
static ENOKI_INLINE auto detach(T2&& value, std::index_sequence<Index...>) {
|
||||||
|
return Matrix<decltype(enoki::detach(value.coeff(0, 0))), Size>(
|
||||||
|
enoki::detach(value.coeff(Index))...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, size_t... Index>
|
||||||
|
static ENOKI_INLINE auto gradient(T2&& value, std::index_sequence<Index...>) {
|
||||||
|
return Matrix<decltype(enoki::gradient(value.coeff(0, 0))), Size>(
|
||||||
|
enoki::gradient(value.coeff(Index))...);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// =======================================================================
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,161 @@
|
||||||
|
/*
|
||||||
|
enoki/morton.h -- Morton/Z-order curve encoding and decoding routines
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
Includes contributions by Sebastien Speierer
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <enoki/array.h>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# pragma warning (push)
|
||||||
|
# pragma warning (disable: 4310) // cast truncates constant value
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
NAMESPACE_BEGIN(detail)
|
||||||
|
|
||||||
|
/// Generate bit masks for the functions \ref scatter_bits() and \ref gather_bits()
|
||||||
|
template <typename Value> constexpr Value morton_magic(size_t dim, size_t level) {
|
||||||
|
size_t n_bits = sizeof(Value) * 8;
|
||||||
|
size_t max_block_size = n_bits / dim;
|
||||||
|
size_t block_size = std::min(size_t(1) << (level - 1), max_block_size);
|
||||||
|
size_t count = 0;
|
||||||
|
|
||||||
|
Value mask = Value(1) << (n_bits - 1),
|
||||||
|
value = Value(0);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < n_bits; ++i) {
|
||||||
|
value >>= 1;
|
||||||
|
|
||||||
|
if (count < max_block_size && (i / block_size) % dim == 0) {
|
||||||
|
count++;
|
||||||
|
value |= mask;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Bit scatter function. \c Dimension defines the final distance between two output bits
|
||||||
|
template <size_t, typename Value, size_t Level, enable_if_t<Level == 0> = 0>
|
||||||
|
ENOKI_INLINE Value scatter_bits(Value x) { return x; }
|
||||||
|
|
||||||
|
template <size_t Dimension, typename Value,
|
||||||
|
size_t Level = clog2i(sizeof(Value) * 8),
|
||||||
|
enable_if_t<Level != 0 && (!(has_avx2 && has_x86_64) || !std::is_integral_v<Value>)> = 0>
|
||||||
|
ENOKI_INLINE Value scatter_bits(Value x) {
|
||||||
|
using Scalar = scalar_t<Value>;
|
||||||
|
|
||||||
|
constexpr Scalar magic = morton_magic<Scalar>(Dimension, Level);
|
||||||
|
constexpr size_t shift_maybe = (1 << (Level - 1)) * (Dimension - 1);
|
||||||
|
constexpr size_t shift = (shift_maybe < sizeof(Scalar) * 8) ? shift_maybe : 0;
|
||||||
|
|
||||||
|
if constexpr (shift != 0)
|
||||||
|
x |= sl<shift>(x);
|
||||||
|
|
||||||
|
x &= magic;
|
||||||
|
|
||||||
|
return scatter_bits<Dimension, Value, Level - 1>(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t, typename Value, size_t Level,
|
||||||
|
enable_if_t<Level == 0> = 0>
|
||||||
|
ENOKI_INLINE Value gather_bits(Value x) { return x; }
|
||||||
|
|
||||||
|
/// Bit gather function. \c Dimension defines the final distance between two input bits
|
||||||
|
template <size_t Dimension, typename Value,
|
||||||
|
size_t Level = clog2i(sizeof(Value) * 8),
|
||||||
|
enable_if_t<Level != 0 && (!(has_avx2 && has_x86_64) || !std::is_integral_v<Value>)> = 0>
|
||||||
|
ENOKI_INLINE Value gather_bits(Value x) {
|
||||||
|
using Scalar = scalar_t<Value>;
|
||||||
|
|
||||||
|
constexpr size_t ilevel = clog2i(sizeof(Value) * 8) - Level + 1;
|
||||||
|
constexpr Scalar magic = morton_magic<Scalar>(Dimension, ilevel);
|
||||||
|
constexpr size_t shift_maybe = (1 << (ilevel - 1)) * (Dimension - 1);
|
||||||
|
constexpr size_t shift = (shift_maybe < sizeof(Scalar) * 8) ? shift_maybe : 0;
|
||||||
|
|
||||||
|
x &= magic;
|
||||||
|
|
||||||
|
if constexpr (shift != 0)
|
||||||
|
x |= sr<shift>(x);
|
||||||
|
|
||||||
|
return gather_bits<Dimension, Value, Level - 1>(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(ENOKI_X86_AVX2) && defined(ENOKI_X86_64)
|
||||||
|
template <size_t Dimension, typename Value,
|
||||||
|
enable_if_t<std::is_integral_v<Value>> = 0>
|
||||||
|
ENOKI_INLINE Value scatter_bits(Value x) {
|
||||||
|
constexpr Value magic = morton_magic<Value>(Dimension, 1);
|
||||||
|
if constexpr (sizeof(Value) <= 4)
|
||||||
|
return Value(_pdep_u32((uint32_t) x, (uint32_t) magic));
|
||||||
|
else
|
||||||
|
return Value(_pdep_u64((uint64_t) x, (uint64_t) magic));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t Dimension, typename Value,
|
||||||
|
enable_if_t<std::is_integral_v<Value>> = 0>
|
||||||
|
ENOKI_INLINE Value gather_bits(Value x) {
|
||||||
|
constexpr Value magic = morton_magic<Value>(Dimension, 1);
|
||||||
|
if constexpr (sizeof(Value) <= 4)
|
||||||
|
return Value(_pext_u32((uint32_t) x, (uint32_t) magic));
|
||||||
|
else
|
||||||
|
return Value(_pext_u64((uint64_t) x, (uint64_t) magic));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <typename Array, size_t Index,
|
||||||
|
enable_if_t<Index == 0> = 0>
|
||||||
|
ENOKI_INLINE void morton_decode_helper(value_t<Array> value, Array &out) {
|
||||||
|
out.coeff(0) = gather_bits<Array::Size>(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Array, size_t Index = array_size_v<Array> - 1,
|
||||||
|
enable_if_t<Index != 0> = 0>
|
||||||
|
ENOKI_INLINE void morton_decode_helper(value_t<Array> value, Array &out) {
|
||||||
|
out.coeff(Index) = gather_bits<Array::Size>(sr<Index>(value));
|
||||||
|
morton_decode_helper<Array, Index - 1>(value, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(detail)
|
||||||
|
|
||||||
|
/// Convert a N-dimensional integer array into the Morton/Z-order curve encoding
|
||||||
|
template <typename Array, size_t Index, typename Return = value_t<Array>,
|
||||||
|
enable_if_t<Index == 0> = 0>
|
||||||
|
ENOKI_INLINE Return morton_encode(Array a) {
|
||||||
|
return detail::scatter_bits<Array::Size>(a.coeff(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert a N-dimensional integer array into the Morton/Z-order curve encoding
|
||||||
|
template <typename Array, size_t Index = array_size_v<Array> - 1,
|
||||||
|
typename Return = value_t<Array>, enable_if_t<Index != 0> = 0>
|
||||||
|
ENOKI_INLINE Return morton_encode(Array a) {
|
||||||
|
static_assert(std::is_unsigned_v<scalar_t<Array>>, "morton_encode() requires unsigned arguments");
|
||||||
|
return sl<Index>(detail::scatter_bits<Array::Size>(a.coeff(Index))) |
|
||||||
|
morton_encode<Array, Index - 1>(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Morton/Z-order curve encoding into a N-dimensional integer array
|
||||||
|
template <typename Array, typename Value = value_t<Array>>
|
||||||
|
ENOKI_INLINE Array morton_decode(Value value) {
|
||||||
|
static_assert(std::is_unsigned_v<scalar_t<Array>>, "morton_decode() requires unsigned arguments");
|
||||||
|
Array result;
|
||||||
|
detail::morton_decode_helper(value, result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
# pragma warning (pop)
|
||||||
|
#endif
|
||||||
|
|
@ -0,0 +1,229 @@
|
||||||
|
/*
|
||||||
|
enoki/python.h -- pybind11 support for Enoki types
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyrighe (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <enoki/complex.h>
|
||||||
|
#include <pybind11/numpy.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(pybind11)
|
||||||
|
NAMESPACE_BEGIN(detail)
|
||||||
|
|
||||||
|
template <typename T, typename = void> struct array_shape_descr {
|
||||||
|
static constexpr auto name() { return _(""); }
|
||||||
|
static constexpr auto name_cont() { return _(""); }
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct array_shape_descr<T, std::enable_if_t<enoki::is_static_array_v<T>>> {
|
||||||
|
static constexpr auto name() {
|
||||||
|
return array_shape_descr<enoki::value_t<T>>::name_cont() + _<T::Size>();
|
||||||
|
}
|
||||||
|
static constexpr auto name_cont() {
|
||||||
|
return array_shape_descr<enoki::value_t<T>>::name_cont() + _<T::Size>() + _(", ");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct array_shape_descr<T, std::enable_if_t<enoki::is_dynamic_array_v<T>>> {
|
||||||
|
static constexpr auto name() {
|
||||||
|
return array_shape_descr<enoki::value_t<T>>::name_cont() + _("n");
|
||||||
|
}
|
||||||
|
static constexpr auto name_cont() {
|
||||||
|
return array_shape_descr<enoki::value_t<T>>::name_cont() + _("n, ");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Value>
|
||||||
|
struct type_caster<Value, std::enable_if_t<enoki::is_array_v<Value> &&
|
||||||
|
!enoki::is_cuda_array_v<Value>>> {
|
||||||
|
using Scalar = std::conditional_t<Value::IsMask, bool, enoki::scalar_t<Value>>;
|
||||||
|
static constexpr bool IsComplex = Value::IsComplex;
|
||||||
|
|
||||||
|
bool load(handle src, bool convert) {
|
||||||
|
if (src.is_none()) {
|
||||||
|
is_none = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (std::is_pointer_v<Scalar> || std::is_enum_v<Scalar>) {
|
||||||
|
/// Convert special array types (pointer, enum) to integer arrays
|
||||||
|
using UInt = enoki::uint_array_t<Value, false>;
|
||||||
|
type_caster<UInt> caster;
|
||||||
|
bool result = caster.load(src, convert);
|
||||||
|
value = caster.operator UInt &();
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isinstance<array_t<Scalar>>(src)) {
|
||||||
|
if (!convert)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/// Don't cast enoki CUDA/autodiff types
|
||||||
|
if (strncmp(((PyTypeObject *) src.get_type().ptr())->tp_name, "enoki.", 6) == 0)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr size_t ndim = enoki::array_depth_v<Value>;
|
||||||
|
|
||||||
|
array arr = reinterpret_borrow<array>(src);
|
||||||
|
if constexpr (IsComplex) {
|
||||||
|
auto np = module::import("numpy");
|
||||||
|
try {
|
||||||
|
arr = np.attr("asarray")(arr, sizeof(Scalar) == 4 ? "c8" : "c16", "F");
|
||||||
|
arr = np.attr("expand_dims")(arr, -1).attr("view")(
|
||||||
|
sizeof(Scalar) == 4 ? "f4" : "f8");
|
||||||
|
} catch (const error_already_set &) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
arr = array_t<Scalar, array::f_style | array::forcecast>::ensure(arr);
|
||||||
|
if (!arr)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (ndim != arr.ndim() && !((arr.ndim() == 0 || (arr.ndim() == 1 && IsComplex)) && convert))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
std::array<size_t, ndim> shape;
|
||||||
|
std::fill(shape.begin(), shape.end(), (size_t) 1);
|
||||||
|
std::reverse_copy(arr.shape(), arr.shape() + arr.ndim(), shape.begin());
|
||||||
|
|
||||||
|
try {
|
||||||
|
enoki::set_shape(value, shape);
|
||||||
|
} catch (const std::length_error &) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const Scalar *buf = static_cast<const Scalar *>(arr.data());
|
||||||
|
read_buffer(buf, value);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static handle cast(const Value *src, return_value_policy policy, handle parent) {
|
||||||
|
if (!src)
|
||||||
|
return pybind11::none();
|
||||||
|
return cast(*src, policy, parent);
|
||||||
|
}
|
||||||
|
|
||||||
|
static handle cast(const Value &src, return_value_policy policy, handle parent) {
|
||||||
|
/// Convert special array types (pointer, enum) to integer arrays
|
||||||
|
if constexpr (std::is_pointer_v<Scalar> || std::is_enum_v<Scalar>) {
|
||||||
|
using UInt = enoki::uint_array_t<Value, false>;
|
||||||
|
return type_caster<UInt>::cast(src, policy, parent);
|
||||||
|
}
|
||||||
|
(void) policy; (void) parent;
|
||||||
|
|
||||||
|
if (enoki::ragged(src))
|
||||||
|
throw type_error("Ragged arrays are not supported!");
|
||||||
|
|
||||||
|
auto shape = enoki::shape(src);
|
||||||
|
std::reverse(shape.begin(), shape.end());
|
||||||
|
decltype(shape) stride;
|
||||||
|
|
||||||
|
stride[0] = sizeof(Scalar);
|
||||||
|
for (size_t i = 1; i < shape.size(); ++i)
|
||||||
|
stride[i] = shape[i - 1] * stride[i - 1];
|
||||||
|
|
||||||
|
array arr(pybind11::dtype::of<Scalar>(),
|
||||||
|
std::vector<ssize_t>(shape.begin(), shape.end()),
|
||||||
|
std::vector<ssize_t>(stride.begin(), stride.end()));
|
||||||
|
|
||||||
|
Scalar *buf = static_cast<Scalar *>(arr.mutable_data());
|
||||||
|
write_buffer(buf, src);
|
||||||
|
|
||||||
|
if constexpr (IsComplex) {
|
||||||
|
auto np = module::import("numpy");
|
||||||
|
arr = np.attr("ascontiguousarray")(arr).attr("view")(
|
||||||
|
sizeof(Scalar) == 4 ? "c8" : "c16").attr("squeeze")(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return arr.release();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
|
||||||
|
|
||||||
|
static constexpr auto name_default =
|
||||||
|
_("numpy.ndarray[dtype=") +
|
||||||
|
npy_format_descriptor<Scalar>::name + _(", shape=(") +
|
||||||
|
array_shape_descr<Value>::name() + _(")]");
|
||||||
|
|
||||||
|
static constexpr auto name_complex =
|
||||||
|
_("numpy.ndarray[dtype=Complex[") +
|
||||||
|
npy_format_descriptor<Scalar>::name + _("], shape=(") +
|
||||||
|
array_shape_descr<enoki::value_t<Value>>::name() + _(")]");
|
||||||
|
|
||||||
|
static constexpr auto name = _<IsComplex>(name_complex, name_default);
|
||||||
|
|
||||||
|
operator Value*() { if (is_none) return nullptr; else return &value; }
|
||||||
|
operator Value&() {
|
||||||
|
#if !defined(NDEBUG)
|
||||||
|
if (is_none)
|
||||||
|
throw pybind11::cast_error("Cannot cast None or nullptr to an"
|
||||||
|
" Enoki array.");
|
||||||
|
#endif
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
template <typename T> static ENOKI_INLINE void write_buffer(Scalar *&buf, const T &value) {
|
||||||
|
if constexpr (!enoki::is_array_v<enoki::value_t<T>>) {
|
||||||
|
if constexpr (!enoki::is_mask_v<T>) {
|
||||||
|
memcpy(buf, value.data(), sizeof(enoki::value_t<T>) * value.size());
|
||||||
|
buf += value.size();
|
||||||
|
} else {
|
||||||
|
for (size_t i = 0, size = value.size(); i < size; ++i)
|
||||||
|
*buf++ = value.coeff(i);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (size_t i = 0, size = value.size(); i < size; ++i)
|
||||||
|
write_buffer(buf, value.coeff(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static ENOKI_INLINE void read_buffer(const Scalar *&buf, T &value) {
|
||||||
|
if constexpr (!enoki::is_array_v<enoki::value_t<T>>) {
|
||||||
|
if constexpr (!enoki::is_mask_v<T>) {
|
||||||
|
memcpy(value.data(), buf, sizeof(enoki::value_t<T>) * value.size());
|
||||||
|
buf += value.size();
|
||||||
|
} else {
|
||||||
|
if constexpr (!enoki::is_dynamic_array_v<T>) {
|
||||||
|
enoki::Array<bool, T::Size> value2 = false;
|
||||||
|
for (size_t i = 0, size = value2.size(); i < size; ++i)
|
||||||
|
value2.coeff(i) = *buf++;
|
||||||
|
value = enoki::reinterpret_array<T>(value2);
|
||||||
|
} else {
|
||||||
|
const Scalar *end = buf + value.size();
|
||||||
|
for (size_t i = 0; i < enoki::packets(value); ++i) {
|
||||||
|
enoki::Array<bool, T::Packet::Size> value2 = false;
|
||||||
|
for (size_t j = 0; j < T::Packet::Size && buf != end; ++j)
|
||||||
|
value2.coeff(j) = *buf++;
|
||||||
|
enoki::packet(value, i) = enoki::reinterpret_array<typename T::Packet>(value2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (size_t i = 0, size = value.size(); i < size; ++i)
|
||||||
|
read_buffer(buf, value.coeff(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
Value value;
|
||||||
|
bool is_none = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(detail)
|
||||||
|
NAMESPACE_END(pybind11)
|
||||||
|
|
@ -0,0 +1,361 @@
|
||||||
|
/*
|
||||||
|
enoki/quaternion.h -- Quaternion data structure
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <enoki/complex.h>
|
||||||
|
#include <enoki/matrix.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
/// SFINAE helper for quaternions
|
||||||
|
template <typename T> using is_quaternion_helper = enable_if_t<std::decay_t<T>::IsQuaternion>;
|
||||||
|
template <typename T> constexpr bool is_quaternion_v = is_detected_v<is_quaternion_helper, T>;
|
||||||
|
template <typename T> using enable_if_quaternion_t = enable_if_t<is_quaternion_v<T>>;
|
||||||
|
template <typename T> using enable_if_not_quaternion_t = enable_if_t<!is_quaternion_v<T>>;
|
||||||
|
|
||||||
|
template <typename Value_>
|
||||||
|
struct Quaternion : StaticArrayImpl<Value_, 4, false, Quaternion<Value_>> {
|
||||||
|
using Base = StaticArrayImpl<Value_, 4, false, Quaternion<Value_>>;
|
||||||
|
ENOKI_ARRAY_IMPORT_BASIC(Base, Quaternion);
|
||||||
|
using Base::operator=;
|
||||||
|
|
||||||
|
static constexpr bool IsQuaternion = true;
|
||||||
|
static constexpr bool IsVector = false;
|
||||||
|
|
||||||
|
using ArrayType = Quaternion;
|
||||||
|
using MaskType = Mask<Value_, 4>;
|
||||||
|
|
||||||
|
template <typename T> using ReplaceValue = Quaternion<T>;
|
||||||
|
|
||||||
|
Quaternion() = default;
|
||||||
|
|
||||||
|
template <typename Value2>
|
||||||
|
ENOKI_INLINE Quaternion(const Quaternion<Value2> &z) : Base(z) { }
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<(array_depth_v<T> < Base::Depth && (is_scalar_v<T> || is_array_v<T>))> = 0,
|
||||||
|
enable_if_not_quaternion_t<T> = 0>
|
||||||
|
ENOKI_INLINE Quaternion(T &&v) : Base(zero<Value_>(), zero<Value_>(), zero<Value_>(), v) { }
|
||||||
|
|
||||||
|
template <typename T, enable_if_t<(array_depth_v<T> == Base::Depth || !(is_scalar_v<T> || is_array_v<T>))> = 0,
|
||||||
|
enable_if_not_quaternion_t<T> = 0>
|
||||||
|
ENOKI_INLINE Quaternion(T &&v) : Base(std::forward<T>(v)) { }
|
||||||
|
|
||||||
|
ENOKI_INLINE Quaternion(const Value_ &vi, const Value_ &vj,
|
||||||
|
const Value_ &vk, const Value_ &vr)
|
||||||
|
: Base(vi, vj, vk, vr) { }
|
||||||
|
|
||||||
|
template <typename Im, typename Re, enable_if_t<array_size_v<Im> == 3> = 0>
|
||||||
|
ENOKI_INLINE Quaternion(const Im &im, const Re &re)
|
||||||
|
: Base(im.x(), im.y(), im.z(), re) { }
|
||||||
|
|
||||||
|
/// Construct from sub-arrays
|
||||||
|
template <typename T1, typename T2, typename T = Quaternion, enable_if_t<
|
||||||
|
array_depth_v<T1> == array_depth_v<T> && array_size_v<T1> == 2 &&
|
||||||
|
array_depth_v<T2> == array_depth_v<T> && array_size_v<T2> == 2> = 0>
|
||||||
|
Quaternion(const T1 &a1, const T2 &a2)
|
||||||
|
: Base(a1, a2) { }
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE static Quaternion full_(const T &value, size_t size) {
|
||||||
|
return Array<Value, 4>::full_(value, size);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, enable_if_quaternion_t<T> = 0>
|
||||||
|
ENOKI_INLINE T identity(size_t size = 1) {
|
||||||
|
using Value = value_t<T>;
|
||||||
|
Value z = zero<Value>(size),
|
||||||
|
o = full<Value>(1.f, size);
|
||||||
|
return T(z, z, z, o);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> ENOKI_INLINE expr_t<T> real(const Quaternion<T> &q) { return q.w(); }
|
||||||
|
template <typename T> ENOKI_INLINE auto imag(const Quaternion<T> &q) { return head<3>(q); }
|
||||||
|
|
||||||
|
template <typename T0, typename T1, typename T = expr_t<T0, T1>>
|
||||||
|
ENOKI_INLINE T dot(const Quaternion<T0> &q0, const Quaternion<T1> &q1) {
|
||||||
|
using Base = Array<T, 4>;
|
||||||
|
return dot(Base(q0), Base(q1));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE Quaternion<expr_t<T>> conj(const Quaternion<T> &q) {
|
||||||
|
const Quaternion<expr_t<T>> mask(-0.f, -0.f, -0.f, 0.f);
|
||||||
|
return q ^ mask;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE expr_t<T> squared_norm(const Quaternion<T> &q) {
|
||||||
|
return enoki::squared_norm(Array<expr_t<T>, 4>(q));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE expr_t<T> norm(const Quaternion<T> &q) {
|
||||||
|
return enoki::norm(Array<expr_t<T>, 4>(q));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE Quaternion<expr_t<T>> normalize(const Quaternion<T> &q) {
|
||||||
|
return enoki::normalize(Array<expr_t<T>, 4>(q));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE Quaternion<expr_t<T>> rcp(const Quaternion<T> &q) {
|
||||||
|
return conj(q) * (1 / squared_norm(q));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T0, typename T1,
|
||||||
|
typename Value = expr_t<T0, T1>, typename Result = Quaternion<Value>>
|
||||||
|
ENOKI_INLINE Result operator*(const Quaternion<T0> &q0, const Quaternion<T1> &q1) {
|
||||||
|
using Base = Array<Value, 4>;
|
||||||
|
const Base sign_mask(0.f, 0.f, 0.f, -0.f);
|
||||||
|
Base q0_xyzx = shuffle<0, 1, 2, 0>(q0);
|
||||||
|
Base q0_yzxy = shuffle<1, 2, 0, 1>(q0);
|
||||||
|
Base q1_wwwx = shuffle<3, 3, 3, 0>(q1);
|
||||||
|
Base q1_zxyy = shuffle<2, 0, 1, 1>(q1);
|
||||||
|
Base t1 = fmadd(q0_xyzx, q1_wwwx, q0_yzxy * q1_zxyy) ^ sign_mask;
|
||||||
|
|
||||||
|
Base q0_zxyz = shuffle<2, 0, 1, 2>(q0);
|
||||||
|
Base q1_yzxz = shuffle<1, 2, 0, 2>(q1);
|
||||||
|
Base q0_wwww = shuffle<3, 3, 3, 3>(q0);
|
||||||
|
Base t2 = fmsub(q0_wwww, q1, q0_zxyz * q1_yzxz);
|
||||||
|
return t1 + t2;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T0, typename T1,
|
||||||
|
typename Value = expr_t<T0, T1>, typename Result = Quaternion<Value>>
|
||||||
|
ENOKI_INLINE Result operator*(const Quaternion<T0> &q0, const T1 &v1) {
|
||||||
|
return Array<expr_t<T0>, 4>(q0) * v1;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T0, typename T1,
|
||||||
|
typename Value = expr_t<T0, T1>, typename Result = Quaternion<Value>>
|
||||||
|
ENOKI_INLINE Result operator*(const T0 &v0, const Quaternion<T1> &q1) {
|
||||||
|
return v0 * Array<expr_t<T0>, 4>(q1);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T0, typename T1,
|
||||||
|
typename Value = expr_t<T0, T1>, typename Result = Quaternion<Value>>
|
||||||
|
ENOKI_INLINE Result operator/(const Quaternion<T0> &q0, const Quaternion<T1> &q1) {
|
||||||
|
return q0 * rcp(q1);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T0, typename T1,
|
||||||
|
typename Value = expr_t<T0, T1>, typename Result = Quaternion<Value>>
|
||||||
|
ENOKI_INLINE Result operator/(const Quaternion<T0> &z0, const T1 &v1) {
|
||||||
|
return Array<expr_t<T0>, 4>(z0) / v1;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE expr_t<T> abs(const Quaternion<T> &z) {
|
||||||
|
return norm(z);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE Quaternion<expr_t<T>> exp(const Quaternion<T> &q) {
|
||||||
|
auto qi = imag(q);
|
||||||
|
auto ri = norm(qi);
|
||||||
|
auto exp_w = exp(real(q));
|
||||||
|
auto [s, c] = sincos(ri);
|
||||||
|
|
||||||
|
return { qi * (s * exp_w / ri), c * exp_w };
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
ENOKI_INLINE Quaternion<expr_t<T>> log(const Quaternion<T> &q) {
|
||||||
|
auto qi_n = normalize(imag(q));
|
||||||
|
auto rq = norm(q);
|
||||||
|
auto acos_rq = acos(real(q) / rq);
|
||||||
|
auto log_rq = log(rq);
|
||||||
|
|
||||||
|
return { qi_n * acos_rq, log_rq };
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T0, typename T1>
|
||||||
|
ENOKI_INLINE auto pow(const Quaternion<T0> &q0, const Quaternion<T1> &q1) {
|
||||||
|
return exp(log(q0) * q1);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
Quaternion<expr_t<T>> sqrt(const Quaternion<T> &q) {
|
||||||
|
auto ri = norm(imag(q));
|
||||||
|
auto cs = sqrt(Complex<expr_t<T>>(real(q), ri));
|
||||||
|
return { imag(q) * (rcp(ri) * imag(cs)), real(cs) };
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Vector, typename T, typename Expr = expr_t<T>>
|
||||||
|
ENOKI_INLINE Vector quat_to_euler(const Quaternion<T> &q) {
|
||||||
|
|
||||||
|
// https://en.wikipedia.org/wiki/Conversion_between_quaternions_and_Euler_angles#Quaternion_to_Euler_Angles_Conversion
|
||||||
|
// roll (x-axis rotation)
|
||||||
|
|
||||||
|
Expr q_y_2 = sqr(q.y());
|
||||||
|
|
||||||
|
Expr sinr_cosp = 2 * fmadd(q.w(), q.x(), q.y() * q.z());
|
||||||
|
Expr cosr_cosp = fnmadd(2, fmadd(q.x(), q.x(), q_y_2), 1);
|
||||||
|
Expr roll = atan2(sinr_cosp, cosr_cosp);
|
||||||
|
|
||||||
|
// pitch (y-axis rotation)
|
||||||
|
Expr sinp = 2 * fmsub(q.w(), q.y(), q.z() * q.x());
|
||||||
|
Expr pitch;
|
||||||
|
if (abs(sinp) >= 1)
|
||||||
|
pitch = copysign(Expr(M_PI / 2), sinp); // use 90 degrees if out of range
|
||||||
|
else
|
||||||
|
pitch = asin(sinp);
|
||||||
|
|
||||||
|
// yaw (z-axis rotation)
|
||||||
|
Expr siny_cosp = 2 * fmadd(q.w(), q.z(), q.x() * q.y());
|
||||||
|
Expr cosy_cosp = fnmadd(2, fmadd(q.z(), q.z(), q_y_2), 1);
|
||||||
|
Expr yaw = atan2(siny_cosp, cosy_cosp);
|
||||||
|
|
||||||
|
return Vector(roll, pitch, yaw);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Matrix, typename T, typename Expr = expr_t<T>,
|
||||||
|
enable_if_t<Matrix::Size == 4> = 0>
|
||||||
|
ENOKI_INLINE Matrix quat_to_matrix(const Quaternion<T> &q_) {
|
||||||
|
auto q = q_ * scalar_t<T>(M_SQRT2);
|
||||||
|
|
||||||
|
Expr xx = q.x() * q.x(), yy = q.y() * q.y(), zz = q.z() * q.z();
|
||||||
|
Expr xy = q.x() * q.y(), xz = q.x() * q.z(), yz = q.y() * q.z();
|
||||||
|
Expr xw = q.x() * q.w(), yw = q.y() * q.w(), zw = q.z() * q.w();
|
||||||
|
|
||||||
|
return Matrix(
|
||||||
|
1.f - (yy + zz), xy - zw, xz + yw, 0.f,
|
||||||
|
xy + zw, 1.f - (xx + zz), yz - xw, 0.f,
|
||||||
|
xz - yw, yz + xw, 1.f - (xx + yy), 0.f,
|
||||||
|
0.f, 0.f, 0.f, 1.f
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Matrix, typename T, typename Expr = expr_t<T>,
|
||||||
|
enable_if_t<Matrix::Size == 3> = 0>
|
||||||
|
ENOKI_INLINE Matrix quat_to_matrix(const Quaternion<T> &q_) {
|
||||||
|
auto q = q_ * scalar_t<T>(M_SQRT2);
|
||||||
|
|
||||||
|
Expr xx = q.x() * q.x(), yy = q.y() * q.y(), zz = q.z() * q.z();
|
||||||
|
Expr xy = q.x() * q.y(), xz = q.x() * q.z(), yz = q.y() * q.z();
|
||||||
|
Expr xw = q.x() * q.w(), yw = q.y() * q.w(), zw = q.z() * q.w();
|
||||||
|
|
||||||
|
return Matrix(
|
||||||
|
1.f - (yy + zz), xy - zw, xz + yw,
|
||||||
|
xy + zw, 1.f - (xx + zz), yz - xw,
|
||||||
|
xz - yw, yz + xw, 1.f - (xx + yy)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, size_t Size,
|
||||||
|
typename Expr = expr_t<T>,
|
||||||
|
typename Quat = Quaternion<Expr>,
|
||||||
|
enable_if_t<Size == 3 || Size == 4> = 0>
|
||||||
|
ENOKI_INLINE Quat matrix_to_quat(const Matrix<T, Size> &mat) {
|
||||||
|
const Expr c0(0), c1(1), ch(0.5f);
|
||||||
|
|
||||||
|
// Converting a Rotation Matrix to a Quaternion
|
||||||
|
// Mike Day, Insomniac Games
|
||||||
|
Expr t0(c1 + mat(0, 0) - mat(1, 1) - mat(2, 2));
|
||||||
|
Quat q0(t0,
|
||||||
|
mat(1, 0) + mat(0, 1),
|
||||||
|
mat(0, 2) + mat(2, 0),
|
||||||
|
mat(2, 1) - mat(1, 2));
|
||||||
|
|
||||||
|
Expr t1(c1 - mat(0, 0) + mat(1, 1) - mat(2, 2));
|
||||||
|
Quat q1(mat(1, 0) + mat(0, 1),
|
||||||
|
t1,
|
||||||
|
mat(2, 1) + mat(1, 2),
|
||||||
|
mat(0, 2) - mat(2, 0));
|
||||||
|
|
||||||
|
Expr t2(c1 - mat(0, 0) - mat(1, 1) + mat(2, 2));
|
||||||
|
Quat q2(mat(0, 2) + mat(2, 0),
|
||||||
|
mat(2, 1) + mat(1, 2),
|
||||||
|
t2,
|
||||||
|
mat(1, 0) - mat(0, 1));
|
||||||
|
|
||||||
|
Expr t3(c1 + mat(0, 0) + mat(1, 1) + mat(2, 2));
|
||||||
|
Quat q3(mat(2, 1) - mat(1, 2),
|
||||||
|
mat(0, 2) - mat(2, 0),
|
||||||
|
mat(1, 0) - mat(0, 1),
|
||||||
|
t3);
|
||||||
|
|
||||||
|
auto mask0 = mat(0, 0) > mat(1, 1);
|
||||||
|
Expr t01 = select(mask0, t0, t1);
|
||||||
|
Quat q01 = select(mask0, q0, q1);
|
||||||
|
|
||||||
|
auto mask1 = mat(0, 0) < -mat(1, 1);
|
||||||
|
Expr t23 = select(mask1, t2, t3);
|
||||||
|
Quat q23 = select(mask1, q2, q3);
|
||||||
|
|
||||||
|
auto mask2 = mat(2, 2) < c0;
|
||||||
|
Expr t0123 = select(mask2, t01, t23);
|
||||||
|
Quat q0123 = select(mask2, q01, q23);
|
||||||
|
|
||||||
|
return q0123 * (rsqrt(t0123) * ch);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T0, typename T1, typename T2,
|
||||||
|
typename Value = expr_t<T0, T1, T2>,
|
||||||
|
typename Return = Quaternion<Value>>
|
||||||
|
ENOKI_INLINE Return slerp(const Quaternion<T0> &q0,
|
||||||
|
const Quaternion<T1> &q1_, const T2 &t) {
|
||||||
|
using Base = Array<Value, 4>;
|
||||||
|
|
||||||
|
Value cos_theta = dot(q0, q1_);
|
||||||
|
Return q1 = mulsign(Base(q1_), cos_theta);
|
||||||
|
cos_theta = mulsign(cos_theta, cos_theta);
|
||||||
|
|
||||||
|
Value theta = acos(cos_theta);
|
||||||
|
auto [s, c] = sincos(theta * t);
|
||||||
|
auto close_mask = cos_theta > 0.9995f;
|
||||||
|
|
||||||
|
Return qperp = normalize(q1 - q0 * cos_theta),
|
||||||
|
result = q0 * c + qperp * s;
|
||||||
|
|
||||||
|
if (ENOKI_UNLIKELY(any_nested(close_mask)))
|
||||||
|
result[mask_t<Base>(close_mask)] =
|
||||||
|
Base(normalize(q0 * (1.f - t) + q1 * t));
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Quat, typename Vector3, enable_if_t<Quat::IsQuaternion> = 0>
|
||||||
|
ENOKI_INLINE Quat rotate(const Vector3 &axis, const value_t<Quat> &angle) {
|
||||||
|
auto [s, c] = sincos(angle * .5f);
|
||||||
|
return concat(axis * s, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, enable_if_not_array_t<T> = 0>
|
||||||
|
ENOKI_NOINLINE std::ostream &operator<<(std::ostream &os, const Quaternion<T> &q) {
|
||||||
|
os << q.w();
|
||||||
|
os << (q.x() < 0 ? " - " : " + ") << abs(q.x()) << "i";
|
||||||
|
os << (q.y() < 0 ? " - " : " + ") << abs(q.y()) << "j";
|
||||||
|
os << (q.z() < 0 ? " - " : " + ") << abs(q.z()) << "k";
|
||||||
|
return os;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, enable_if_array_t<T> = 0>
|
||||||
|
ENOKI_NOINLINE std::ostream &operator<<(std::ostream &os, const Quaternion<T> &q) {
|
||||||
|
os << "[";
|
||||||
|
size_t size = q.x().size();
|
||||||
|
for (size_t i = 0; i < size; ++i) {
|
||||||
|
os << q.w().coeff(i);
|
||||||
|
os << (q.x().coeff(i) < 0 ? " - " : " + ") << abs(q.x().coeff(i)) << "i";
|
||||||
|
os << (q.y().coeff(i) < 0 ? " - " : " + ") << abs(q.y().coeff(i)) << "j";
|
||||||
|
os << (q.z().coeff(i) < 0 ? " - " : " + ") << abs(q.z().coeff(i)) << "k";
|
||||||
|
if (i + 1 < size)
|
||||||
|
os << ",\n ";
|
||||||
|
}
|
||||||
|
os << "]";
|
||||||
|
return os;
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,333 @@
|
||||||
|
/*
|
||||||
|
* Tiny self-contained version of the PCG Random Number Generation for C++,
|
||||||
|
* put together from pieces of the much larger C/C++ codebase with
|
||||||
|
* vectorization using Enoki.
|
||||||
|
*
|
||||||
|
* Wenzel Jakob, February 2019
|
||||||
|
*
|
||||||
|
* The PCG random number generator was developed by Melissa O'Neill
|
||||||
|
* <oneill@pcg-random.org>
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*
|
||||||
|
* For additional information about the PCG random number generation scheme,
|
||||||
|
* including its license and other licensing options, visit
|
||||||
|
*
|
||||||
|
* http://www.pcg-random.org
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <enoki/array.h>
|
||||||
|
|
||||||
|
#define PCG32_DEFAULT_STATE 0x853c49e6748fea9bULL
|
||||||
|
#define PCG32_DEFAULT_STREAM 0xda3e39cb94b95bdbULL
|
||||||
|
#define PCG32_MULT 0x5851f42d4c957f2dULL
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
/// PCG32 pseudorandom number generator proposed by Melissa O'Neill
|
||||||
|
template <typename T, size_t Size = array_size_v<T>> struct PCG32 {
|
||||||
|
/* Some convenient type aliases for vectorization */
|
||||||
|
using Int64 = int64_array_t<T>;
|
||||||
|
using UInt64 = uint64_array_t<T>;
|
||||||
|
using UInt32 = uint32_array_t<T>;
|
||||||
|
using Float64 = float64_array_t<T>;
|
||||||
|
using Float32 = float32_array_t<T>;
|
||||||
|
using UInt32Mask = mask_t<UInt32>;
|
||||||
|
using UInt64Mask = mask_t<UInt64>;
|
||||||
|
|
||||||
|
/// Initialize the pseudorandom number generator with the \ref seed() function
|
||||||
|
PCG32(const UInt64 &initstate = PCG32_DEFAULT_STATE,
|
||||||
|
const UInt64 &initseq = arange<UInt64>(Size) + PCG32_DEFAULT_STREAM) {
|
||||||
|
seed(initstate, initseq);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Seed the pseudorandom number generator
|
||||||
|
*
|
||||||
|
* Specified in two parts: a state initializer and a sequence selection
|
||||||
|
* constant (a.k.a. stream id)
|
||||||
|
*/
|
||||||
|
void seed(const UInt64 &initstate, const UInt64 &initseq) {
|
||||||
|
state = zero<UInt64>();
|
||||||
|
inc = sl<1>(initseq) | 1u;
|
||||||
|
next_uint32();
|
||||||
|
state += initstate;
|
||||||
|
next_uint32();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generate a uniformly distributed unsigned 32-bit random number
|
||||||
|
ENOKI_INLINE UInt32 next_uint32() {
|
||||||
|
UInt64 oldstate = state;
|
||||||
|
state = oldstate * uint64_t(PCG32_MULT) + inc;
|
||||||
|
UInt32 xorshifted = UInt32(sr<27>(sr<18>(oldstate) ^ oldstate));
|
||||||
|
UInt32 rot_offset = UInt32(sr<59>(oldstate));
|
||||||
|
return ror(xorshifted, rot_offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Masked version of \ref next_uint32
|
||||||
|
ENOKI_INLINE UInt32 next_uint32(const UInt64Mask &mask) {
|
||||||
|
UInt64 oldstate = state;
|
||||||
|
masked(state, mask) = oldstate * uint64_t(PCG32_MULT) + inc;
|
||||||
|
UInt32 xorshifted = UInt32(sr<27>(sr<18>(oldstate) ^ oldstate));
|
||||||
|
UInt32 rot_offset = UInt32(sr<59>(oldstate));
|
||||||
|
return ror(xorshifted, rot_offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generate a uniformly distributed unsigned 64-bit random number
|
||||||
|
ENOKI_INLINE UInt64 next_uint64() {
|
||||||
|
return UInt64(next_uint32()) | sl<32>(UInt64(next_uint32()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Masked version of \ref next_uint64
|
||||||
|
ENOKI_INLINE UInt64 next_uint64(const UInt64Mask &mask) {
|
||||||
|
return UInt64(next_uint32(mask)) | sl<32>(UInt64(next_uint32(mask)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Forward \ref next_uint call to the correct method based given type size
|
||||||
|
template <typename Value, enable_if_std_int_v<scalar_t<Value>> = 0>
|
||||||
|
ENOKI_INLINE Value next_uint() {
|
||||||
|
if constexpr (is_int64_v<scalar_t<Value>>)
|
||||||
|
return next_uint64();
|
||||||
|
else
|
||||||
|
return next_uint32();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Forward \ref next_uint call to the correct method based given type size (masked version)
|
||||||
|
template <typename Value, enable_if_std_int_v<scalar_t<Value>> = 0>
|
||||||
|
ENOKI_INLINE Value next_uint(const mask_t<Value> &mask) {
|
||||||
|
if constexpr (is_int64_v<scalar_t<Value>>)
|
||||||
|
return next_uint64(mask);
|
||||||
|
else
|
||||||
|
return next_uint32(mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generate a single precision floating point value on the interval [0, 1)
|
||||||
|
ENOKI_INLINE Float32 next_float32() {
|
||||||
|
return reinterpret_array<Float32>(sr<9>(next_uint32()) | 0x3f800000u) - 1.f;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Masked version of \ref next_float32
|
||||||
|
ENOKI_INLINE Float32 next_float32(const UInt64Mask &mask) {
|
||||||
|
return reinterpret_array<Float32>(sr<9>(next_uint32(mask)) | 0x3f800000u) - 1.f;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Generate a double precision floating point value on the interval [0, 1)
|
||||||
|
*
|
||||||
|
* \remark Since the underlying random number generator produces 32 bit output,
|
||||||
|
* only the first 32 mantissa bits will be filled (however, the resolution is still
|
||||||
|
* finer than in \ref next_float(), which only uses 23 mantissa bits)
|
||||||
|
*/
|
||||||
|
ENOKI_INLINE Float64 next_float64() {
|
||||||
|
/* Trick from MTGP: generate an uniformly distributed
|
||||||
|
double precision number in [1,2) and subtract 1. */
|
||||||
|
return reinterpret_array<Float64>(sl<20>(UInt64(next_uint32())) |
|
||||||
|
0x3ff0000000000000ull) - 1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Masked version of next_float64
|
||||||
|
ENOKI_INLINE Float64 next_float64(const UInt64Mask &mask) {
|
||||||
|
return reinterpret_array<Float64>(sl<20>(UInt64(next_uint32(mask))) |
|
||||||
|
0x3ff0000000000000ull) - 1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Forward \ref next_float call to the correct method based given type size
|
||||||
|
template <typename Value, enable_if_std_float_v<scalar_t<Value>> = 0>
|
||||||
|
ENOKI_INLINE Value next_float() {
|
||||||
|
if constexpr (is_double_v<scalar_t<Value>>)
|
||||||
|
return next_float64();
|
||||||
|
else
|
||||||
|
return next_float32();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Forward \ref next_float call to the correct method based given type size (masked version)
|
||||||
|
template <typename Value, enable_if_std_float_v<scalar_t<Value>> = 0>
|
||||||
|
ENOKI_INLINE Value next_float(const mask_t<Value> &mask) {
|
||||||
|
if constexpr (is_double_v<scalar_t<Value>>)
|
||||||
|
return next_float64(mask);
|
||||||
|
else
|
||||||
|
return next_float32(mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generate a uniformly distributed integer r, where 0 <= r < bound
|
||||||
|
UInt32 next_uint32_bounded(uint32_t bound, UInt64Mask mask = true) {
|
||||||
|
if constexpr (is_scalar_v<T>) {
|
||||||
|
ENOKI_MARK_USED(mask);
|
||||||
|
|
||||||
|
/* To avoid bias, we need to make the range of the RNG a multiple of
|
||||||
|
bound, which we do by dropping output less than a threshold.
|
||||||
|
A naive scheme to calculate the threshold would be to do
|
||||||
|
|
||||||
|
UInt32 threshold = 0x1'0000'0000ull % bound;
|
||||||
|
|
||||||
|
but 64-bit div/mod is slower than 32-bit div/mod (especially on
|
||||||
|
32-bit platforms). In essence, we do
|
||||||
|
|
||||||
|
UInt32 threshold = (0x1'0000'0000ull-bound) % bound;
|
||||||
|
|
||||||
|
because this version will calculate the same modulus, but the LHS
|
||||||
|
value is less than 2^32.
|
||||||
|
*/
|
||||||
|
|
||||||
|
const UInt32 threshold = (~bound + 1u) % bound;
|
||||||
|
|
||||||
|
/* Uniformity guarantees that this loop will terminate. In practice, it
|
||||||
|
should usually terminate quickly; on average (assuming all bounds are
|
||||||
|
equally likely), 82.25% of the time, we can expect it to require just
|
||||||
|
one iteration. In the worst case, someone passes a bound of 2^31 + 1
|
||||||
|
(i.e., 2147483649), which invalidates almost 50% of the range. In
|
||||||
|
practice, bounds are typically small and only a tiny amount of the range
|
||||||
|
is eliminated.
|
||||||
|
*/
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
UInt32 result = next_uint32();
|
||||||
|
|
||||||
|
if (all(result >= threshold))
|
||||||
|
return result % bound;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const divisor_ext<uint32_t> div(bound);
|
||||||
|
const UInt32 threshold = (~bound + 1u) % div;
|
||||||
|
|
||||||
|
UInt32 result = zero<UInt32>();
|
||||||
|
do {
|
||||||
|
result[mask] = next_uint32(mask);
|
||||||
|
|
||||||
|
/* Keep track of which SIMD lanes have already
|
||||||
|
finished and stops advancing the associated PRNGs */
|
||||||
|
mask &= result < threshold;
|
||||||
|
} while (any(mask));
|
||||||
|
|
||||||
|
return result % div;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generate a uniformly distributed integer r, where 0 <= r < bound
|
||||||
|
UInt64 next_uint64_bounded(uint64_t bound, UInt64Mask mask = true) {
|
||||||
|
if constexpr (is_scalar_v<T>) {
|
||||||
|
ENOKI_MARK_USED(mask);
|
||||||
|
|
||||||
|
const uint64_t threshold = (~bound + (uint64_t) 1) % bound;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
uint64_t result = next_uint64();
|
||||||
|
|
||||||
|
if (all(result >= threshold))
|
||||||
|
return result % bound;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const divisor_ext<uint64_t> div(bound);
|
||||||
|
const UInt64 threshold = (~bound + (uint64_t) 1) % div;
|
||||||
|
|
||||||
|
UInt64 result = zero<UInt64>();
|
||||||
|
do {
|
||||||
|
result[mask] = next_uint64(mask);
|
||||||
|
|
||||||
|
/* Keep track of which SIMD lanes have already
|
||||||
|
finished and stops advancing the associated PRNGs */
|
||||||
|
mask &= result < threshold;
|
||||||
|
} while (any(mask));
|
||||||
|
|
||||||
|
return result % div;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Forward \ref next_uint_bounded call to the correct method based given type size
|
||||||
|
template <typename Value, enable_if_std_int_v<scalar_t<Value>> = 0>
|
||||||
|
ENOKI_INLINE Value next_uint_bounded(scalar_t<Value> bound,
|
||||||
|
const mask_t<Value> &mask = true) {
|
||||||
|
if constexpr (is_int64_v<scalar_t<Value>>)
|
||||||
|
return next_uint64_bounded(bound, mask);
|
||||||
|
else
|
||||||
|
return next_uint32_bounded(bound, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Multi-step advance function (jump-ahead, jump-back)
|
||||||
|
*
|
||||||
|
* The method used here is based on Brown, "Random Number Generation with
|
||||||
|
* Arbitrary Stride", Transactions of the American Nuclear Society (Nov.
|
||||||
|
* 1994). The algorithm is very similar to fast exponentiation.
|
||||||
|
*/
|
||||||
|
void advance(const Int64 &delta_) {
|
||||||
|
UInt64 cur_mult = PCG32_MULT,
|
||||||
|
cur_plus = inc,
|
||||||
|
acc_mult = 1ull,
|
||||||
|
acc_plus = 0ull;
|
||||||
|
|
||||||
|
/* Even though delta is an unsigned integer, we can pass a signed
|
||||||
|
integer to go backwards, it just goes "the long way round". */
|
||||||
|
UInt64 delta(delta_);
|
||||||
|
|
||||||
|
while (delta != zero<UInt64>()) {
|
||||||
|
auto mask = neq(delta & UInt64(1), zero<UInt64>());
|
||||||
|
acc_mult = select(mask, acc_mult * cur_mult, acc_mult);
|
||||||
|
acc_plus = select(mask, acc_plus * cur_mult + cur_plus, acc_plus);
|
||||||
|
cur_plus = (cur_mult + UInt64(1)) * cur_plus;
|
||||||
|
cur_mult *= cur_mult;
|
||||||
|
delta = sr<1>(delta);
|
||||||
|
}
|
||||||
|
|
||||||
|
state = acc_mult * state + acc_plus;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute the distance between two PCG32 pseudorandom number generators
|
||||||
|
Int64 operator-(const PCG32 &other) const {
|
||||||
|
assert(inc == other.inc);
|
||||||
|
|
||||||
|
UInt64 cur_mult = PCG32_MULT,
|
||||||
|
cur_plus = inc,
|
||||||
|
cur_state = other.state,
|
||||||
|
the_bit = 1ull,
|
||||||
|
distance = 0ull;
|
||||||
|
|
||||||
|
while (state != cur_state) {
|
||||||
|
auto mask = neq(state & the_bit, cur_state & the_bit);
|
||||||
|
cur_state = select(mask, cur_state * cur_mult + cur_plus, cur_state);
|
||||||
|
distance = select(mask, distance | the_bit, distance);
|
||||||
|
assert((state & the_bit) == (cur_state & the_bit));
|
||||||
|
the_bit = sl<1>(the_bit);
|
||||||
|
cur_plus = (cur_mult + UInt64(1)) * cur_plus;
|
||||||
|
cur_mult *= cur_mult;
|
||||||
|
}
|
||||||
|
|
||||||
|
return Int64(distance);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Draw uniformly distributed permutation and permute the
|
||||||
|
* given container
|
||||||
|
*
|
||||||
|
* From: Knuth, TAoCP Vol. 2 (3rd 3d), Section 3.4.2
|
||||||
|
*/
|
||||||
|
template <typename Iterator, typename T2 = T,
|
||||||
|
enable_if_t<is_scalar_v<T2>> = 0>
|
||||||
|
void shuffle(Iterator begin, Iterator end) {
|
||||||
|
for (Iterator it = end - 1; it > begin; --it)
|
||||||
|
std::iter_swap(it, begin + next_uint32_bounded((uint32_t) (it - begin + 1)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Equality operator
|
||||||
|
bool operator==(const PCG32 &other) const { return state == other.state && inc == other.inc; }
|
||||||
|
|
||||||
|
/// Inequality operator
|
||||||
|
bool operator!=(const PCG32 &other) const { return state != other.state || inc != other.inc; }
|
||||||
|
|
||||||
|
UInt64 state; // RNG state. All values are possible.
|
||||||
|
UInt64 inc; // Controls which RNG sequence (stream) is selected. Must *always* be odd.
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,843 @@
|
||||||
|
/*
|
||||||
|
enoki/matrix.h -- Real spherical harmonics evaluation routines
|
||||||
|
|
||||||
|
The generated code is based on the paper `Efficient Spherical Harmonic
|
||||||
|
Evaluation, Journal of Computer Graphics Techniques (JCGT), vol. 2, no. 2,
|
||||||
|
84-90, 2013 by Peter-Pike Sloan
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "array.h"
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
template <typename Array>
|
||||||
|
void sh_eval(const Array &d, size_t order, value_t<expr_t<Array>> *out) {
|
||||||
|
switch (order) {
|
||||||
|
case 0: sh_eval_0(d, out); break;
|
||||||
|
case 1: sh_eval_1(d, out); break;
|
||||||
|
case 2: sh_eval_2(d, out); break;
|
||||||
|
case 3: sh_eval_3(d, out); break;
|
||||||
|
case 4: sh_eval_4(d, out); break;
|
||||||
|
case 5: sh_eval_5(d, out); break;
|
||||||
|
case 6: sh_eval_6(d, out); break;
|
||||||
|
case 7: sh_eval_7(d, out); break;
|
||||||
|
case 8: sh_eval_8(d, out); break;
|
||||||
|
case 9: sh_eval_9(d, out); break;
|
||||||
|
default: throw std::runtime_error("sh_eval(): order too high!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Array>
|
||||||
|
void sh_eval_0(const Array &, value_t<expr_t<Array>> *out) {
|
||||||
|
static_assert(array_size_v<Array> == 3, "The parameter 'd' should be a 3D vector.");
|
||||||
|
|
||||||
|
using Value = value_t<expr_t<Array>>;
|
||||||
|
using Scalar = scalar_t<Value>;
|
||||||
|
|
||||||
|
store(out + 0, Value(Scalar(0.28209479177387814)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Array>
|
||||||
|
void sh_eval_1(const Array &d, value_t<expr_t<Array>> *out) {
|
||||||
|
static_assert(array_size_v<Array> == 3, "The parameter 'd' should be a 3D vector.");
|
||||||
|
|
||||||
|
using Value = value_t<expr_t<Array>>;
|
||||||
|
using Scalar = scalar_t<Value>;
|
||||||
|
|
||||||
|
Value x = d.x(), y = d.y(), z = d.z();
|
||||||
|
Value c0, s0, tmp_a;
|
||||||
|
|
||||||
|
store(out + 0, Value(Scalar(0.28209479177387814)));
|
||||||
|
store(out + 2, z * Scalar(0.488602511902919923));
|
||||||
|
c0 = x;
|
||||||
|
s0 = y;
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.488602511902919978);
|
||||||
|
store(out + 3, tmp_a * c0);
|
||||||
|
store(out + 1, tmp_a * s0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Array>
|
||||||
|
void sh_eval_2(const Array &d, value_t<expr_t<Array>> *out) {
|
||||||
|
static_assert(array_size_v<Array> == 3, "The parameter 'd' should be a 3D vector.");
|
||||||
|
|
||||||
|
using Value = value_t<expr_t<Array>>;
|
||||||
|
using Scalar = scalar_t<Value>;
|
||||||
|
|
||||||
|
Value x = d.x(), y = d.y(), z = d.z(), z2 = z * z;
|
||||||
|
Value c0, c1, s0, s1, tmp_a, tmp_b, tmp_c;
|
||||||
|
|
||||||
|
store(out + 0, Value(Scalar(0.28209479177387814)));
|
||||||
|
store(out + 2, z * Scalar(0.488602511902919923));
|
||||||
|
store(out + 6, fmadd(z2, Scalar(0.94617469575756008), Scalar(-0.315391565252520045)));
|
||||||
|
c0 = x;
|
||||||
|
s0 = y;
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.488602511902919978);
|
||||||
|
store(out + 3, tmp_a * c0);
|
||||||
|
store(out + 1, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-1.09254843059207896);
|
||||||
|
store(out + 7, tmp_b * c0);
|
||||||
|
store(out + 5, tmp_b * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_c = Scalar(0.546274215296039478);
|
||||||
|
store(out + 8, tmp_c * c1);
|
||||||
|
store(out + 4, tmp_c * s1);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Array>
|
||||||
|
void sh_eval_3(const Array &d, value_t<expr_t<Array>> *out) {
|
||||||
|
static_assert(array_size_v<Array> == 3, "The parameter 'd' should be a 3D vector.");
|
||||||
|
|
||||||
|
using Value = value_t<expr_t<Array>>;
|
||||||
|
using Scalar = scalar_t<Value>;
|
||||||
|
|
||||||
|
Value x = d.x(), y = d.y(), z = d.z(), z2 = z * z;
|
||||||
|
Value c0, c1, s0, s1, tmp_a, tmp_b, tmp_c;
|
||||||
|
|
||||||
|
store(out + 0, Value(Scalar(0.28209479177387814)));
|
||||||
|
store(out + 2, z * Scalar(0.488602511902919923));
|
||||||
|
store(out + 6, fmadd(z2, Scalar(0.94617469575756008), Scalar(-0.315391565252520045)));
|
||||||
|
store(out + 12, z * fmadd(z2, Scalar(1.865881662950577), Scalar(-1.1195289977703462)));
|
||||||
|
c0 = x;
|
||||||
|
s0 = y;
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.488602511902919978);
|
||||||
|
store(out + 3, tmp_a * c0);
|
||||||
|
store(out + 1, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-1.09254843059207896);
|
||||||
|
store(out + 7, tmp_b * c0);
|
||||||
|
store(out + 5, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z2, Scalar(-2.28522899732232876), Scalar(0.457045799464465774));
|
||||||
|
store(out + 13, tmp_c * c0);
|
||||||
|
store(out + 11, tmp_c * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_a = Scalar(0.546274215296039478);
|
||||||
|
store(out + 8, tmp_a * c1);
|
||||||
|
store(out + 4, tmp_a * s1);
|
||||||
|
tmp_b = z * Scalar(1.44530572132027735);
|
||||||
|
store(out + 14, tmp_b * c1);
|
||||||
|
store(out + 10, tmp_b * s1);
|
||||||
|
c0 = fmsub(x, c1, y * s1);
|
||||||
|
s0 = fmadd(x, s1, y * c1);
|
||||||
|
|
||||||
|
tmp_c = Scalar(-0.590043589926643519);
|
||||||
|
store(out + 15, tmp_c * c0);
|
||||||
|
store(out + 9, tmp_c * s0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Array>
|
||||||
|
void sh_eval_4(const Array &d, value_t<expr_t<Array>> *out) {
|
||||||
|
static_assert(array_size_v<Array> == 3, "The parameter 'd' should be a 3D vector.");
|
||||||
|
|
||||||
|
using Value = value_t<expr_t<Array>>;
|
||||||
|
using Scalar = scalar_t<Value>;
|
||||||
|
|
||||||
|
Value x = d.x(), y = d.y(), z = d.z(), z2 = z * z;
|
||||||
|
Value c0, c1, s0, s1, tmp_a, tmp_b, tmp_c;
|
||||||
|
|
||||||
|
store(out + 0, Value(Scalar(0.28209479177387814)));
|
||||||
|
store(out + 2, z * Scalar(0.488602511902919923));
|
||||||
|
store(out + 6, fmadd(z2, Scalar(0.94617469575756008), Scalar(-0.315391565252520045)));
|
||||||
|
store(out + 12, z * fmadd(z2, Scalar(1.865881662950577), Scalar(-1.1195289977703462)));
|
||||||
|
store(out + 20, fmadd(z * Scalar(1.98431348329844304), load<Value>(out + 12), load<Value>(out + 6) * Scalar(-1.00623058987490532)));
|
||||||
|
c0 = x;
|
||||||
|
s0 = y;
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.488602511902919978);
|
||||||
|
store(out + 3, tmp_a * c0);
|
||||||
|
store(out + 1, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-1.09254843059207896);
|
||||||
|
store(out + 7, tmp_b * c0);
|
||||||
|
store(out + 5, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z2, Scalar(-2.28522899732232876), Scalar(0.457045799464465774));
|
||||||
|
store(out + 13, tmp_c * c0);
|
||||||
|
store(out + 11, tmp_c * s0);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(-4.6833258049010249), Scalar(2.00713963067186763));
|
||||||
|
store(out + 21, tmp_a * c0);
|
||||||
|
store(out + 19, tmp_a * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_a = Scalar(0.546274215296039478);
|
||||||
|
store(out + 8, tmp_a * c1);
|
||||||
|
store(out + 4, tmp_a * s1);
|
||||||
|
tmp_b = z * Scalar(1.44530572132027735);
|
||||||
|
store(out + 14, tmp_b * c1);
|
||||||
|
store(out + 10, tmp_b * s1);
|
||||||
|
tmp_c = fmadd(z2, Scalar(3.31161143515146028), Scalar(-0.473087347878779985));
|
||||||
|
store(out + 22, tmp_c * c1);
|
||||||
|
store(out + 18, tmp_c * s1);
|
||||||
|
c0 = fmsub(x, c1, y * s1);
|
||||||
|
s0 = fmadd(x, s1, y * c1);
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.590043589926643519);
|
||||||
|
store(out + 15, tmp_a * c0);
|
||||||
|
store(out + 9, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-1.77013076977993067);
|
||||||
|
store(out + 23, tmp_b * c0);
|
||||||
|
store(out + 17, tmp_b * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_c = Scalar(0.625835735449176256);
|
||||||
|
store(out + 24, tmp_c * c1);
|
||||||
|
store(out + 16, tmp_c * s1);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Array>
|
||||||
|
void sh_eval_5(const Array &d, value_t<expr_t<Array>> *out) {
|
||||||
|
static_assert(array_size_v<Array> == 3, "The parameter 'd' should be a 3D vector.");
|
||||||
|
|
||||||
|
using Value = value_t<expr_t<Array>>;
|
||||||
|
using Scalar = scalar_t<Value>;
|
||||||
|
|
||||||
|
Value x = d.x(), y = d.y(), z = d.z(), z2 = z * z;
|
||||||
|
Value c0, c1, s0, s1, tmp_a, tmp_b, tmp_c;
|
||||||
|
|
||||||
|
store(out + 0, Value(Scalar(0.28209479177387814)));
|
||||||
|
store(out + 2, z * Scalar(0.488602511902919923));
|
||||||
|
store(out + 6, fmadd(z2, Scalar(0.94617469575756008), Scalar(-0.315391565252520045)));
|
||||||
|
store(out + 12, z * fmadd(z2, Scalar(1.865881662950577), Scalar(-1.1195289977703462)));
|
||||||
|
store(out + 20, fmadd(z * Scalar(1.98431348329844304), load<Value>(out + 12), load<Value>(out + 6) * Scalar(-1.00623058987490532)));
|
||||||
|
store(out + 30, fmadd(z * Scalar(1.98997487421323993), load<Value>(out + 20), load<Value>(out + 12) * Scalar(-1.00285307284481395)));
|
||||||
|
c0 = x;
|
||||||
|
s0 = y;
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.488602511902919978);
|
||||||
|
store(out + 3, tmp_a * c0);
|
||||||
|
store(out + 1, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-1.09254843059207896);
|
||||||
|
store(out + 7, tmp_b * c0);
|
||||||
|
store(out + 5, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z2, Scalar(-2.28522899732232876), Scalar(0.457045799464465774));
|
||||||
|
store(out + 13, tmp_c * c0);
|
||||||
|
store(out + 11, tmp_c * s0);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(-4.6833258049010249), Scalar(2.00713963067186763));
|
||||||
|
store(out + 21, tmp_a * c0);
|
||||||
|
store(out + 19, tmp_a * s0);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.03100960115899021), tmp_a, tmp_c * Scalar(-0.991031208965114985));
|
||||||
|
store(out + 31, tmp_b * c0);
|
||||||
|
store(out + 29, tmp_b * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_a = Scalar(0.546274215296039478);
|
||||||
|
store(out + 8, tmp_a * c1);
|
||||||
|
store(out + 4, tmp_a * s1);
|
||||||
|
tmp_b = z * Scalar(1.44530572132027735);
|
||||||
|
store(out + 14, tmp_b * c1);
|
||||||
|
store(out + 10, tmp_b * s1);
|
||||||
|
tmp_c = fmadd(z2, Scalar(3.31161143515146028), Scalar(-0.473087347878779985));
|
||||||
|
store(out + 22, tmp_c * c1);
|
||||||
|
store(out + 18, tmp_c * s1);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(7.19030517745998665), Scalar(-2.39676839248666207));
|
||||||
|
store(out + 32, tmp_a * c1);
|
||||||
|
store(out + 28, tmp_a * s1);
|
||||||
|
c0 = fmsub(x, c1, y * s1);
|
||||||
|
s0 = fmadd(x, s1, y * c1);
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.590043589926643519);
|
||||||
|
store(out + 15, tmp_a * c0);
|
||||||
|
store(out + 9, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-1.77013076977993067);
|
||||||
|
store(out + 23, tmp_b * c0);
|
||||||
|
store(out + 17, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z2, Scalar(-4.40314469491725369), Scalar(0.48923829943525049));
|
||||||
|
store(out + 33, tmp_c * c0);
|
||||||
|
store(out + 27, tmp_c * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_a = Scalar(0.625835735449176256);
|
||||||
|
store(out + 24, tmp_a * c1);
|
||||||
|
store(out + 16, tmp_a * s1);
|
||||||
|
tmp_b = z * Scalar(2.07566231488104114);
|
||||||
|
store(out + 34, tmp_b * c1);
|
||||||
|
store(out + 26, tmp_b * s1);
|
||||||
|
c0 = fmsub(x, c1, y * s1);
|
||||||
|
s0 = fmadd(x, s1, y * c1);
|
||||||
|
|
||||||
|
tmp_c = Scalar(-0.656382056840170258);
|
||||||
|
store(out + 35, tmp_c * c0);
|
||||||
|
store(out + 25, tmp_c * s0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Array>
|
||||||
|
void sh_eval_6(const Array &d, value_t<expr_t<Array>> *out) {
|
||||||
|
static_assert(array_size_v<Array> == 3, "The parameter 'd' should be a 3D vector.");
|
||||||
|
|
||||||
|
using Value = value_t<expr_t<Array>>;
|
||||||
|
using Scalar = scalar_t<Value>;
|
||||||
|
|
||||||
|
Value x = d.x(), y = d.y(), z = d.z(), z2 = z * z;
|
||||||
|
Value c0, c1, s0, s1, tmp_a, tmp_b, tmp_c;
|
||||||
|
|
||||||
|
store(out + 0, Value(Scalar(0.28209479177387814)));
|
||||||
|
store(out + 2, z * Scalar(0.488602511902919923));
|
||||||
|
store(out + 6, fmadd(z2, Scalar(0.94617469575756008), Scalar(-0.315391565252520045)));
|
||||||
|
store(out + 12, z * fmadd(z2, Scalar(1.865881662950577), Scalar(-1.1195289977703462)));
|
||||||
|
store(out + 20, fmadd(z * Scalar(1.98431348329844304), load<Value>(out + 12), load<Value>(out + 6) * Scalar(-1.00623058987490532)));
|
||||||
|
store(out + 30, fmadd(z * Scalar(1.98997487421323993), load<Value>(out + 20), load<Value>(out + 12) * Scalar(-1.00285307284481395)));
|
||||||
|
store(out + 42, fmadd(z * Scalar(1.99304345718356646), load<Value>(out + 30), load<Value>(out + 20) * Scalar(-1.00154202096221923)));
|
||||||
|
c0 = x;
|
||||||
|
s0 = y;
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.488602511902919978);
|
||||||
|
store(out + 3, tmp_a * c0);
|
||||||
|
store(out + 1, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-1.09254843059207896);
|
||||||
|
store(out + 7, tmp_b * c0);
|
||||||
|
store(out + 5, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z2, Scalar(-2.28522899732232876), Scalar(0.457045799464465774));
|
||||||
|
store(out + 13, tmp_c * c0);
|
||||||
|
store(out + 11, tmp_c * s0);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(-4.6833258049010249), Scalar(2.00713963067186763));
|
||||||
|
store(out + 21, tmp_a * c0);
|
||||||
|
store(out + 19, tmp_a * s0);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.03100960115899021), tmp_a, tmp_c * Scalar(-0.991031208965114985));
|
||||||
|
store(out + 31, tmp_b * c0);
|
||||||
|
store(out + 29, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z * Scalar(2.02131498923702768), tmp_b, tmp_a * Scalar(-0.995226703056238504));
|
||||||
|
store(out + 43, tmp_c * c0);
|
||||||
|
store(out + 41, tmp_c * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_a = Scalar(0.546274215296039478);
|
||||||
|
store(out + 8, tmp_a * c1);
|
||||||
|
store(out + 4, tmp_a * s1);
|
||||||
|
tmp_b = z * Scalar(1.44530572132027735);
|
||||||
|
store(out + 14, tmp_b * c1);
|
||||||
|
store(out + 10, tmp_b * s1);
|
||||||
|
tmp_c = fmadd(z2, Scalar(3.31161143515146028), Scalar(-0.473087347878779985));
|
||||||
|
store(out + 22, tmp_c * c1);
|
||||||
|
store(out + 18, tmp_c * s1);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(7.19030517745998665), Scalar(-2.39676839248666207));
|
||||||
|
store(out + 32, tmp_a * c1);
|
||||||
|
store(out + 28, tmp_a * s1);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.11394181566096995), tmp_a, tmp_c * Scalar(-0.973610120462326756));
|
||||||
|
store(out + 44, tmp_b * c1);
|
||||||
|
store(out + 40, tmp_b * s1);
|
||||||
|
c0 = fmsub(x, c1, y * s1);
|
||||||
|
s0 = fmadd(x, s1, y * c1);
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.590043589926643519);
|
||||||
|
store(out + 15, tmp_a * c0);
|
||||||
|
store(out + 9, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-1.77013076977993067);
|
||||||
|
store(out + 23, tmp_b * c0);
|
||||||
|
store(out + 17, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z2, Scalar(-4.40314469491725369), Scalar(0.48923829943525049));
|
||||||
|
store(out + 33, tmp_c * c0);
|
||||||
|
store(out + 27, tmp_c * s0);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(-10.1332578546641603), Scalar(2.76361577854477058));
|
||||||
|
store(out + 45, tmp_a * c0);
|
||||||
|
store(out + 39, tmp_a * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_a = Scalar(0.625835735449176256);
|
||||||
|
store(out + 24, tmp_a * c1);
|
||||||
|
store(out + 16, tmp_a * s1);
|
||||||
|
tmp_b = z * Scalar(2.07566231488104114);
|
||||||
|
store(out + 34, tmp_b * c1);
|
||||||
|
store(out + 26, tmp_b * s1);
|
||||||
|
tmp_c = fmadd(z2, Scalar(5.55021390801596581), Scalar(-0.504564900728724064));
|
||||||
|
store(out + 46, tmp_c * c1);
|
||||||
|
store(out + 38, tmp_c * s1);
|
||||||
|
c0 = fmsub(x, c1, y * s1);
|
||||||
|
s0 = fmadd(x, s1, y * c1);
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.656382056840170258);
|
||||||
|
store(out + 35, tmp_a * c0);
|
||||||
|
store(out + 25, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-2.3666191622317525);
|
||||||
|
store(out + 47, tmp_b * c0);
|
||||||
|
store(out + 37, tmp_b * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_c = Scalar(0.683184105191914415);
|
||||||
|
store(out + 48, tmp_c * c1);
|
||||||
|
store(out + 36, tmp_c * s1);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Array>
|
||||||
|
void sh_eval_7(const Array &d, value_t<expr_t<Array>> *out) {
|
||||||
|
static_assert(array_size_v<Array> == 3, "The parameter 'd' should be a 3D vector.");
|
||||||
|
|
||||||
|
using Value = value_t<expr_t<Array>>;
|
||||||
|
using Scalar = scalar_t<Value>;
|
||||||
|
|
||||||
|
Value x = d.x(), y = d.y(), z = d.z(), z2 = z * z;
|
||||||
|
Value c0, c1, s0, s1, tmp_a, tmp_b, tmp_c;
|
||||||
|
|
||||||
|
store(out + 0, Value(Scalar(0.28209479177387814)));
|
||||||
|
store(out + 2, z * Scalar(0.488602511902919923));
|
||||||
|
store(out + 6, fmadd(z2, Scalar(0.94617469575756008), Scalar(-0.315391565252520045)));
|
||||||
|
store(out + 12, z * fmadd(z2, Scalar(1.865881662950577), Scalar(-1.1195289977703462)));
|
||||||
|
store(out + 20, fmadd(z * Scalar(1.98431348329844304), load<Value>(out + 12), load<Value>(out + 6) * Scalar(-1.00623058987490532)));
|
||||||
|
store(out + 30, fmadd(z * Scalar(1.98997487421323993), load<Value>(out + 20), load<Value>(out + 12) * Scalar(-1.00285307284481395)));
|
||||||
|
store(out + 42, fmadd(z * Scalar(1.99304345718356646), load<Value>(out + 30), load<Value>(out + 20) * Scalar(-1.00154202096221923)));
|
||||||
|
store(out + 56, fmadd(z * Scalar(1.99489143482413467), load<Value>(out + 42), load<Value>(out + 30) * Scalar(-1.00092721392195827)));
|
||||||
|
c0 = x;
|
||||||
|
s0 = y;
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.488602511902919978);
|
||||||
|
store(out + 3, tmp_a * c0);
|
||||||
|
store(out + 1, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-1.09254843059207896);
|
||||||
|
store(out + 7, tmp_b * c0);
|
||||||
|
store(out + 5, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z2, Scalar(-2.28522899732232876), Scalar(0.457045799464465774));
|
||||||
|
store(out + 13, tmp_c * c0);
|
||||||
|
store(out + 11, tmp_c * s0);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(-4.6833258049010249), Scalar(2.00713963067186763));
|
||||||
|
store(out + 21, tmp_a * c0);
|
||||||
|
store(out + 19, tmp_a * s0);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.03100960115899021), tmp_a, tmp_c * Scalar(-0.991031208965114985));
|
||||||
|
store(out + 31, tmp_b * c0);
|
||||||
|
store(out + 29, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z * Scalar(2.02131498923702768), tmp_b, tmp_a * Scalar(-0.995226703056238504));
|
||||||
|
store(out + 43, tmp_c * c0);
|
||||||
|
store(out + 41, tmp_c * s0);
|
||||||
|
tmp_a = fmadd(z * Scalar(2.01556443707463773), tmp_c, tmp_b * Scalar(-0.99715504402183186));
|
||||||
|
store(out + 57, tmp_a * c0);
|
||||||
|
store(out + 55, tmp_a * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_a = Scalar(0.546274215296039478);
|
||||||
|
store(out + 8, tmp_a * c1);
|
||||||
|
store(out + 4, tmp_a * s1);
|
||||||
|
tmp_b = z * Scalar(1.44530572132027735);
|
||||||
|
store(out + 14, tmp_b * c1);
|
||||||
|
store(out + 10, tmp_b * s1);
|
||||||
|
tmp_c = fmadd(z2, Scalar(3.31161143515146028), Scalar(-0.473087347878779985));
|
||||||
|
store(out + 22, tmp_c * c1);
|
||||||
|
store(out + 18, tmp_c * s1);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(7.19030517745998665), Scalar(-2.39676839248666207));
|
||||||
|
store(out + 32, tmp_a * c1);
|
||||||
|
store(out + 28, tmp_a * s1);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.11394181566096995), tmp_a, tmp_c * Scalar(-0.973610120462326756));
|
||||||
|
store(out + 44, tmp_b * c1);
|
||||||
|
store(out + 40, tmp_b * s1);
|
||||||
|
tmp_c = fmadd(z * Scalar(2.08166599946613307), tmp_b, tmp_a * Scalar(-0.984731927834661791));
|
||||||
|
store(out + 58, tmp_c * c1);
|
||||||
|
store(out + 54, tmp_c * s1);
|
||||||
|
c0 = fmsub(x, c1, y * s1);
|
||||||
|
s0 = fmadd(x, s1, y * c1);
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.590043589926643519);
|
||||||
|
store(out + 15, tmp_a * c0);
|
||||||
|
store(out + 9, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-1.77013076977993067);
|
||||||
|
store(out + 23, tmp_b * c0);
|
||||||
|
store(out + 17, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z2, Scalar(-4.40314469491725369), Scalar(0.48923829943525049));
|
||||||
|
store(out + 33, tmp_c * c0);
|
||||||
|
store(out + 27, tmp_c * s0);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(-10.1332578546641603), Scalar(2.76361577854477058));
|
||||||
|
store(out + 45, tmp_a * c0);
|
||||||
|
store(out + 39, tmp_a * s0);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.20794021658196149), tmp_a, tmp_c * Scalar(-0.95940322360024699));
|
||||||
|
store(out + 59, tmp_b * c0);
|
||||||
|
store(out + 53, tmp_b * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_a = Scalar(0.625835735449176256);
|
||||||
|
store(out + 24, tmp_a * c1);
|
||||||
|
store(out + 16, tmp_a * s1);
|
||||||
|
tmp_b = z * Scalar(2.07566231488104114);
|
||||||
|
store(out + 34, tmp_b * c1);
|
||||||
|
store(out + 26, tmp_b * s1);
|
||||||
|
tmp_c = fmadd(z2, Scalar(5.55021390801596581), Scalar(-0.504564900728724064));
|
||||||
|
store(out + 46, tmp_c * c1);
|
||||||
|
store(out + 38, tmp_c * s1);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(13.4918050467267694), Scalar(-3.11349347232156193));
|
||||||
|
store(out + 60, tmp_a * c1);
|
||||||
|
store(out + 52, tmp_a * s1);
|
||||||
|
c0 = fmsub(x, c1, y * s1);
|
||||||
|
s0 = fmadd(x, s1, y * c1);
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.656382056840170258);
|
||||||
|
store(out + 35, tmp_a * c0);
|
||||||
|
store(out + 25, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-2.3666191622317525);
|
||||||
|
store(out + 47, tmp_b * c0);
|
||||||
|
store(out + 37, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z2, Scalar(-6.7459025233633847), Scalar(0.518915578720260395));
|
||||||
|
store(out + 61, tmp_c * c0);
|
||||||
|
store(out + 51, tmp_c * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_a = Scalar(0.683184105191914415);
|
||||||
|
store(out + 48, tmp_a * c1);
|
||||||
|
store(out + 36, tmp_a * s1);
|
||||||
|
tmp_b = z * Scalar(2.64596066180190048);
|
||||||
|
store(out + 62, tmp_b * c1);
|
||||||
|
store(out + 50, tmp_b * s1);
|
||||||
|
c0 = fmsub(x, c1, y * s1);
|
||||||
|
s0 = fmadd(x, s1, y * c1);
|
||||||
|
|
||||||
|
tmp_c = Scalar(-0.707162732524596271);
|
||||||
|
store(out + 63, tmp_c * c0);
|
||||||
|
store(out + 49, tmp_c * s0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Array>
|
||||||
|
void sh_eval_8(const Array &d, value_t<expr_t<Array>> *out) {
|
||||||
|
static_assert(array_size_v<Array> == 3, "The parameter 'd' should be a 3D vector.");
|
||||||
|
|
||||||
|
using Value = value_t<expr_t<Array>>;
|
||||||
|
using Scalar = scalar_t<Value>;
|
||||||
|
|
||||||
|
Value x = d.x(), y = d.y(), z = d.z(), z2 = z * z;
|
||||||
|
Value c0, c1, s0, s1, tmp_a, tmp_b, tmp_c;
|
||||||
|
|
||||||
|
store(out + 0, Value(Scalar(0.28209479177387814)));
|
||||||
|
store(out + 2, z * Scalar(0.488602511902919923));
|
||||||
|
store(out + 6, fmadd(z2, Scalar(0.94617469575756008), Scalar(-0.315391565252520045)));
|
||||||
|
store(out + 12, z * fmadd(z2, Scalar(1.865881662950577), Scalar(-1.1195289977703462)));
|
||||||
|
store(out + 20, fmadd(z * Scalar(1.98431348329844304), load<Value>(out + 12), load<Value>(out + 6) * Scalar(-1.00623058987490532)));
|
||||||
|
store(out + 30, fmadd(z * Scalar(1.98997487421323993), load<Value>(out + 20), load<Value>(out + 12) * Scalar(-1.00285307284481395)));
|
||||||
|
store(out + 42, fmadd(z * Scalar(1.99304345718356646), load<Value>(out + 30), load<Value>(out + 20) * Scalar(-1.00154202096221923)));
|
||||||
|
store(out + 56, fmadd(z * Scalar(1.99489143482413467), load<Value>(out + 42), load<Value>(out + 30) * Scalar(-1.00092721392195827)));
|
||||||
|
store(out + 72, fmadd(z * Scalar(1.9960899278339137), load<Value>(out + 56), load<Value>(out + 42) * Scalar(-1.00060078106951478)));
|
||||||
|
c0 = x;
|
||||||
|
s0 = y;
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.488602511902919978);
|
||||||
|
store(out + 3, tmp_a * c0);
|
||||||
|
store(out + 1, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-1.09254843059207896);
|
||||||
|
store(out + 7, tmp_b * c0);
|
||||||
|
store(out + 5, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z2, Scalar(-2.28522899732232876), Scalar(0.457045799464465774));
|
||||||
|
store(out + 13, tmp_c * c0);
|
||||||
|
store(out + 11, tmp_c * s0);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(-4.6833258049010249), Scalar(2.00713963067186763));
|
||||||
|
store(out + 21, tmp_a * c0);
|
||||||
|
store(out + 19, tmp_a * s0);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.03100960115899021), tmp_a, tmp_c * Scalar(-0.991031208965114985));
|
||||||
|
store(out + 31, tmp_b * c0);
|
||||||
|
store(out + 29, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z * Scalar(2.02131498923702768), tmp_b, tmp_a * Scalar(-0.995226703056238504));
|
||||||
|
store(out + 43, tmp_c * c0);
|
||||||
|
store(out + 41, tmp_c * s0);
|
||||||
|
tmp_a = fmadd(z * Scalar(2.01556443707463773), tmp_c, tmp_b * Scalar(-0.99715504402183186));
|
||||||
|
store(out + 57, tmp_a * c0);
|
||||||
|
store(out + 55, tmp_a * s0);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.01186954040739119), tmp_a, tmp_c * Scalar(-0.998166817890174474));
|
||||||
|
store(out + 73, tmp_b * c0);
|
||||||
|
store(out + 71, tmp_b * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_a = Scalar(0.546274215296039478);
|
||||||
|
store(out + 8, tmp_a * c1);
|
||||||
|
store(out + 4, tmp_a * s1);
|
||||||
|
tmp_b = z * Scalar(1.44530572132027735);
|
||||||
|
store(out + 14, tmp_b * c1);
|
||||||
|
store(out + 10, tmp_b * s1);
|
||||||
|
tmp_c = fmadd(z2, Scalar(3.31161143515146028), Scalar(-0.473087347878779985));
|
||||||
|
store(out + 22, tmp_c * c1);
|
||||||
|
store(out + 18, tmp_c * s1);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(7.19030517745998665), Scalar(-2.39676839248666207));
|
||||||
|
store(out + 32, tmp_a * c1);
|
||||||
|
store(out + 28, tmp_a * s1);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.11394181566096995), tmp_a, tmp_c * Scalar(-0.973610120462326756));
|
||||||
|
store(out + 44, tmp_b * c1);
|
||||||
|
store(out + 40, tmp_b * s1);
|
||||||
|
tmp_c = fmadd(z * Scalar(2.08166599946613307), tmp_b, tmp_a * Scalar(-0.984731927834661791));
|
||||||
|
store(out + 58, tmp_c * c1);
|
||||||
|
store(out + 54, tmp_c * s1);
|
||||||
|
tmp_a = fmadd(z * Scalar(2.06155281280883029), tmp_c, tmp_b * Scalar(-0.990337937660287326));
|
||||||
|
store(out + 74, tmp_a * c1);
|
||||||
|
store(out + 70, tmp_a * s1);
|
||||||
|
c0 = fmsub(x, c1, y * s1);
|
||||||
|
s0 = fmadd(x, s1, y * c1);
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.590043589926643519);
|
||||||
|
store(out + 15, tmp_a * c0);
|
||||||
|
store(out + 9, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-1.77013076977993067);
|
||||||
|
store(out + 23, tmp_b * c0);
|
||||||
|
store(out + 17, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z2, Scalar(-4.40314469491725369), Scalar(0.48923829943525049));
|
||||||
|
store(out + 33, tmp_c * c0);
|
||||||
|
store(out + 27, tmp_c * s0);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(-10.1332578546641603), Scalar(2.76361577854477058));
|
||||||
|
store(out + 45, tmp_a * c0);
|
||||||
|
store(out + 39, tmp_a * s0);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.20794021658196149), tmp_a, tmp_c * Scalar(-0.95940322360024699));
|
||||||
|
store(out + 59, tmp_b * c0);
|
||||||
|
store(out + 53, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z * Scalar(2.15322168769582012), tmp_b, tmp_a * Scalar(-0.975217386560017774));
|
||||||
|
store(out + 75, tmp_c * c0);
|
||||||
|
store(out + 69, tmp_c * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_a = Scalar(0.625835735449176256);
|
||||||
|
store(out + 24, tmp_a * c1);
|
||||||
|
store(out + 16, tmp_a * s1);
|
||||||
|
tmp_b = z * Scalar(2.07566231488104114);
|
||||||
|
store(out + 34, tmp_b * c1);
|
||||||
|
store(out + 26, tmp_b * s1);
|
||||||
|
tmp_c = fmadd(z2, Scalar(5.55021390801596581), Scalar(-0.504564900728724064));
|
||||||
|
store(out + 46, tmp_c * c1);
|
||||||
|
store(out + 38, tmp_c * s1);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(13.4918050467267694), Scalar(-3.11349347232156193));
|
||||||
|
store(out + 60, tmp_a * c1);
|
||||||
|
store(out + 52, tmp_a * s1);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.30488611432322132), tmp_a, tmp_c * Scalar(-0.948176387355465389));
|
||||||
|
store(out + 76, tmp_b * c1);
|
||||||
|
store(out + 68, tmp_b * s1);
|
||||||
|
c0 = fmsub(x, c1, y * s1);
|
||||||
|
s0 = fmadd(x, s1, y * c1);
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.656382056840170258);
|
||||||
|
store(out + 35, tmp_a * c0);
|
||||||
|
store(out + 25, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-2.3666191622317525);
|
||||||
|
store(out + 47, tmp_b * c0);
|
||||||
|
store(out + 37, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z2, Scalar(-6.7459025233633847), Scalar(0.518915578720260395));
|
||||||
|
store(out + 61, tmp_c * c0);
|
||||||
|
store(out + 51, tmp_c * s0);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(-17.2495531104905417), Scalar(3.44991062209810817));
|
||||||
|
store(out + 77, tmp_a * c0);
|
||||||
|
store(out + 67, tmp_a * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_a = Scalar(0.683184105191914415);
|
||||||
|
store(out + 48, tmp_a * c1);
|
||||||
|
store(out + 36, tmp_a * s1);
|
||||||
|
tmp_b = z * Scalar(2.64596066180190048);
|
||||||
|
store(out + 62, tmp_b * c1);
|
||||||
|
store(out + 50, tmp_b * s1);
|
||||||
|
tmp_c = fmadd(z2, Scalar(7.98499149089313942), Scalar(-0.532332766059542606));
|
||||||
|
store(out + 78, tmp_c * c1);
|
||||||
|
store(out + 66, tmp_c * s1);
|
||||||
|
c0 = fmsub(x, c1, y * s1);
|
||||||
|
s0 = fmadd(x, s1, y * c1);
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.707162732524596271);
|
||||||
|
store(out + 63, tmp_a * c0);
|
||||||
|
store(out + 49, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-2.91570664069931995);
|
||||||
|
store(out + 79, tmp_b * c0);
|
||||||
|
store(out + 65, tmp_b * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_c = Scalar(0.728926660174829988);
|
||||||
|
store(out + 80, tmp_c * c1);
|
||||||
|
store(out + 64, tmp_c * s1);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Array>
|
||||||
|
void sh_eval_9(const Array &d, value_t<expr_t<Array>> *out) {
|
||||||
|
static_assert(array_size_v<Array> == 3, "The parameter 'd' should be a 3D vector.");
|
||||||
|
|
||||||
|
using Value = value_t<expr_t<Array>>;
|
||||||
|
using Scalar = scalar_t<Value>;
|
||||||
|
|
||||||
|
Value x = d.x(), y = d.y(), z = d.z(), z2 = z * z;
|
||||||
|
Value c0, c1, s0, s1, tmp_a, tmp_b, tmp_c;
|
||||||
|
|
||||||
|
store(out + 0, Value(Scalar(0.28209479177387814)));
|
||||||
|
store(out + 2, z * Scalar(0.488602511902919923));
|
||||||
|
store(out + 6, fmadd(z2, Scalar(0.94617469575756008), Scalar(-0.315391565252520045)));
|
||||||
|
store(out + 12, z * fmadd(z2, Scalar(1.865881662950577), Scalar(-1.1195289977703462)));
|
||||||
|
store(out + 20, fmadd(z * Scalar(1.98431348329844304), load<Value>(out + 12), load<Value>(out + 6) * Scalar(-1.00623058987490532)));
|
||||||
|
store(out + 30, fmadd(z * Scalar(1.98997487421323993), load<Value>(out + 20), load<Value>(out + 12) * Scalar(-1.00285307284481395)));
|
||||||
|
store(out + 42, fmadd(z * Scalar(1.99304345718356646), load<Value>(out + 30), load<Value>(out + 20) * Scalar(-1.00154202096221923)));
|
||||||
|
store(out + 56, fmadd(z * Scalar(1.99489143482413467), load<Value>(out + 42), load<Value>(out + 30) * Scalar(-1.00092721392195827)));
|
||||||
|
store(out + 72, fmadd(z * Scalar(1.9960899278339137), load<Value>(out + 56), load<Value>(out + 42) * Scalar(-1.00060078106951478)));
|
||||||
|
store(out + 90, fmadd(z * Scalar(1.99691119506793657), load<Value>(out + 72), load<Value>(out + 56) * Scalar(-1.0004114379931337)));
|
||||||
|
c0 = x;
|
||||||
|
s0 = y;
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.488602511902919978);
|
||||||
|
store(out + 3, tmp_a * c0);
|
||||||
|
store(out + 1, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-1.09254843059207896);
|
||||||
|
store(out + 7, tmp_b * c0);
|
||||||
|
store(out + 5, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z2, Scalar(-2.28522899732232876), Scalar(0.457045799464465774));
|
||||||
|
store(out + 13, tmp_c * c0);
|
||||||
|
store(out + 11, tmp_c * s0);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(-4.6833258049010249), Scalar(2.00713963067186763));
|
||||||
|
store(out + 21, tmp_a * c0);
|
||||||
|
store(out + 19, tmp_a * s0);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.03100960115899021), tmp_a, tmp_c * Scalar(-0.991031208965114985));
|
||||||
|
store(out + 31, tmp_b * c0);
|
||||||
|
store(out + 29, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z * Scalar(2.02131498923702768), tmp_b, tmp_a * Scalar(-0.995226703056238504));
|
||||||
|
store(out + 43, tmp_c * c0);
|
||||||
|
store(out + 41, tmp_c * s0);
|
||||||
|
tmp_a = fmadd(z * Scalar(2.01556443707463773), tmp_c, tmp_b * Scalar(-0.99715504402183186));
|
||||||
|
store(out + 57, tmp_a * c0);
|
||||||
|
store(out + 55, tmp_a * s0);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.01186954040739119), tmp_a, tmp_c * Scalar(-0.998166817890174474));
|
||||||
|
store(out + 73, tmp_b * c0);
|
||||||
|
store(out + 71, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z * Scalar(2.00935312974101166), tmp_b, tmp_a * Scalar(-0.998749217771908837));
|
||||||
|
store(out + 91, tmp_c * c0);
|
||||||
|
store(out + 89, tmp_c * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_a = Scalar(0.546274215296039478);
|
||||||
|
store(out + 8, tmp_a * c1);
|
||||||
|
store(out + 4, tmp_a * s1);
|
||||||
|
tmp_b = z * Scalar(1.44530572132027735);
|
||||||
|
store(out + 14, tmp_b * c1);
|
||||||
|
store(out + 10, tmp_b * s1);
|
||||||
|
tmp_c = fmadd(z2, Scalar(3.31161143515146028), Scalar(-0.473087347878779985));
|
||||||
|
store(out + 22, tmp_c * c1);
|
||||||
|
store(out + 18, tmp_c * s1);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(7.19030517745998665), Scalar(-2.39676839248666207));
|
||||||
|
store(out + 32, tmp_a * c1);
|
||||||
|
store(out + 28, tmp_a * s1);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.11394181566096995), tmp_a, tmp_c * Scalar(-0.973610120462326756));
|
||||||
|
store(out + 44, tmp_b * c1);
|
||||||
|
store(out + 40, tmp_b * s1);
|
||||||
|
tmp_c = fmadd(z * Scalar(2.08166599946613307), tmp_b, tmp_a * Scalar(-0.984731927834661791));
|
||||||
|
store(out + 58, tmp_c * c1);
|
||||||
|
store(out + 54, tmp_c * s1);
|
||||||
|
tmp_a = fmadd(z * Scalar(2.06155281280883029), tmp_c, tmp_b * Scalar(-0.990337937660287326));
|
||||||
|
store(out + 74, tmp_a * c1);
|
||||||
|
store(out + 70, tmp_a * s1);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.04812235835781919), tmp_a, tmp_c * Scalar(-0.993485272670404207));
|
||||||
|
store(out + 92, tmp_b * c1);
|
||||||
|
store(out + 88, tmp_b * s1);
|
||||||
|
c0 = fmsub(x, c1, y * s1);
|
||||||
|
s0 = fmadd(x, s1, y * c1);
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.590043589926643519);
|
||||||
|
store(out + 15, tmp_a * c0);
|
||||||
|
store(out + 9, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-1.77013076977993067);
|
||||||
|
store(out + 23, tmp_b * c0);
|
||||||
|
store(out + 17, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z2, Scalar(-4.40314469491725369), Scalar(0.48923829943525049));
|
||||||
|
store(out + 33, tmp_c * c0);
|
||||||
|
store(out + 27, tmp_c * s0);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(-10.1332578546641603), Scalar(2.76361577854477058));
|
||||||
|
store(out + 45, tmp_a * c0);
|
||||||
|
store(out + 39, tmp_a * s0);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.20794021658196149), tmp_a, tmp_c * Scalar(-0.95940322360024699));
|
||||||
|
store(out + 59, tmp_b * c0);
|
||||||
|
store(out + 53, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z * Scalar(2.15322168769582012), tmp_b, tmp_a * Scalar(-0.975217386560017774));
|
||||||
|
store(out + 75, tmp_c * c0);
|
||||||
|
store(out + 69, tmp_c * s0);
|
||||||
|
tmp_a = fmadd(z * Scalar(2.11804417118980526), tmp_c, tmp_b * Scalar(-0.983662844979209416));
|
||||||
|
store(out + 93, tmp_a * c0);
|
||||||
|
store(out + 87, tmp_a * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_a = Scalar(0.625835735449176256);
|
||||||
|
store(out + 24, tmp_a * c1);
|
||||||
|
store(out + 16, tmp_a * s1);
|
||||||
|
tmp_b = z * Scalar(2.07566231488104114);
|
||||||
|
store(out + 34, tmp_b * c1);
|
||||||
|
store(out + 26, tmp_b * s1);
|
||||||
|
tmp_c = fmadd(z2, Scalar(5.55021390801596581), Scalar(-0.504564900728724064));
|
||||||
|
store(out + 46, tmp_c * c1);
|
||||||
|
store(out + 38, tmp_c * s1);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(13.4918050467267694), Scalar(-3.11349347232156193));
|
||||||
|
store(out + 60, tmp_a * c1);
|
||||||
|
store(out + 52, tmp_a * s1);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.30488611432322132), tmp_a, tmp_c * Scalar(-0.948176387355465389));
|
||||||
|
store(out + 76, tmp_b * c1);
|
||||||
|
store(out + 68, tmp_b * s1);
|
||||||
|
tmp_c = fmadd(z * Scalar(2.22917715070623501), tmp_b, tmp_a * Scalar(-0.967152839723182112));
|
||||||
|
store(out + 94, tmp_c * c1);
|
||||||
|
store(out + 86, tmp_c * s1);
|
||||||
|
c0 = fmsub(x, c1, y * s1);
|
||||||
|
s0 = fmadd(x, s1, y * c1);
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.656382056840170258);
|
||||||
|
store(out + 35, tmp_a * c0);
|
||||||
|
store(out + 25, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-2.3666191622317525);
|
||||||
|
store(out + 47, tmp_b * c0);
|
||||||
|
store(out + 37, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z2, Scalar(-6.7459025233633847), Scalar(0.518915578720260395));
|
||||||
|
store(out + 61, tmp_c * c0);
|
||||||
|
store(out + 51, tmp_c * s0);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(-17.2495531104905417), Scalar(3.44991062209810817));
|
||||||
|
store(out + 77, tmp_a * c0);
|
||||||
|
store(out + 67, tmp_a * s0);
|
||||||
|
tmp_b = fmadd(z * Scalar(2.40163634692206163), tmp_a, tmp_c * Scalar(-0.939224604204370817));
|
||||||
|
store(out + 95, tmp_b * c0);
|
||||||
|
store(out + 85, tmp_b * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_a = Scalar(0.683184105191914415);
|
||||||
|
store(out + 48, tmp_a * c1);
|
||||||
|
store(out + 36, tmp_a * s1);
|
||||||
|
tmp_b = z * Scalar(2.64596066180190048);
|
||||||
|
store(out + 62, tmp_b * c1);
|
||||||
|
store(out + 50, tmp_b * s1);
|
||||||
|
tmp_c = fmadd(z2, Scalar(7.98499149089313942), Scalar(-0.532332766059542606));
|
||||||
|
store(out + 78, tmp_c * c1);
|
||||||
|
store(out + 66, tmp_c * s1);
|
||||||
|
tmp_a = z * fmadd(z2, Scalar(21.3928901909086377), Scalar(-3.77521591604270101));
|
||||||
|
store(out + 96, tmp_a * c1);
|
||||||
|
store(out + 84, tmp_a * s1);
|
||||||
|
c0 = fmsub(x, c1, y * s1);
|
||||||
|
s0 = fmadd(x, s1, y * c1);
|
||||||
|
|
||||||
|
tmp_a = Scalar(-0.707162732524596271);
|
||||||
|
store(out + 63, tmp_a * c0);
|
||||||
|
store(out + 49, tmp_a * s0);
|
||||||
|
tmp_b = z * Scalar(-2.91570664069931995);
|
||||||
|
store(out + 79, tmp_b * c0);
|
||||||
|
store(out + 65, tmp_b * s0);
|
||||||
|
tmp_c = fmadd(z2, Scalar(-9.26339318284890467), Scalar(0.544905481344053255));
|
||||||
|
store(out + 97, tmp_c * c0);
|
||||||
|
store(out + 83, tmp_c * s0);
|
||||||
|
c1 = fmsub(x, c0, y * s0);
|
||||||
|
s1 = fmadd(x, s0, y * c0);
|
||||||
|
|
||||||
|
tmp_a = Scalar(0.728926660174829988);
|
||||||
|
store(out + 80, tmp_a * c1);
|
||||||
|
store(out + 64, tmp_a * s1);
|
||||||
|
tmp_b = z * Scalar(3.17731764895469793);
|
||||||
|
store(out + 98, tmp_b * c1);
|
||||||
|
store(out + 82, tmp_b * s1);
|
||||||
|
c0 = fmsub(x, c1, y * s1);
|
||||||
|
s0 = fmadd(x, s1, y * c1);
|
||||||
|
|
||||||
|
tmp_c = Scalar(-0.74890095185318839);
|
||||||
|
store(out + 99, tmp_c * c0);
|
||||||
|
store(out + 81, tmp_c * s0);
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,675 @@
|
||||||
|
/*
|
||||||
|
enoki/special.h -- Special functions: Bessel functions, Elliptic
|
||||||
|
and exponential integrals, etc. (still incomplete)
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <enoki/array.h>
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
/// Evaluates a series of Chebyshev polynomials at argument x/2.
|
||||||
|
template <typename T, typename T2, size_t Size,
|
||||||
|
typename Expr = expr_t<T>> Expr chbevl(const T &x, T2 (&coeffs)[Size]) {
|
||||||
|
using Scalar = scalar_t<Expr>;
|
||||||
|
|
||||||
|
Expr b0 = Scalar(coeffs[0]);
|
||||||
|
Expr b1 = Scalar(0);
|
||||||
|
Expr b2;
|
||||||
|
|
||||||
|
ENOKI_UNROLL for (size_t i = 0; i < Size; ++i) {
|
||||||
|
b2 = b1;
|
||||||
|
b1 = b0;
|
||||||
|
b0 = fmsub(x, b1, b2 - Scalar(coeffs[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
return (b0 - b2) * Scalar(0.5f);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, enable_if_not_array_t<T> = 0> T erf(const T &x) {
|
||||||
|
return std::erf(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <typename T, enable_if_not_array_t<T> = 0> T erfc(const T &x) {
|
||||||
|
return std::erfc(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, bool Recurse = true, typename Expr = expr_t<T>,
|
||||||
|
enable_if_array_t<T> = 0>
|
||||||
|
Expr erfc(const T &x);
|
||||||
|
|
||||||
|
template <typename T, bool Recurse = true, typename Expr = expr_t<T>,
|
||||||
|
enable_if_array_t<T> = 0>
|
||||||
|
Expr erf(const T &x);
|
||||||
|
|
||||||
|
template <typename T, bool Recurse, typename Expr, enable_if_array_t<T>>
|
||||||
|
Expr erfc(const T &x) {
|
||||||
|
constexpr bool Single = std::is_same_v<scalar_t<T>, float>;
|
||||||
|
using Scalar = scalar_t<T>;
|
||||||
|
|
||||||
|
Expr r;
|
||||||
|
Expr xa = abs(x),
|
||||||
|
z = exp(-x*x);
|
||||||
|
|
||||||
|
auto erf_mask = xa < Scalar(1),
|
||||||
|
large_mask = xa > Scalar(Single ? 2 : 8);
|
||||||
|
|
||||||
|
ENOKI_MARK_USED(erf_mask);
|
||||||
|
|
||||||
|
if constexpr (Single) {
|
||||||
|
Expr q = rcp(xa),
|
||||||
|
y = q*q, p_small, p_large;
|
||||||
|
|
||||||
|
if (is_cuda_array_v<Expr> || !all_nested(large_mask))
|
||||||
|
p_small = poly8(y, 5.638259427386472e-1, -2.741127028184656e-1,
|
||||||
|
3.404879937665872e-1, -4.944515323274145e-1,
|
||||||
|
6.210004621745983e-1, -5.824733027278666e-1,
|
||||||
|
3.687424674597105e-1, -1.387039388740657e-1,
|
||||||
|
2.326819970068386e-2);
|
||||||
|
|
||||||
|
if (is_cuda_array_v<Expr> || any_nested(large_mask))
|
||||||
|
p_large = poly7(y, 5.641895067754075e-1, -2.820767439740514e-1,
|
||||||
|
4.218463358204948e-1, -1.015265279202700e+0,
|
||||||
|
2.921019019210786e+0, -7.495518717768503e+0,
|
||||||
|
1.297719955372516e+1, -1.047766399936249e+1);
|
||||||
|
r = z * q * select(large_mask, p_large, p_small);
|
||||||
|
} else {
|
||||||
|
Expr p_small, p_large, q_small, q_large;
|
||||||
|
|
||||||
|
if (is_cuda_array_v<Expr> || !all_nested(large_mask)) {
|
||||||
|
p_small = poly8(xa, 5.57535335369399327526e2, 1.02755188689515710272e3,
|
||||||
|
9.34528527171957607540e2, 5.26445194995477358631e2,
|
||||||
|
1.96520832956077098242e2, 4.86371970985681366614e1,
|
||||||
|
7.46321056442269912687e0, 5.64189564831068821977e-1,
|
||||||
|
2.46196981473530512524e-10);
|
||||||
|
|
||||||
|
q_small = poly8(xa, 5.57535340817727675546e2, 1.65666309194161350182e3,
|
||||||
|
2.24633760818710981792e3, 1.82390916687909736289e3,
|
||||||
|
9.75708501743205489753e2, 3.54937778887819891062e2,
|
||||||
|
8.67072140885989742329e1, 1.32281951154744992508e1,
|
||||||
|
1.00000000000000000000e0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (is_cuda_array_v<Expr> || any_nested(large_mask)) {
|
||||||
|
p_large = poly5(xa, 2.97886665372100240670e0, 7.40974269950448939160e0,
|
||||||
|
6.16021097993053585195e0, 5.01905042251180477414e0,
|
||||||
|
1.27536670759978104416e0, 5.64189583547755073984e-1);
|
||||||
|
|
||||||
|
q_large = poly6(xa, 3.36907645100081516050e0, 9.60896809063285878198e0,
|
||||||
|
1.70814450747565897222e1, 1.20489539808096656605e1,
|
||||||
|
9.39603524938001434673e0, 2.26052863220117276590e0,
|
||||||
|
1.00000000000000000000e0);
|
||||||
|
}
|
||||||
|
|
||||||
|
r = (z * select(large_mask, p_large, p_small)) /
|
||||||
|
select(large_mask, q_large, q_small);
|
||||||
|
|
||||||
|
r &= neq(z, zero<Expr>());
|
||||||
|
}
|
||||||
|
|
||||||
|
r[x < Scalar(0)] = Scalar(2) - r;
|
||||||
|
|
||||||
|
if constexpr (Recurse) {
|
||||||
|
if (ENOKI_UNLIKELY(is_cuda_array_v<Expr> || any_nested(erf_mask)))
|
||||||
|
r[erf_mask] = Scalar(1) - erf<T, false>(x);
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, bool Recurse, typename Expr, enable_if_array_t<T>>
|
||||||
|
Expr erf(const T &x) {
|
||||||
|
using Scalar = scalar_t<T>;
|
||||||
|
|
||||||
|
Expr r;
|
||||||
|
auto erfc_mask = abs(x) > Scalar(1);
|
||||||
|
ENOKI_MARK_USED(erfc_mask);
|
||||||
|
|
||||||
|
Expr z = x * x;
|
||||||
|
|
||||||
|
constexpr bool Single = std::is_same_v<scalar_t<T>, float>;
|
||||||
|
if constexpr (Single) {
|
||||||
|
r = poly6(z, 1.128379165726710e+0, -3.761262582423300e-1,
|
||||||
|
1.128358514861418e-1, -2.685381193529856e-2,
|
||||||
|
5.188327685732524e-3, -8.010193625184903e-4,
|
||||||
|
7.853861353153693e-5);
|
||||||
|
} else {
|
||||||
|
r = poly4(z, 5.55923013010394962768e4, 7.00332514112805075473e3,
|
||||||
|
2.23200534594684319226e3, 9.00260197203842689217e1,
|
||||||
|
9.60497373987051638749e0) /
|
||||||
|
poly5(z, 4.92673942608635921086e4, 2.26290000613890934246e4,
|
||||||
|
4.59432382970980127987e3, 5.21357949780152679795e2,
|
||||||
|
3.35617141647503099647e1, 1.00000000000000000000e0);
|
||||||
|
}
|
||||||
|
|
||||||
|
r *= x;
|
||||||
|
|
||||||
|
if constexpr (Recurse) {
|
||||||
|
if (ENOKI_UNLIKELY(is_cuda_array_v<Expr> || any_nested(erfc_mask)))
|
||||||
|
r[erfc_mask] = Scalar(1) - erfc<T, false>(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Modified Bessel function of the first kind, order zero (exponentially scaled)
|
||||||
|
template <typename T, typename Expr = expr_t<T>> Expr i0e(const T &x_) {
|
||||||
|
using Scalar = scalar_t<T>;
|
||||||
|
|
||||||
|
/* Chebyshev coefficients for exp(-x) I0(x)
|
||||||
|
* in the interval [0,8].
|
||||||
|
*
|
||||||
|
* lim(x->0) { exp(-x) I0(x) } = 1.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static Scalar A[] = {
|
||||||
|
Scalar(-1.30002500998624804212E-8), Scalar(6.04699502254191894932E-8),
|
||||||
|
Scalar(-2.67079385394061173391E-7), Scalar(1.11738753912010371815E-6),
|
||||||
|
Scalar(-4.41673835845875056359E-6), Scalar(1.64484480707288970893E-5),
|
||||||
|
Scalar(-5.75419501008210370398E-5), Scalar(1.88502885095841655729E-4),
|
||||||
|
Scalar(-5.76375574538582365885E-4), Scalar(1.63947561694133579842E-3),
|
||||||
|
Scalar(-4.32430999505057594430E-3), Scalar(1.05464603945949983183E-2),
|
||||||
|
Scalar(-2.37374148058994688156E-2), Scalar(4.93052842396707084878E-2),
|
||||||
|
Scalar(-9.49010970480476444210E-2), Scalar(1.71620901522208775349E-1),
|
||||||
|
Scalar(-3.04682672343198398683E-1), Scalar(6.76795274409476084995E-1)
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/* Chebyshev coefficients for exp(-x) sqrt(x) I0(x)
|
||||||
|
* in the inverted interval [8,infinity].
|
||||||
|
*
|
||||||
|
* lim(x->inf) { exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi).
|
||||||
|
*/
|
||||||
|
|
||||||
|
static Scalar B[] = {
|
||||||
|
Scalar(3.39623202570838634515E-9), Scalar(2.26666899049817806459E-8),
|
||||||
|
Scalar(2.04891858946906374183E-7), Scalar(2.89137052083475648297E-6),
|
||||||
|
Scalar(6.88975834691682398426E-5), Scalar(3.36911647825569408990E-3),
|
||||||
|
Scalar(8.04490411014108831608E-1)
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
Expr x = abs(x_);
|
||||||
|
|
||||||
|
auto mask_big = x > Scalar(8);
|
||||||
|
|
||||||
|
Expr r_big, r_small;
|
||||||
|
|
||||||
|
if (!all_nested(mask_big))
|
||||||
|
r_small = chbevl(fmsub(x, Expr(Scalar(0.5)), Expr(Scalar(2))), A);
|
||||||
|
|
||||||
|
if (any_nested(mask_big))
|
||||||
|
r_big = chbevl(fmsub(Expr(Scalar(32)), rcp(x), Expr(Scalar(2))), B) *
|
||||||
|
rsqrt(x);
|
||||||
|
|
||||||
|
return select(mask_big, r_big, r_small);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Inverse real error function approximation based on on "Approximating the
|
||||||
|
// erfinv function" by Mark Giles
|
||||||
|
template <typename T, typename Expr = expr_t<T>> Expr erfinv(const T &x_) {
|
||||||
|
using Scalar = scalar_t<T>;
|
||||||
|
|
||||||
|
Expr x(x_);
|
||||||
|
Expr w = -log((Expr(Scalar(1)) - x) * (Expr(Scalar(1)) + x));
|
||||||
|
|
||||||
|
Expr w1 = w - Scalar(2.5);
|
||||||
|
Expr w2 = sqrt(w) - Scalar(3);
|
||||||
|
|
||||||
|
Expr p1 = poly8(w1,
|
||||||
|
1.50140941, 0.246640727,
|
||||||
|
-0.00417768164, -0.00125372503,
|
||||||
|
0.00021858087, -4.39150654e-06,
|
||||||
|
-3.5233877e-06, 3.43273939e-07,
|
||||||
|
2.81022636e-08);
|
||||||
|
|
||||||
|
Expr p2 = poly8(w2,
|
||||||
|
2.83297682, 1.00167406,
|
||||||
|
0.00943887047, -0.0076224613,
|
||||||
|
0.00573950773, -0.00367342844,
|
||||||
|
0.00134934322, 0.000100950558,
|
||||||
|
-0.000200214257);
|
||||||
|
|
||||||
|
return select(w < Scalar(5), p1, p2) * x;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Evaluates Dawson's integral (e^(-x^2) \int_0^x e^(y^2) dy)
|
||||||
|
template <typename T, typename Expr = expr_t<T>> Expr dawson(const T &x) {
|
||||||
|
// Rational minimax approximation to Dawson's integral with relative
|
||||||
|
// error < 1e-6 on the real number line. July 2017, Wenzel Jakob
|
||||||
|
|
||||||
|
Expr x2 = x*x;
|
||||||
|
Expr num = poly6(x2, 1.00000080272429,9.18170212243285e-2,
|
||||||
|
4.25835373536124e-2, 6.0536496345054e-3,
|
||||||
|
9.88555033724111e-4, 3.64943550840577e-5,
|
||||||
|
1.55942290996993e-5);
|
||||||
|
|
||||||
|
Expr denom = poly7(x2, 1.0, 7.58517175815194e-1,
|
||||||
|
2.81364355593059e-1, 6.81783097841267e-2,
|
||||||
|
1.13586116798019e-2, 1.92020805811771e-3,
|
||||||
|
5.74217664074868e-5, 3.11884331363595e-5);
|
||||||
|
|
||||||
|
return num / denom * x;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Imaginary component of the error function
|
||||||
|
template <typename T, typename Expr = expr_t<T>> Expr erfi(const T &x) {
|
||||||
|
using Scalar = scalar_t<T>;
|
||||||
|
|
||||||
|
return Scalar(M_2_SQRTPI) * dawson(x) * exp(x * x);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Natural logarithm of the Gamma function
|
||||||
|
template <typename Value> Value lgamma(Value x_) {
|
||||||
|
using Mask = mask_t<Value>;
|
||||||
|
using Scalar = scalar_t<Value>;
|
||||||
|
|
||||||
|
// 'g' and 'n' parameters of the Lanczos approximation
|
||||||
|
// See mrob.com/pub/ries/lanczos-gamma.html
|
||||||
|
const int n = 6;
|
||||||
|
const Scalar g = 5.0f;
|
||||||
|
const Scalar log_sqrt2pi = Scalar(0.91893853320467274178);
|
||||||
|
const Scalar coeff[n + 1] = { (Scalar) 1.000000000190015, (Scalar) 76.18009172947146,
|
||||||
|
(Scalar) -86.50532032941677, (Scalar) 24.01409824083091,
|
||||||
|
(Scalar) -1.231739572450155, (Scalar) 0.1208650973866179e-2,
|
||||||
|
(Scalar) -0.5395239384953e-5 };
|
||||||
|
|
||||||
|
// potentially reflect using gamma(x) = pi / (sin(pi*x) * gamma(1-x))
|
||||||
|
Mask reflect = x_ < .5f;
|
||||||
|
|
||||||
|
Value x = select(reflect, -x_, x_ - 1.f),
|
||||||
|
b = x + g + .5f; // base
|
||||||
|
|
||||||
|
Value sum = 0;
|
||||||
|
for (int i = n; i >= 1; --i)
|
||||||
|
sum += coeff[i] / (x + Scalar(i));
|
||||||
|
sum += coeff[0];
|
||||||
|
|
||||||
|
// gamma(x) = sqrt(2*pi) * sum * b^(x + .5) / exp(b)
|
||||||
|
Value result = ((log_sqrt2pi + log(sum)) - b) + log(b) * (x + .5f);
|
||||||
|
|
||||||
|
if (is_cuda_array_v<Value> || any_nested(reflect)) {
|
||||||
|
masked(result, reflect) = log(abs(Scalar(M_PI) / sin(Scalar(M_PI) * x_))) - result;
|
||||||
|
masked(result, reflect && eq(x_, round(x_))) = std::numeric_limits<Scalar>::infinity();
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gamma function
|
||||||
|
template <typename Value> Value tgamma(Value x) { return exp(lgamma(x)); }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes a Carlson integral of the form
|
||||||
|
*
|
||||||
|
* R_F(X, Y, Z) = 1/2 * \int_{0}^\infty ((t + x) (t + y) (t + z))^(-1/2) dt
|
||||||
|
*
|
||||||
|
* Based on
|
||||||
|
*
|
||||||
|
* Computing elliptic integrals by duplication
|
||||||
|
* B. C. Carlson
|
||||||
|
* Numerische Mathematik, March 1979, Volume 33, Issue 1
|
||||||
|
*/
|
||||||
|
template <typename Vector3,
|
||||||
|
typename Value = value_t<Vector3>,
|
||||||
|
typename Scalar = scalar_t<Vector3>>
|
||||||
|
Value carlson_rf(Vector3 xyz) {
|
||||||
|
static_assert(
|
||||||
|
Vector3::Size == 3,
|
||||||
|
"carlson_rf(): Expected a three-dimensional input vector (x, y, z)");
|
||||||
|
assert(all_nested(xyz.x() >= Scalar(0) && xyz.y() > Scalar(0) && xyz.z() > Scalar(0)));
|
||||||
|
|
||||||
|
Vector3 XYZ;
|
||||||
|
Value mu_inv;
|
||||||
|
mask_t<Value> active = true;
|
||||||
|
int iterations = 0;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
Vector3 sqrt_xyz = sqrt(xyz);
|
||||||
|
Value lambda = dot(shuffle<1, 2, 0>(sqrt_xyz), sqrt_xyz);
|
||||||
|
Value mu = hsum(xyz) * Scalar(1.0 / 3.0);
|
||||||
|
mu_inv = rcp(mu);
|
||||||
|
XYZ = fnmadd(xyz, mu_inv, Scalar(1));
|
||||||
|
Value eps = hmax(abs(XYZ));
|
||||||
|
active &= eps > Scalar(std::is_same_v<Scalar, double>
|
||||||
|
? 0.0024608
|
||||||
|
: 0.070154); // eps ^ (1/6)
|
||||||
|
|
||||||
|
if (none(active) || ++iterations == 10)
|
||||||
|
break;
|
||||||
|
|
||||||
|
xyz[mask_t<Vector3>(active)] = (xyz + lambda) * Scalar(0.25);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Use recurrences for cheaper polynomial evaluation. Based
|
||||||
|
on Numerical Recipes (3rd ed) by Press, Teukolsky,
|
||||||
|
Vetterling, and Flannery */
|
||||||
|
|
||||||
|
Value e2 = XYZ.x() * XYZ.y() - XYZ.z() * XYZ.z(),
|
||||||
|
e3 = hprod(XYZ),
|
||||||
|
er = (Scalar(1.0 / 24.0) * e2 - Scalar(1.0 / 10.0) -
|
||||||
|
Scalar(3.0 / 44.0) * e3) * e2 + Scalar(1.0 / 14.0) * e3;
|
||||||
|
|
||||||
|
return sqrt(mu_inv) * (Scalar(1) + er);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes a Carlson integral of the form
|
||||||
|
*
|
||||||
|
* R_D(x, y, z) = 3/2 * \int_{0}^\infty (t + x)^(-1/2) (t + y)^(-1/2) (t + z)^(-3/2) dt
|
||||||
|
*
|
||||||
|
* Based on
|
||||||
|
*
|
||||||
|
* Computing elliptic integrals by duplication
|
||||||
|
* B. C. Carlson
|
||||||
|
* Numerische Mathematik, March 1979, Volume 33, Issue 1
|
||||||
|
*/
|
||||||
|
template <typename Vector3,
|
||||||
|
typename Value = value_t<Vector3>,
|
||||||
|
typename Scalar = scalar_t<Vector3>>
|
||||||
|
Value carlson_rd(Vector3 xyz) {
|
||||||
|
static_assert(
|
||||||
|
Vector3::Size == 3,
|
||||||
|
"carlson_rd(): Expected a three-dimensional input vector (x, y, z)");
|
||||||
|
assert(all_nested(xyz.x() >= Scalar(0) && xyz.y() > Scalar(0) && xyz.z() > Scalar(0)));
|
||||||
|
|
||||||
|
Vector3 XYZ;
|
||||||
|
Value mu_inv;
|
||||||
|
mask_t<Value> active = true;
|
||||||
|
int iterations = 0;
|
||||||
|
Value sum = 0;
|
||||||
|
Value num = 1;
|
||||||
|
const Vector3 W(Scalar(1.0 / 5.0), Scalar(1.0 / 5.0), Scalar(3.0 / 5.0));
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
Vector3 sqrt_xyz = sqrt(xyz);
|
||||||
|
Value lambda = dot(shuffle<1, 2, 0>(sqrt_xyz), sqrt_xyz);
|
||||||
|
Value mu = hsum(xyz * W);
|
||||||
|
mu_inv = rcp(mu);
|
||||||
|
XYZ = fnmadd(xyz, mu_inv, Scalar(1));
|
||||||
|
Value eps = hmax(abs(XYZ));
|
||||||
|
active &= eps > Scalar(std::is_same_v<Scalar, double>
|
||||||
|
? (0.0024608 * 0.6)
|
||||||
|
: (0.070154 * 0.6)); // eps ^ (1/6) * 0.6
|
||||||
|
|
||||||
|
if (none(active) || ++iterations == 10)
|
||||||
|
break;
|
||||||
|
|
||||||
|
masked(sum, active) += num / (sqrt(xyz.z()) * (xyz.z() + lambda));
|
||||||
|
masked(num, active) *= Scalar(0.25f);
|
||||||
|
masked(xyz, mask_t<Vector3>(active)) = (xyz + lambda) * Scalar(0.25f);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Use recurrences for cheaper polynomial evaluation. Based
|
||||||
|
on Numerical Recipes (3rd ed) by Press, Teukolsky,
|
||||||
|
Vetterling, and Flannery */
|
||||||
|
|
||||||
|
Value z = XYZ.z(),
|
||||||
|
ea = XYZ.x() * XYZ.y(),
|
||||||
|
eb = z * z,
|
||||||
|
ec = ea - eb,
|
||||||
|
ed = fnmadd(Scalar(6), eb, ea),
|
||||||
|
ee = fmadd(ec, Scalar(2), ed);
|
||||||
|
|
||||||
|
Value p = ed * (-Scalar(3.0 / 14.0) + Scalar(9.0 / 88.0) * ed -
|
||||||
|
Scalar(1.0 / 4.0) * z * ee) +
|
||||||
|
z * (Scalar(1.0 / 6.0) * ee + z *
|
||||||
|
(-Scalar(9.0 / 22.0) * ec + z * Scalar(3.0 / 26.0) * ea));
|
||||||
|
|
||||||
|
return Scalar(3) * sum + num * mu_inv * sqrt(mu_inv) * (Scalar(1.0) + p);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes a Carlson integral of the form
|
||||||
|
*
|
||||||
|
* R_C(x, y) = 1/2 * \int_{0}^\infty (t + x)^(-1/2) (t + y)^-1 dt
|
||||||
|
*
|
||||||
|
* Based on
|
||||||
|
*
|
||||||
|
* Computing elliptic integrals by duplication
|
||||||
|
* B. C. Carlson
|
||||||
|
* Numerische Mathematik, March 1979, Volume 33, Issue 1
|
||||||
|
*/
|
||||||
|
template <typename Vector2,
|
||||||
|
typename Value = value_t<Vector2>,
|
||||||
|
typename Scalar = scalar_t<Vector2>>
|
||||||
|
Value carlson_rc(Vector2 xy) {
|
||||||
|
static_assert(
|
||||||
|
Vector2::Size == 2,
|
||||||
|
"carlson_rc(): Expected a two-dimensional input vector (x, y)");
|
||||||
|
assert(all(xy.x() >= Scalar(0) && xy.y() > Scalar(0)));
|
||||||
|
|
||||||
|
mask_t<Value> active = true;
|
||||||
|
Value inv_mu, s;
|
||||||
|
int iterations = 0;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
Value lambda = hprod(sqrt(xy));
|
||||||
|
lambda += lambda + xy.y();
|
||||||
|
Value mu = fmadd(xy.x(), Scalar(1.0 / 3.0), xy.y() * Scalar(2.0 / 3.0));
|
||||||
|
inv_mu = rcp(mu);
|
||||||
|
s = (xy.y() - mu) * inv_mu;
|
||||||
|
|
||||||
|
active &= abs(s) > Scalar(std::is_same_v<Scalar, double>
|
||||||
|
? (0.0024608 * 0.48)
|
||||||
|
: (0.070154 * 0.48)); // eps ^ (1/6) * 0.48
|
||||||
|
|
||||||
|
if (none(active) || ++iterations == 10)
|
||||||
|
break;
|
||||||
|
|
||||||
|
masked(xy, mask_t<Vector2>(active)) = (xy + lambda) * Scalar(0.25f);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Use recurrences for cheaper polynomial evaluation. Based
|
||||||
|
on Numerical Recipes (3rd ed) by Press, Teukolsky,
|
||||||
|
Vetterling, and Flannery */
|
||||||
|
|
||||||
|
return sqrt(inv_mu) * (Scalar(1) + s * s *
|
||||||
|
(Scalar(0.3) + s * (Scalar(1.0 / 7.0) +
|
||||||
|
s * (Scalar(0.375) + s * Scalar(9.0 / 22.0)))));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes a Carlson integral of the form
|
||||||
|
*
|
||||||
|
* R_J(x, y, z, rho) = 3/2 * \int_{0}^\infty ((t + x) (t + y) (t + z))^(-1/2) (t+rho)^(-1) dt
|
||||||
|
*
|
||||||
|
* Based on
|
||||||
|
*
|
||||||
|
* Computing elliptic integrals by duplication
|
||||||
|
* B. C. Carlson
|
||||||
|
* Numerische Mathematik, March 1979, Volume 33, Issue 1
|
||||||
|
*/
|
||||||
|
template <typename Vector4,
|
||||||
|
typename Value = value_t<Vector4>,
|
||||||
|
typename Vector2 = Array<Value, 2>,
|
||||||
|
typename Scalar = scalar_t<Vector4>>
|
||||||
|
Value carlson_rj(Vector4 xyzr) {
|
||||||
|
static_assert(
|
||||||
|
Vector4::Size == 4,
|
||||||
|
"carlson_rj(): Expected a four-dimensional input vector (x, y, z, rho)");
|
||||||
|
assert(all(xyzr.x() >= Scalar(0) && xyzr.y() > Scalar(0) && xyzr.z() > Scalar(0) && xyzr.w() > Scalar(0)));
|
||||||
|
|
||||||
|
Vector4 XYZR;
|
||||||
|
Value mu_inv;
|
||||||
|
mask_t<Value> active = true;
|
||||||
|
int iterations = 0;
|
||||||
|
Value sum = 0;
|
||||||
|
Value num = 1;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
auto xyz = head<3>(xyzr);
|
||||||
|
auto rho = xyzr.w();
|
||||||
|
auto sqrt_xyz = sqrt(xyz);
|
||||||
|
Value lambda = dot(shuffle<1, 2, 0>(sqrt_xyz), sqrt_xyz);
|
||||||
|
|
||||||
|
Value mu = (hsum(xyzr) + rho) * Scalar(1.0 / 5.0);
|
||||||
|
mu_inv = rcp(mu);
|
||||||
|
XYZR = fnmadd(xyzr, mu_inv, Scalar(1));
|
||||||
|
Value eps = hmax(abs(XYZR));
|
||||||
|
active &= eps > Scalar(std::is_same_v<Scalar, double>
|
||||||
|
? (0.0024608 * 0.6)
|
||||||
|
: (0.070154 * 0.6)); // eps ^ (1/6) * 0.6
|
||||||
|
|
||||||
|
Value alpha = rho * hsum(sqrt(xyz)) + sqrt(hprod(xyz));
|
||||||
|
alpha *= alpha;
|
||||||
|
Value beta = rho * (rho + lambda) * (rho + lambda);
|
||||||
|
|
||||||
|
if (none(active) || ++iterations == 10)
|
||||||
|
break;
|
||||||
|
|
||||||
|
masked(sum, active) += num * carlson_rc(Vector2(alpha, beta));
|
||||||
|
masked(num, active) *= Scalar(0.25f);
|
||||||
|
masked(xyzr, mask_t<Vector4>(active)) = (xyzr + lambda) * Scalar(0.25f);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Use recurrences for cheaper polynomial evaluation. Based
|
||||||
|
on Numerical Recipes (3rd ed) by Press, Teukolsky,
|
||||||
|
Vetterling, and Flannery */
|
||||||
|
|
||||||
|
Value ea = XYZR.x() * (XYZR.y() + XYZR.z()) + XYZR.y() * XYZR.z(),
|
||||||
|
eb = XYZR.x() * XYZR.y() * XYZR.z(),
|
||||||
|
R = XYZR.w(),
|
||||||
|
ec = R * R,
|
||||||
|
ed = ea - Scalar(3) * ec,
|
||||||
|
ee = eb + Scalar(2) * R * (ea - ec);
|
||||||
|
|
||||||
|
return Scalar(3) * sum +
|
||||||
|
num * mu_inv * sqrt(mu_inv) *
|
||||||
|
(Scalar(1) +
|
||||||
|
ed * (-Scalar(3.0 / 14.0) + Scalar(9.0 / 88.0) * ed -
|
||||||
|
Scalar(9.0 / 52.0) * ee) +
|
||||||
|
eb * (Scalar(1.0 / 6.0) +
|
||||||
|
R * (-Scalar(3.0 / 11.0) + R * Scalar(3.0 / 26.0))) +
|
||||||
|
R * ea * (Scalar(1.0 / 3.0) - R * Scalar(3.0 / 22.0)) -
|
||||||
|
Scalar(1.0 / 3.0) * R * ec);
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
//! @{ \name Complete and incomplete elliptic integrals
|
||||||
|
//! Caution: the 'k' factor is squared in the elliptic integral, which
|
||||||
|
//! differs from the convention of Mathematica's EllipticK etc.
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Complete elliptic integral of the first kind
|
||||||
|
template <typename K, typename Value = expr_t<K>,
|
||||||
|
typename Scalar = scalar_t<Value>,
|
||||||
|
typename Vector3 = Array<Value, 3>>
|
||||||
|
Value comp_ellint_1(K k) {
|
||||||
|
return carlson_rf(Vector3(Scalar(0), Scalar(1) - k * k, Scalar(1)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Incomplete elliptic integral of the first kind
|
||||||
|
template <typename Phi, typename K,
|
||||||
|
typename Value = expr_t<Phi, K>,
|
||||||
|
typename Scalar = scalar_t<Value>,
|
||||||
|
typename Vector3 = Array<Value, 3>>
|
||||||
|
Value ellint_1(Phi phi_, K k) {
|
||||||
|
Value phi = phi_,
|
||||||
|
n = floor(fmadd(phi, Scalar(1.0 / M_PI), Scalar(.5f))),
|
||||||
|
result = 0;
|
||||||
|
|
||||||
|
if (ENOKI_UNLIKELY(any(neq(n, Scalar(0))))) {
|
||||||
|
result = comp_ellint_1(k) * n * Scalar(2);
|
||||||
|
phi = fnmadd(n, Scalar(M_PI), phi);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto [sin_phi, cos_phi] = sincos(phi);
|
||||||
|
Vector3 xyz(cos_phi * cos_phi, Scalar(1) - k * k * sin_phi * sin_phi,
|
||||||
|
Scalar(1));
|
||||||
|
result += sin_phi * carlson_rf(xyz);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Complete elliptic integral of the second kind
|
||||||
|
template <typename K, typename Value = expr_t<K>,
|
||||||
|
typename Scalar = scalar_t<Value>,
|
||||||
|
typename Vector3 = Array<Value, 3>>
|
||||||
|
Value comp_ellint_2(K k) {
|
||||||
|
auto k2 = k*k;
|
||||||
|
Vector3 xyz(Scalar(0), Scalar(1) - k2, Scalar(1));
|
||||||
|
return carlson_rf(xyz) - Scalar(1.0 / 3.0) * k2 * carlson_rd(xyz);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Incomplete elliptic integral of the second kind
|
||||||
|
template <typename Phi, typename K,
|
||||||
|
typename Value = expr_t<Phi, K>,
|
||||||
|
typename Scalar = scalar_t<Value>,
|
||||||
|
typename Vector3 = Array<Value, 3>>
|
||||||
|
Value ellint_2(Phi phi_, K k) {
|
||||||
|
Value phi = phi_,
|
||||||
|
k2 = k*k,
|
||||||
|
n = floor(fmadd(phi, Scalar(1.0 / M_PI), Scalar(.5f))),
|
||||||
|
result = 0;
|
||||||
|
|
||||||
|
if (ENOKI_UNLIKELY(any(neq(n, Scalar(0))))) {
|
||||||
|
result = comp_ellint_2(k) * n * Scalar(2);
|
||||||
|
phi = fnmadd(n, Scalar(M_PI), phi);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto [sin_phi, cos_phi] = sincos(phi);
|
||||||
|
auto sin_phi_k_2 = sin_phi * sin_phi * k2;
|
||||||
|
Vector3 xyz(cos_phi * cos_phi, Scalar(1) - sin_phi_k_2, Scalar(1));
|
||||||
|
result += sin_phi * (carlson_rf(xyz) -
|
||||||
|
Scalar(1.0 / 3.0) * sin_phi_k_2 * carlson_rd(xyz));
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Complete elliptic integral of the third kind
|
||||||
|
template <typename K, typename Nu,
|
||||||
|
typename Value = expr_t<K, Nu>,
|
||||||
|
typename Scalar = scalar_t<Value>,
|
||||||
|
typename Vector4 = Array<Value, 4>>
|
||||||
|
Value comp_ellint_3(K k, Nu nu) {
|
||||||
|
auto k2 = k*k;
|
||||||
|
Vector4 xyzr(Scalar(0), Scalar(1) - k2, Scalar(1), Scalar(1) + nu);
|
||||||
|
return carlson_rf(head<3>(xyzr)) -
|
||||||
|
Scalar(1.0 / 3.0) * nu * carlson_rj(xyzr);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Incomplete elliptic integral of the third kind
|
||||||
|
template <typename Phi, typename K, typename Nu,
|
||||||
|
typename Value = expr_t<Phi, K, Nu>,
|
||||||
|
typename Scalar = scalar_t<Value>,
|
||||||
|
typename Vector4 = Array<Value, 4>>
|
||||||
|
Value ellint_3(Phi phi_, K k, Nu nu) {
|
||||||
|
Value phi = phi_,
|
||||||
|
k2 = k*k,
|
||||||
|
n = floor(fmadd(phi, Scalar(1.0 / M_PI), Scalar(.5f))),
|
||||||
|
result = 0;
|
||||||
|
|
||||||
|
if (ENOKI_UNLIKELY(any(neq(n, Scalar(0))))) {
|
||||||
|
result = comp_ellint_3(k, nu) * n * Scalar(2);
|
||||||
|
phi = fnmadd(n, Scalar(M_PI), phi);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
auto [sin_phi, cos_phi] = sincos(phi);
|
||||||
|
auto sin_phi_2 = sin_phi * sin_phi;
|
||||||
|
Vector4 xyzr(cos_phi * cos_phi, Scalar(1) - k2 * sin_phi_2, Scalar(1),
|
||||||
|
Scalar(1) + nu * sin_phi_2);
|
||||||
|
result += sin_phi * (carlson_rf(head<3>(xyzr)) -
|
||||||
|
Scalar(1.0 / 3.0) * nu * sin_phi_2 * carlson_rj(xyzr));
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
//! @}
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,323 @@
|
||||||
|
/*
|
||||||
|
enoki/stl.h -- vectorization support for STL pairs, tuples, and arrays
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <enoki/array.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
template <typename Arg0, typename Arg1> struct struct_support<std::pair<Arg0, Arg1>> {
|
||||||
|
static constexpr bool IsDynamic =
|
||||||
|
enoki::is_dynamic_v<Arg0> || enoki::is_dynamic_v<Arg1>;
|
||||||
|
using Dynamic = std::pair<enoki::make_dynamic_t<Arg0>, enoki::make_dynamic_t<Arg1>>;
|
||||||
|
using Value = std::pair<Arg0, Arg1>;
|
||||||
|
|
||||||
|
static ENOKI_INLINE size_t slices(const Value &value) {
|
||||||
|
return enoki::slices(value.first);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE size_t packets(const Value &value) {
|
||||||
|
return enoki::packets(value.first);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE void set_slices(Value &value, size_t size) {
|
||||||
|
enoki::set_slices(value.first, size);
|
||||||
|
enoki::set_slices(value.second, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto packet(T2 &&value, size_t i) {
|
||||||
|
return std::pair<decltype(enoki::packet(value.first, i)),
|
||||||
|
decltype(enoki::packet(value.second, i))>(
|
||||||
|
enoki::packet(value.first, i), enoki::packet(value.second, i));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto slice(T2 &&value, size_t i) {
|
||||||
|
return std::pair<decltype(enoki::slice(value.first, i)),
|
||||||
|
decltype(enoki::slice(value.second, i))>(
|
||||||
|
enoki::slice(value.first, i), enoki::slice(value.second, i));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto slice_ptr(T2 &&value, size_t i) {
|
||||||
|
return std::pair<decltype(enoki::slice_ptr(value.first, i)),
|
||||||
|
decltype(enoki::slice_ptr(value.second, i))>(
|
||||||
|
enoki::slice_ptr(value.first, i), enoki::slice_ptr(value.second, i));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto ref_wrap(T2 &&value) {
|
||||||
|
return std::pair<decltype(enoki::ref_wrap(value.first)),
|
||||||
|
decltype(enoki::ref_wrap(value.second))>(
|
||||||
|
enoki::ref_wrap(value.first), enoki::ref_wrap(value.second));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, typename Mask>
|
||||||
|
static ENOKI_INLINE auto masked(T2 &&value, const Mask &mask) {
|
||||||
|
return std::pair<decltype(enoki::masked(value.first, mask)),
|
||||||
|
decltype(enoki::masked(value.second, mask))>(
|
||||||
|
enoki::masked(value.first, mask), enoki::masked(value.second, mask));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, typename Index, typename Mask>
|
||||||
|
static ENOKI_INLINE void scatter(T2 &dst, const Value &value, const Index &index, const Mask &mask) {
|
||||||
|
enoki::scatter(dst.first, value.first, index, mask);
|
||||||
|
enoki::scatter(dst.second, value.second, index, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, typename Index, typename Mask>
|
||||||
|
static ENOKI_INLINE Value gather(const T2 &src, const Index &index, const Mask &mask) {
|
||||||
|
return Value(
|
||||||
|
enoki::gather<Arg0>(src.first, index, mask),
|
||||||
|
enoki::gather<Arg1>(src.second, index, mask)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE Value zero(size_t size) {
|
||||||
|
return Value(enoki::zero<Arg0>(size), enoki::zero<Arg1>(size));
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE Value empty(size_t size) {
|
||||||
|
return Value(enoki::empty<Arg0>(size), enoki::empty<Arg1>(size));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename... Args> struct struct_support<std::tuple<Args...>> {
|
||||||
|
static constexpr bool IsDynamic = std::disjunction_v<enoki::is_dynamic<Args>...>;
|
||||||
|
using Dynamic = std::tuple<enoki::make_dynamic_t<Args>...>;
|
||||||
|
using Value = std::tuple<Args...>;
|
||||||
|
|
||||||
|
static ENOKI_INLINE size_t slices(const Value &value) {
|
||||||
|
return enoki::slices(std::get<0>(value));
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE size_t packets(const Value &value) {
|
||||||
|
return enoki::packets(std::get<0>(value));
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE void set_slices(Value &value, size_t size) {
|
||||||
|
set_slices(value, size, std::make_index_sequence<sizeof...(Args)>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto packet(T2 &&value, size_t i) {
|
||||||
|
return packet(std::forward<T2>(value), i, std::make_index_sequence<sizeof...(Args)>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto slice(T2 &&value, size_t i) {
|
||||||
|
return slice(std::forward<T2>(value), i, std::make_index_sequence<sizeof...(Args)>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto slice_ptr(T2 &&value, size_t i) {
|
||||||
|
return slice_ptr(std::forward<T2>(value), i, std::make_index_sequence<sizeof...(Args)>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto ref_wrap(T2 &&value) {
|
||||||
|
return ref_wrap(std::forward<T2>(value), std::make_index_sequence<sizeof...(Args)>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, typename Mask>
|
||||||
|
static ENOKI_INLINE auto masked(T2 &&value, const Mask &mask) {
|
||||||
|
return masked(value, mask, std::make_index_sequence<sizeof...(Args)>());
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE Value zero(size_t size) {
|
||||||
|
return Value(enoki::zero<Args>(size)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE Value empty(size_t size) {
|
||||||
|
return Value(enoki::empty<Args>(size)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, typename Index, typename Mask>
|
||||||
|
static ENOKI_INLINE void scatter(T2 &dst, const Value &value, const Index &index, const Mask &mask) {
|
||||||
|
scatter(dst, value, index, mask, std::make_index_sequence<sizeof...(Args)>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, typename Index, typename Mask>
|
||||||
|
static ENOKI_INLINE Value gather(const T2 &src, const Index &index, const Mask &mask) {
|
||||||
|
return gather(src, index, mask, std::make_index_sequence<sizeof...(Args)>());
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
template <size_t... Index>
|
||||||
|
static ENOKI_INLINE void set_slices(Value &value, size_t i, std::index_sequence<Index...>) {
|
||||||
|
bool unused[] = { (enoki::set_slices(std::get<Index>(value), i), false)..., false };
|
||||||
|
(void) unused;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, size_t... Index>
|
||||||
|
static ENOKI_INLINE auto packet(T2 &&value, size_t i, std::index_sequence<Index...>) {
|
||||||
|
return std::tuple<decltype(enoki::packet(std::get<Index>(value), i))...>(
|
||||||
|
enoki::packet(std::get<Index>(value), i)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, size_t... Index>
|
||||||
|
static ENOKI_INLINE auto slice(T2 &&value, size_t i, std::index_sequence<Index...>) {
|
||||||
|
return std::tuple<decltype(enoki::slice(std::get<Index>(value), i))...>(
|
||||||
|
enoki::slice(std::get<Index>(value), i)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, size_t... Index>
|
||||||
|
static ENOKI_INLINE auto slice_ptr(T2 &&value, size_t i, std::index_sequence<Index...>) {
|
||||||
|
return std::tuple<decltype(enoki::slice_ptr(std::get<Index>(value), i))...>(
|
||||||
|
enoki::slice_ptr(std::get<Index>(value), i)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, size_t... Index>
|
||||||
|
static ENOKI_INLINE auto ref_wrap(T2 &&value, std::index_sequence<Index...>) {
|
||||||
|
return std::tuple<decltype(enoki::ref_wrap(std::get<Index>(value)))...>(
|
||||||
|
enoki::ref_wrap(std::get<Index>(value))...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, typename Mask, size_t... Index>
|
||||||
|
static ENOKI_INLINE auto masked(T2 &&value, const Mask &mask, std::index_sequence<Index...>) {
|
||||||
|
return std::tuple<decltype(enoki::masked(std::get<Index>(value), mask))...>(
|
||||||
|
enoki::masked(std::get<Index>(value), mask)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, typename Index, typename Mask, size_t... Is>
|
||||||
|
static ENOKI_INLINE void scatter(T2 &dst, const Value &value, const Index &index, const Mask &mask, std::index_sequence<Is...>) {
|
||||||
|
bool unused[] = { (enoki::scatter(std::get<Is>(dst),
|
||||||
|
std::get<Is>(value), index, mask), false)..., false };
|
||||||
|
ENOKI_MARK_USED(unused);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, typename Index, typename Mask, size_t... Is>
|
||||||
|
static ENOKI_INLINE Value gather(const T2 &src, const Index &index, const Mask &mask, std::index_sequence<Is...>) {
|
||||||
|
return Value(
|
||||||
|
enoki::gather<std::tuple_element_t<Is, Value>>(std::get<Is>(src), index, mask)...
|
||||||
|
);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, size_t Size> struct struct_support<std::array<T, Size>> {
|
||||||
|
static constexpr bool IsDynamic = enoki::is_dynamic_v<T>;
|
||||||
|
using Dynamic = std::array<enoki::make_dynamic_t<T>, Size>;
|
||||||
|
using Value = std::array<T, Size>;
|
||||||
|
|
||||||
|
static ENOKI_INLINE size_t slices(const Value &value) {
|
||||||
|
return enoki::slices(value[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE size_t packets(const Value &value) {
|
||||||
|
return enoki::packets(value[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE void set_slices(Value &value, size_t size) {
|
||||||
|
for (size_t i = 0; i < Size; ++i)
|
||||||
|
enoki::set_slices(value[i], size);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto packet(T2 &&value, size_t i) {
|
||||||
|
return packet(std::forward<T2>(value), i, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto slice(T2 &&value, size_t i) {
|
||||||
|
return slice(std::forward<T2>(value), i, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto slice_ptr(T2 &&value, size_t i) {
|
||||||
|
return slice_ptr(std::forward<T2>(value), i, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2>
|
||||||
|
static ENOKI_INLINE auto ref_wrap(T2 &&value) {
|
||||||
|
return ref_wrap(std::forward<T2>(value), std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, typename Mask>
|
||||||
|
static ENOKI_INLINE auto masked(T2 &value, const Mask &mask) {
|
||||||
|
return masked(value, mask, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, typename Index, typename Mask>
|
||||||
|
static ENOKI_INLINE void scatter(T2 &dst, const Value &value, const Index &index, const Mask &mask) {
|
||||||
|
scatter(dst, value, index, mask, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, typename Index, typename Mask>
|
||||||
|
static ENOKI_INLINE Value gather(const T2 &src, const Index &index, const Mask &mask) {
|
||||||
|
return gather(src, index, mask, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE auto zero(size_t size) {
|
||||||
|
return zero(size, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
|
||||||
|
static ENOKI_INLINE auto empty(size_t size) {
|
||||||
|
return empty(size, std::make_index_sequence<Size>());
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
template <typename T2, size_t... Index>
|
||||||
|
static ENOKI_INLINE auto packet(T2 &&value, size_t i, std::index_sequence<Index...>) {
|
||||||
|
return std::array<decltype(enoki::packet(value[0], i)), Size>{{
|
||||||
|
enoki::packet(value[Index], i)...}};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, size_t... Index>
|
||||||
|
static ENOKI_INLINE auto slice(T2 &&value, size_t i, std::index_sequence<Index...>) {
|
||||||
|
return std::array<decltype(enoki::slice(value[0], i)), Size>{{
|
||||||
|
enoki::slice(value[Index], i)...}};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, size_t... Index>
|
||||||
|
static ENOKI_INLINE auto slice_ptr(T2 &&value, size_t i, std::index_sequence<Index...>) {
|
||||||
|
return std::array<decltype(enoki::slice_ptr(value[0], i)), Size>{{
|
||||||
|
enoki::slice_ptr(value[Index], i)...}};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, size_t... Index>
|
||||||
|
static ENOKI_INLINE auto ref_wrap(T2 &&value, std::index_sequence<Index...>) {
|
||||||
|
return std::array<decltype(enoki::ref_wrap(value[0])), Size>{{
|
||||||
|
enoki::ref_wrap(value[Index])...}};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, typename Mask, size_t... Index>
|
||||||
|
static ENOKI_INLINE auto masked(T2 &value, const Mask &mask, std::index_sequence<Index...>) {
|
||||||
|
return std::array<decltype(enoki::masked(value[0], mask)), Size>{{
|
||||||
|
enoki::masked(value[Index], mask)...}};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t... Index>
|
||||||
|
static ENOKI_INLINE auto zero(size_t size, std::index_sequence<Index...>) {
|
||||||
|
return Value{{ zero<T>(Index, size)... }};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t... Index>
|
||||||
|
static ENOKI_INLINE auto empty(size_t size, std::index_sequence<Index...>) {
|
||||||
|
return Value{{ empty<T>(Index, size)... }};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, typename Index, typename Mask, size_t... Is>
|
||||||
|
static ENOKI_INLINE void scatter(T2 &dst, const Value &value, const Index &index, const Mask &mask, std::index_sequence<Is...>) {
|
||||||
|
bool unused[] = { (enoki::scatter(dst[Is], value[Is], index, mask), false)..., false };
|
||||||
|
ENOKI_MARK_USED(unused);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T2, typename Index, typename Mask, size_t... Is>
|
||||||
|
static ENOKI_INLINE Value gather(const T2 &src, const Index &index, const Mask &mask, std::index_sequence<Is...>) {
|
||||||
|
return Value{
|
||||||
|
enoki::gather<T>(src[Is], index, mask)...
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
|
|
@ -0,0 +1,202 @@
|
||||||
|
/*
|
||||||
|
enoki/transform.h -- 3D homogeneous coordinate transformations
|
||||||
|
|
||||||
|
Enoki is a C++ template library that enables transparent vectorization
|
||||||
|
of numerical kernels using SIMD instruction sets available on current
|
||||||
|
processor architectures.
|
||||||
|
|
||||||
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
||||||
|
|
||||||
|
All rights reserved. Use of this source code is governed by a BSD-style
|
||||||
|
license that can be found in the LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <enoki/quaternion.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(enoki)
|
||||||
|
|
||||||
|
template <typename Matrix, typename Vector> ENOKI_INLINE Matrix translate(const Vector &v) {
|
||||||
|
Matrix trafo = identity<Matrix>();
|
||||||
|
trafo.coeff(Matrix::Size - 1) = concat(v, scalar_t<Matrix>(1));
|
||||||
|
return trafo;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Matrix, typename Vector> ENOKI_INLINE Matrix scale(const Vector &v) {
|
||||||
|
return diag<Matrix>(concat(v, scalar_t<Matrix>(1)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Matrix, enable_if_t<Matrix::IsMatrix && Matrix::Size == 3> = 0>
|
||||||
|
ENOKI_INLINE Matrix rotate(const entry_t<Matrix> &angle) {
|
||||||
|
entry_t<Matrix> z(0.f), o(1.f);
|
||||||
|
auto [s, c] = sincos(angle);
|
||||||
|
return Matrix(c, -s, z, s, c, z, z, z, o);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Matrix, typename Vector3, enable_if_t<Matrix::IsMatrix && Matrix::Size == 4> = 0>
|
||||||
|
ENOKI_INLINE Matrix rotate(const Vector3 &axis, const entry_t<Matrix> &angle) {
|
||||||
|
using Value = entry_t<Matrix>;
|
||||||
|
using Vector4 = column_t<Matrix>;
|
||||||
|
|
||||||
|
auto [sin_theta, cos_theta] = sincos(angle);
|
||||||
|
Value cos_theta_m = 1.f - cos_theta;
|
||||||
|
|
||||||
|
auto shuf1 = shuffle<1, 2, 0>(axis),
|
||||||
|
shuf2 = shuffle<2, 0, 1>(axis),
|
||||||
|
tmp0 = fmadd(axis * axis, cos_theta_m, cos_theta),
|
||||||
|
tmp1 = fmadd(axis * shuf1, cos_theta_m, shuf2 * sin_theta),
|
||||||
|
tmp2 = fmsub(axis * shuf2, cos_theta_m, shuf1 * sin_theta);
|
||||||
|
|
||||||
|
return Matrix(
|
||||||
|
Vector4(tmp0.x(), tmp1.x(), tmp2.x(), 0.f),
|
||||||
|
Vector4(tmp2.y(), tmp0.y(), tmp1.y(), 0.f),
|
||||||
|
Vector4(tmp1.z(), tmp2.z(), tmp0.z(), 0.f),
|
||||||
|
Vector4(0.f, 0.f, 0.f, 1.f)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Matrix>
|
||||||
|
ENOKI_INLINE Matrix perspective(const entry_t<Matrix> &fov,
|
||||||
|
const entry_t<Matrix> &near_,
|
||||||
|
const entry_t<Matrix> &far_,
|
||||||
|
const entry_t<Matrix> &aspect = 1.f) {
|
||||||
|
static_assert(Matrix::Size == 4, "Matrix::perspective(): implementation assumes 4x4 matrix output");
|
||||||
|
|
||||||
|
auto recip = rcp(near_ - far_);
|
||||||
|
auto c = cot(.5f * fov);
|
||||||
|
|
||||||
|
Matrix trafo = diag<Matrix>(
|
||||||
|
column_t<Matrix>(c / aspect, c, (near_ + far_) * recip, 0.f));
|
||||||
|
|
||||||
|
trafo(2, 3) = 2.f * near_ * far_ * recip;
|
||||||
|
trafo(3, 2) = -1.f;
|
||||||
|
|
||||||
|
return trafo;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Matrix>
|
||||||
|
ENOKI_INLINE Matrix frustum(const entry_t<Matrix> &left,
|
||||||
|
const entry_t<Matrix> &right,
|
||||||
|
const entry_t<Matrix> &bottom,
|
||||||
|
const entry_t<Matrix> &top,
|
||||||
|
const entry_t<Matrix> &near_,
|
||||||
|
const entry_t<Matrix> &far_) {
|
||||||
|
static_assert(Matrix::Size == 4, "Matrix::frustum(): implementation assumes 4x4 matrix output");
|
||||||
|
|
||||||
|
auto rl = rcp(right - left),
|
||||||
|
tb = rcp(top - bottom),
|
||||||
|
fn = rcp(far_ - near_);
|
||||||
|
|
||||||
|
Matrix trafo = zero<Matrix>();
|
||||||
|
trafo(0, 0) = (2.f * near_) * rl;
|
||||||
|
trafo(1, 1) = (2.f * near_) * tb;
|
||||||
|
trafo(0, 2) = (right + left) * rl;
|
||||||
|
trafo(1, 2) = (top + bottom) * tb;
|
||||||
|
trafo(2, 2) = -(far_ + near_) * fn;
|
||||||
|
trafo(3, 2) = -1.f;
|
||||||
|
trafo(2, 3) = -2.f * far_ * near_ * fn;
|
||||||
|
|
||||||
|
return trafo;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Matrix>
|
||||||
|
ENOKI_INLINE Matrix ortho(const entry_t<Matrix> &left,
|
||||||
|
const entry_t<Matrix> &right,
|
||||||
|
const entry_t<Matrix> &bottom,
|
||||||
|
const entry_t<Matrix> &top,
|
||||||
|
const entry_t<Matrix> &near_,
|
||||||
|
const entry_t<Matrix> &far_) {
|
||||||
|
static_assert(Matrix::Size == 4, "Matrix::ortho(): implementation assumes 4x4 matrix output");
|
||||||
|
|
||||||
|
auto rl = rcp(right - left),
|
||||||
|
tb = rcp(top - bottom),
|
||||||
|
fn = rcp(far_ - near_);
|
||||||
|
|
||||||
|
Matrix trafo = zero<Matrix>();
|
||||||
|
|
||||||
|
trafo(0, 0) = 2.f * rl;
|
||||||
|
trafo(1, 1) = 2.f * tb;
|
||||||
|
trafo(2, 2) = -2.f * fn;
|
||||||
|
trafo(3, 3) = 1.f;
|
||||||
|
trafo(0, 3) = -(right + left) * rl;
|
||||||
|
trafo(1, 3) = -(top + bottom) * tb;
|
||||||
|
trafo(2, 3) = -(far_ + near_) * fn;
|
||||||
|
|
||||||
|
return trafo;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Matrix, typename Point, typename Vector>
|
||||||
|
Matrix look_at(const Point &origin, const Point &target, const Vector &up) {
|
||||||
|
static_assert(Matrix::Size == 4, "Matrix::look_at(): implementation "
|
||||||
|
"assumes 4x4 matrix output");
|
||||||
|
|
||||||
|
auto dir = normalize(target - origin);
|
||||||
|
auto left = normalize(cross(dir, up));
|
||||||
|
auto new_up = cross(left, dir);
|
||||||
|
using Scalar = scalar_t<Matrix>;
|
||||||
|
|
||||||
|
return Matrix(
|
||||||
|
concat(left, Scalar(0)),
|
||||||
|
concat(new_up, Scalar(0)),
|
||||||
|
concat(-dir, Scalar(0)),
|
||||||
|
column_t<Matrix>(
|
||||||
|
-dot(left, origin),
|
||||||
|
-dot(new_up, origin),
|
||||||
|
dot(dir, origin),
|
||||||
|
1.f
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T,
|
||||||
|
typename E = expr_t<T>,
|
||||||
|
typename Matrix3 = Matrix<E, 3>,
|
||||||
|
typename Vector3 = Array<E, 3>,
|
||||||
|
typename Quat = Quaternion<E>>
|
||||||
|
std::tuple<Matrix3, Quat, Vector3> transform_decompose(const Matrix<T, 4> &A, size_t it = 10) {
|
||||||
|
Matrix3 A_sub(A), Q, P;
|
||||||
|
std::tie(Q, P) = polar_decomp(A_sub, it);
|
||||||
|
|
||||||
|
if (ENOKI_UNLIKELY(any(enoki::isnan(Q(0, 0)))))
|
||||||
|
Q = identity<Matrix3>();
|
||||||
|
|
||||||
|
auto sign_q = det(Q);
|
||||||
|
Q = mulsign(Array<Vector3, 3>(Q), sign_q);
|
||||||
|
P = mulsign(Array<Vector3, 3>(P), sign_q);
|
||||||
|
|
||||||
|
return std::make_tuple(
|
||||||
|
P,
|
||||||
|
matrix_to_quat(Q),
|
||||||
|
head<3>(A.col(3))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T,
|
||||||
|
typename E = expr_t<T>,
|
||||||
|
typename Matrix3 = Matrix<E, 3>,
|
||||||
|
typename Matrix4 = Matrix<E, 4>,
|
||||||
|
typename Vector3>
|
||||||
|
Matrix4 transform_compose(const Matrix<T, 3> &S,
|
||||||
|
const Quaternion<T> &q,
|
||||||
|
const Vector3 &t) {
|
||||||
|
Matrix4 result = Matrix4(quat_to_matrix<Matrix3>(q) * S);
|
||||||
|
result.coeff(3) = concat(t, scalar_t<Matrix4>(1));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T,
|
||||||
|
typename E = expr_t<T>,
|
||||||
|
typename Matrix3 = Matrix<E, 3>,
|
||||||
|
typename Matrix4 = Matrix<E, 4>,
|
||||||
|
typename Vector3>
|
||||||
|
Matrix4 transform_compose_inverse(const Matrix<T, 3> &S,
|
||||||
|
const Quaternion<T> &q,
|
||||||
|
const Vector3 &t) {
|
||||||
|
auto inv_m = inverse(quat_to_matrix<Matrix3>(q) * S);
|
||||||
|
Matrix4 result = Matrix4(inv_m);
|
||||||
|
result.coeff(3) = concat(inv_m * -t, scalar_t<Matrix4>(1));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(enoki)
|
||||||
Loading…
Reference in New Issue