1506 lines
57 KiB
C++
1506 lines
57 KiB
C++
/*
|
|
enoki/array_math.h -- Mathematical support library
|
|
|
|
Enoki is a C++ template library that enables transparent vectorization
|
|
of numerical kernels using ENOKI instruction sets available on current
|
|
processor architectures.
|
|
|
|
Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>
|
|
|
|
All rights reserved. Use of this source code is governed by a BSD-style
|
|
license that can be found in the LICENSE file.
|
|
*/
|
|
|
|
#include <enoki/array_generic.h>
|
|
|
|
#pragma once
|
|
|
|
NAMESPACE_BEGIN(enoki)
|
|
|
|
// -----------------------------------------------------------------------
|
|
//! @{ \name Polynomial evaluation with short dependency chains and
|
|
// fused multply-adds based on Estrin's scheme
|
|
// -----------------------------------------------------------------------
|
|
|
|
template <typename T1, typename T2, typename T = expr_t<T1>, typename S = scalar_t<T1>>
|
|
ENOKI_INLINE T poly2(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2) {
|
|
T x2 = x * x;
|
|
return fmadd(x2, S(c2), fmadd(x, S(c1), S(c0)));
|
|
}
|
|
|
|
template <typename T1, typename T2, typename T = expr_t<T1>,
|
|
typename S = scalar_t<T1>>
|
|
ENOKI_INLINE T poly3(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
|
|
const T2 &c3) {
|
|
T x2 = x * x;
|
|
return fmadd(x2, fmadd(x, S(c3), S(c2)), fmadd(x, S(c1), S(c0)));
|
|
}
|
|
|
|
template <typename T1, typename T2, typename T = expr_t<T1>,
|
|
typename S = scalar_t<T1>>
|
|
ENOKI_INLINE T poly4(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
|
|
const T2 &c3, const T2 &c4) {
|
|
T x2 = x * x, x4 = x2 * x2;
|
|
return fmadd(x2, fmadd(x, S(c3), S(c2)), fmadd(x, S(c1), S(c0)) + S(c4) * x4);
|
|
}
|
|
|
|
template <typename T1, typename T2, typename T = expr_t<T1>,
|
|
typename S = scalar_t<T1>>
|
|
ENOKI_INLINE T poly5(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
|
|
const T2 &c3, const T2 &c4, const T2 &c5) {
|
|
T x2 = x * x, x4 = x2 * x2;
|
|
return fmadd(x2, fmadd(x, S(c3), S(c2)),
|
|
fmadd(x4, fmadd(x, S(c5), S(c4)), fmadd(x, S(c1), S(c0))));
|
|
}
|
|
|
|
template <typename T1, typename T2, typename T = expr_t<T1>,
|
|
typename S = scalar_t<T1>>
|
|
ENOKI_INLINE T poly6(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
|
|
const T2 &c3, const T2 &c4, const T2 &c5, const T2 &c6) {
|
|
T x2 = x * x, x4 = x2 * x2;
|
|
return fmadd(x4, fmadd(x2, S(c6), fmadd(x, S(c5), S(c4))),
|
|
fmadd(x2, fmadd(x, S(c3), S(c2)), fmadd(x, S(c1), S(c0))));
|
|
}
|
|
|
|
template <typename T1, typename T2, typename T = expr_t<T1>,
|
|
typename S = scalar_t<T1>>
|
|
ENOKI_INLINE T poly7(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
|
|
const T2 &c3, const T2 &c4, const T2 &c5, const T2 &c6,
|
|
const T2 &c7) {
|
|
T x2 = x * x, x4 = x2 * x2;
|
|
return fmadd(x4, fmadd(x2, fmadd(x, S(c7), S(c6)), fmadd(x, S(c5), S(c4))),
|
|
fmadd(x2, fmadd(x, S(c3), S(c2)), fmadd(x, S(c1), S(c0))));
|
|
}
|
|
|
|
template <typename T1, typename T2, typename T = expr_t<T1>,
|
|
typename S = scalar_t<T1>>
|
|
ENOKI_INLINE T poly8(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
|
|
const T2 &c3, const T2 &c4, const T2 &c5, const T2 &c6,
|
|
const T2 &c7, const T2 &c8) {
|
|
T x2 = x * x, x4 = x2 * x2, x8 = x4 * x4;
|
|
return fmadd(x4, fmadd(x2, fmadd(x, S(c7), S(c6)), fmadd(x, S(c5), S(c4))),
|
|
fmadd(x2, fmadd(x, S(c3), S(c2)), fmadd(x, S(c1), S(c0)) + S(c8) * x8));
|
|
}
|
|
|
|
template <typename T1, typename T2, typename T = expr_t<T1>,
|
|
typename S = scalar_t<T1>>
|
|
ENOKI_INLINE T poly9(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
|
|
const T2 &c3, const T2 &c4, const T2 &c5, const T2 &c6,
|
|
const T2 &c7, const T2 &c8, const T2 &c9) {
|
|
T x2 = x * x, x4 = x2 * x2, x8 = x4 * x4;
|
|
return fmadd(x8, fmadd(x, S(c9), S(c8)),
|
|
fmadd(x4, fmadd(x2, fmadd(x, S(c7), S(c6)), fmadd(x, S(c5), S(c4))),
|
|
fmadd(x2, fmadd(x, S(c3), S(c2)), fmadd(x, S(c1), S(c0)))));
|
|
}
|
|
|
|
template <typename T1, typename T2, typename T = expr_t<T1>,
|
|
typename S = scalar_t<T1>>
|
|
ENOKI_INLINE T poly10(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
|
|
const T2 &c3, const T2 &c4, const T2 &c5, const T2 &c6,
|
|
const T2 &c7, const T2 &c8, const T2 &c9, const T2 &c10) {
|
|
T x2 = x * x, x4 = x2 * x2, x8 = x4 * x4;
|
|
return fmadd(x8, fmadd(x2, S(c10), fmadd(x, S(c9), S(c8))),
|
|
fmadd(x4, fmadd(x2, fmadd(x, S(c7), S(c6)), fmadd(x, S(c5), S(c4))),
|
|
fmadd(x2, fmadd(x, S(c3), S(c2)), fmadd(x, S(c1), S(c0)))));
|
|
}
|
|
|
|
//! @}
|
|
// -----------------------------------------------------------------------
|
|
|
|
#define ENOKI_UNARY_OPERATION(name, expr) \
|
|
namespace detail { \
|
|
template <typename T> \
|
|
using has_##name = decltype(std::declval<T>().name##_()); \
|
|
template <typename T> \
|
|
constexpr bool has_##name##_v = is_detected_v<has_##name, T>; \
|
|
template <typename Value, typename Scalar = scalar_t<Value>, \
|
|
typename Mask = mask_t<Value>, \
|
|
bool Single = std::is_same_v<float, Scalar>> \
|
|
Value name##_impl(const Value &); \
|
|
} \
|
|
template <typename T> auto name(const T &x) { \
|
|
using E = expr_t<T>; \
|
|
using Value = value_t<E>; \
|
|
if constexpr (detail::has_##name##_v<E>) { \
|
|
return ((const E &) x).name##_(); \
|
|
} else if constexpr (is_recursive_array_v<E>) { \
|
|
return E(name(low(x)), name(high(x))); \
|
|
} else if constexpr (is_dynamic_array_v<E> && \
|
|
!is_diff_array_v<E> && \
|
|
!is_cuda_array_v<E>) { \
|
|
E r = empty<E>(x.size()); \
|
|
auto pr = r.packet_ptr(); \
|
|
auto px = x.packet_ptr(); \
|
|
for (size_t i = 0, n = r.packets(); i < n; ++i, ++pr, ++px) \
|
|
*pr = name(*px); \
|
|
return r; \
|
|
} else if constexpr (array_depth_v<E> > 1) { \
|
|
E r; \
|
|
ENOKI_CHKSCALAR(#name); \
|
|
for (size_t i = 0; i < x.size(); ++i) \
|
|
r.coeff(i) = name(x.coeff(i)); \
|
|
return r; \
|
|
} else if constexpr (is_array_v<E>) { \
|
|
return detail::name##_impl((const E &) x); \
|
|
} else { \
|
|
return expr; \
|
|
} \
|
|
} \
|
|
template <typename Value, typename Scalar, typename Mask, bool Single> \
|
|
ENOKI_INLINE Value enoki::detail::name##_impl(const Value &x)
|
|
|
|
#define ENOKI_UNARY_OPERATION_PAIR(name, expr) \
|
|
namespace detail { \
|
|
template <typename T> \
|
|
using has_##name = decltype(std::declval<T>().name##_()); \
|
|
template <typename T> \
|
|
constexpr bool has_##name##_v = is_detected_v<has_##name, T>; \
|
|
template <typename Value, typename Scalar = scalar_t<Value>, \
|
|
typename Mask = mask_t<Value>, \
|
|
bool Single = std::is_same_v<float, Scalar>> \
|
|
std::pair<Value, Value> name##_impl(const Value &); \
|
|
} \
|
|
template <typename T> auto name(const T &x) { \
|
|
using E = expr_t<T>; \
|
|
using Value = value_t<E>; \
|
|
if constexpr (detail::has_##name##_v<E>) { \
|
|
return ((const E &) x).name##_(); \
|
|
} else if constexpr (is_recursive_array_v<E>) { \
|
|
auto l = name(low(x)); \
|
|
auto h = name(high(x)); \
|
|
return std::pair<E, E>(E(l.first, h.first), \
|
|
E(l.second, h.second)); \
|
|
} else if constexpr (is_dynamic_array_v<E> && \
|
|
!is_cuda_array_v<E> && \
|
|
!is_diff_array_v<E>) { \
|
|
std::pair<E, E> r(empty<E>(x.size()), empty<E>(x.size())); \
|
|
auto pr0 = r.first.packet_ptr(), \
|
|
pr1 = r.second.packet_ptr(); \
|
|
auto px = x.packet_ptr(); \
|
|
for (size_t i = 0, n = x.packets(); \
|
|
i < n; ++i, ++pr0, ++pr1, ++px) \
|
|
std::tie(*pr0, *pr1) = name(*px); \
|
|
return r; \
|
|
} else if constexpr (array_depth_v<E> > 1) { \
|
|
std::pair<E, E> r; \
|
|
ENOKI_CHKSCALAR(#name); \
|
|
for (size_t i = 0; i < x.size(); ++i) \
|
|
std::tie(r.first.coeff(i), \
|
|
r.second.coeff(i)) = name(x.coeff(i)); \
|
|
return r; \
|
|
} else if constexpr (is_array_v<E>) { \
|
|
return detail::name##_impl((const E &) x); \
|
|
} else { \
|
|
return expr; \
|
|
} \
|
|
\
|
|
} \
|
|
template <typename Value, typename Scalar, typename Mask, bool Single> \
|
|
ENOKI_INLINE std::pair<Value, Value> enoki::detail::name##_impl(const Value &x)
|
|
|
|
|
|
#define ENOKI_BINARY_OPERATION(name, expr) \
|
|
namespace detail { \
|
|
template <typename T> \
|
|
using has_##name = decltype(std::declval<T>() \
|
|
.name##_(std::declval<T>())); \
|
|
template <typename T> \
|
|
constexpr bool has_##name##_v = is_detected_v<has_##name, T>; \
|
|
template <typename Value, typename Scalar = scalar_t<Value>, \
|
|
typename Mask = mask_t<Value>, \
|
|
bool Single = std::is_same_v<float, Scalar>> \
|
|
Value name##_impl(const Value &, const Value &); \
|
|
} \
|
|
template <typename T1, typename T2> auto name(const T1 &x, const T2 &y) { \
|
|
using E = expr_t<T1, T2>; \
|
|
using Value = value_t<E>; \
|
|
if constexpr (detail::has_##name##_v<E>) { \
|
|
return ((const E &) x).name##_((const E &) y); \
|
|
} else if constexpr (is_recursive_array_v<E>) { \
|
|
return E(name(low(x), low(y)), name(high(x), high(y))); \
|
|
} else if constexpr (!std::is_same_v<T1, E> || \
|
|
!std::is_same_v<T2, E>) { \
|
|
return name((const E& ) x, (const E &) y); \
|
|
} else if constexpr (is_dynamic_array_v<E> && \
|
|
!is_cuda_array_v<E> && \
|
|
!is_diff_array_v<E>) { \
|
|
E r; \
|
|
r.resize_like(x, y); \
|
|
size_t xs = x.size() == 1 ? 0 : 1, \
|
|
ys = y.size() == 1 ? 0 : 1; \
|
|
auto pr = r.packet_ptr(); \
|
|
auto px = x.packet_ptr(); \
|
|
auto py = y.packet_ptr(); \
|
|
for (size_t i = 0, n = r.packets(); i < n; \
|
|
++i, pr += 1, px += xs, py += ys) \
|
|
*pr = name(*px, *py); \
|
|
return r; \
|
|
} else if constexpr (array_depth_v<E> > 1) { \
|
|
assert(x.size() == y.size()); \
|
|
E r; \
|
|
ENOKI_CHKSCALAR(#name); \
|
|
for (size_t i = 0; i < x.size(); ++i) \
|
|
r.coeff(i) = name(x.coeff(i), y.coeff(i)); \
|
|
return r; \
|
|
} else if constexpr (is_array_v<E>) { \
|
|
return detail::name##_impl((const E &) x, (const E &) y); \
|
|
} else { \
|
|
return expr; \
|
|
} \
|
|
\
|
|
} \
|
|
template <typename Value, typename Scalar, typename Mask, bool Single> \
|
|
ENOKI_INLINE Value enoki::detail::name##_impl(const Value &x, const Value &y)
|
|
|
|
|
|
// -----------------------------------------------------------------------
|
|
//! @{ \name Trigonometric functions and their inverses
|
|
// -----------------------------------------------------------------------
|
|
|
|
namespace detail {
|
|
template <bool Sin, bool Cos, typename Value>
|
|
ENOKI_INLINE void sincos_approx(const Value &x, Value &s_out, Value &c_out) {
|
|
using Scalar = scalar_t<Value>;
|
|
constexpr bool Single = std::is_same_v<Scalar, float>;
|
|
using IntArray = int_array_t<Value>;
|
|
using Int = scalar_t<IntArray>;
|
|
using Mask = mask_t<Value>;
|
|
ENOKI_MARK_USED(s_out);
|
|
ENOKI_MARK_USED(c_out);
|
|
|
|
/* Joint sine & cosine function approximation based on CEPHES.
|
|
Excellent accuracy in the domain |x| < 8192
|
|
|
|
Redistributed under a BSD license with permission of the author, see
|
|
https://github.com/deepmind/torch-cephes/blob/master/LICENSE.txt
|
|
|
|
- sin (in [-8192, 8192]):
|
|
* avg abs. err = 6.61896e-09
|
|
* avg rel. err = 1.37888e-08
|
|
-> in ULPs = 0.166492
|
|
* max abs. err = 5.96046e-08
|
|
(at x=-8191.31)
|
|
* max rel. err = 1.76826e-06
|
|
-> in ULPs = 19
|
|
(at x=-6374.29)
|
|
|
|
- cos (in [-8192, 8192]):
|
|
* avg abs. err = 6.59965e-09
|
|
* avg rel. err = 1.37432e-08
|
|
-> in ULPs = 0.166141
|
|
* max abs. err = 5.96046e-08
|
|
(at x=-8191.05)
|
|
* max rel. err = 3.13993e-06
|
|
-> in ULPs = 47
|
|
(at x=-6199.93)
|
|
*/
|
|
|
|
Value xa = abs(x);
|
|
|
|
/* Scale by 4/Pi and get the integer part */
|
|
IntArray j(xa * Scalar(1.2732395447351626862));
|
|
|
|
/* Map zeros to origin; if (j & 1) j += 1 */
|
|
j = (j + Int(1)) & Int(~1u);
|
|
|
|
/* Cast back to a floating point value */
|
|
Value y(j);
|
|
|
|
/* Determine sign of result */
|
|
Value sign_sin, sign_cos;
|
|
constexpr size_t Shift = sizeof(Scalar) * 8 - 3;
|
|
|
|
if constexpr (Sin)
|
|
sign_sin = reinterpret_array<Value>(sl<Shift>(j)) ^ x;
|
|
|
|
if constexpr (Cos)
|
|
sign_cos = reinterpret_array<Value>(sl<Shift>(~(j - Int(2))));
|
|
|
|
/* Extended precision modular arithmetic */
|
|
if constexpr (Single) {
|
|
y = xa - y * Scalar(0.78515625)
|
|
- y * Scalar(2.4187564849853515625e-4)
|
|
- y * Scalar(3.77489497744594108e-8);
|
|
} else {
|
|
y = xa - y * Scalar(7.85398125648498535156e-1)
|
|
- y * Scalar(3.77489470793079817668e-8)
|
|
- y * Scalar(2.69515142907905952645e-15);
|
|
}
|
|
|
|
Value z = y * y, s, c;
|
|
z |= eq(xa, std::numeric_limits<Scalar>::infinity());
|
|
|
|
if constexpr (Single) {
|
|
s = poly2(z, -1.6666654611e-1,
|
|
8.3321608736e-3,
|
|
-1.9515295891e-4) * z;
|
|
|
|
c = poly2(z, 4.166664568298827e-2,
|
|
-1.388731625493765e-3,
|
|
2.443315711809948e-5) * z;
|
|
} else {
|
|
s = poly5(z, -1.66666666666666307295e-1,
|
|
8.33333333332211858878e-3,
|
|
-1.98412698295895385996e-4,
|
|
2.75573136213857245213e-6,
|
|
-2.50507477628578072866e-8,
|
|
1.58962301576546568060e-10) * z;
|
|
|
|
c = poly5(z, 4.16666666666665929218e-2,
|
|
-1.38888888888730564116e-3,
|
|
2.48015872888517045348e-5,
|
|
-2.75573141792967388112e-7,
|
|
2.08757008419747316778e-9,
|
|
-1.13585365213876817300e-11) * z;
|
|
}
|
|
|
|
s = fmadd(s, y, y);
|
|
c = fmadd(c, z, fmadd(z, Scalar(-0.5), Scalar(1)));
|
|
|
|
Mask polymask(eq(j & Int(2), zero<IntArray>()));
|
|
|
|
if constexpr (Sin)
|
|
s_out = mulsign(select(polymask, s, c), sign_sin);
|
|
|
|
if constexpr (Cos)
|
|
c_out = mulsign(select(polymask, c, s), sign_cos);
|
|
}
|
|
|
|
template <bool Tan, typename Value>
|
|
ENOKI_INLINE auto tancot_approx(const Value &x) {
|
|
using Scalar = scalar_t<Value>;
|
|
constexpr bool Single = std::is_same_v<Scalar, float>;
|
|
using IntArray = int_array_t<Value>;
|
|
using Int = scalar_t<IntArray>;
|
|
|
|
/*
|
|
- tan (in [-8192, 8192]):
|
|
* avg abs. err = 4.63693e-06
|
|
* avg rel. err = 3.60191e-08
|
|
-> in ULPs = 0.435442
|
|
* max abs. err = 0.8125
|
|
(at x=-6199.93)
|
|
* max rel. err = 3.12284e-06
|
|
-> in ULPs = 30
|
|
(at x=-7406.3)
|
|
*/
|
|
|
|
Value xa = abs(x);
|
|
|
|
/* Scale by 4/Pi and get the integer part */
|
|
IntArray j(xa * Scalar(1.2732395447351626862));
|
|
|
|
/* Map zeros to origin; if (j & 1) j += 1 */
|
|
j = (j + Int(1)) & Int(~1u);
|
|
|
|
/* Cast back to a floating point value */
|
|
Value y(j);
|
|
|
|
/* Extended precision modular arithmetic */
|
|
if constexpr (Single) {
|
|
y = xa - y * Scalar(0.78515625)
|
|
- y * Scalar(2.4187564849853515625e-4)
|
|
- y * Scalar(3.77489497744594108e-8);
|
|
} else {
|
|
y = xa - y * Scalar(7.85398125648498535156e-1)
|
|
- y * Scalar(3.77489470793079817668e-8)
|
|
- y * Scalar(2.69515142907905952645e-15);
|
|
}
|
|
|
|
Value z = y * y;
|
|
z |= eq(xa, std::numeric_limits<Scalar>::infinity());
|
|
|
|
Value r;
|
|
if constexpr (Single) {
|
|
r = poly5(z, 3.33331568548e-1,
|
|
1.33387994085e-1,
|
|
5.34112807005e-2,
|
|
2.44301354525e-2,
|
|
3.11992232697e-3,
|
|
9.38540185543e-3);
|
|
} else {
|
|
r = poly2(z, -1.79565251976484877988e7,
|
|
1.15351664838587416140e6,
|
|
-1.30936939181383777646e4) /
|
|
poly4(z, -5.38695755929454629881e7,
|
|
2.50083801823357915839e7,
|
|
-1.32089234440210967447e6,
|
|
1.36812963470692954678e4,
|
|
1.00000000000000000000e0);
|
|
}
|
|
|
|
r = fmadd(r, z * y, y);
|
|
|
|
auto recip_mask = Tan ? neq(j & Int(2), Int(0)) :
|
|
eq(j & Int(2), Int(0));
|
|
r[xa < Scalar(1e-4)] = y;
|
|
r[recip_mask] = rcp(r);
|
|
|
|
Value sign = reinterpret_array<Value>(sl<sizeof(Scalar) * 8 - 2>(j)) ^ x;
|
|
|
|
return mulsign(r, sign);
|
|
}
|
|
}
|
|
|
|
ENOKI_UNARY_OPERATION(sin, std::sin(x)) {
|
|
Value r;
|
|
detail::sincos_approx<true, false>(x, r, r);
|
|
return r;
|
|
}
|
|
|
|
ENOKI_UNARY_OPERATION(cos, std::cos(x)) {
|
|
Value r;
|
|
detail::sincos_approx<false, true>(x, r, r);
|
|
return r;
|
|
}
|
|
|
|
ENOKI_UNARY_OPERATION_PAIR(sincos, std::make_pair(std::sin(x), std::cos(x))) {
|
|
Value s, c;
|
|
detail::sincos_approx<true, true>(x, s, c);
|
|
return std::make_pair(s, c);
|
|
}
|
|
|
|
template <typename T> auto csc(const T &a) { return rcp(sin(a)); }
|
|
template <typename T> auto sec(const T &a) { return rcp(cos(a)); }
|
|
|
|
ENOKI_UNARY_OPERATION(tan, std::tan(x)) {
|
|
return detail::tancot_approx<true>(x);
|
|
}
|
|
|
|
ENOKI_UNARY_OPERATION(cot, 1 / std::tan(x)) {
|
|
return detail::tancot_approx<false>(x);
|
|
}
|
|
|
|
ENOKI_UNARY_OPERATION(asin, std::asin(x)) {
|
|
/*
|
|
Arc sine function approximation based on CEPHES.
|
|
|
|
- asin (in [-1, 1]):
|
|
* avg abs. err = 2.25422e-08
|
|
* avg rel. err = 2.85777e-08
|
|
-> in ULPs = 0.331032
|
|
* max abs. err = 1.19209e-07
|
|
(at x=-0.999998)
|
|
* max rel. err = 2.27663e-07
|
|
-> in ULPs = 2
|
|
(at x=-0.841416)
|
|
*/
|
|
|
|
Value xa = abs(x),
|
|
x2 = sqr(x),
|
|
r;
|
|
|
|
if constexpr (Single) {
|
|
Mask mask_big = xa > Scalar(0.5);
|
|
|
|
Value x1 = Scalar(0.5) * (Scalar(1) - xa);
|
|
Value x3 = select(mask_big, x1, x2);
|
|
Value x4 = select(mask_big, sqrt(x1), xa);
|
|
|
|
Value z1 = poly4(x3, 1.6666752422e-1f,
|
|
7.4953002686e-2f,
|
|
4.5470025998e-2f,
|
|
2.4181311049e-2f,
|
|
4.2163199048e-2f);
|
|
|
|
z1 = fmadd(z1, x3*x4, x4);
|
|
|
|
r = select(mask_big, Scalar(M_PI_2) - (z1 + z1), z1);
|
|
} else {
|
|
constexpr bool IsCuda = is_cuda_array_v<Value>;
|
|
Mask mask_big = xa > Scalar(0.625);
|
|
|
|
if (IsCuda || any_nested(mask_big)) {
|
|
const Scalar pio4 = Scalar(0.78539816339744830962);
|
|
const Scalar more_bits = Scalar(6.123233995736765886130e-17);
|
|
|
|
/* arcsin(1-x) = pi/2 - sqrt(2x)(1+R(x)) */
|
|
Value zz = Scalar(1) - xa;
|
|
Value p = poly4(zz, 2.853665548261061424989e1,
|
|
-2.556901049652824852289e1,
|
|
6.968710824104713396794e0,
|
|
-5.634242780008963776856e-1,
|
|
2.967721961301243206100e-3) /
|
|
poly4(zz, 3.424398657913078477438e2,
|
|
-3.838770957603691357202e2,
|
|
1.470656354026814941758e2,
|
|
-2.194779531642920639778e1,
|
|
1.000000000000000000000e0) * zz;
|
|
zz = sqrt(zz + zz);
|
|
Value z = pio4 - zz;
|
|
r[mask_big] = z - fmsub(zz, p, more_bits) + pio4;
|
|
}
|
|
|
|
if (IsCuda || !all_nested(mask_big)) {
|
|
Value z = poly5(x2, -8.198089802484824371615e0,
|
|
1.956261983317594739197e1,
|
|
-1.626247967210700244449e1,
|
|
5.444622390564711410273e0,
|
|
-6.019598008014123785661e-1,
|
|
4.253011369004428248960e-3) /
|
|
poly5(x2, -4.918853881490881290097e1,
|
|
1.395105614657485689735e2,
|
|
-1.471791292232726029859e2,
|
|
7.049610280856842141659e1,
|
|
-1.474091372988853791896e1,
|
|
1.000000000000000000000e0) * x2;
|
|
z = fmadd(xa, z, xa);
|
|
z = select(xa < Scalar(1e-8), xa, z);
|
|
r[~mask_big] = z;
|
|
}
|
|
}
|
|
return copysign(r, x);
|
|
}
|
|
|
|
ENOKI_UNARY_OPERATION(acos, std::acos(x)) {
|
|
/*
|
|
Arc cosine function approximation based on CEPHES.
|
|
|
|
- acos (in [-1, 1]):
|
|
* avg abs. err = 4.72002e-08
|
|
* avg rel. err = 2.85612e-08
|
|
-> in ULPs = 0.33034
|
|
* max abs. err = 2.38419e-07
|
|
(at x=-0.99999)
|
|
* max rel. err = 1.19209e-07
|
|
-> in ULPs = 1
|
|
(at x=-0.99999)
|
|
*/
|
|
|
|
if constexpr (Single) {
|
|
Value xa = abs(x), x2 = sqr(x);
|
|
|
|
Mask mask_big = xa > Scalar(0.5);
|
|
|
|
Value x1 = Scalar(0.5) * (Scalar(1) - xa);
|
|
Value x3 = select(mask_big, x1, x2);
|
|
Value x4 = select(mask_big, sqrt(x1), xa);
|
|
|
|
Value z1 = poly4(x3, 1.666675242e-1f,
|
|
7.4953002686e-2f,
|
|
4.5470025998e-2f,
|
|
2.4181311049e-2f,
|
|
4.2163199048e-2f);
|
|
|
|
z1 = fmadd(z1, x3 * x4, x4);
|
|
Value z2 = z1 + z1;
|
|
z2 = select(x < Scalar(0), Scalar(M_PI) - z2, z2);
|
|
|
|
Value z3 = Scalar(M_PI_2) - copysign(z1, x);
|
|
return select(mask_big, z2, z3);
|
|
} else {
|
|
const Scalar pio4 = Scalar(0.78539816339744830962);
|
|
const Scalar more_bits = Scalar(6.123233995736765886130e-17);
|
|
const Scalar h = Scalar(0.5);
|
|
|
|
Mask mask = x > h;
|
|
|
|
Value y = asin(select(mask, sqrt(fnmadd(h, x, h)), x));
|
|
return select(mask, y + y, pio4 - y + more_bits + pio4);
|
|
}
|
|
}
|
|
|
|
ENOKI_BINARY_OPERATION(atan2, std::atan2(x, y)) {
|
|
/*
|
|
MiniMax fit by Wenzel Jakob, May 2016
|
|
|
|
- atan2() tested via atan() (in [-1, 1]):
|
|
* avg abs. err = 1.81543e-07
|
|
* avg rel. err = 4.15224e-07
|
|
-> in ULPs = 4.9197
|
|
* max abs. err = 5.96046e-07
|
|
(at x=-0.976062)
|
|
* max rel. err = 7.73931e-07
|
|
-> in ULPs = 12
|
|
(at x=-0.015445)
|
|
*/
|
|
Value x_ = y,
|
|
y_ = x,
|
|
abs_x = abs(x_),
|
|
abs_y = abs(y_),
|
|
min_val = min(abs_y, abs_x),
|
|
max_val = max(abs_x, abs_y),
|
|
scale = Scalar(1) / max_val,
|
|
scaled_min = min_val * scale,
|
|
z = scaled_min * scaled_min;
|
|
|
|
// How to find these:
|
|
// f[x_] = MiniMaxApproximation[ArcTan[Sqrt[x]]/Sqrt[x],
|
|
// {x, {1/10000, 1}, 6, 0}, WorkingPrecision->20][[2, 1]]
|
|
|
|
Value t;
|
|
if constexpr (Single) {
|
|
t = poly6(z, 0.99999934166683966009,
|
|
-0.33326497518773606976,
|
|
+0.19881342388439013552,
|
|
-0.13486708938456973185,
|
|
+0.083863120428809689910,
|
|
-0.037006525670417265220,
|
|
0.0078613793713198150252);
|
|
} else {
|
|
t = poly6(z, 9.9999999999999999419e-1,
|
|
2.50554429737833465113e0,
|
|
2.28289058385464073556e0,
|
|
9.20960512187107069075e-1,
|
|
1.59189681028889623410e-1,
|
|
9.35911604785115940726e-3,
|
|
8.07005540507283419124e-5) /
|
|
poly6(z, 1.00000000000000000000e0,
|
|
2.83887763071166519407e0,
|
|
3.02918312742541450749e0,
|
|
1.50576983803701596773e0,
|
|
3.49719171130492192607e-1,
|
|
3.29968942624402204199e-2,
|
|
8.26619391703564168942e-4);
|
|
}
|
|
|
|
t = t * scaled_min;
|
|
|
|
t = select(abs_y > abs_x, Scalar(M_PI_2) - t, t);
|
|
t = select(x_ < zero<Value>(), Scalar(M_PI) - t, t);
|
|
Value r = select(y_ < zero<Value>(), -t, t);
|
|
r &= neq(max_val, Scalar(0));
|
|
return r;
|
|
}
|
|
|
|
ENOKI_UNARY_OPERATION(atan, std::atan(x)) {
|
|
return atan2(x, Scalar(1));
|
|
}
|
|
|
|
//! @}
|
|
// -----------------------------------------------------------------------
|
|
|
|
// -----------------------------------------------------------------------
|
|
//! @{ \name Exponential function, logarithm, power
|
|
// -----------------------------------------------------------------------
|
|
|
|
ENOKI_BINARY_OPERATION(ldexp, detail::ldexp_scalar(x, y)) {
|
|
return x * reinterpret_array<Value>(
|
|
sl<Single ? 23 : 52>(int_array_t<Value>(y) + (Single ? 0x7f : 0x3ff)));
|
|
}
|
|
|
|
ENOKI_UNARY_OPERATION_PAIR(frexp, detail::frexp_scalar(x)) {
|
|
using IntArray = int_array_t<Value>;
|
|
using Int = scalar_t<IntArray>;
|
|
using IntMask = mask_t<IntArray>;
|
|
|
|
const IntArray
|
|
exponent_mask(Int(Single ? 0x7f800000ull : 0x7ff0000000000000ull)),
|
|
mantissa_sign_mask(Int(Single ? ~0x7f800000ull : ~0x7ff0000000000000ull)),
|
|
bias(Int(Single ? 0x7f : 0x3ff));
|
|
|
|
IntArray xi = reinterpret_array<IntArray>(x);
|
|
IntArray exponent_bits = xi & exponent_mask;
|
|
|
|
/* Detect zero/inf/NaN */
|
|
IntMask is_normal =
|
|
IntMask(neq(x, zero<Value>())) &
|
|
neq(exponent_bits, exponent_mask);
|
|
|
|
IntArray exponent_i = (sr<Single ? 23 : 52>(exponent_bits)) - bias;
|
|
|
|
IntArray mantissa = (xi & mantissa_sign_mask) |
|
|
IntArray(memcpy_cast<Int>(Scalar(.5f)));
|
|
|
|
return std::make_pair(
|
|
reinterpret_array<Value>(select(is_normal, mantissa, xi)),
|
|
Value(exponent_i & is_normal)
|
|
);
|
|
}
|
|
|
|
ENOKI_UNARY_OPERATION(exp, std::exp(x)) {
|
|
/* Exponential function approximation based on CEPHES
|
|
|
|
Redistributed under a BSD license with permission of the author, see
|
|
https://github.com/deepmind/torch-cephes/blob/master/LICENSE.txt
|
|
|
|
- exp (in [-20, 30]):
|
|
* avg abs. err = 7155.01
|
|
* avg rel. err = 2.35929e-08
|
|
-> in ULPs = 0.273524
|
|
* max abs. err = 1.04858e+06
|
|
(at x=29.8057)
|
|
* max rel. err = 1.192e-07
|
|
-> in ULPs = 1
|
|
(at x=-19.9999)
|
|
*/
|
|
|
|
const Scalar inf = std::numeric_limits<Scalar>::infinity();
|
|
const Scalar max_range = Scalar(Single ? +88.3762588501 : +7.0943613930310391424428e2);
|
|
const Scalar min_range = Scalar(Single ? -88.3762588501 : -7.0943613930310391424428e2);
|
|
|
|
Mask mask_overflow = x > max_range,
|
|
mask_underflow = x < min_range;
|
|
|
|
/* Valueess e^x = e^g 2^n
|
|
= e^g e^(n loge(2))
|
|
= e^(g + n loge(2))
|
|
*/
|
|
Value n = floor(fmadd(Scalar(1.4426950408889634073599), x, Scalar(0.5)));
|
|
Value xr = x;
|
|
if constexpr (Single) {
|
|
xr = fnmadd(n, Scalar(0.693359375), xr);
|
|
xr = fnmadd(n, Scalar(-2.12194440e-4), xr);
|
|
} else {
|
|
xr = fnmadd(n, Scalar(6.93145751953125e-1), xr);
|
|
xr = fnmadd(n, Scalar(1.42860682030941723212e-6), xr);
|
|
}
|
|
|
|
Value z = sqr(xr);
|
|
|
|
if constexpr (Single) {
|
|
z = poly5(xr, 5.0000001201e-1, 1.6666665459e-1,
|
|
4.1665795894e-2, 8.3334519073e-3,
|
|
1.3981999507e-3, 1.9875691500e-4);
|
|
z = fmadd(z, xr * xr, xr + Scalar(1));
|
|
} else {
|
|
/* Rational approximation for exponential
|
|
of the fractional part:
|
|
e^x = 1 + 2x P(x^2) / (Q(x^2) - P(x^2))
|
|
*/
|
|
Value p = poly2(z, 9.99999999999999999910e-1,
|
|
3.02994407707441961300e-2,
|
|
1.26177193074810590878e-4) * xr;
|
|
|
|
Value q = poly3(z, 2.00000000000000000009e0,
|
|
2.27265548208155028766e-1,
|
|
2.52448340349684104192e-3,
|
|
3.00198505138664455042e-6);
|
|
|
|
Value pq = p / (q-p);
|
|
z = pq + pq + Scalar(1);
|
|
}
|
|
|
|
return select(mask_overflow, inf,
|
|
select(mask_underflow, zero<Value>(), ldexp(z, n)));
|
|
}
|
|
|
|
ENOKI_UNARY_OPERATION(log, std::log(x)) {
|
|
/* Logarithm function approximation based on CEPHES
|
|
|
|
Redistributed under a BSD license with permission of the author, see
|
|
https://github.com/deepmind/torch-cephes/blob/master/LICENSE.txt
|
|
|
|
- log (in [1e-20, 1000]):
|
|
* avg abs. err = 8.8672e-09
|
|
* avg rel. err = 1.57541e-09
|
|
-> in ULPs = 0.020038
|
|
* max abs. err = 4.76837e-07
|
|
(at x=54.7661)
|
|
* max rel. err = 1.19194e-07
|
|
-> in ULPs = 1
|
|
(at x=0.021)
|
|
*/
|
|
|
|
using UInt = scalar_t<int_array_t<Value>>;
|
|
|
|
/* Catch negative and NaN values */
|
|
Mask valid_mask = x >= Scalar(0);
|
|
Value input = x, xm;
|
|
|
|
/* The frexp in array_base.h does not handle denormalized numbers,
|
|
cut them off. The AVX512 backend does support them, however. */
|
|
if constexpr (!has_avx512f) {
|
|
Scalar limit = memcpy_cast<Scalar>(
|
|
UInt(Single ? 0x00800000u : 0x0010000000000000ull));
|
|
xm = max(x, limit);
|
|
} else {
|
|
xm = x;
|
|
}
|
|
|
|
Value e;
|
|
std::tie(xm, e) = frexp(x);
|
|
|
|
const Scalar sqrt_half = Scalar(0.70710678118654752440);
|
|
Mask mask_e_big = abs(e) > Scalar(2);
|
|
Mask mask_ge_inv_sqrt2 = xm >= sqrt_half;
|
|
ENOKI_MARK_USED(mask_e_big);
|
|
|
|
e[mask_ge_inv_sqrt2] += Scalar(1);
|
|
|
|
Value r;
|
|
if constexpr (Single) {
|
|
xm += (xm & ~mask_ge_inv_sqrt2) - Scalar(1);
|
|
|
|
Value z = xm * xm;
|
|
Value y = poly8(xm, 3.3333331174e-1, -2.4999993993e-1,
|
|
2.0000714765e-1, -1.6668057665e-1,
|
|
1.4249322787e-1, -1.2420140846e-1,
|
|
1.1676998740e-1, -1.1514610310e-1,
|
|
7.0376836292e-2);
|
|
|
|
y *= xm * z;
|
|
|
|
y = fmadd(e, Scalar(-2.12194440e-4), y);
|
|
z = fmadd(z, Scalar(-0.5), xm + y);
|
|
r = fmadd(e, Scalar(0.693359375), z);
|
|
} else {
|
|
constexpr bool IsCuda = is_cuda_array_v<Value>;
|
|
const Scalar half = Scalar(0.5);
|
|
Value r_big, r_small;
|
|
|
|
if (IsCuda || any_nested(mask_e_big)) {
|
|
/* logarithm using log(x) = z + z**3 P(z)/Q(z), where z = 2(x-1)/x+1) */
|
|
Value z = xm - half;
|
|
|
|
z[mask_ge_inv_sqrt2] -= half;
|
|
|
|
Value y = half * select(mask_ge_inv_sqrt2, xm, z) + half;
|
|
Value x2 = z / y;
|
|
|
|
z = x2 * x2;
|
|
z = x2 * (z * poly2(z, -6.41409952958715622951e1,
|
|
1.63866645699558079767e1,
|
|
-7.89580278884799154124e-1) /
|
|
poly3(z, -7.69691943550460008604e2,
|
|
3.12093766372244180303e2,
|
|
-3.56722798256324312549e1,
|
|
1.00000000000000000000e0));
|
|
|
|
r_big = fnmadd(e, Scalar(2.121944400546905827679e-4), z) + x2;
|
|
}
|
|
|
|
if (IsCuda || !all_nested(mask_e_big)) {
|
|
/* logarithm using log(1+x) = x - .5x**2 + x**3 P(x)/Q(x) */
|
|
Value x2 = select(mask_ge_inv_sqrt2, xm, xm + xm) - Scalar(1);
|
|
|
|
Value z = x2*x2;
|
|
Value y = x2 * (z * poly5(x2, 7.70838733755885391666e0,
|
|
1.79368678507819816313e1,
|
|
1.44989225341610930846e1,
|
|
4.70579119878881725854e0,
|
|
4.97494994976747001425e-1,
|
|
1.01875663804580931796e-4) /
|
|
poly5(x2, 2.31251620126765340583e1,
|
|
7.11544750618563894466e1,
|
|
8.29875266912776603211e1,
|
|
4.52279145837532221105e1,
|
|
1.12873587189167450590e1,
|
|
1.00000000000000000000e0));
|
|
|
|
y = fnmadd(e, Scalar(2.121944400546905827679e-4), y);
|
|
|
|
r_small = x2 + fnmadd(half, z, y);
|
|
}
|
|
|
|
r = select(mask_e_big, r_big, r_small);
|
|
r = fmadd(e, Scalar(0.693359375), r);
|
|
}
|
|
|
|
/* Handle a few special cases */
|
|
const Scalar n_inf(-std::numeric_limits<Scalar>::infinity());
|
|
const Scalar p_inf(std::numeric_limits<Scalar>::infinity());
|
|
|
|
r[eq(input, p_inf)] = p_inf;
|
|
r[eq(input, Scalar(0))] = n_inf;
|
|
|
|
return r | ~valid_mask;
|
|
}
|
|
|
|
ENOKI_UNARY_OPERATION(cbrt, std::cbrt(x)) {
|
|
/* Cubic root approximation based on CEPHES
|
|
|
|
Redistributed under a BSD license with permission of the author, see
|
|
https://github.com/deepmind/torch-cephes/blob/master/LICENSE.txt
|
|
|
|
- cbrt (in [-10, 10]):
|
|
* avg abs. err = 2.91027e-17
|
|
* avg rel. err = 1.79292e-17
|
|
-> in ULPs = 0.118351
|
|
* max abs. err = 4.44089e-16
|
|
(at x=-9.99994)
|
|
* max rel. err = 2.22044e-16
|
|
-> in ULPs = 1
|
|
(at x=-9.99994)
|
|
*/
|
|
|
|
const Scalar CBRT2 = Scalar(1.25992104989487316477),
|
|
CBRT4 = Scalar(1.58740105196819947475),
|
|
THIRD = Scalar(1.0 / 3.0);
|
|
|
|
Value xa = abs(x);
|
|
|
|
auto [xm, xe] = frexp(xa);
|
|
xe += Scalar(1);
|
|
|
|
Value xea = abs(xe),
|
|
xea1 = floor(xea * THIRD),
|
|
rem = fnmadd(xea1, Scalar(3), xea);
|
|
|
|
/* Approximate cube root of number between .5 and 1,
|
|
peak relative error = 9.2e-6 */
|
|
xm = poly4(xm, 0.40238979564544752126924,
|
|
1.1399983354717293273738,
|
|
-0.95438224771509446525043,
|
|
0.54664601366395524503440,
|
|
-0.13466110473359520655053);
|
|
|
|
Value f1 = select(xe >= Scalar(0), Value(CBRT2), Value(Scalar(1) / CBRT2)),
|
|
f2 = select(xe >= Scalar(0), Value(CBRT4), Value(Scalar(1) / CBRT4)),
|
|
f = select(eq(rem, 1.f), f1, f2);
|
|
|
|
xm[neq(rem, 0.f)] *= f;
|
|
|
|
Value r = ldexp(xm, mulsign(xea1, xe));
|
|
r = mulsign(r, x);
|
|
|
|
// Newton iteration
|
|
r -= (r - (x / sqr(r))) * THIRD;
|
|
|
|
if constexpr (!Single)
|
|
r -= (r - (x / sqr(r))) * THIRD;
|
|
|
|
return select(isfinite(x), r, x);
|
|
}
|
|
|
|
ENOKI_BINARY_OPERATION(pow, std::pow(x, y)) {
|
|
return exp(log(x) * y);
|
|
}
|
|
|
|
template <typename T, typename E = expr_t<T>>
|
|
ENOKI_INLINE E pow(const T &x_, const int &y) {
|
|
int n = std::abs(y);
|
|
E result(1.f), x(x_);
|
|
|
|
while (n > 0) {
|
|
if (n & 1)
|
|
result *= x;
|
|
x *= x;
|
|
n /= 2;
|
|
}
|
|
|
|
return (y >= 0) ? result : rcp(result);
|
|
}
|
|
|
|
template <typename T, typename E = expr_t<T, float>,
|
|
enable_if_t<is_array_v<T>> = 0>
|
|
ENOKI_INLINE E pow(const T &x, const float &y) {
|
|
if (enoki::round(y) == y)
|
|
return enoki::pow(E(x), (int) y);
|
|
else
|
|
return enoki::pow(E(x), E(y));
|
|
}
|
|
|
|
template <typename T, typename E = expr_t<T, double>,
|
|
enable_if_t<is_array_v<T>> = 0>
|
|
ENOKI_INLINE E pow(const T &x, const double &y) {
|
|
if (enoki::round(y) == y)
|
|
return enoki::pow(E(x), (int) y);
|
|
else
|
|
return enoki::pow(E(x), E(y));
|
|
}
|
|
|
|
// -----------------------------------------------------------------------
|
|
//! @{ \name Hyperbolic and inverse hyperbolic functions
|
|
// -----------------------------------------------------------------------
|
|
|
|
ENOKI_UNARY_OPERATION(sinh, std::sinh(x)) {
|
|
/*
|
|
- sinh (in [-10, 10]):
|
|
* avg abs. err = 2.92524e-05
|
|
* avg rel. err = 2.80831e-08
|
|
-> in ULPs = 0.336485
|
|
* max abs. err = 0.00195312
|
|
(at x=-9.99894)
|
|
* max rel. err = 2.36862e-07
|
|
-> in ULPs = 3
|
|
(at x=-9.69866)
|
|
*/
|
|
|
|
constexpr bool IsCuda = is_cuda_array_v<Value>;
|
|
|
|
Value xa = abs(x),
|
|
r_small, r_big;
|
|
|
|
Mask mask_big = xa > Scalar(1);
|
|
|
|
if (IsCuda || any_nested(mask_big)) {
|
|
Value exp0 = exp(x),
|
|
exp1 = rcp(exp0);
|
|
|
|
r_big = (exp0 - exp1) * Scalar(0.5);
|
|
}
|
|
|
|
if (IsCuda || !all_nested(mask_big)) {
|
|
Value x2 = x * x;
|
|
|
|
if constexpr (Single) {
|
|
r_small = fmadd(poly2(x2, 1.66667160211e-1,
|
|
8.33028376239e-3,
|
|
2.03721912945e-4),
|
|
x2 * x, x);
|
|
} else {
|
|
r_small = fmadd(poly3(x2, -3.51754964808151394800e5,
|
|
-1.15614435765005216044e4,
|
|
-1.63725857525983828727e2,
|
|
-7.89474443963537015605e-1) /
|
|
poly3(x2, -2.11052978884890840399e6,
|
|
3.61578279834431989373e4,
|
|
-2.77711081420602794433e2,
|
|
1.00000000000000000000e0),
|
|
x2 * x, x);
|
|
}
|
|
}
|
|
|
|
return select(mask_big, r_big, r_small);
|
|
}
|
|
|
|
ENOKI_UNARY_OPERATION(cosh, std::cosh(x)) {
|
|
/*
|
|
- cosh (in [-10, 10]):
|
|
* avg abs. err = 4.17738e-05
|
|
* avg rel. err = 3.15608e-08
|
|
-> in ULPs = 0.376252
|
|
* max abs. err = 0.00195312
|
|
(at x=-9.99894)
|
|
* max rel. err = 2.38001e-07
|
|
-> in ULPs = 3
|
|
(at x=-9.70164)
|
|
*/
|
|
|
|
Value exp0 = exp(x),
|
|
exp1 = rcp(exp0);
|
|
|
|
return (exp0 + exp1) * Scalar(.5f);
|
|
}
|
|
|
|
ENOKI_UNARY_OPERATION_PAIR(sincosh, std::make_pair(std::sinh(x), std::cosh(x))) {
|
|
/*
|
|
- sinh (in [-10, 10]):
|
|
* avg abs. err = 2.92524e-05
|
|
* avg rel. err = 2.80831e-08
|
|
-> in ULPs = 0.336485
|
|
* max abs. err = 0.00195312
|
|
(at x=-9.99894)
|
|
* max rel. err = 2.36862e-07
|
|
-> in ULPs = 3
|
|
(at x=-9.69866)
|
|
|
|
- cosh (in [-10, 10]):
|
|
* avg abs. err = 4.17738e-05
|
|
* avg rel. err = 3.15608e-08
|
|
-> in ULPs = 0.376252
|
|
* max abs. err = 0.00195312
|
|
(at x=-9.99894)
|
|
* max rel. err = 2.38001e-07
|
|
-> in ULPs = 3
|
|
(at x=-9.70164)
|
|
*/
|
|
|
|
constexpr bool IsCuda = is_cuda_array_v<Value>;
|
|
|
|
const Scalar half = Scalar(0.5);
|
|
|
|
Value xa = abs(x),
|
|
exp0 = exp(x),
|
|
exp1 = rcp(exp0),
|
|
r_big = (exp0 - exp1) * half,
|
|
r_small;
|
|
|
|
Mask mask_big = xa > Scalar(1);
|
|
|
|
if (IsCuda || !all_nested(mask_big)) {
|
|
Value x2 = x * x;
|
|
|
|
if constexpr (Single) {
|
|
r_small = fmadd(poly2(x2, 1.66667160211e-1,
|
|
8.33028376239e-3,
|
|
2.03721912945e-4),
|
|
x2 * x, x);
|
|
} else {
|
|
r_small = fmadd(poly3(x2, -3.51754964808151394800e5,
|
|
-1.15614435765005216044e4,
|
|
-1.63725857525983828727e2,
|
|
-7.89474443963537015605e-1) /
|
|
poly3(x2, -2.11052978884890840399e6,
|
|
3.61578279834431989373e4,
|
|
-2.77711081420602794433e2,
|
|
1.00000000000000000000e0),
|
|
x2 * x, x);
|
|
}
|
|
}
|
|
|
|
return std::make_pair(
|
|
select(mask_big, r_big, r_small),
|
|
half * (exp0 + exp1)
|
|
);
|
|
}
|
|
|
|
ENOKI_UNARY_OPERATION(tanh, std::tanh(x)) {
|
|
/*
|
|
Hyperbolic tangent function approximation based on CEPHES.
|
|
|
|
- tanh (in [-10, 10]):
|
|
* avg abs. err = 4.44655e-08
|
|
* avg rel. err = 4.58074e-08
|
|
-> in ULPs = 0.698044
|
|
* max abs. err = 3.57628e-07
|
|
(at x=-2.12867)
|
|
* max rel. err = 4.1006e-07
|
|
-> in ULPs = 6
|
|
(at x=-2.12867)
|
|
*/
|
|
|
|
constexpr bool IsCuda = is_cuda_array_v<Value>;
|
|
|
|
Value r_big, r_small;
|
|
|
|
Mask mask_big = abs(x) >= Scalar(0.625);
|
|
|
|
if (IsCuda || !all_nested(mask_big)) {
|
|
Value x2 = x*x;
|
|
|
|
if constexpr (Single) {
|
|
r_small = poly4(x2, -3.33332819422e-1,
|
|
1.33314422036e-1,
|
|
-5.37397155531e-2,
|
|
2.06390887954e-2,
|
|
-5.70498872745e-3);
|
|
} else {
|
|
r_small = poly2(x2, -1.61468768441708447952e3,
|
|
-9.92877231001918586564e1,
|
|
-9.64399179425052238628e-1) /
|
|
poly3(x2, 4.84406305325125486048e3,
|
|
2.23548839060100448583e3,
|
|
1.12811678491632931402e2,
|
|
1.00000000000000000000e0);
|
|
}
|
|
|
|
r_small = fmadd(r_small, x2 * x, x);
|
|
}
|
|
|
|
if (IsCuda || any_nested(mask_big)) {
|
|
Value e = exp(x + x),
|
|
e2 = rcp(e + Scalar(1));
|
|
r_big = Scalar(1) - (e2 + e2);
|
|
}
|
|
|
|
return select(mask_big, r_big, r_small);
|
|
}
|
|
|
|
template <typename T> auto csch(const T &a) { return rcp(sinh(a)); }
|
|
template <typename T> auto sech(const T &a) { return rcp(cosh(a)); }
|
|
template <typename T> auto coth(const T &a) { return rcp(tanh(a)); }
|
|
|
|
ENOKI_UNARY_OPERATION(asinh, std::asinh(x)) {
|
|
/*
|
|
Hyperbolic arc sine function approximation based on CEPHES.
|
|
|
|
- asinh (in [-10, 10]):
|
|
* avg abs. err = 2.75626e-08
|
|
* avg rel. err = 1.51762e-08
|
|
-> in ULPs = 0.178341
|
|
* max abs. err = 2.38419e-07
|
|
(at x=-10)
|
|
* max rel. err = 1.71857e-07
|
|
-> in ULPs = 2
|
|
(at x=-1.17457)
|
|
*/
|
|
|
|
constexpr bool IsCuda = is_cuda_array_v<Value>;
|
|
|
|
Value x2 = x*x,
|
|
xa = abs(x),
|
|
r_big, r_small;
|
|
|
|
Mask mask_big = xa >= Scalar(Single ? 0.51 : 0.533),
|
|
mask_huge = xa >= Scalar(Single ? 1e10 : 1e20);
|
|
|
|
if (IsCuda || !all_nested(mask_big)) {
|
|
if constexpr (Single) {
|
|
r_small = poly3(x2, -1.6666288134e-1,
|
|
7.4847586088e-2,
|
|
-4.2699340972e-2,
|
|
2.0122003309e-2);
|
|
} else {
|
|
r_small = poly4(x2, -5.56682227230859640450e0,
|
|
-9.09030533308377316566e0,
|
|
-4.37390226194356683570e0,
|
|
-5.91750212056387121207e-1,
|
|
-4.33231683752342103572e-3) /
|
|
poly4(x2, 3.34009336338516356383e1,
|
|
6.95722521337257608734e1,
|
|
4.86042483805291788324e1,
|
|
1.28757002067426453537e1,
|
|
1.00000000000000000000e0);
|
|
}
|
|
r_small = fmadd(r_small, x2 * x, x);
|
|
}
|
|
|
|
if (IsCuda || any_nested(mask_big)) {
|
|
r_big = log(xa + (sqrt(x2 + Scalar(1)) & ~mask_huge));
|
|
r_big[mask_huge] += Scalar(M_LN2);
|
|
r_big = copysign(r_big, x);
|
|
}
|
|
|
|
return select(mask_big, r_big, r_small);
|
|
}
|
|
|
|
ENOKI_UNARY_OPERATION(acosh, std::acosh(x)) {
|
|
/*
|
|
Hyperbolic arc cosine function approximation based on CEPHES.
|
|
|
|
- acosh (in [-10, 10]):
|
|
* avg abs. err = 2.8897e-08
|
|
* avg rel. err = 1.49658e-08
|
|
-> in ULPs = 0.175817
|
|
* max abs. err = 2.38419e-07
|
|
(at x=3.76221)
|
|
* max rel. err = 2.35024e-07
|
|
-> in ULPs = 3
|
|
(at x=1.02974)
|
|
*/
|
|
|
|
constexpr bool IsCuda = is_cuda_array_v<Value>;
|
|
|
|
Value x1 = x - Scalar(1),
|
|
r_big, r_small;
|
|
|
|
Mask mask_big = x1 >= Scalar(0.49),
|
|
mask_huge = x1 >= Scalar(1e10);
|
|
|
|
if (IsCuda || !all_nested(mask_big)) {
|
|
if constexpr (Single) {
|
|
r_small = poly4(x1, 1.4142135263e+0,
|
|
-1.1784741703e-1,
|
|
2.6454905019e-2,
|
|
-7.5272886713e-3,
|
|
1.7596881071e-3);
|
|
} else {
|
|
r_small = poly4(x1, 1.10855947270161294369E5,
|
|
1.08102874834699867335E5,
|
|
3.43989375926195455866E4,
|
|
3.94726656571334401102E3,
|
|
1.18801130533544501356E2) /
|
|
poly5(x1, 7.83869920495893927727E4,
|
|
8.29725251988426222434E4,
|
|
2.97683430363289370382E4,
|
|
4.15352677227719831579E3,
|
|
1.86145380837903397292E2,
|
|
1.00000000000000000000E0);
|
|
}
|
|
|
|
r_small *= sqrt(x1);
|
|
r_small |= x1 < zero<Value>();
|
|
}
|
|
|
|
if (IsCuda || any_nested(mask_big)) {
|
|
r_big = log(x + (sqrt(fmsub(x, x, Scalar(1))) & ~mask_huge));
|
|
r_big[mask_huge] += Scalar(M_LN2);
|
|
}
|
|
|
|
return select(mask_big, r_big, r_small);
|
|
}
|
|
|
|
ENOKI_UNARY_OPERATION(atanh, std::atanh(x)) {
|
|
/*
|
|
Hyperbolic arc tangent function approximation based on CEPHES.
|
|
|
|
|
|
- acosh (in [-10, 10]):
|
|
* avg abs. err = 9.87529e-09
|
|
* avg rel. err = 1.52741e-08
|
|
-> in ULPs = 0.183879
|
|
* max abs. err = 2.38419e-07
|
|
(at x=-0.998962)
|
|
* max rel. err = 1.19209e-07
|
|
-> in ULPs = 1
|
|
(at x=-0.998962)
|
|
*/
|
|
|
|
constexpr bool IsCuda = is_cuda_array_v<Value>;
|
|
|
|
Value xa = abs(x),
|
|
r_big, r_small;
|
|
|
|
Mask mask_big = xa >= Scalar(0.5);
|
|
|
|
if (IsCuda || !all_nested(mask_big)) {
|
|
Value x2 = x*x;
|
|
if constexpr (Single) {
|
|
r_small = poly4(x2, 3.33337300303e-1,
|
|
1.99782164500e-1,
|
|
1.46691431730e-1,
|
|
8.24370301058e-2,
|
|
1.81740078349e-1);
|
|
} else {
|
|
r_small = poly4(x2, -3.09092539379866942570e1,
|
|
6.54566728676544377376e1,
|
|
-4.61252884198732692637e1,
|
|
1.20426861384072379242e1,
|
|
-8.54074331929669305196e-1) /
|
|
poly5(x2, -9.27277618139601130017e1,
|
|
2.52006675691344555838e2,
|
|
-2.49839401325893582852e2,
|
|
1.08938092147140262656e2,
|
|
-1.95638849376911654834e1,
|
|
1.00000000000000000000e0);
|
|
}
|
|
r_small = fmadd(r_small, x2*x, x);
|
|
}
|
|
|
|
if (IsCuda || any_nested(mask_big)) {
|
|
r_big = log((Scalar(1) + xa) / (Scalar(1) - xa)) * Scalar(0.5);
|
|
r_big = copysign(r_big, x);
|
|
}
|
|
|
|
return select(mask_big, r_big, r_small);
|
|
}
|
|
|
|
/// Linearly interpolate between 'a' and 'b', using 't'
|
|
template <typename Value1, typename Value2, typename Value3>
|
|
auto lerp(const Value1 &a, const Value2 &b, const Value3 &t) {
|
|
return fmadd(b, t, fnmadd(a, t, a));
|
|
}
|
|
|
|
/// Clamp the value 'value' to the range [min, max]
|
|
template <typename Value1, typename Value2, typename Value3>
|
|
auto clamp(const Value1 &value, const Value2 &min, const Value3 &max) {
|
|
return enoki::max(enoki::min(value, max), min);
|
|
}
|
|
|
|
/// Compute the hypotenuse of 'a' and 'b', while avoiding under/overflow
|
|
template <typename T1, typename T2>
|
|
ENOKI_INLINE auto hypot(const T1 &a, const T2 &b) {
|
|
auto abs_a = abs(a);
|
|
auto abs_b = abs(b);
|
|
auto maxval = max(abs_a, abs_b),
|
|
minval = min(abs_a, abs_b),
|
|
ratio = minval / maxval;
|
|
|
|
using Scalar = scalar_t<decltype(ratio)>;
|
|
const Scalar inf = std::numeric_limits<Scalar>::infinity();
|
|
|
|
return select(
|
|
(abs_a < inf) && (abs_b < inf) && (ratio < inf),
|
|
maxval * sqrt(Scalar(1) + sqr(ratio)),
|
|
abs_a + abs_b
|
|
);
|
|
}
|
|
|
|
ENOKI_BINARY_OPERATION(fmod, std::fmod(x, y)) {
|
|
return fnmadd(trunc(x / y), y, x);
|
|
}
|
|
|
|
// -----------------------------------------------------------------------
|
|
//! @{ \name "Safe" functions that avoid domain errors due to rounding
|
|
// -----------------------------------------------------------------------
|
|
|
|
template <typename T> ENOKI_INLINE auto safe_sqrt(const T &a) {
|
|
return sqrt(max(a, zero<T>()));
|
|
}
|
|
|
|
template <typename T> ENOKI_INLINE auto safe_rsqrt(const T &a) {
|
|
return rsqrt(max(a, zero<T>()));
|
|
}
|
|
|
|
template <typename T> ENOKI_INLINE auto safe_asin(const T &a) {
|
|
return asin(min(T(1), max(T(-1), a)));
|
|
}
|
|
|
|
template <typename T> ENOKI_INLINE auto safe_acos(const T &a) {
|
|
return acos(min(T(1), max(T(-1), a)));
|
|
}
|
|
|
|
/**
|
|
* \brief Numerically well-behaved routine for computing the angle
|
|
* between two unit direction vectors
|
|
*
|
|
* This should be used wherever one is tempted to compute the
|
|
* arc cosine of a dot product.
|
|
*
|
|
* By Don Hatch at http://www.plunk.org/~hatch/rightway.php
|
|
*/
|
|
template <typename T, typename Expr = expr_t<value_t<T>>>
|
|
Expr unit_angle(const T &a, const T &b) {
|
|
Expr dot_uv = dot(a, b),
|
|
temp = 2.f * asin(.5f * norm(b - mulsign(a, dot_uv)));
|
|
return select(dot_uv >= 0, temp, scalar_t<Expr>(M_PI) - temp);
|
|
}
|
|
|
|
/**
|
|
* \brief Numerically well-behaved routine for computing the angle
|
|
* between the unit direction vector 'v' and the z-axis
|
|
*
|
|
* This should be used wherever one is tempted to compute
|
|
* std::acos(v.z())
|
|
*
|
|
* By Don Hatch at http://www.plunk.org/~hatch/rightway.php
|
|
*/
|
|
template <typename T, typename Expr = expr_t<value_t<T>>>
|
|
Expr unit_angle_z(const T &v) {
|
|
static_assert(T::Size == 3, "unit_angle_z(): input is not a 3D vector");
|
|
Expr temp = 2.f * asin(.5f * sqrt(sqr(v.x()) + sqr(v.y()) +
|
|
sqr(v.z() - copysign(Expr(1.f), v.z()))));
|
|
return select(v.z() >= 0, temp, scalar_t<Expr>(M_PI) - temp);
|
|
}
|
|
|
|
//! @}
|
|
// -----------------------------------------------------------------------
|
|
|
|
// -----------------------------------------------------------------------
|
|
//! @{ \name Floating point manipulation routines
|
|
// -----------------------------------------------------------------------
|
|
|
|
template <typename Value, typename Expr = expr_t<Value>>
|
|
ENOKI_INLINE Expr prev_float(const Value &value) {
|
|
using Int = int_array_t<Expr>;
|
|
using IntScalar = scalar_t<Int>;
|
|
|
|
const Int exponent_mask = sizeof(IntScalar) == 4
|
|
? IntScalar(0x7f800000)
|
|
: IntScalar(0x7ff0000000000000ll);
|
|
|
|
const Int pos_denorm = sizeof(IntScalar) == 4
|
|
? IntScalar(0x80000001)
|
|
: IntScalar(0x8000000000000001ll);
|
|
|
|
Int i = reinterpret_array<Int>(value);
|
|
|
|
auto is_nan_inf = eq(i & exponent_mask, exponent_mask);
|
|
auto is_pos_0 = eq(i, 0);
|
|
auto is_gt_0 = i >= 0;
|
|
auto is_special = is_nan_inf | is_pos_0;
|
|
|
|
Int j1 = i + select(is_gt_0, Int(-1), Int(1));
|
|
Int j2 = select(is_pos_0, pos_denorm, i);
|
|
|
|
return reinterpret_array<Expr>(select(is_special, j2, j1));
|
|
}
|
|
|
|
template <typename Value, typename Expr = expr_t<Value>>
|
|
ENOKI_INLINE Expr next_float(const Value &value) {
|
|
using Int = int_array_t<Expr>;
|
|
using IntScalar = scalar_t<Int>;
|
|
|
|
const Int exponent_mask = sizeof(IntScalar) == 4
|
|
? IntScalar(0x7f800000)
|
|
: IntScalar(0x7ff0000000000000ll);
|
|
|
|
const Int sign_mask = sizeof(IntScalar) == 4
|
|
? IntScalar(0x80000000)
|
|
: IntScalar(0x8000000000000000ll);
|
|
|
|
Int i = reinterpret_array<Int>(value);
|
|
|
|
auto is_nan_inf = eq(i & exponent_mask, exponent_mask);
|
|
auto is_neg_0 = eq(i, sign_mask);
|
|
auto is_gt_0 = i >= 0;
|
|
auto is_special = is_nan_inf | is_neg_0;
|
|
|
|
Int j1 = i + select(is_gt_0, Int(1), Int(-1));
|
|
Int j2 = select(is_neg_0, Int(1), i);
|
|
|
|
return reinterpret_array<Expr>(select(is_special, j2, j1));
|
|
}
|
|
|
|
template <typename Arg> auto isdenormal(const Arg &a) {
|
|
return abs(a) < std::numeric_limits<scalar_t<Arg>>::min() &&
|
|
neq(a, zero<Arg>());
|
|
}
|
|
|
|
//! @}
|
|
// -----------------------------------------------------------------------
|
|
|
|
NAMESPACE_END(enoki)
|