cocos-engine-external/sources/enoki/array_math.h

/*
    enoki/array_math.h -- Mathematical support library

    Enoki is a C++ template library that enables transparent vectorization
    of numerical kernels using ENOKI instruction sets available on current
    processor architectures.

    Copyright (c) 2019 Wenzel Jakob <wenzel.jakob@epfl.ch>

    All rights reserved. Use of this source code is governed by a BSD-style
    license that can be found in the LICENSE file.
*/

#include <enoki/array_generic.h>

#pragma once

NAMESPACE_BEGIN(enoki)

// -----------------------------------------------------------------------
//! @{ \name Polynomial evaluation with short dependency chains and
//           fused multply-adds based on Estrin's scheme
// -----------------------------------------------------------------------

template <typename T1, typename T2, typename T = expr_t<T1>, typename S = scalar_t<T1>>
ENOKI_INLINE T poly2(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2) {
    T x2 = x * x;
    return fmadd(x2, S(c2), fmadd(x, S(c1), S(c0)));
}

template <typename T1, typename T2, typename T = expr_t<T1>,
          typename S = scalar_t<T1>>
ENOKI_INLINE T poly3(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
                     const T2 &c3) {
    T x2 = x * x;
    return fmadd(x2, fmadd(x, S(c3), S(c2)), fmadd(x, S(c1), S(c0)));
}

template <typename T1, typename T2, typename T = expr_t<T1>,
          typename S = scalar_t<T1>>
ENOKI_INLINE T poly4(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
                     const T2 &c3, const T2 &c4) {
    T x2 = x * x, x4 = x2 * x2;
    return fmadd(x2, fmadd(x, S(c3), S(c2)), fmadd(x, S(c1), S(c0)) + S(c4) * x4);
}

template <typename T1, typename T2, typename T = expr_t<T1>,
          typename S = scalar_t<T1>>
ENOKI_INLINE T poly5(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
                     const T2 &c3, const T2 &c4, const T2 &c5) {
    T x2 = x * x, x4 = x2 * x2;
    return fmadd(x2, fmadd(x, S(c3), S(c2)),
                     fmadd(x4, fmadd(x, S(c5), S(c4)), fmadd(x, S(c1), S(c0))));
}

template <typename T1, typename T2, typename T = expr_t<T1>,
          typename S = scalar_t<T1>>
ENOKI_INLINE T poly6(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
                     const T2 &c3, const T2 &c4, const T2 &c5, const T2 &c6) {
    T x2 = x * x, x4 = x2 * x2;
    return fmadd(x4, fmadd(x2, S(c6), fmadd(x, S(c5), S(c4))),
                     fmadd(x2, fmadd(x, S(c3), S(c2)), fmadd(x, S(c1), S(c0))));
}

template <typename T1, typename T2, typename T = expr_t<T1>,
          typename S = scalar_t<T1>>
ENOKI_INLINE T poly7(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
                     const T2 &c3, const T2 &c4, const T2 &c5, const T2 &c6,
                     const T2 &c7) {
    T x2 = x * x, x4 = x2 * x2;
    return fmadd(x4, fmadd(x2, fmadd(x, S(c7), S(c6)), fmadd(x, S(c5), S(c4))),
                     fmadd(x2, fmadd(x, S(c3), S(c2)), fmadd(x, S(c1), S(c0))));
}

template <typename T1, typename T2, typename T = expr_t<T1>,
          typename S = scalar_t<T1>>
ENOKI_INLINE T poly8(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
                     const T2 &c3, const T2 &c4, const T2 &c5, const T2 &c6,
                     const T2 &c7, const T2 &c8) {
    T x2 = x * x, x4 = x2 * x2, x8 = x4 * x4;
    return fmadd(x4, fmadd(x2, fmadd(x, S(c7), S(c6)), fmadd(x, S(c5), S(c4))),
                     fmadd(x2, fmadd(x, S(c3), S(c2)), fmadd(x, S(c1), S(c0)) + S(c8) * x8));
}

template <typename T1, typename T2, typename T = expr_t<T1>,
          typename S = scalar_t<T1>>
ENOKI_INLINE T poly9(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
                     const T2 &c3, const T2 &c4, const T2 &c5, const T2 &c6,
                     const T2 &c7, const T2 &c8, const T2 &c9) {
    T x2 = x * x, x4 = x2 * x2, x8 = x4 * x4;
    return fmadd(x8, fmadd(x, S(c9), S(c8)),
                     fmadd(x4, fmadd(x2, fmadd(x, S(c7), S(c6)), fmadd(x, S(c5), S(c4))),
                               fmadd(x2, fmadd(x, S(c3), S(c2)), fmadd(x, S(c1), S(c0)))));
}

template <typename T1, typename T2, typename T = expr_t<T1>,
          typename S = scalar_t<T1>>
ENOKI_INLINE T poly10(const T1 &x, const T2 &c0, const T2 &c1, const T2 &c2,
                      const T2 &c3, const T2 &c4, const T2 &c5, const T2 &c6,
                      const T2 &c7, const T2 &c8, const T2 &c9, const T2 &c10) {
    T x2 = x * x, x4 = x2 * x2, x8 = x4 * x4;
    return fmadd(x8, fmadd(x2, S(c10), fmadd(x, S(c9), S(c8))),
                     fmadd(x4, fmadd(x2, fmadd(x, S(c7), S(c6)), fmadd(x, S(c5), S(c4))),
                               fmadd(x2, fmadd(x, S(c3), S(c2)), fmadd(x, S(c1), S(c0)))));
}

//! @}
// -----------------------------------------------------------------------

#define ENOKI_UNARY_OPERATION(name, expr)                                      \
    namespace detail {                                                         \
        template <typename T>                                                  \
        using has_##name = decltype(std::declval<T>().name##_());              \
        template <typename T>                                                  \
        constexpr bool has_##name##_v = is_detected_v<has_##name, T>;          \
        template <typename Value, typename Scalar = scalar_t<Value>,           \
                  typename Mask = mask_t<Value>,                               \
                  bool Single = std::is_same_v<float, Scalar>>                 \
        Value name##_impl(const Value &);                                      \
    }                                                                          \
    template <typename T> auto name(const T &x) {                              \
        using E = expr_t<T>;                                                   \
        using Value = value_t<E>;                                              \
        if constexpr (detail::has_##name##_v<E>) {                             \
            return ((const E &) x).name##_();                                  \
        } else if constexpr (is_recursive_array_v<E>) {                        \
            return E(name(low(x)), name(high(x)));                             \
        } else if constexpr (is_dynamic_array_v<E> &&                          \
                            !is_diff_array_v<E> &&                             \
                            !is_cuda_array_v<E>) {                             \
            E r = empty<E>(x.size());                                          \
            auto pr = r.packet_ptr();                                          \
            auto px = x.packet_ptr();                                          \
            for (size_t i = 0, n = r.packets(); i < n; ++i, ++pr, ++px)        \
                *pr = name(*px);                                               \
            return r;                                                          \
        } else if constexpr (array_depth_v<E> > 1) {                           \
            E r;                                                               \
            ENOKI_CHKSCALAR(#name);                                            \
            for (size_t i = 0; i < x.size(); ++i)                              \
                r.coeff(i) = name(x.coeff(i));                                 \
            return r;                                                          \
        } else if constexpr (is_array_v<E>) {                                  \
            return detail::name##_impl((const E &) x);                         \
        } else {                                                               \
            return expr;                                                       \
        }                                                                      \
    }                                                                          \
    template <typename Value, typename Scalar, typename Mask, bool Single>     \
    ENOKI_INLINE Value enoki::detail::name##_impl(const Value &x)

#define ENOKI_UNARY_OPERATION_PAIR(name, expr)                                 \
    namespace detail {                                                         \
        template <typename T>                                                  \
        using has_##name = decltype(std::declval<T>().name##_());              \
        template <typename T>                                                  \
        constexpr bool has_##name##_v = is_detected_v<has_##name, T>;          \
        template <typename Value, typename Scalar = scalar_t<Value>,           \
                  typename Mask = mask_t<Value>,                               \
                  bool Single = std::is_same_v<float, Scalar>>                 \
        std::pair<Value, Value> name##_impl(const Value &);                    \
    }                                                                          \
    template <typename T> auto name(const T &x) {                              \
        using E = expr_t<T>;                                                   \
        using Value = value_t<E>;                                              \
        if constexpr (detail::has_##name##_v<E>) {                             \
            return ((const E &) x).name##_();                                  \
        } else if constexpr (is_recursive_array_v<E>) {                        \
            auto l = name(low(x));                                             \
            auto h = name(high(x));                                            \
            return std::pair<E, E>(E(l.first, h.first),                        \
                                   E(l.second, h.second));                     \
        } else if constexpr (is_dynamic_array_v<E> &&                          \
                            !is_cuda_array_v<E> &&                             \
                            !is_diff_array_v<E>) {                             \
            std::pair<E, E> r(empty<E>(x.size()), empty<E>(x.size()));         \
            auto pr0 = r.first.packet_ptr(),                                   \
                 pr1 = r.second.packet_ptr();                                  \
            auto px = x.packet_ptr();                                          \
            for (size_t i = 0, n = x.packets();                                \
                 i < n; ++i, ++pr0, ++pr1, ++px)                               \
                std::tie(*pr0, *pr1) = name(*px);                              \
            return r;                                                          \
        } else if constexpr (array_depth_v<E> > 1) {                           \
            std::pair<E, E> r;                                                 \
            ENOKI_CHKSCALAR(#name);                                            \
            for (size_t i = 0; i < x.size(); ++i)                              \
                std::tie(r.first.coeff(i),                                     \
                         r.second.coeff(i)) = name(x.coeff(i));                \
            return r;                                                          \
        } else if constexpr (is_array_v<E>) {                                  \
            return detail::name##_impl((const E &) x);                         \
        } else {                                                               \
            return expr;                                                       \
        }                                                                      \
                                                                               \
    }                                                                          \
    template <typename Value, typename Scalar, typename Mask, bool Single>     \
    ENOKI_INLINE std::pair<Value, Value> enoki::detail::name##_impl(const Value &x)


#define ENOKI_BINARY_OPERATION(name, expr)                                     \
    namespace detail {                                                         \
        template <typename T>                                                  \
        using has_##name = decltype(std::declval<T>()                          \
                                        .name##_(std::declval<T>()));          \
        template <typename T>                                                  \
        constexpr bool has_##name##_v = is_detected_v<has_##name, T>;          \
        template <typename Value, typename Scalar = scalar_t<Value>,           \
                  typename Mask = mask_t<Value>,                               \
                  bool Single = std::is_same_v<float, Scalar>>                 \
        Value name##_impl(const Value &, const Value &);                       \
    }                                                                          \
    template <typename T1, typename T2> auto name(const T1 &x, const T2 &y) {  \
        using E = expr_t<T1, T2>;                                              \
        using Value = value_t<E>;                                              \
        if constexpr (detail::has_##name##_v<E>) {                             \
            return ((const E &) x).name##_((const E &) y);                     \
        } else if constexpr (is_recursive_array_v<E>) {                        \
            return E(name(low(x), low(y)), name(high(x), high(y)));            \
        } else if constexpr (!std::is_same_v<T1, E> ||                         \
                             !std::is_same_v<T2, E>) {                         \
            return name((const E& ) x, (const E &) y);                         \
        } else if constexpr (is_dynamic_array_v<E> &&                          \
                            !is_cuda_array_v<E> &&                             \
                            !is_diff_array_v<E>) {                             \
            E r;                                                               \
            r.resize_like(x, y);                                               \
            size_t xs = x.size() == 1 ? 0 : 1,                                 \
                   ys = y.size() == 1 ? 0 : 1;                                 \
            auto pr = r.packet_ptr();                                          \
            auto px = x.packet_ptr();                                          \
            auto py = y.packet_ptr();                                          \
            for (size_t i = 0, n = r.packets(); i < n;                         \
                 ++i, pr += 1, px += xs, py += ys)                             \
                *pr = name(*px, *py);                                          \
            return r;                                                          \
        } else if constexpr (array_depth_v<E> > 1) {                           \
            assert(x.size() == y.size());                                      \
            E r;                                                               \
            ENOKI_CHKSCALAR(#name);                                            \
            for (size_t i = 0; i < x.size(); ++i)                              \
                r.coeff(i) = name(x.coeff(i), y.coeff(i));                     \
            return r;                                                          \
        } else if constexpr (is_array_v<E>) {                                  \
            return detail::name##_impl((const E &) x, (const E &) y);          \
        } else {                                                               \
            return expr;                                                       \
        }                                                                      \
                                                                               \
    }                                                                          \
    template <typename Value, typename Scalar, typename Mask, bool Single>     \
    ENOKI_INLINE Value enoki::detail::name##_impl(const Value &x, const Value &y)


// -----------------------------------------------------------------------
//! @{ \name Trigonometric functions and their inverses
// -----------------------------------------------------------------------

namespace detail {
    template <bool Sin, bool Cos, typename Value>
    ENOKI_INLINE void sincos_approx(const Value &x, Value &s_out, Value &c_out) {
        using Scalar = scalar_t<Value>;
        constexpr bool Single = std::is_same_v<Scalar, float>;
        using IntArray = int_array_t<Value>;
        using Int = scalar_t<IntArray>;
        using Mask = mask_t<Value>;
        ENOKI_MARK_USED(s_out);
        ENOKI_MARK_USED(c_out);

        /* Joint sine & cosine function approximation based on CEPHES.
           Excellent accuracy in the domain |x| < 8192

           Redistributed under a BSD license with permission of the author, see
           https://github.com/deepmind/torch-cephes/blob/master/LICENSE.txt

         - sin (in [-8192, 8192]):
           * avg abs. err = 6.61896e-09
           * avg rel. err = 1.37888e-08
              -> in ULPs  = 0.166492
           * max abs. err = 5.96046e-08
             (at x=-8191.31)
           * max rel. err = 1.76826e-06
             -> in ULPs   = 19
             (at x=-6374.29)

         - cos (in [-8192, 8192]):
           * avg abs. err = 6.59965e-09
           * avg rel. err = 1.37432e-08
              -> in ULPs  = 0.166141
           * max abs. err = 5.96046e-08
             (at x=-8191.05)
           * max rel. err = 3.13993e-06
             -> in ULPs   = 47
             (at x=-6199.93)
        */

        Value xa = abs(x);

        /* Scale by 4/Pi and get the integer part */
        IntArray j(xa * Scalar(1.2732395447351626862));

        /* Map zeros to origin; if (j & 1) j += 1 */
        j = (j + Int(1)) & Int(~1u);

        /* Cast back to a floating point value */
        Value y(j);

        /* Determine sign of result */
        Value sign_sin, sign_cos;
        constexpr size_t Shift = sizeof(Scalar) * 8 - 3;

        if constexpr (Sin)
            sign_sin = reinterpret_array<Value>(sl<Shift>(j)) ^ x;

        if constexpr (Cos)
            sign_cos = reinterpret_array<Value>(sl<Shift>(~(j - Int(2))));

        /* Extended precision modular arithmetic */
        if constexpr (Single) {
            y = xa - y * Scalar(0.78515625)
                   - y * Scalar(2.4187564849853515625e-4)
                   - y * Scalar(3.77489497744594108e-8);
        } else {
            y = xa - y * Scalar(7.85398125648498535156e-1)
                   - y * Scalar(3.77489470793079817668e-8)
                   - y * Scalar(2.69515142907905952645e-15);
        }

        Value z = y * y, s, c;
        z |= eq(xa, std::numeric_limits<Scalar>::infinity());

        if constexpr (Single) {
            s = poly2(z, -1.6666654611e-1,
                          8.3321608736e-3,
                         -1.9515295891e-4) * z;

            c = poly2(z,  4.166664568298827e-2,
                         -1.388731625493765e-3,
                          2.443315711809948e-5) * z;
        } else {
            s = poly5(z, -1.66666666666666307295e-1,
                          8.33333333332211858878e-3,
                         -1.98412698295895385996e-4,
                          2.75573136213857245213e-6,
                         -2.50507477628578072866e-8,
                          1.58962301576546568060e-10) * z;

            c = poly5(z,  4.16666666666665929218e-2,
                         -1.38888888888730564116e-3,
                          2.48015872888517045348e-5,
                         -2.75573141792967388112e-7,
                          2.08757008419747316778e-9,
                         -1.13585365213876817300e-11) * z;
        }

        s = fmadd(s, y, y);
        c = fmadd(c, z, fmadd(z, Scalar(-0.5), Scalar(1)));

        Mask polymask(eq(j & Int(2), zero<IntArray>()));

        if constexpr (Sin)
            s_out = mulsign(select(polymask, s, c), sign_sin);

        if constexpr (Cos)
            c_out = mulsign(select(polymask, c, s), sign_cos);
    }

    template <bool Tan, typename Value>
    ENOKI_INLINE auto tancot_approx(const Value &x) {
        using Scalar = scalar_t<Value>;
        constexpr bool Single = std::is_same_v<Scalar, float>;
        using IntArray = int_array_t<Value>;
        using Int = scalar_t<IntArray>;

        /*
         - tan (in [-8192, 8192]):
           * avg abs. err = 4.63693e-06
           * avg rel. err = 3.60191e-08
              -> in ULPs  = 0.435442
           * max abs. err = 0.8125
             (at x=-6199.93)
           * max rel. err = 3.12284e-06
             -> in ULPs   = 30
             (at x=-7406.3)
        */

        Value xa = abs(x);

        /* Scale by 4/Pi and get the integer part */
        IntArray j(xa * Scalar(1.2732395447351626862));

        /* Map zeros to origin; if (j & 1) j += 1 */
        j = (j + Int(1)) & Int(~1u);

        /* Cast back to a floating point value */
        Value y(j);

        /* Extended precision modular arithmetic */
        if constexpr (Single) {
            y = xa - y * Scalar(0.78515625)
                   - y * Scalar(2.4187564849853515625e-4)
                   - y * Scalar(3.77489497744594108e-8);
        } else {
            y = xa - y * Scalar(7.85398125648498535156e-1)
                   - y * Scalar(3.77489470793079817668e-8)
                   - y * Scalar(2.69515142907905952645e-15);
        }

        Value z = y * y;
        z |= eq(xa, std::numeric_limits<Scalar>::infinity());

        Value r;
        if constexpr (Single) {
            r = poly5(z, 3.33331568548e-1,
                         1.33387994085e-1,
                         5.34112807005e-2,
                         2.44301354525e-2,
                         3.11992232697e-3,
                         9.38540185543e-3);
        } else {
            r = poly2(z, -1.79565251976484877988e7,
                          1.15351664838587416140e6,
                         -1.30936939181383777646e4) /
                poly4(z, -5.38695755929454629881e7,
                          2.50083801823357915839e7,
                         -1.32089234440210967447e6,
                          1.36812963470692954678e4,
                          1.00000000000000000000e0);
        }

        r = fmadd(r, z * y, y);

        auto recip_mask = Tan ? neq(j & Int(2), Int(0)) :
                                 eq(j & Int(2), Int(0));
        r[xa < Scalar(1e-4)] = y;
        r[recip_mask] = rcp(r);

        Value sign = reinterpret_array<Value>(sl<sizeof(Scalar) * 8 - 2>(j)) ^ x;

        return mulsign(r, sign);
    }
}

ENOKI_UNARY_OPERATION(sin, std::sin(x)) {
    Value r;
    detail::sincos_approx<true, false>(x, r, r);
    return r;
}

ENOKI_UNARY_OPERATION(cos, std::cos(x)) {
    Value r;
    detail::sincos_approx<false, true>(x, r, r);
    return r;
}

ENOKI_UNARY_OPERATION_PAIR(sincos, std::make_pair(std::sin(x), std::cos(x))) {
    Value s, c;
    detail::sincos_approx<true, true>(x, s, c);
    return std::make_pair(s, c);
}

template <typename T> auto csc(const T &a) { return rcp(sin(a)); }
template <typename T> auto sec(const T &a) { return rcp(cos(a)); }

ENOKI_UNARY_OPERATION(tan, std::tan(x)) {
    return detail::tancot_approx<true>(x);
}

ENOKI_UNARY_OPERATION(cot, 1 / std::tan(x)) {
    return detail::tancot_approx<false>(x);
}

ENOKI_UNARY_OPERATION(asin, std::asin(x)) {
    /*
       Arc sine function approximation based on CEPHES.

     - asin (in [-1, 1]):
       * avg abs. err = 2.25422e-08
       * avg rel. err = 2.85777e-08
          -> in ULPs  = 0.331032
       * max abs. err = 1.19209e-07
         (at x=-0.999998)
       * max rel. err = 2.27663e-07
         -> in ULPs   = 2
         (at x=-0.841416)
    */

    Value xa          = abs(x),
          x2          = sqr(x),
          r;

    if constexpr (Single) {
        Mask mask_big = xa > Scalar(0.5);

        Value x1 = Scalar(0.5) * (Scalar(1) - xa);
        Value x3 = select(mask_big, x1, x2);
        Value x4 = select(mask_big, sqrt(x1), xa);

        Value z1 = poly4(x3, 1.6666752422e-1f,
                            7.4953002686e-2f,
                            4.5470025998e-2f,
                            2.4181311049e-2f,
                            4.2163199048e-2f);

        z1 = fmadd(z1, x3*x4, x4);

        r = select(mask_big, Scalar(M_PI_2) - (z1 + z1), z1);
    } else {
        constexpr bool IsCuda = is_cuda_array_v<Value>;
        Mask mask_big = xa > Scalar(0.625);

        if (IsCuda || any_nested(mask_big)) {
            const Scalar pio4 = Scalar(0.78539816339744830962);
            const Scalar more_bits = Scalar(6.123233995736765886130e-17);

            /* arcsin(1-x) = pi/2 - sqrt(2x)(1+R(x))  */
            Value zz = Scalar(1) - xa;
            Value p = poly4(zz, 2.853665548261061424989e1,
                               -2.556901049652824852289e1,
                                6.968710824104713396794e0,
                               -5.634242780008963776856e-1,
                                2.967721961301243206100e-3) /
                      poly4(zz, 3.424398657913078477438e2,
                               -3.838770957603691357202e2,
                                1.470656354026814941758e2,
                               -2.194779531642920639778e1,
                                1.000000000000000000000e0) * zz;
            zz = sqrt(zz + zz);
            Value z = pio4 - zz;
            r[mask_big] = z - fmsub(zz, p, more_bits) + pio4;
        }

        if (IsCuda || !all_nested(mask_big)) {
            Value z = poly5(x2, -8.198089802484824371615e0,
                                 1.956261983317594739197e1,
                                -1.626247967210700244449e1,
                                 5.444622390564711410273e0,
                                -6.019598008014123785661e-1,
                                 4.253011369004428248960e-3) /
                      poly5(x2, -4.918853881490881290097e1,
                                 1.395105614657485689735e2,
                                -1.471791292232726029859e2,
                                 7.049610280856842141659e1,
                                -1.474091372988853791896e1,
                                 1.000000000000000000000e0) * x2;
            z = fmadd(xa, z, xa);
            z = select(xa < Scalar(1e-8), xa, z);
            r[~mask_big] = z;
        }
    }
    return copysign(r, x);
}

ENOKI_UNARY_OPERATION(acos, std::acos(x)) {
    /*
       Arc cosine function approximation based on CEPHES.

     - acos (in [-1, 1]):
       * avg abs. err = 4.72002e-08
       * avg rel. err = 2.85612e-08
          -> in ULPs  = 0.33034
       * max abs. err = 2.38419e-07
         (at x=-0.99999)
       * max rel. err = 1.19209e-07
         -> in ULPs   = 1
         (at x=-0.99999)
    */

    if constexpr (Single) {
        Value xa = abs(x), x2 = sqr(x);

        Mask mask_big = xa > Scalar(0.5);

        Value x1 = Scalar(0.5) * (Scalar(1) - xa);
        Value x3 = select(mask_big, x1, x2);
        Value x4 = select(mask_big, sqrt(x1), xa);

        Value z1 = poly4(x3, 1.666675242e-1f,
                             7.4953002686e-2f,
                             4.5470025998e-2f,
                             2.4181311049e-2f,
                             4.2163199048e-2f);

        z1 = fmadd(z1, x3 * x4, x4);
        Value z2 = z1 + z1;
        z2 = select(x < Scalar(0), Scalar(M_PI) - z2, z2);

        Value z3 = Scalar(M_PI_2) - copysign(z1, x);
        return select(mask_big, z2, z3);
    } else {
        const Scalar pio4 = Scalar(0.78539816339744830962);
        const Scalar more_bits = Scalar(6.123233995736765886130e-17);
        const Scalar h = Scalar(0.5);

        Mask mask = x > h;

        Value y = asin(select(mask, sqrt(fnmadd(h, x, h)), x));
        return select(mask, y + y, pio4 - y + more_bits + pio4);
    }
}

ENOKI_BINARY_OPERATION(atan2, std::atan2(x, y)) {
    /*
       MiniMax fit by Wenzel Jakob, May 2016

     - atan2() tested via atan() (in [-1, 1]):
       * avg abs. err = 1.81543e-07
       * avg rel. err = 4.15224e-07
          -> in ULPs  = 4.9197
       * max abs. err = 5.96046e-07
         (at x=-0.976062)
       * max rel. err = 7.73931e-07
         -> in ULPs   = 12
         (at x=-0.015445)
    */
    Value x_ = y,
          y_ = x,
          abs_x      = abs(x_),
          abs_y      = abs(y_),
          min_val    = min(abs_y, abs_x),
          max_val    = max(abs_x, abs_y),
          scale      = Scalar(1) / max_val,
          scaled_min = min_val * scale,
          z          = scaled_min * scaled_min;

    // How to find these:
    // f[x_] = MiniMaxApproximation[ArcTan[Sqrt[x]]/Sqrt[x],
    //         {x, {1/10000, 1}, 6, 0}, WorkingPrecision->20][[2, 1]]

    Value t;
    if constexpr (Single) {
        t = poly6(z, 0.99999934166683966009,
                    -0.33326497518773606976,
                    +0.19881342388439013552,
                    -0.13486708938456973185,
                    +0.083863120428809689910,
                    -0.037006525670417265220,
                     0.0078613793713198150252);
    } else {
        t = poly6(z, 9.9999999999999999419e-1,
                     2.50554429737833465113e0,
                     2.28289058385464073556e0,
                     9.20960512187107069075e-1,
                     1.59189681028889623410e-1,
                     9.35911604785115940726e-3,
                     8.07005540507283419124e-5) /
            poly6(z, 1.00000000000000000000e0,
                     2.83887763071166519407e0,
                     3.02918312742541450749e0,
                     1.50576983803701596773e0,
                     3.49719171130492192607e-1,
                     3.29968942624402204199e-2,
                     8.26619391703564168942e-4);
    }

    t = t * scaled_min;

    t = select(abs_y > abs_x, Scalar(M_PI_2) - t, t);
    t = select(x_ < zero<Value>(), Scalar(M_PI) - t, t);
    Value r = select(y_ < zero<Value>(), -t, t);
    r &= neq(max_val, Scalar(0));
    return r;
}

ENOKI_UNARY_OPERATION(atan, std::atan(x)) {
    return atan2(x, Scalar(1));
}

//! @}
// -----------------------------------------------------------------------

// -----------------------------------------------------------------------
//! @{ \name Exponential function, logarithm, power
// -----------------------------------------------------------------------

ENOKI_BINARY_OPERATION(ldexp, detail::ldexp_scalar(x, y)) {
    return x * reinterpret_array<Value>(
        sl<Single ? 23 : 52>(int_array_t<Value>(y) + (Single ? 0x7f : 0x3ff)));
}

ENOKI_UNARY_OPERATION_PAIR(frexp, detail::frexp_scalar(x)) {
    using IntArray = int_array_t<Value>;
    using Int = scalar_t<IntArray>;
    using IntMask = mask_t<IntArray>;

    const IntArray
        exponent_mask(Int(Single ? 0x7f800000ull : 0x7ff0000000000000ull)),
        mantissa_sign_mask(Int(Single ? ~0x7f800000ull : ~0x7ff0000000000000ull)),
        bias(Int(Single ? 0x7f : 0x3ff));

    IntArray xi = reinterpret_array<IntArray>(x);
    IntArray exponent_bits = xi & exponent_mask;

    /* Detect zero/inf/NaN */
    IntMask is_normal =
        IntMask(neq(x, zero<Value>())) &
        neq(exponent_bits, exponent_mask);

    IntArray exponent_i = (sr<Single ? 23 : 52>(exponent_bits)) - bias;

    IntArray mantissa = (xi & mantissa_sign_mask) |
                         IntArray(memcpy_cast<Int>(Scalar(.5f)));

    return std::make_pair(
        reinterpret_array<Value>(select(is_normal, mantissa, xi)),
        Value(exponent_i & is_normal)
    );
}

ENOKI_UNARY_OPERATION(exp, std::exp(x)) {
    /* Exponential function approximation based on CEPHES

       Redistributed under a BSD license with permission of the author, see
       https://github.com/deepmind/torch-cephes/blob/master/LICENSE.txt

     - exp (in [-20, 30]):
       * avg abs. err = 7155.01
       * avg rel. err = 2.35929e-08
          -> in ULPs  = 0.273524
       * max abs. err = 1.04858e+06
         (at x=29.8057)
       * max rel. err = 1.192e-07
         -> in ULPs   = 1
         (at x=-19.9999)
    */

    const Scalar inf = std::numeric_limits<Scalar>::infinity();
    const Scalar max_range = Scalar(Single ? +88.3762588501 : +7.0943613930310391424428e2);
    const Scalar min_range = Scalar(Single ? -88.3762588501 : -7.0943613930310391424428e2);

    Mask mask_overflow  = x > max_range,
         mask_underflow = x < min_range;

    /* Valueess e^x = e^g 2^n
         = e^g e^(n loge(2))
         = e^(g + n loge(2))
    */
    Value n = floor(fmadd(Scalar(1.4426950408889634073599), x, Scalar(0.5)));
    Value xr = x;
    if constexpr (Single) {
        xr = fnmadd(n, Scalar(0.693359375), xr);
        xr = fnmadd(n, Scalar(-2.12194440e-4), xr);
    } else {
        xr = fnmadd(n, Scalar(6.93145751953125e-1), xr);
        xr = fnmadd(n, Scalar(1.42860682030941723212e-6), xr);
    }

    Value z = sqr(xr);

    if constexpr (Single) {
        z = poly5(xr, 5.0000001201e-1, 1.6666665459e-1,
                      4.1665795894e-2, 8.3334519073e-3,
                      1.3981999507e-3, 1.9875691500e-4);
        z = fmadd(z, xr * xr, xr + Scalar(1));
    } else {
        /* Rational approximation for exponential
           of the fractional part:
              e^x = 1 + 2x P(x^2) / (Q(x^2) - P(x^2))
         */
        Value p = poly2(z, 9.99999999999999999910e-1,
                           3.02994407707441961300e-2,
                           1.26177193074810590878e-4) * xr;

        Value q = poly3(z, 2.00000000000000000009e0,
                           2.27265548208155028766e-1,
                           2.52448340349684104192e-3,
                           3.00198505138664455042e-6);

        Value pq = p / (q-p);
        z = pq + pq + Scalar(1);
    }

    return select(mask_overflow, inf,
                  select(mask_underflow, zero<Value>(), ldexp(z, n)));
}

ENOKI_UNARY_OPERATION(log, std::log(x)) {
    /* Logarithm function approximation based on CEPHES

       Redistributed under a BSD license with permission of the author, see
       https://github.com/deepmind/torch-cephes/blob/master/LICENSE.txt

     - log (in [1e-20, 1000]):
       * avg abs. err = 8.8672e-09
       * avg rel. err = 1.57541e-09
          -> in ULPs  = 0.020038
       * max abs. err = 4.76837e-07
         (at x=54.7661)
       * max rel. err = 1.19194e-07
         -> in ULPs   = 1
         (at x=0.021)
    */

    using UInt = scalar_t<int_array_t<Value>>;

    /* Catch negative and NaN values */
    Mask valid_mask = x >= Scalar(0);
    Value input = x, xm;

    /* The frexp in array_base.h does not handle denormalized numbers,
       cut them off. The AVX512 backend does support them, however. */
    if constexpr (!has_avx512f) {
        Scalar limit = memcpy_cast<Scalar>(
            UInt(Single ? 0x00800000u : 0x0010000000000000ull));
        xm = max(x, limit);
    } else {
        xm = x;
    }

    Value e;
    std::tie(xm, e) = frexp(x);

    const Scalar sqrt_half = Scalar(0.70710678118654752440);
    Mask mask_e_big = abs(e) > Scalar(2);
    Mask mask_ge_inv_sqrt2 = xm >= sqrt_half;
    ENOKI_MARK_USED(mask_e_big);

    e[mask_ge_inv_sqrt2] += Scalar(1);

    Value r;
    if constexpr (Single) {
        xm += (xm & ~mask_ge_inv_sqrt2) - Scalar(1);

        Value z = xm * xm;
        Value y = poly8(xm, 3.3333331174e-1, -2.4999993993e-1,
                            2.0000714765e-1, -1.6668057665e-1,
                            1.4249322787e-1, -1.2420140846e-1,
                            1.1676998740e-1, -1.1514610310e-1,
                            7.0376836292e-2);

        y *= xm * z;

        y = fmadd(e, Scalar(-2.12194440e-4), y);
        z = fmadd(z, Scalar(-0.5), xm + y);
        r = fmadd(e, Scalar(0.693359375), z);
    } else {
        constexpr bool IsCuda = is_cuda_array_v<Value>;
        const Scalar half = Scalar(0.5);
        Value r_big, r_small;

        if (IsCuda || any_nested(mask_e_big)) {
            /* logarithm using log(x) = z + z**3 P(z)/Q(z), where z = 2(x-1)/x+1) */
            Value z = xm - half;

            z[mask_ge_inv_sqrt2] -= half;

            Value y = half * select(mask_ge_inv_sqrt2, xm, z) + half;
            Value x2 = z / y;

            z = x2 * x2;
            z = x2 * (z * poly2(z, -6.41409952958715622951e1,
                                    1.63866645699558079767e1,
                                   -7.89580278884799154124e-1) /
                          poly3(z, -7.69691943550460008604e2,
                                    3.12093766372244180303e2,
                                   -3.56722798256324312549e1,
                                    1.00000000000000000000e0));

            r_big = fnmadd(e, Scalar(2.121944400546905827679e-4), z) + x2;
        }

        if (IsCuda || !all_nested(mask_e_big)) {
            /* logarithm using log(1+x) = x - .5x**2 + x**3 P(x)/Q(x) */
            Value x2 = select(mask_ge_inv_sqrt2, xm, xm + xm) - Scalar(1);

            Value z = x2*x2;
            Value y = x2 * (z * poly5(x2, 7.70838733755885391666e0,
                                          1.79368678507819816313e1,
                                          1.44989225341610930846e1,
                                          4.70579119878881725854e0,
                                          4.97494994976747001425e-1,
                                          1.01875663804580931796e-4) /
                                poly5(x2, 2.31251620126765340583e1,
                                          7.11544750618563894466e1,
                                          8.29875266912776603211e1,
                                          4.52279145837532221105e1,
                                          1.12873587189167450590e1,
                                          1.00000000000000000000e0));

            y = fnmadd(e, Scalar(2.121944400546905827679e-4), y);

            r_small = x2 + fnmadd(half, z, y);
        }

        r = select(mask_e_big, r_big, r_small);
        r = fmadd(e, Scalar(0.693359375), r);
    }

    /* Handle a few special cases */
    const Scalar n_inf(-std::numeric_limits<Scalar>::infinity());
    const Scalar p_inf(std::numeric_limits<Scalar>::infinity());

    r[eq(input, p_inf)] = p_inf;
    r[eq(input, Scalar(0))] = n_inf;

    return r | ~valid_mask;
}

ENOKI_UNARY_OPERATION(cbrt, std::cbrt(x)) {
    /* Cubic root approximation based on CEPHES

       Redistributed under a BSD license with permission of the author, see
       https://github.com/deepmind/torch-cephes/blob/master/LICENSE.txt

     - cbrt (in [-10, 10]):
       * avg abs. err = 2.91027e-17
       * avg rel. err = 1.79292e-17
          -> in ULPs  = 0.118351
       * max abs. err = 4.44089e-16
         (at x=-9.99994)
       * max rel. err = 2.22044e-16
         -> in ULPs   = 1
         (at x=-9.99994)
    */

    const Scalar CBRT2 = Scalar(1.25992104989487316477),
                 CBRT4 = Scalar(1.58740105196819947475),
                 THIRD = Scalar(1.0 / 3.0);

    Value xa = abs(x);

    auto [xm, xe] = frexp(xa);
    xe += Scalar(1);

    Value xea = abs(xe),
          xea1 = floor(xea * THIRD),
          rem = fnmadd(xea1, Scalar(3), xea);

    /* Approximate cube root of number between .5 and 1,
       peak relative error = 9.2e-6 */
    xm = poly4(xm, 0.40238979564544752126924,
                   1.1399983354717293273738,
                  -0.95438224771509446525043,
                   0.54664601366395524503440,
                  -0.13466110473359520655053);

    Value f1 = select(xe >= Scalar(0), Value(CBRT2), Value(Scalar(1) / CBRT2)),
          f2 = select(xe >= Scalar(0), Value(CBRT4), Value(Scalar(1) / CBRT4)),
          f  = select(eq(rem, 1.f), f1, f2);

    xm[neq(rem, 0.f)] *= f;

    Value r = ldexp(xm, mulsign(xea1, xe));
    r = mulsign(r, x);

    // Newton iteration
    r -= (r - (x / sqr(r))) * THIRD;

    if constexpr (!Single)
        r -= (r - (x / sqr(r))) * THIRD;

    return select(isfinite(x), r, x);
}

ENOKI_BINARY_OPERATION(pow, std::pow(x, y)) {
    return exp(log(x) * y);
}

template <typename T, typename E = expr_t<T>>
ENOKI_INLINE E pow(const T &x_, const int &y) {
    int n = std::abs(y);
    E result(1.f), x(x_);

    while (n > 0) {
        if (n & 1)
            result *= x;
        x *= x;
        n /= 2;
    }

    return (y >= 0) ? result : rcp(result);
}

template <typename T, typename E = expr_t<T, float>,
          enable_if_t<is_array_v<T>> = 0>
ENOKI_INLINE E pow(const T &x, const float &y) {
    if (enoki::round(y) == y)
        return enoki::pow(E(x), (int) y);
    else
        return enoki::pow(E(x), E(y));
}

template <typename T, typename E = expr_t<T, double>,
          enable_if_t<is_array_v<T>> = 0>
ENOKI_INLINE E pow(const T &x, const double &y) {
    if (enoki::round(y) == y)
        return enoki::pow(E(x), (int) y);
    else
        return enoki::pow(E(x), E(y));
}

// -----------------------------------------------------------------------
//! @{ \name Hyperbolic and inverse hyperbolic functions
// -----------------------------------------------------------------------

ENOKI_UNARY_OPERATION(sinh, std::sinh(x)) {
    /*
     - sinh (in [-10, 10]):
       * avg abs. err = 2.92524e-05
       * avg rel. err = 2.80831e-08
          -> in ULPs  = 0.336485
       * max abs. err = 0.00195312
         (at x=-9.99894)
       * max rel. err = 2.36862e-07
         -> in ULPs   = 3
         (at x=-9.69866)
    */

    constexpr bool IsCuda = is_cuda_array_v<Value>;

    Value xa = abs(x),
          r_small, r_big;

    Mask mask_big = xa > Scalar(1);

    if (IsCuda || any_nested(mask_big)) {
        Value exp0 = exp(x),
              exp1 = rcp(exp0);

        r_big = (exp0 - exp1) * Scalar(0.5);
    }

    if (IsCuda || !all_nested(mask_big)) {
        Value x2 = x * x;

        if constexpr (Single) {
            r_small = fmadd(poly2(x2, 1.66667160211e-1,
                                      8.33028376239e-3,
                                      2.03721912945e-4),
                            x2 * x, x);
        } else {
            r_small = fmadd(poly3(x2, -3.51754964808151394800e5,
                                      -1.15614435765005216044e4,
                                      -1.63725857525983828727e2,
                                      -7.89474443963537015605e-1) /
                            poly3(x2, -2.11052978884890840399e6,
                                       3.61578279834431989373e4,
                                      -2.77711081420602794433e2,
                                       1.00000000000000000000e0),
                            x2 * x, x);
        }
    }

    return select(mask_big, r_big, r_small);
}

ENOKI_UNARY_OPERATION(cosh, std::cosh(x)) {
    /*
     - cosh (in [-10, 10]):
       * avg abs. err = 4.17738e-05
       * avg rel. err = 3.15608e-08
          -> in ULPs  = 0.376252
       * max abs. err = 0.00195312
         (at x=-9.99894)
       * max rel. err = 2.38001e-07
         -> in ULPs   = 3
         (at x=-9.70164)
    */

    Value exp0 = exp(x),
          exp1 = rcp(exp0);

    return (exp0 + exp1) * Scalar(.5f);
}

ENOKI_UNARY_OPERATION_PAIR(sincosh, std::make_pair(std::sinh(x), std::cosh(x))) {
    /*
     - sinh (in [-10, 10]):
       * avg abs. err = 2.92524e-05
       * avg rel. err = 2.80831e-08
          -> in ULPs  = 0.336485
       * max abs. err = 0.00195312
         (at x=-9.99894)
       * max rel. err = 2.36862e-07
         -> in ULPs   = 3
         (at x=-9.69866)

     - cosh (in [-10, 10]):
       * avg abs. err = 4.17738e-05
       * avg rel. err = 3.15608e-08
          -> in ULPs  = 0.376252
       * max abs. err = 0.00195312
         (at x=-9.99894)
       * max rel. err = 2.38001e-07
         -> in ULPs   = 3
         (at x=-9.70164)
    */

    constexpr bool IsCuda = is_cuda_array_v<Value>;

    const Scalar half = Scalar(0.5);

    Value xa    = abs(x),
          exp0  = exp(x),
          exp1  = rcp(exp0),
          r_big = (exp0 - exp1) * half,
          r_small;

    Mask mask_big = xa > Scalar(1);

    if (IsCuda || !all_nested(mask_big)) {
        Value x2 = x * x;

        if constexpr (Single) {
            r_small = fmadd(poly2(x2, 1.66667160211e-1,
                                      8.33028376239e-3,
                                      2.03721912945e-4),
                            x2 * x, x);
        } else {
            r_small = fmadd(poly3(x2, -3.51754964808151394800e5,
                                      -1.15614435765005216044e4,
                                      -1.63725857525983828727e2,
                                      -7.89474443963537015605e-1) /
                            poly3(x2, -2.11052978884890840399e6,
                                       3.61578279834431989373e4,
                                      -2.77711081420602794433e2,
                                       1.00000000000000000000e0),
                            x2 * x, x);
        }
    }

    return std::make_pair(
        select(mask_big, r_big, r_small),
        half * (exp0 + exp1)
    );
}

ENOKI_UNARY_OPERATION(tanh, std::tanh(x)) {
    /*
       Hyperbolic tangent function approximation based on CEPHES.

     - tanh (in [-10, 10]):
       * avg abs. err = 4.44655e-08
       * avg rel. err = 4.58074e-08
          -> in ULPs  = 0.698044
       * max abs. err = 3.57628e-07
         (at x=-2.12867)
       * max rel. err = 4.1006e-07
         -> in ULPs   = 6
         (at x=-2.12867)
    */

    constexpr bool IsCuda = is_cuda_array_v<Value>;

    Value r_big, r_small;

    Mask mask_big = abs(x) >= Scalar(0.625);

    if (IsCuda || !all_nested(mask_big)) {
        Value x2 = x*x;

        if constexpr (Single) {
            r_small = poly4(x2, -3.33332819422e-1,
                                 1.33314422036e-1,
                                -5.37397155531e-2,
                                 2.06390887954e-2,
                                -5.70498872745e-3);
        } else {
            r_small = poly2(x2, -1.61468768441708447952e3,
                                -9.92877231001918586564e1,
                                -9.64399179425052238628e-1) /
                      poly3(x2,  4.84406305325125486048e3,
                                 2.23548839060100448583e3,
                                 1.12811678491632931402e2,
                                 1.00000000000000000000e0);
        }

        r_small = fmadd(r_small, x2 * x, x);
    }

    if (IsCuda || any_nested(mask_big)) {
        Value e  = exp(x + x),
              e2 = rcp(e + Scalar(1));
        r_big = Scalar(1) - (e2 + e2);
    }

    return select(mask_big, r_big, r_small);
}

template <typename T> auto csch(const T &a) { return rcp(sinh(a)); }
template <typename T> auto sech(const T &a) { return rcp(cosh(a)); }
template <typename T> auto coth(const T &a) { return rcp(tanh(a)); }

ENOKI_UNARY_OPERATION(asinh, std::asinh(x)) {
    /*
       Hyperbolic arc sine function approximation based on CEPHES.

     - asinh (in [-10, 10]):
       * avg abs. err = 2.75626e-08
       * avg rel. err = 1.51762e-08
          -> in ULPs  = 0.178341
       * max abs. err = 2.38419e-07
         (at x=-10)
       * max rel. err = 1.71857e-07
         -> in ULPs   = 2
         (at x=-1.17457)
    */

    constexpr bool IsCuda = is_cuda_array_v<Value>;

    Value x2 = x*x,
          xa = abs(x),
          r_big, r_small;

    Mask mask_big  = xa >= Scalar(Single ? 0.51 : 0.533),
         mask_huge = xa >= Scalar(Single ? 1e10 : 1e20);

    if (IsCuda || !all_nested(mask_big)) {
        if constexpr (Single) {
            r_small = poly3(x2, -1.6666288134e-1,
                                 7.4847586088e-2,
                                -4.2699340972e-2,
                                 2.0122003309e-2);
        } else {
            r_small = poly4(x2, -5.56682227230859640450e0,
                                -9.09030533308377316566e0,
                                -4.37390226194356683570e0,
                                -5.91750212056387121207e-1,
                                -4.33231683752342103572e-3) /
                      poly4(x2, 3.34009336338516356383e1,
                                6.95722521337257608734e1,
                                4.86042483805291788324e1,
                                1.28757002067426453537e1,
                                1.00000000000000000000e0);
        }
        r_small = fmadd(r_small, x2 * x, x);
    }

    if (IsCuda || any_nested(mask_big)) {
        r_big = log(xa + (sqrt(x2 + Scalar(1)) & ~mask_huge));
        r_big[mask_huge] += Scalar(M_LN2);
        r_big = copysign(r_big, x);
    }

    return select(mask_big, r_big, r_small);
}

ENOKI_UNARY_OPERATION(acosh, std::acosh(x)) {
    /*
       Hyperbolic arc cosine function approximation based on CEPHES.

     - acosh (in [-10, 10]):
       * avg abs. err = 2.8897e-08
       * avg rel. err = 1.49658e-08
          -> in ULPs  = 0.175817
       * max abs. err = 2.38419e-07
         (at x=3.76221)
       * max rel. err = 2.35024e-07
         -> in ULPs   = 3
         (at x=1.02974)
    */

    constexpr bool IsCuda = is_cuda_array_v<Value>;

    Value x1 = x - Scalar(1),
         r_big, r_small;

    Mask mask_big  = x1 >= Scalar(0.49),
         mask_huge = x1 >= Scalar(1e10);

    if (IsCuda || !all_nested(mask_big)) {
        if constexpr (Single) {
            r_small = poly4(x1,  1.4142135263e+0,
                                -1.1784741703e-1,
                                 2.6454905019e-2,
                                -7.5272886713e-3,
                                 1.7596881071e-3);
        } else {
            r_small = poly4(x1, 1.10855947270161294369E5,
                                1.08102874834699867335E5,
                                3.43989375926195455866E4,
                                3.94726656571334401102E3,
                                1.18801130533544501356E2) /
                      poly5(x1, 7.83869920495893927727E4,
                                8.29725251988426222434E4,
                                2.97683430363289370382E4,
                                4.15352677227719831579E3,
                                1.86145380837903397292E2,
                                1.00000000000000000000E0);
        }

        r_small *= sqrt(x1);
        r_small |= x1 < zero<Value>();
    }

    if (IsCuda || any_nested(mask_big)) {
        r_big = log(x + (sqrt(fmsub(x, x, Scalar(1))) & ~mask_huge));
        r_big[mask_huge] += Scalar(M_LN2);
    }

    return select(mask_big, r_big, r_small);
}

ENOKI_UNARY_OPERATION(atanh, std::atanh(x)) {
    /*
       Hyperbolic arc tangent function approximation based on CEPHES.


     - acosh (in [-10, 10]):
       * avg abs. err = 9.87529e-09
       * avg rel. err = 1.52741e-08
          -> in ULPs  = 0.183879
       * max abs. err = 2.38419e-07
         (at x=-0.998962)
       * max rel. err = 1.19209e-07
         -> in ULPs   = 1
         (at x=-0.998962)
    */

    constexpr bool IsCuda = is_cuda_array_v<Value>;

    Value xa = abs(x),
          r_big, r_small;

    Mask mask_big  = xa >= Scalar(0.5);

    if (IsCuda || !all_nested(mask_big)) {
        Value x2 = x*x;
        if constexpr (Single) {
            r_small = poly4(x2, 3.33337300303e-1,
                                1.99782164500e-1,
                                1.46691431730e-1,
                                8.24370301058e-2,
                                1.81740078349e-1);
        } else {
            r_small = poly4(x2, -3.09092539379866942570e1,
                                 6.54566728676544377376e1,
                                -4.61252884198732692637e1,
                                 1.20426861384072379242e1,
                                -8.54074331929669305196e-1) /
                      poly5(x2, -9.27277618139601130017e1,
                                 2.52006675691344555838e2,
                                -2.49839401325893582852e2,
                                 1.08938092147140262656e2,
                                -1.95638849376911654834e1,
                                 1.00000000000000000000e0);
        }
        r_small = fmadd(r_small, x2*x, x);
    }

    if (IsCuda || any_nested(mask_big)) {
        r_big = log((Scalar(1) + xa) / (Scalar(1) - xa)) * Scalar(0.5);
        r_big = copysign(r_big, x);
    }

    return select(mask_big, r_big, r_small);
}

/// Linearly interpolate between 'a' and 'b', using 't'
template <typename Value1, typename Value2, typename Value3>
auto lerp(const Value1 &a, const Value2 &b, const Value3 &t) {
    return fmadd(b, t, fnmadd(a, t, a));
}

/// Clamp the value 'value' to the range [min, max]
template <typename Value1, typename Value2, typename Value3>
auto clamp(const Value1 &value, const Value2 &min, const Value3 &max) {
    return enoki::max(enoki::min(value, max), min);
}

/// Compute the hypotenuse of 'a' and 'b', while avoiding under/overflow
template <typename T1, typename T2>
ENOKI_INLINE auto hypot(const T1 &a, const T2 &b) {
    auto abs_a  = abs(a);
    auto abs_b  = abs(b);
    auto maxval = max(abs_a, abs_b),
         minval = min(abs_a, abs_b),
         ratio  = minval / maxval;

    using Scalar = scalar_t<decltype(ratio)>;
    const Scalar inf = std::numeric_limits<Scalar>::infinity();

    return select(
        (abs_a < inf) && (abs_b < inf) && (ratio < inf),
        maxval * sqrt(Scalar(1) + sqr(ratio)),
        abs_a + abs_b
    );
}

ENOKI_BINARY_OPERATION(fmod, std::fmod(x, y)) {
    return fnmadd(trunc(x / y), y, x);
}

// -----------------------------------------------------------------------
//! @{ \name "Safe" functions that avoid domain errors due to rounding
// -----------------------------------------------------------------------

template <typename T> ENOKI_INLINE auto safe_sqrt(const T &a) {
    return sqrt(max(a, zero<T>()));
}

template <typename T> ENOKI_INLINE auto safe_rsqrt(const T &a) {
    return rsqrt(max(a, zero<T>()));
}

template <typename T> ENOKI_INLINE auto safe_asin(const T &a) {
    return asin(min(T(1), max(T(-1), a)));
}

template <typename T> ENOKI_INLINE auto safe_acos(const T &a) {
    return acos(min(T(1), max(T(-1), a)));
}

/**
 * \brief Numerically well-behaved routine for computing the angle
 * between two unit direction vectors
 *
 * This should be used wherever one is tempted to compute the
 * arc cosine of a dot product.
 *
 * By Don Hatch at http://www.plunk.org/~hatch/rightway.php
 */
template <typename T, typename Expr = expr_t<value_t<T>>>
Expr unit_angle(const T &a, const T &b) {
    Expr dot_uv = dot(a, b),
         temp = 2.f * asin(.5f * norm(b - mulsign(a, dot_uv)));
    return select(dot_uv >= 0, temp, scalar_t<Expr>(M_PI) - temp);
}

/**
 * \brief Numerically well-behaved routine for computing the angle
 * between the unit direction vector 'v' and the z-axis
 *
 * This should be used wherever one is tempted to compute
 * std::acos(v.z())
 *
 * By Don Hatch at http://www.plunk.org/~hatch/rightway.php
 */
template <typename T, typename Expr = expr_t<value_t<T>>>
Expr unit_angle_z(const T &v) {
    static_assert(T::Size == 3, "unit_angle_z(): input is not a 3D vector");
    Expr temp = 2.f * asin(.5f * sqrt(sqr(v.x()) + sqr(v.y()) +
                                      sqr(v.z() - copysign(Expr(1.f), v.z()))));
    return select(v.z() >= 0, temp, scalar_t<Expr>(M_PI) - temp);
}

//! @}
// -----------------------------------------------------------------------

// -----------------------------------------------------------------------
//! @{ \name Floating point manipulation routines
// -----------------------------------------------------------------------

template <typename Value, typename Expr = expr_t<Value>>
ENOKI_INLINE Expr prev_float(const Value &value) {
    using Int = int_array_t<Expr>;
    using IntScalar = scalar_t<Int>;

    const Int exponent_mask = sizeof(IntScalar) == 4
                                  ? IntScalar(0x7f800000)
                                  : IntScalar(0x7ff0000000000000ll);

    const Int pos_denorm = sizeof(IntScalar) == 4
                              ? IntScalar(0x80000001)
                              : IntScalar(0x8000000000000001ll);

    Int i = reinterpret_array<Int>(value);

    auto is_nan_inf = eq(i & exponent_mask, exponent_mask);
    auto is_pos_0   = eq(i, 0);
    auto is_gt_0    = i >= 0;
    auto is_special = is_nan_inf | is_pos_0;

    Int j1 = i + select(is_gt_0, Int(-1), Int(1));
    Int j2 = select(is_pos_0, pos_denorm, i);

    return reinterpret_array<Expr>(select(is_special, j2, j1));
}

template <typename Value, typename Expr = expr_t<Value>>
ENOKI_INLINE Expr next_float(const Value &value) {
    using Int = int_array_t<Expr>;
    using IntScalar = scalar_t<Int>;

    const Int exponent_mask = sizeof(IntScalar) == 4
                                  ? IntScalar(0x7f800000)
                                  : IntScalar(0x7ff0000000000000ll);

    const Int sign_mask = sizeof(IntScalar) == 4
                              ? IntScalar(0x80000000)
                              : IntScalar(0x8000000000000000ll);

    Int i = reinterpret_array<Int>(value);

    auto is_nan_inf = eq(i & exponent_mask, exponent_mask);
    auto is_neg_0   = eq(i, sign_mask);
    auto is_gt_0    = i >= 0;
    auto is_special = is_nan_inf | is_neg_0;

    Int j1 = i + select(is_gt_0, Int(1), Int(-1));
    Int j2 = select(is_neg_0, Int(1), i);

    return reinterpret_array<Expr>(select(is_special, j2, j1));
}

template <typename Arg> auto isdenormal(const Arg &a) {
    return abs(a) < std::numeric_limits<scalar_t<Arg>>::min() &&
           neq(a, zero<Arg>());
}

//! @}
// -----------------------------------------------------------------------

NAMESPACE_END(enoki)