cocos-engine-external/sources/enoki/array_idiv.h

/*
    enoki/array_idiv.h -- fast precomputed integer division by constants based
    on libdivide (https://github.com/ridiculousfish/libdivide)

    Copyright (C) 2010 ridiculous_fish

    This software is provided 'as-is', without any express or implied
    warranty.  In no event will the authors be held liable for any damages
    arising from the use of this software.

    Permission is granted to anyone to use this software for any purpose,
    including commercial applications, and to alter it and redistribute it
    freely, subject to the following restrictions:

    1. The origin of this software must not be misrepresented; you must not
       claim that you wrote the original software. If you use this software
       in a product, an acknowledgment in the product documentation would be
       appreciated but is not required.
    2. Altered source versions must be plainly marked as such, and must not be
       misrepresented as being the original software.
    3. This notice may not be removed or altered from any source distribution.

    libdivide@ridiculousfish.com

*/

#pragma once

#include <enoki/array_generic.h>

NAMESPACE_BEGIN(enoki)

NAMESPACE_BEGIN(detail)

// -----------------------------------------------------------------------
//! @{ \name Precomputation for division by integer constants
// -----------------------------------------------------------------------

template <bool UseIntrinsic = false>
std::pair<uint32_t, uint32_t> div_wide(uint32_t u1, uint32_t u0, uint32_t v) {
#if defined(__GNUC__) && (defined(ENOKI_X86_32) || defined(ENOKI_X86_64))
    if constexpr (UseIntrinsic) {
        uint32_t res, rem;
        __asm__("divl %[v]"
                : "=a"(res), "=d"(rem)
                : [v] "r"(v), "a"(u0), "d"(u1));
        return { res, rem };
    }
#endif

    uint64_t u = (((uint64_t) u1) << 32) | u0;

    return { (uint32_t) (u / v),
             (uint32_t) (u % v) };
}

template <bool UseIntrinsic = false>
std::pair<uint64_t, uint64_t> div_wide(uint64_t u1, uint64_t u0, uint64_t d) {
#if defined(__GNUC__) && defined(ENOKI_X86_64)
    if constexpr (UseIntrinsic) {
        uint64_t res, rem;
        __asm__("divq %[v]"
                : "=a"(res), "=d"(rem)
                : [v]"r"(d), "a"(u0), "d"(u1));
        return { res, rem };
    }
#endif

#if defined(__SIZEOF_INT128__)
    __uint128_t n = (((__uint128_t) u1) << 64) | u0;
    return {
        (uint64_t) (n / d),
        (uint64_t) (n % d)
    };
#else
    // Code taken from Hacker's Delight:
    // http://www.hackersdelight.org/HDcode/divlu.c.
    // License permits inclusion here per:
    // http://www.hackersdelight.org/permissions.htm

    const uint64_t b = (1ULL << 32); // Number base (16 bits).
    uint64_t un1, un0,  // Norm. dividend LSD's.
    vn1, vn0,           // Norm. divisor digits.
    q1, q0,             // Quotient digits.
    un64, un21, un10,   // Dividend digit pairs.
    rhat;               // A remainder.
    int s;              // Shift amount for norm.

    if (u1 >= d) // overflow
        return { (uint64_t) -1, (uint64_t) -1 };

    // count leading zeros
    s = (int) (63 - log2i(d)); // 0 <= s <= 63.
    if (s > 0) {
        d = d << s;         // Normalize divisor.
        un64 = (u1 << s) | ((u0 >> (64 - s)) & uint64_t(-s >> 31));
        un10 = u0 << s;     // Shift dividend left.
    } else {
        // Avoid undefined behavior.
        un64 = u1 | u0;
        un10 = u0;
    }

    vn1 = d >> 32;            // Break divisor up into
    vn0 = d & 0xFFFFFFFF;     // two 32-bit digits.

    un1 = un10 >> 32;         // Break right half of
    un0 = un10 & 0xFFFFFFFF;  // dividend into two digits.

    q1 = un64/vn1;            // Compute the first
    rhat = un64 - q1*vn1;     // quotient digit, q1.

again1:
    if (q1 >= b || q1*vn0 > b*rhat + un1) {
        q1 = q1 - 1;
        rhat = rhat + vn1;
        if (rhat < b)
            goto again1;
    }

    un21 = un64*b + un1 - q1*d;  // Multiply and subtract.

    q0 = un21/vn1;            // Compute the second
    rhat = un21 - q0*vn1;     // quotient digit, q0.

again2:
    if (q0 >= b || q0 * vn0 > b * rhat + un0) {
        q0 = q0 - 1;
        rhat = rhat + vn1;
        if (rhat < b)
            goto again2;
    }

    return {
        q1*b + q0,
        (un21*b + un0 - q0*d) >> s
    };
#endif
}

//! @}
// -----------------------------------------------------------------------

NAMESPACE_END(detail)

#if defined(_MSC_VER)
#  pragma pack(push)
#  pragma pack(1)
#endif

template <typename T, bool UseIntrinsic>
struct divisor<T, UseIntrinsic, enable_if_t<std::is_unsigned_v<T>>> {
    T multiplier;
    uint8_t shift;

    divisor() = default;

    ENOKI_INLINE divisor(T d) {
        /* Division by +/-1 is not supported by the
           precomputation-based approach */
        assert(d != 1);
        shift = (uint8_t) log2i(d);

        if ((d & (d - 1)) == 0) {
            /* Power of two */
            multiplier = 0;
            shift--;
        } else {
            /* General case */
            auto [m, rem] =
                detail::div_wide<UseIntrinsic>(T(1) << shift, T(0), d);
            multiplier = m * 2 + 1;
            assert(rem > 0 && rem < d);

            T rem2 = rem * 2;
            if (rem2 >= d || rem2 < rem)
                multiplier += 1;
        }
    }

    template <typename T2>
    ENOKI_INLINE auto operator()(const T2 &value) const {
        using Expr = decltype(value + value);
        auto q = mulhi(Expr(multiplier), value);
        auto t = sr<1>(value - q) + q;
        return t >> shift;
    }
} ENOKI_PACK;

template <typename T, bool UseIntrinsic>
struct divisor<T, UseIntrinsic, enable_if_t<std::is_signed_v<T>>> {
    using U = std::make_unsigned_t<T>;

    T multiplier;
    uint8_t shift;

    divisor() = default;

    ENOKI_INLINE divisor(T d) {
        /* Division by +/-1 is not supported by the
           precomputation-based approach */
        assert(d != 1 && d != -1);

        U ad = d < 0 ? (U) -d : (U) d;
        shift = (uint8_t) log2i(ad);

        if ((ad & (ad - 1)) == 0) {
            /* Power of two */
            multiplier = 0;
        } else {
            /* General case */
            auto [m, rem] =
                detail::div_wide<UseIntrinsic>(U(1) << (shift - 1), U(0), ad);
            multiplier = T(m * 2 + 1);

            U rem2 = rem * 2;
            if (rem2 >= ad || rem2 < rem)
                multiplier += 1;
        }
        if (d < 0)
            shift |= 0x80;
    }

    template <typename T2>
    ENOKI_INLINE auto operator()(const T2 &value) const {
        using Expr = decltype(value + value);
        uint8_t shift_ = shift & 0x3f;
        Expr sign(int8_t(shift) >> 7);

        auto q = mulhi(Expr(multiplier), value) + value;
        auto q_sign = sr<sizeof(T) * 8 - 1>(q);
        q += q_sign & ((T(1) << shift_) - (multiplier == 0 ? 1 : 0));

        return ((q >> shift_) ^ sign) - sign;
    }
} ENOKI_PACK;

/// Stores *both* the original divisor + magic number
template <typename T> struct divisor_ext : divisor<T> {
    T value;
    ENOKI_INLINE divisor_ext(T value) : divisor<T>(value), value(value) { }
} ENOKI_PACK;

#if defined(_MSC_VER)
#  pragma pack(pop)
#endif

template <typename T, enable_if_t<std::is_integral_v<scalar_t<T>>> = 0>
ENOKI_INLINE auto operator/(const T &a, const divisor<scalar_t<T>> &div) {
    return div(a);
}

template <typename T, enable_if_t<std::is_integral_v<scalar_t<T>>> = 0>
ENOKI_INLINE auto operator%(const T &a, const divisor_ext<scalar_t<T>> &div) {
    return a - div(a) * div.value;
}

// -----------------------------------------------------------------------
//! @{ \name Arithmetic operations for pointer arrays
// -----------------------------------------------------------------------

template <typename T1, typename T2,
          typename S1 = scalar_t<T1>, typename S2 = scalar_t<T2>,
          enable_if_t<std::is_pointer_v<S1> || std::is_pointer_v<S2>> = 0,
          enable_if_array_any_t<T1, T2> = 0>
ENOKI_INLINE auto operator-(const T1 &a1_, const T2 &a2_) {
    using Int = std::conditional_t<sizeof(void *) == 8, int64_t, int32_t>;
    using T1i = replace_scalar_t<T1, Int, false>;
    using T2i = replace_scalar_t<T2, Int, false>;
    using Ti  = expr_t<T1i, T2i>;
    using T   = expr_t<T1, T2>;

    constexpr Int InstanceSize    = sizeof(std::remove_pointer_t<scalar_t<T1>>),
                  LogInstanceSize = detail::clog2i(InstanceSize);

    constexpr bool PointerDiff = std::is_pointer_v<S1> &&
                                 std::is_pointer_v<S2>;

    using Ret = std::conditional_t<PointerDiff, Ti, T>;
    Ti a1 = Ti((T1i) a1_),
       a2 = Ti((T2i) a2_);

    if constexpr (InstanceSize == 1) {
        return Ret(a1.sub_(a2));
    } else if constexpr ((1 << LogInstanceSize) == InstanceSize) {
        if constexpr (PointerDiff)
            return Ret(a1.sub_(a2).template sr_<LogInstanceSize>());
        else
            return Ret(a1.sub_(a2.template sl_<LogInstanceSize>()));
    } else {
        if constexpr (PointerDiff)
            return Ret(a1.sub_(a2) / InstanceSize);
        else
            return Ret(a1.sub_(a2 * InstanceSize));
    }
}


template <typename T1, typename T2,
          typename S1 = scalar_t<T1>, typename S2 = scalar_t<T2>,
          enable_if_t<std::is_pointer_v<S1> && !std::is_pointer_v<S2>> = 0,
          enable_if_array_any_t<T1, T2> = 0>
ENOKI_INLINE auto operator+(const T1 &a1_, const T2 &a2_) {
    using Int = std::conditional_t<sizeof(void *) == 8, int64_t, int32_t>;
    using T1i = replace_scalar_t<T1, Int, false>;
    using T2i = replace_scalar_t<T2, Int, false>;
    using Ti  = expr_t<T1i, T2i>;
    using Ret = expr_t<T1, T2>;

    constexpr Int InstanceSize    = sizeof(std::remove_pointer_t<scalar_t<T1>>),
                  LogInstanceSize = detail::clog2i(InstanceSize);

    Ti a1 = Ti((T1i) a1_),
       a2 = Ti((T2i) a2_);

    if constexpr (InstanceSize == 1)
        return Ret(a1.add_(a2));
    if constexpr ((1 << LogInstanceSize) == InstanceSize)
        return Ret(a1.add_(a2.template sl_<LogInstanceSize>()));
    else
        return Ret(a1.add_(a2 * InstanceSize));
}

//! @}
// -----------------------------------------------------------------------

NAMESPACE_END(enoki)