cocos-engine-external/sources/enoki/array_idiv.h

328 lines
10 KiB
C++

/*
enoki/array_idiv.h -- fast precomputed integer division by constants based
on libdivide (https://github.com/ridiculousfish/libdivide)
Copyright (C) 2010 ridiculous_fish
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
libdivide@ridiculousfish.com
*/
#pragma once
#include <enoki/array_generic.h>
NAMESPACE_BEGIN(enoki)
NAMESPACE_BEGIN(detail)
// -----------------------------------------------------------------------
//! @{ \name Precomputation for division by integer constants
// -----------------------------------------------------------------------
template <bool UseIntrinsic = false>
std::pair<uint32_t, uint32_t> div_wide(uint32_t u1, uint32_t u0, uint32_t v) {
#if defined(__GNUC__) && (defined(ENOKI_X86_32) || defined(ENOKI_X86_64))
if constexpr (UseIntrinsic) {
uint32_t res, rem;
__asm__("divl %[v]"
: "=a"(res), "=d"(rem)
: [v] "r"(v), "a"(u0), "d"(u1));
return { res, rem };
}
#endif
uint64_t u = (((uint64_t) u1) << 32) | u0;
return { (uint32_t) (u / v),
(uint32_t) (u % v) };
}
template <bool UseIntrinsic = false>
std::pair<uint64_t, uint64_t> div_wide(uint64_t u1, uint64_t u0, uint64_t d) {
#if defined(__GNUC__) && defined(ENOKI_X86_64)
if constexpr (UseIntrinsic) {
uint64_t res, rem;
__asm__("divq %[v]"
: "=a"(res), "=d"(rem)
: [v]"r"(d), "a"(u0), "d"(u1));
return { res, rem };
}
#endif
#if defined(__SIZEOF_INT128__)
__uint128_t n = (((__uint128_t) u1) << 64) | u0;
return {
(uint64_t) (n / d),
(uint64_t) (n % d)
};
#else
// Code taken from Hacker's Delight:
// http://www.hackersdelight.org/HDcode/divlu.c.
// License permits inclusion here per:
// http://www.hackersdelight.org/permissions.htm
const uint64_t b = (1ULL << 32); // Number base (16 bits).
uint64_t un1, un0, // Norm. dividend LSD's.
vn1, vn0, // Norm. divisor digits.
q1, q0, // Quotient digits.
un64, un21, un10, // Dividend digit pairs.
rhat; // A remainder.
int s; // Shift amount for norm.
if (u1 >= d) // overflow
return { (uint64_t) -1, (uint64_t) -1 };
// count leading zeros
s = (int) (63 - log2i(d)); // 0 <= s <= 63.
if (s > 0) {
d = d << s; // Normalize divisor.
un64 = (u1 << s) | ((u0 >> (64 - s)) & uint64_t(-s >> 31));
un10 = u0 << s; // Shift dividend left.
} else {
// Avoid undefined behavior.
un64 = u1 | u0;
un10 = u0;
}
vn1 = d >> 32; // Break divisor up into
vn0 = d & 0xFFFFFFFF; // two 32-bit digits.
un1 = un10 >> 32; // Break right half of
un0 = un10 & 0xFFFFFFFF; // dividend into two digits.
q1 = un64/vn1; // Compute the first
rhat = un64 - q1*vn1; // quotient digit, q1.
again1:
if (q1 >= b || q1*vn0 > b*rhat + un1) {
q1 = q1 - 1;
rhat = rhat + vn1;
if (rhat < b)
goto again1;
}
un21 = un64*b + un1 - q1*d; // Multiply and subtract.
q0 = un21/vn1; // Compute the second
rhat = un21 - q0*vn1; // quotient digit, q0.
again2:
if (q0 >= b || q0 * vn0 > b * rhat + un0) {
q0 = q0 - 1;
rhat = rhat + vn1;
if (rhat < b)
goto again2;
}
return {
q1*b + q0,
(un21*b + un0 - q0*d) >> s
};
#endif
}
//! @}
// -----------------------------------------------------------------------
NAMESPACE_END(detail)
#if defined(_MSC_VER)
# pragma pack(push)
# pragma pack(1)
#endif
template <typename T, bool UseIntrinsic>
struct divisor<T, UseIntrinsic, enable_if_t<std::is_unsigned_v<T>>> {
T multiplier;
uint8_t shift;
divisor() = default;
ENOKI_INLINE divisor(T d) {
/* Division by +/-1 is not supported by the
precomputation-based approach */
assert(d != 1);
shift = (uint8_t) log2i(d);
if ((d & (d - 1)) == 0) {
/* Power of two */
multiplier = 0;
shift--;
} else {
/* General case */
auto [m, rem] =
detail::div_wide<UseIntrinsic>(T(1) << shift, T(0), d);
multiplier = m * 2 + 1;
assert(rem > 0 && rem < d);
T rem2 = rem * 2;
if (rem2 >= d || rem2 < rem)
multiplier += 1;
}
}
template <typename T2>
ENOKI_INLINE auto operator()(const T2 &value) const {
using Expr = decltype(value + value);
auto q = mulhi(Expr(multiplier), value);
auto t = sr<1>(value - q) + q;
return t >> shift;
}
} ENOKI_PACK;
template <typename T, bool UseIntrinsic>
struct divisor<T, UseIntrinsic, enable_if_t<std::is_signed_v<T>>> {
using U = std::make_unsigned_t<T>;
T multiplier;
uint8_t shift;
divisor() = default;
ENOKI_INLINE divisor(T d) {
/* Division by +/-1 is not supported by the
precomputation-based approach */
assert(d != 1 && d != -1);
U ad = d < 0 ? (U) -d : (U) d;
shift = (uint8_t) log2i(ad);
if ((ad & (ad - 1)) == 0) {
/* Power of two */
multiplier = 0;
} else {
/* General case */
auto [m, rem] =
detail::div_wide<UseIntrinsic>(U(1) << (shift - 1), U(0), ad);
multiplier = T(m * 2 + 1);
U rem2 = rem * 2;
if (rem2 >= ad || rem2 < rem)
multiplier += 1;
}
if (d < 0)
shift |= 0x80;
}
template <typename T2>
ENOKI_INLINE auto operator()(const T2 &value) const {
using Expr = decltype(value + value);
uint8_t shift_ = shift & 0x3f;
Expr sign(int8_t(shift) >> 7);
auto q = mulhi(Expr(multiplier), value) + value;
auto q_sign = sr<sizeof(T) * 8 - 1>(q);
q += q_sign & ((T(1) << shift_) - (multiplier == 0 ? 1 : 0));
return ((q >> shift_) ^ sign) - sign;
}
} ENOKI_PACK;
/// Stores *both* the original divisor + magic number
template <typename T> struct divisor_ext : divisor<T> {
T value;
ENOKI_INLINE divisor_ext(T value) : divisor<T>(value), value(value) { }
} ENOKI_PACK;
#if defined(_MSC_VER)
# pragma pack(pop)
#endif
template <typename T, enable_if_t<std::is_integral_v<scalar_t<T>>> = 0>
ENOKI_INLINE auto operator/(const T &a, const divisor<scalar_t<T>> &div) {
return div(a);
}
template <typename T, enable_if_t<std::is_integral_v<scalar_t<T>>> = 0>
ENOKI_INLINE auto operator%(const T &a, const divisor_ext<scalar_t<T>> &div) {
return a - div(a) * div.value;
}
// -----------------------------------------------------------------------
//! @{ \name Arithmetic operations for pointer arrays
// -----------------------------------------------------------------------
template <typename T1, typename T2,
typename S1 = scalar_t<T1>, typename S2 = scalar_t<T2>,
enable_if_t<std::is_pointer_v<S1> || std::is_pointer_v<S2>> = 0,
enable_if_array_any_t<T1, T2> = 0>
ENOKI_INLINE auto operator-(const T1 &a1_, const T2 &a2_) {
using Int = std::conditional_t<sizeof(void *) == 8, int64_t, int32_t>;
using T1i = replace_scalar_t<T1, Int, false>;
using T2i = replace_scalar_t<T2, Int, false>;
using Ti = expr_t<T1i, T2i>;
using T = expr_t<T1, T2>;
constexpr Int InstanceSize = sizeof(std::remove_pointer_t<scalar_t<T1>>),
LogInstanceSize = detail::clog2i(InstanceSize);
constexpr bool PointerDiff = std::is_pointer_v<S1> &&
std::is_pointer_v<S2>;
using Ret = std::conditional_t<PointerDiff, Ti, T>;
Ti a1 = Ti((T1i) a1_),
a2 = Ti((T2i) a2_);
if constexpr (InstanceSize == 1) {
return Ret(a1.sub_(a2));
} else if constexpr ((1 << LogInstanceSize) == InstanceSize) {
if constexpr (PointerDiff)
return Ret(a1.sub_(a2).template sr_<LogInstanceSize>());
else
return Ret(a1.sub_(a2.template sl_<LogInstanceSize>()));
} else {
if constexpr (PointerDiff)
return Ret(a1.sub_(a2) / InstanceSize);
else
return Ret(a1.sub_(a2 * InstanceSize));
}
}
template <typename T1, typename T2,
typename S1 = scalar_t<T1>, typename S2 = scalar_t<T2>,
enable_if_t<std::is_pointer_v<S1> && !std::is_pointer_v<S2>> = 0,
enable_if_array_any_t<T1, T2> = 0>
ENOKI_INLINE auto operator+(const T1 &a1_, const T2 &a2_) {
using Int = std::conditional_t<sizeof(void *) == 8, int64_t, int32_t>;
using T1i = replace_scalar_t<T1, Int, false>;
using T2i = replace_scalar_t<T2, Int, false>;
using Ti = expr_t<T1i, T2i>;
using Ret = expr_t<T1, T2>;
constexpr Int InstanceSize = sizeof(std::remove_pointer_t<scalar_t<T1>>),
LogInstanceSize = detail::clog2i(InstanceSize);
Ti a1 = Ti((T1i) a1_),
a2 = Ti((T2i) a2_);
if constexpr (InstanceSize == 1)
return Ret(a1.add_(a2));
if constexpr ((1 << LogInstanceSize) == InstanceSize)
return Ret(a1.add_(a2.template sl_<LogInstanceSize>()));
else
return Ret(a1.add_(a2 * InstanceSize));
}
//! @}
// -----------------------------------------------------------------------
NAMESPACE_END(enoki)