/* enoki/array_idiv.h -- fast precomputed integer division by constants based on libdivide (https://github.com/ridiculousfish/libdivide) Copyright (C) 2010 ridiculous_fish This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. libdivide@ridiculousfish.com */ #pragma once #include NAMESPACE_BEGIN(enoki) NAMESPACE_BEGIN(detail) // ----------------------------------------------------------------------- //! @{ \name Precomputation for division by integer constants // ----------------------------------------------------------------------- template std::pair div_wide(uint32_t u1, uint32_t u0, uint32_t v) { #if defined(__GNUC__) && (defined(ENOKI_X86_32) || defined(ENOKI_X86_64)) if constexpr (UseIntrinsic) { uint32_t res, rem; __asm__("divl %[v]" : "=a"(res), "=d"(rem) : [v] "r"(v), "a"(u0), "d"(u1)); return { res, rem }; } #endif uint64_t u = (((uint64_t) u1) << 32) | u0; return { (uint32_t) (u / v), (uint32_t) (u % v) }; } template std::pair div_wide(uint64_t u1, uint64_t u0, uint64_t d) { #if defined(__GNUC__) && defined(ENOKI_X86_64) if constexpr (UseIntrinsic) { uint64_t res, rem; __asm__("divq %[v]" : "=a"(res), "=d"(rem) : [v]"r"(d), "a"(u0), "d"(u1)); return { res, rem }; } #endif #if defined(__SIZEOF_INT128__) __uint128_t n = (((__uint128_t) u1) << 64) | u0; return { (uint64_t) (n / d), (uint64_t) (n % d) }; #else // Code taken from Hacker's Delight: // http://www.hackersdelight.org/HDcode/divlu.c. // License permits inclusion here per: // http://www.hackersdelight.org/permissions.htm const uint64_t b = (1ULL << 32); // Number base (16 bits). uint64_t un1, un0, // Norm. dividend LSD's. vn1, vn0, // Norm. divisor digits. q1, q0, // Quotient digits. un64, un21, un10, // Dividend digit pairs. rhat; // A remainder. int s; // Shift amount for norm. if (u1 >= d) // overflow return { (uint64_t) -1, (uint64_t) -1 }; // count leading zeros s = (int) (63 - log2i(d)); // 0 <= s <= 63. if (s > 0) { d = d << s; // Normalize divisor. un64 = (u1 << s) | ((u0 >> (64 - s)) & uint64_t(-s >> 31)); un10 = u0 << s; // Shift dividend left. } else { // Avoid undefined behavior. un64 = u1 | u0; un10 = u0; } vn1 = d >> 32; // Break divisor up into vn0 = d & 0xFFFFFFFF; // two 32-bit digits. un1 = un10 >> 32; // Break right half of un0 = un10 & 0xFFFFFFFF; // dividend into two digits. q1 = un64/vn1; // Compute the first rhat = un64 - q1*vn1; // quotient digit, q1. again1: if (q1 >= b || q1*vn0 > b*rhat + un1) { q1 = q1 - 1; rhat = rhat + vn1; if (rhat < b) goto again1; } un21 = un64*b + un1 - q1*d; // Multiply and subtract. q0 = un21/vn1; // Compute the second rhat = un21 - q0*vn1; // quotient digit, q0. again2: if (q0 >= b || q0 * vn0 > b * rhat + un0) { q0 = q0 - 1; rhat = rhat + vn1; if (rhat < b) goto again2; } return { q1*b + q0, (un21*b + un0 - q0*d) >> s }; #endif } //! @} // ----------------------------------------------------------------------- NAMESPACE_END(detail) #if defined(_MSC_VER) # pragma pack(push) # pragma pack(1) #endif template struct divisor>> { T multiplier; uint8_t shift; divisor() = default; ENOKI_INLINE divisor(T d) { /* Division by +/-1 is not supported by the precomputation-based approach */ assert(d != 1); shift = (uint8_t) log2i(d); if ((d & (d - 1)) == 0) { /* Power of two */ multiplier = 0; shift--; } else { /* General case */ auto [m, rem] = detail::div_wide(T(1) << shift, T(0), d); multiplier = m * 2 + 1; assert(rem > 0 && rem < d); T rem2 = rem * 2; if (rem2 >= d || rem2 < rem) multiplier += 1; } } template ENOKI_INLINE auto operator()(const T2 &value) const { using Expr = decltype(value + value); auto q = mulhi(Expr(multiplier), value); auto t = sr<1>(value - q) + q; return t >> shift; } } ENOKI_PACK; template struct divisor>> { using U = std::make_unsigned_t; T multiplier; uint8_t shift; divisor() = default; ENOKI_INLINE divisor(T d) { /* Division by +/-1 is not supported by the precomputation-based approach */ assert(d != 1 && d != -1); U ad = d < 0 ? (U) -d : (U) d; shift = (uint8_t) log2i(ad); if ((ad & (ad - 1)) == 0) { /* Power of two */ multiplier = 0; } else { /* General case */ auto [m, rem] = detail::div_wide(U(1) << (shift - 1), U(0), ad); multiplier = T(m * 2 + 1); U rem2 = rem * 2; if (rem2 >= ad || rem2 < rem) multiplier += 1; } if (d < 0) shift |= 0x80; } template ENOKI_INLINE auto operator()(const T2 &value) const { using Expr = decltype(value + value); uint8_t shift_ = shift & 0x3f; Expr sign(int8_t(shift) >> 7); auto q = mulhi(Expr(multiplier), value) + value; auto q_sign = sr(q); q += q_sign & ((T(1) << shift_) - (multiplier == 0 ? 1 : 0)); return ((q >> shift_) ^ sign) - sign; } } ENOKI_PACK; /// Stores *both* the original divisor + magic number template struct divisor_ext : divisor { T value; ENOKI_INLINE divisor_ext(T value) : divisor(value), value(value) { } } ENOKI_PACK; #if defined(_MSC_VER) # pragma pack(pop) #endif template >> = 0> ENOKI_INLINE auto operator/(const T &a, const divisor> &div) { return div(a); } template >> = 0> ENOKI_INLINE auto operator%(const T &a, const divisor_ext> &div) { return a - div(a) * div.value; } // ----------------------------------------------------------------------- //! @{ \name Arithmetic operations for pointer arrays // ----------------------------------------------------------------------- template , typename S2 = scalar_t, enable_if_t || std::is_pointer_v> = 0, enable_if_array_any_t = 0> ENOKI_INLINE auto operator-(const T1 &a1_, const T2 &a2_) { using Int = std::conditional_t; using T1i = replace_scalar_t; using T2i = replace_scalar_t; using Ti = expr_t; using T = expr_t; constexpr Int InstanceSize = sizeof(std::remove_pointer_t>), LogInstanceSize = detail::clog2i(InstanceSize); constexpr bool PointerDiff = std::is_pointer_v && std::is_pointer_v; using Ret = std::conditional_t; Ti a1 = Ti((T1i) a1_), a2 = Ti((T2i) a2_); if constexpr (InstanceSize == 1) { return Ret(a1.sub_(a2)); } else if constexpr ((1 << LogInstanceSize) == InstanceSize) { if constexpr (PointerDiff) return Ret(a1.sub_(a2).template sr_()); else return Ret(a1.sub_(a2.template sl_())); } else { if constexpr (PointerDiff) return Ret(a1.sub_(a2) / InstanceSize); else return Ret(a1.sub_(a2 * InstanceSize)); } } template , typename S2 = scalar_t, enable_if_t && !std::is_pointer_v> = 0, enable_if_array_any_t = 0> ENOKI_INLINE auto operator+(const T1 &a1_, const T2 &a2_) { using Int = std::conditional_t; using T1i = replace_scalar_t; using T2i = replace_scalar_t; using Ti = expr_t; using Ret = expr_t; constexpr Int InstanceSize = sizeof(std::remove_pointer_t>), LogInstanceSize = detail::clog2i(InstanceSize); Ti a1 = Ti((T1i) a1_), a2 = Ti((T2i) a2_); if constexpr (InstanceSize == 1) return Ret(a1.add_(a2)); if constexpr ((1 << LogInstanceSize) == InstanceSize) return Ret(a1.add_(a2.template sl_())); else return Ret(a1.add_(a2 * InstanceSize)); } //! @} // ----------------------------------------------------------------------- NAMESPACE_END(enoki)