/* enoki/special.h -- Special functions: Bessel functions, Elliptic and exponential integrals, etc. (still incomplete) Enoki is a C++ template library that enables transparent vectorization of numerical kernels using SIMD instruction sets available on current processor architectures. Copyright (c) 2019 Wenzel Jakob All rights reserved. Use of this source code is governed by a BSD-style license that can be found in the LICENSE file. */ #include #pragma once NAMESPACE_BEGIN(enoki) /// Evaluates a series of Chebyshev polynomials at argument x/2. template > Expr chbevl(const T &x, T2 (&coeffs)[Size]) { using Scalar = scalar_t; Expr b0 = Scalar(coeffs[0]); Expr b1 = Scalar(0); Expr b2; ENOKI_UNROLL for (size_t i = 0; i < Size; ++i) { b2 = b1; b1 = b0; b0 = fmsub(x, b1, b2 - Scalar(coeffs[i])); } return (b0 - b2) * Scalar(0.5f); } template = 0> T erf(const T &x) { return std::erf(x); } template = 0> T erfc(const T &x) { return std::erfc(x); } template , enable_if_array_t = 0> Expr erfc(const T &x); template , enable_if_array_t = 0> Expr erf(const T &x); template > Expr erfc(const T &x) { constexpr bool Single = std::is_same_v, float>; using Scalar = scalar_t; Expr r; Expr xa = abs(x), z = exp(-x*x); auto erf_mask = xa < Scalar(1), large_mask = xa > Scalar(Single ? 2 : 8); ENOKI_MARK_USED(erf_mask); if constexpr (Single) { Expr q = rcp(xa), y = q*q, p_small, p_large; if (is_cuda_array_v || !all_nested(large_mask)) p_small = poly8(y, 5.638259427386472e-1, -2.741127028184656e-1, 3.404879937665872e-1, -4.944515323274145e-1, 6.210004621745983e-1, -5.824733027278666e-1, 3.687424674597105e-1, -1.387039388740657e-1, 2.326819970068386e-2); if (is_cuda_array_v || any_nested(large_mask)) p_large = poly7(y, 5.641895067754075e-1, -2.820767439740514e-1, 4.218463358204948e-1, -1.015265279202700e+0, 2.921019019210786e+0, -7.495518717768503e+0, 1.297719955372516e+1, -1.047766399936249e+1); r = z * q * select(large_mask, p_large, p_small); } else { Expr p_small, p_large, q_small, q_large; if (is_cuda_array_v || !all_nested(large_mask)) { p_small = poly8(xa, 5.57535335369399327526e2, 1.02755188689515710272e3, 9.34528527171957607540e2, 5.26445194995477358631e2, 1.96520832956077098242e2, 4.86371970985681366614e1, 7.46321056442269912687e0, 5.64189564831068821977e-1, 2.46196981473530512524e-10); q_small = poly8(xa, 5.57535340817727675546e2, 1.65666309194161350182e3, 2.24633760818710981792e3, 1.82390916687909736289e3, 9.75708501743205489753e2, 3.54937778887819891062e2, 8.67072140885989742329e1, 1.32281951154744992508e1, 1.00000000000000000000e0); } if (is_cuda_array_v || any_nested(large_mask)) { p_large = poly5(xa, 2.97886665372100240670e0, 7.40974269950448939160e0, 6.16021097993053585195e0, 5.01905042251180477414e0, 1.27536670759978104416e0, 5.64189583547755073984e-1); q_large = poly6(xa, 3.36907645100081516050e0, 9.60896809063285878198e0, 1.70814450747565897222e1, 1.20489539808096656605e1, 9.39603524938001434673e0, 2.26052863220117276590e0, 1.00000000000000000000e0); } r = (z * select(large_mask, p_large, p_small)) / select(large_mask, q_large, q_small); r &= neq(z, zero()); } r[x < Scalar(0)] = Scalar(2) - r; if constexpr (Recurse) { if (ENOKI_UNLIKELY(is_cuda_array_v || any_nested(erf_mask))) r[erf_mask] = Scalar(1) - erf(x); } return r; } template > Expr erf(const T &x) { using Scalar = scalar_t; Expr r; auto erfc_mask = abs(x) > Scalar(1); ENOKI_MARK_USED(erfc_mask); Expr z = x * x; constexpr bool Single = std::is_same_v, float>; if constexpr (Single) { r = poly6(z, 1.128379165726710e+0, -3.761262582423300e-1, 1.128358514861418e-1, -2.685381193529856e-2, 5.188327685732524e-3, -8.010193625184903e-4, 7.853861353153693e-5); } else { r = poly4(z, 5.55923013010394962768e4, 7.00332514112805075473e3, 2.23200534594684319226e3, 9.00260197203842689217e1, 9.60497373987051638749e0) / poly5(z, 4.92673942608635921086e4, 2.26290000613890934246e4, 4.59432382970980127987e3, 5.21357949780152679795e2, 3.35617141647503099647e1, 1.00000000000000000000e0); } r *= x; if constexpr (Recurse) { if (ENOKI_UNLIKELY(is_cuda_array_v || any_nested(erfc_mask))) r[erfc_mask] = Scalar(1) - erfc(x); } return r; } /// Modified Bessel function of the first kind, order zero (exponentially scaled) template > Expr i0e(const T &x_) { using Scalar = scalar_t; /* Chebyshev coefficients for exp(-x) I0(x) * in the interval [0,8]. * * lim(x->0) { exp(-x) I0(x) } = 1. */ static Scalar A[] = { Scalar(-1.30002500998624804212E-8), Scalar(6.04699502254191894932E-8), Scalar(-2.67079385394061173391E-7), Scalar(1.11738753912010371815E-6), Scalar(-4.41673835845875056359E-6), Scalar(1.64484480707288970893E-5), Scalar(-5.75419501008210370398E-5), Scalar(1.88502885095841655729E-4), Scalar(-5.76375574538582365885E-4), Scalar(1.63947561694133579842E-3), Scalar(-4.32430999505057594430E-3), Scalar(1.05464603945949983183E-2), Scalar(-2.37374148058994688156E-2), Scalar(4.93052842396707084878E-2), Scalar(-9.49010970480476444210E-2), Scalar(1.71620901522208775349E-1), Scalar(-3.04682672343198398683E-1), Scalar(6.76795274409476084995E-1) }; /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x) * in the inverted interval [8,infinity]. * * lim(x->inf) { exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi). */ static Scalar B[] = { Scalar(3.39623202570838634515E-9), Scalar(2.26666899049817806459E-8), Scalar(2.04891858946906374183E-7), Scalar(2.89137052083475648297E-6), Scalar(6.88975834691682398426E-5), Scalar(3.36911647825569408990E-3), Scalar(8.04490411014108831608E-1) }; Expr x = abs(x_); auto mask_big = x > Scalar(8); Expr r_big, r_small; if (!all_nested(mask_big)) r_small = chbevl(fmsub(x, Expr(Scalar(0.5)), Expr(Scalar(2))), A); if (any_nested(mask_big)) r_big = chbevl(fmsub(Expr(Scalar(32)), rcp(x), Expr(Scalar(2))), B) * rsqrt(x); return select(mask_big, r_big, r_small); } // Inverse real error function approximation based on on "Approximating the // erfinv function" by Mark Giles template > Expr erfinv(const T &x_) { using Scalar = scalar_t; Expr x(x_); Expr w = -log((Expr(Scalar(1)) - x) * (Expr(Scalar(1)) + x)); Expr w1 = w - Scalar(2.5); Expr w2 = sqrt(w) - Scalar(3); Expr p1 = poly8(w1, 1.50140941, 0.246640727, -0.00417768164, -0.00125372503, 0.00021858087, -4.39150654e-06, -3.5233877e-06, 3.43273939e-07, 2.81022636e-08); Expr p2 = poly8(w2, 2.83297682, 1.00167406, 0.00943887047, -0.0076224613, 0.00573950773, -0.00367342844, 0.00134934322, 0.000100950558, -0.000200214257); return select(w < Scalar(5), p1, p2) * x; } /// Evaluates Dawson's integral (e^(-x^2) \int_0^x e^(y^2) dy) template > Expr dawson(const T &x) { // Rational minimax approximation to Dawson's integral with relative // error < 1e-6 on the real number line. July 2017, Wenzel Jakob Expr x2 = x*x; Expr num = poly6(x2, 1.00000080272429,9.18170212243285e-2, 4.25835373536124e-2, 6.0536496345054e-3, 9.88555033724111e-4, 3.64943550840577e-5, 1.55942290996993e-5); Expr denom = poly7(x2, 1.0, 7.58517175815194e-1, 2.81364355593059e-1, 6.81783097841267e-2, 1.13586116798019e-2, 1.92020805811771e-3, 5.74217664074868e-5, 3.11884331363595e-5); return num / denom * x; } /// Imaginary component of the error function template > Expr erfi(const T &x) { using Scalar = scalar_t; return Scalar(M_2_SQRTPI) * dawson(x) * exp(x * x); } /// Natural logarithm of the Gamma function template Value lgamma(Value x_) { using Mask = mask_t; using Scalar = scalar_t; // 'g' and 'n' parameters of the Lanczos approximation // See mrob.com/pub/ries/lanczos-gamma.html const int n = 6; const Scalar g = 5.0f; const Scalar log_sqrt2pi = Scalar(0.91893853320467274178); const Scalar coeff[n + 1] = { (Scalar) 1.000000000190015, (Scalar) 76.18009172947146, (Scalar) -86.50532032941677, (Scalar) 24.01409824083091, (Scalar) -1.231739572450155, (Scalar) 0.1208650973866179e-2, (Scalar) -0.5395239384953e-5 }; // potentially reflect using gamma(x) = pi / (sin(pi*x) * gamma(1-x)) Mask reflect = x_ < .5f; Value x = select(reflect, -x_, x_ - 1.f), b = x + g + .5f; // base Value sum = 0; for (int i = n; i >= 1; --i) sum += coeff[i] / (x + Scalar(i)); sum += coeff[0]; // gamma(x) = sqrt(2*pi) * sum * b^(x + .5) / exp(b) Value result = ((log_sqrt2pi + log(sum)) - b) + log(b) * (x + .5f); if (is_cuda_array_v || any_nested(reflect)) { masked(result, reflect) = log(abs(Scalar(M_PI) / sin(Scalar(M_PI) * x_))) - result; masked(result, reflect && eq(x_, round(x_))) = std::numeric_limits::infinity(); } return result; } /// Gamma function template Value tgamma(Value x) { return exp(lgamma(x)); } /** * Computes a Carlson integral of the form * * R_F(X, Y, Z) = 1/2 * \int_{0}^\infty ((t + x) (t + y) (t + z))^(-1/2) dt * * Based on * * Computing elliptic integrals by duplication * B. C. Carlson * Numerische Mathematik, March 1979, Volume 33, Issue 1 */ template , typename Scalar = scalar_t> Value carlson_rf(Vector3 xyz) { static_assert( Vector3::Size == 3, "carlson_rf(): Expected a three-dimensional input vector (x, y, z)"); assert(all_nested(xyz.x() >= Scalar(0) && xyz.y() > Scalar(0) && xyz.z() > Scalar(0))); Vector3 XYZ; Value mu_inv; mask_t active = true; int iterations = 0; while (true) { Vector3 sqrt_xyz = sqrt(xyz); Value lambda = dot(shuffle<1, 2, 0>(sqrt_xyz), sqrt_xyz); Value mu = hsum(xyz) * Scalar(1.0 / 3.0); mu_inv = rcp(mu); XYZ = fnmadd(xyz, mu_inv, Scalar(1)); Value eps = hmax(abs(XYZ)); active &= eps > Scalar(std::is_same_v ? 0.0024608 : 0.070154); // eps ^ (1/6) if (none(active) || ++iterations == 10) break; xyz[mask_t(active)] = (xyz + lambda) * Scalar(0.25); } /* Use recurrences for cheaper polynomial evaluation. Based on Numerical Recipes (3rd ed) by Press, Teukolsky, Vetterling, and Flannery */ Value e2 = XYZ.x() * XYZ.y() - XYZ.z() * XYZ.z(), e3 = hprod(XYZ), er = (Scalar(1.0 / 24.0) * e2 - Scalar(1.0 / 10.0) - Scalar(3.0 / 44.0) * e3) * e2 + Scalar(1.0 / 14.0) * e3; return sqrt(mu_inv) * (Scalar(1) + er); } /** * Computes a Carlson integral of the form * * R_D(x, y, z) = 3/2 * \int_{0}^\infty (t + x)^(-1/2) (t + y)^(-1/2) (t + z)^(-3/2) dt * * Based on * * Computing elliptic integrals by duplication * B. C. Carlson * Numerische Mathematik, March 1979, Volume 33, Issue 1 */ template , typename Scalar = scalar_t> Value carlson_rd(Vector3 xyz) { static_assert( Vector3::Size == 3, "carlson_rd(): Expected a three-dimensional input vector (x, y, z)"); assert(all_nested(xyz.x() >= Scalar(0) && xyz.y() > Scalar(0) && xyz.z() > Scalar(0))); Vector3 XYZ; Value mu_inv; mask_t active = true; int iterations = 0; Value sum = 0; Value num = 1; const Vector3 W(Scalar(1.0 / 5.0), Scalar(1.0 / 5.0), Scalar(3.0 / 5.0)); while (true) { Vector3 sqrt_xyz = sqrt(xyz); Value lambda = dot(shuffle<1, 2, 0>(sqrt_xyz), sqrt_xyz); Value mu = hsum(xyz * W); mu_inv = rcp(mu); XYZ = fnmadd(xyz, mu_inv, Scalar(1)); Value eps = hmax(abs(XYZ)); active &= eps > Scalar(std::is_same_v ? (0.0024608 * 0.6) : (0.070154 * 0.6)); // eps ^ (1/6) * 0.6 if (none(active) || ++iterations == 10) break; masked(sum, active) += num / (sqrt(xyz.z()) * (xyz.z() + lambda)); masked(num, active) *= Scalar(0.25f); masked(xyz, mask_t(active)) = (xyz + lambda) * Scalar(0.25f); } /* Use recurrences for cheaper polynomial evaluation. Based on Numerical Recipes (3rd ed) by Press, Teukolsky, Vetterling, and Flannery */ Value z = XYZ.z(), ea = XYZ.x() * XYZ.y(), eb = z * z, ec = ea - eb, ed = fnmadd(Scalar(6), eb, ea), ee = fmadd(ec, Scalar(2), ed); Value p = ed * (-Scalar(3.0 / 14.0) + Scalar(9.0 / 88.0) * ed - Scalar(1.0 / 4.0) * z * ee) + z * (Scalar(1.0 / 6.0) * ee + z * (-Scalar(9.0 / 22.0) * ec + z * Scalar(3.0 / 26.0) * ea)); return Scalar(3) * sum + num * mu_inv * sqrt(mu_inv) * (Scalar(1.0) + p); } /** * Computes a Carlson integral of the form * * R_C(x, y) = 1/2 * \int_{0}^\infty (t + x)^(-1/2) (t + y)^-1 dt * * Based on * * Computing elliptic integrals by duplication * B. C. Carlson * Numerische Mathematik, March 1979, Volume 33, Issue 1 */ template , typename Scalar = scalar_t> Value carlson_rc(Vector2 xy) { static_assert( Vector2::Size == 2, "carlson_rc(): Expected a two-dimensional input vector (x, y)"); assert(all(xy.x() >= Scalar(0) && xy.y() > Scalar(0))); mask_t active = true; Value inv_mu, s; int iterations = 0; while (true) { Value lambda = hprod(sqrt(xy)); lambda += lambda + xy.y(); Value mu = fmadd(xy.x(), Scalar(1.0 / 3.0), xy.y() * Scalar(2.0 / 3.0)); inv_mu = rcp(mu); s = (xy.y() - mu) * inv_mu; active &= abs(s) > Scalar(std::is_same_v ? (0.0024608 * 0.48) : (0.070154 * 0.48)); // eps ^ (1/6) * 0.48 if (none(active) || ++iterations == 10) break; masked(xy, mask_t(active)) = (xy + lambda) * Scalar(0.25f); } /* Use recurrences for cheaper polynomial evaluation. Based on Numerical Recipes (3rd ed) by Press, Teukolsky, Vetterling, and Flannery */ return sqrt(inv_mu) * (Scalar(1) + s * s * (Scalar(0.3) + s * (Scalar(1.0 / 7.0) + s * (Scalar(0.375) + s * Scalar(9.0 / 22.0))))); } /** * Computes a Carlson integral of the form * * R_J(x, y, z, rho) = 3/2 * \int_{0}^\infty ((t + x) (t + y) (t + z))^(-1/2) (t+rho)^(-1) dt * * Based on * * Computing elliptic integrals by duplication * B. C. Carlson * Numerische Mathematik, March 1979, Volume 33, Issue 1 */ template , typename Vector2 = Array, typename Scalar = scalar_t> Value carlson_rj(Vector4 xyzr) { static_assert( Vector4::Size == 4, "carlson_rj(): Expected a four-dimensional input vector (x, y, z, rho)"); assert(all(xyzr.x() >= Scalar(0) && xyzr.y() > Scalar(0) && xyzr.z() > Scalar(0) && xyzr.w() > Scalar(0))); Vector4 XYZR; Value mu_inv; mask_t active = true; int iterations = 0; Value sum = 0; Value num = 1; while (true) { auto xyz = head<3>(xyzr); auto rho = xyzr.w(); auto sqrt_xyz = sqrt(xyz); Value lambda = dot(shuffle<1, 2, 0>(sqrt_xyz), sqrt_xyz); Value mu = (hsum(xyzr) + rho) * Scalar(1.0 / 5.0); mu_inv = rcp(mu); XYZR = fnmadd(xyzr, mu_inv, Scalar(1)); Value eps = hmax(abs(XYZR)); active &= eps > Scalar(std::is_same_v ? (0.0024608 * 0.6) : (0.070154 * 0.6)); // eps ^ (1/6) * 0.6 Value alpha = rho * hsum(sqrt(xyz)) + sqrt(hprod(xyz)); alpha *= alpha; Value beta = rho * (rho + lambda) * (rho + lambda); if (none(active) || ++iterations == 10) break; masked(sum, active) += num * carlson_rc(Vector2(alpha, beta)); masked(num, active) *= Scalar(0.25f); masked(xyzr, mask_t(active)) = (xyzr + lambda) * Scalar(0.25f); } /* Use recurrences for cheaper polynomial evaluation. Based on Numerical Recipes (3rd ed) by Press, Teukolsky, Vetterling, and Flannery */ Value ea = XYZR.x() * (XYZR.y() + XYZR.z()) + XYZR.y() * XYZR.z(), eb = XYZR.x() * XYZR.y() * XYZR.z(), R = XYZR.w(), ec = R * R, ed = ea - Scalar(3) * ec, ee = eb + Scalar(2) * R * (ea - ec); return Scalar(3) * sum + num * mu_inv * sqrt(mu_inv) * (Scalar(1) + ed * (-Scalar(3.0 / 14.0) + Scalar(9.0 / 88.0) * ed - Scalar(9.0 / 52.0) * ee) + eb * (Scalar(1.0 / 6.0) + R * (-Scalar(3.0 / 11.0) + R * Scalar(3.0 / 26.0))) + R * ea * (Scalar(1.0 / 3.0) - R * Scalar(3.0 / 22.0)) - Scalar(1.0 / 3.0) * R * ec); } // ----------------------------------------------------------------------- //! @{ \name Complete and incomplete elliptic integrals //! Caution: the 'k' factor is squared in the elliptic integral, which //! differs from the convention of Mathematica's EllipticK etc. // ----------------------------------------------------------------------- /// Complete elliptic integral of the first kind template , typename Scalar = scalar_t, typename Vector3 = Array> Value comp_ellint_1(K k) { return carlson_rf(Vector3(Scalar(0), Scalar(1) - k * k, Scalar(1))); } /// Incomplete elliptic integral of the first kind template , typename Scalar = scalar_t, typename Vector3 = Array> Value ellint_1(Phi phi_, K k) { Value phi = phi_, n = floor(fmadd(phi, Scalar(1.0 / M_PI), Scalar(.5f))), result = 0; if (ENOKI_UNLIKELY(any(neq(n, Scalar(0))))) { result = comp_ellint_1(k) * n * Scalar(2); phi = fnmadd(n, Scalar(M_PI), phi); } auto [sin_phi, cos_phi] = sincos(phi); Vector3 xyz(cos_phi * cos_phi, Scalar(1) - k * k * sin_phi * sin_phi, Scalar(1)); result += sin_phi * carlson_rf(xyz); return result; } /// Complete elliptic integral of the second kind template , typename Scalar = scalar_t, typename Vector3 = Array> Value comp_ellint_2(K k) { auto k2 = k*k; Vector3 xyz(Scalar(0), Scalar(1) - k2, Scalar(1)); return carlson_rf(xyz) - Scalar(1.0 / 3.0) * k2 * carlson_rd(xyz); } /// Incomplete elliptic integral of the second kind template , typename Scalar = scalar_t, typename Vector3 = Array> Value ellint_2(Phi phi_, K k) { Value phi = phi_, k2 = k*k, n = floor(fmadd(phi, Scalar(1.0 / M_PI), Scalar(.5f))), result = 0; if (ENOKI_UNLIKELY(any(neq(n, Scalar(0))))) { result = comp_ellint_2(k) * n * Scalar(2); phi = fnmadd(n, Scalar(M_PI), phi); } auto [sin_phi, cos_phi] = sincos(phi); auto sin_phi_k_2 = sin_phi * sin_phi * k2; Vector3 xyz(cos_phi * cos_phi, Scalar(1) - sin_phi_k_2, Scalar(1)); result += sin_phi * (carlson_rf(xyz) - Scalar(1.0 / 3.0) * sin_phi_k_2 * carlson_rd(xyz)); return result; } /// Complete elliptic integral of the third kind template , typename Scalar = scalar_t, typename Vector4 = Array> Value comp_ellint_3(K k, Nu nu) { auto k2 = k*k; Vector4 xyzr(Scalar(0), Scalar(1) - k2, Scalar(1), Scalar(1) + nu); return carlson_rf(head<3>(xyzr)) - Scalar(1.0 / 3.0) * nu * carlson_rj(xyzr); } /// Incomplete elliptic integral of the third kind template , typename Scalar = scalar_t, typename Vector4 = Array> Value ellint_3(Phi phi_, K k, Nu nu) { Value phi = phi_, k2 = k*k, n = floor(fmadd(phi, Scalar(1.0 / M_PI), Scalar(.5f))), result = 0; if (ENOKI_UNLIKELY(any(neq(n, Scalar(0))))) { result = comp_ellint_3(k, nu) * n * Scalar(2); phi = fnmadd(n, Scalar(M_PI), phi); } auto [sin_phi, cos_phi] = sincos(phi); auto sin_phi_2 = sin_phi * sin_phi; Vector4 xyzr(cos_phi * cos_phi, Scalar(1) - k2 * sin_phi_2, Scalar(1), Scalar(1) + nu * sin_phi_2); result += sin_phi * (carlson_rf(head<3>(xyzr)) - Scalar(1.0 / 3.0) * nu * sin_phi_2 * carlson_rj(xyzr)); return result; } //! @} // ----------------------------------------------------------------------- NAMESPACE_END(enoki)