/* enoki/array_sse42.h -- Packed SIMD array (SSE4.2 specialization) Enoki is a C++ template library that enables transparent vectorization of numerical kernels using SIMD instruction sets available on current processor architectures. Copyrighe (c) 2019 Wenzel Jakob All rights reserved. Use of this source code is governed by a BSD-style license that can be found in the LICENSE file. */ #pragma once NAMESPACE_BEGIN(enoki) NAMESPACE_BEGIN(detail) /// Compressed look-up table for the store_compress() operation [256 bytes] alignas(16) const uint8_t compress_lut_128[16*16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x01, 0x02, 0x03, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x04, 0x05, 0x06, 0x07, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x09, 0x0a, 0x0b, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x80, 0x80, 0x80, 0x80, 0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x01, 0x02, 0x03, 0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }; template <> struct is_native : std::true_type { } ; template <> struct is_native : std::true_type { }; template <> struct is_native : std::true_type { }; template struct is_native> : std::true_type { }; template struct is_native> : std::true_type { }; template struct is_native> : std::true_type { }; NAMESPACE_END(detail) /// Partial overload of StaticArrayImpl using SSE4.2 intrinsics (single precision) template struct alignas(16) StaticArrayImpl : StaticArrayBase { ENOKI_NATIVE_ARRAY(float, 4, __m128) // ----------------------------------------------------------------------- //! @{ \name Value constructors // ----------------------------------------------------------------------- ENOKI_INLINE StaticArrayImpl(Value value) : m(_mm_set1_ps(value)) { } ENOKI_INLINE StaticArrayImpl(Value v0, Value v1, Value v2, Value v3) : m(_mm_setr_ps(v0, v1, v2, v3)) { } //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Type converting constructors // ----------------------------------------------------------------------- #if defined(ENOKI_X86_F16C) ENOKI_CONVERT(half) { m = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *) a.derived().data())); } #endif ENOKI_CONVERT(float) : m(a.derived().m) { } ENOKI_CONVERT(int32_t) : m(_mm_cvtepi32_ps(a.derived().m)) { } ENOKI_CONVERT(uint32_t) { #if defined(ENOKI_X86_AVX512DQ) && defined(ENOKI_X86_AVX512VL) m = _mm_cvtepu32_ps(a.derived().m); #else int32_array_t ai(a); Derived result = Derived(ai & 0x7fffffff) + (Derived(float(1u << 31)) & mask_t(sr<31>(ai))); m = result.m; #endif } #if defined(ENOKI_X86_AVX) ENOKI_CONVERT(double) : m(_mm256_cvtpd_ps(a.derived().m)) { } #else ENOKI_CONVERT(double) : m(_mm_shuffle_ps(_mm_cvtpd_ps(low(a).m), _mm_cvtpd_ps(high(a).m), _MM_SHUFFLE(1, 0, 1, 0))) { } #endif #if defined(ENOKI_X86_AVX512DQ) && defined(ENOKI_X86_AVX512VL) ENOKI_CONVERT(int64_t) : m(_mm256_cvtepi64_ps(a.derived().m)) { } ENOKI_CONVERT(uint64_t) : m(_mm256_cvtepu64_ps(a.derived().m)) { } #endif //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Reinterpreting constructors, mask converters // ----------------------------------------------------------------------- ENOKI_REINTERPRET(bool) { int ival; memcpy(&ival, a.derived().data(), 4); m = _mm_castsi128_ps(_mm_cvtepi8_epi32( _mm_cmpgt_epi8(_mm_cvtsi32_si128(ival), _mm_setzero_si128()))); } ENOKI_REINTERPRET(float) : m(a.derived().m) { } ENOKI_REINTERPRET(int32_t) : m(_mm_castsi128_ps(a.derived().m)) { } ENOKI_REINTERPRET(uint32_t) : m(_mm_castsi128_ps(a.derived().m)) { } #if defined(ENOKI_X86_AVX) ENOKI_REINTERPRET(double) : m(_mm_castsi128_ps( detail::mm256_cvtepi64_epi32(_mm256_castpd_si256(a.derived().m)))) { } #else ENOKI_REINTERPRET(double) : m(_mm_castsi128_ps(detail::mm256_cvtepi64_epi32( _mm_castpd_si128(low(a).m), _mm_castpd_si128(high(a).m)))) { } #endif #if defined(ENOKI_X86_AVX2) ENOKI_REINTERPRET(uint64_t) : m(_mm_castsi128_ps( detail::mm256_cvtepi64_epi32(a.derived().m))) { } ENOKI_REINTERPRET(int64_t) : m(_mm_castsi128_ps( detail::mm256_cvtepi64_epi32(a.derived().m))) { } #else ENOKI_REINTERPRET(uint64_t) : m(_mm_castsi128_ps( detail::mm256_cvtepi64_epi32(low(a).m, high(a).m))) { } ENOKI_REINTERPRET(int64_t) : m(_mm_castsi128_ps( detail::mm256_cvtepi64_epi32(low(a).m, high(a).m))) { } #endif //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Converting from/to half size vectors // ----------------------------------------------------------------------- StaticArrayImpl(const Array1 &a1, const Array2 &a2) : m(_mm_setr_ps(a1.coeff(0), a1.coeff(1), a2.coeff(0), a2.coeff(1))) { } ENOKI_INLINE Array1 low_() const { return Array1(coeff(0), coeff(1)); } ENOKI_INLINE Array2 high_() const { return Array2(coeff(2), coeff(3)); } //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Vertical operations // ----------------------------------------------------------------------- ENOKI_INLINE Derived add_(Ref a) const { return _mm_add_ps(m, a.m); } ENOKI_INLINE Derived sub_(Ref a) const { return _mm_sub_ps(m, a.m); } ENOKI_INLINE Derived mul_(Ref a) const { return _mm_mul_ps(m, a.m); } ENOKI_INLINE Derived div_(Ref a) const { return _mm_div_ps(m, a.m); } template ENOKI_INLINE Derived or_(const T &a) const { #if defined(ENOKI_X86_AVX512VL) if constexpr (is_mask_v) return _mm_mask_mov_ps(m, a.k, _mm_set1_ps(memcpy_cast(int32_t(-1)))); else #endif return _mm_or_ps(m, a.m); } template ENOKI_INLINE Derived and_(const T &a) const { #if defined(ENOKI_X86_AVX512VL) if constexpr (is_mask_v) return _mm_maskz_mov_ps(a.k, m); else #endif return _mm_and_ps(m, a.m); } template ENOKI_INLINE Derived xor_(const T &a) const { #if defined(ENOKI_X86_AVX512VL) if constexpr (is_mask_v) return _mm_mask_xor_ps(m, a.k, m, _mm_set1_ps(memcpy_cast(int32_t(-1)))); else #endif return _mm_xor_ps(m, a.m); } template ENOKI_INLINE Derived andnot_(const T &a) const { #if defined(ENOKI_X86_AVX512VL) if constexpr (is_mask_v) return _mm_mask_mov_ps(m, a.k, _mm_setzero_ps()); else #endif return _mm_andnot_ps(a.m, m); } #if defined(ENOKI_X86_AVX512VL) #define ENOKI_COMP(name, NAME) mask_t::from_k(_mm_cmp_ps_mask(m, a.m, _CMP_##NAME)) #elif defined(ENOKI_X86_AVX) #define ENOKI_COMP(name, NAME) mask_t(_mm_cmp_ps(m, a.m, _CMP_##NAME)) #else #define ENOKI_COMP(name, NAME) mask_t(_mm_cmp##name##_ps(m, a.m)) #endif ENOKI_INLINE auto lt_ (Ref a) const { return ENOKI_COMP(lt, LT_OQ); } ENOKI_INLINE auto gt_ (Ref a) const { return ENOKI_COMP(gt, GT_OQ); } ENOKI_INLINE auto le_ (Ref a) const { return ENOKI_COMP(le, LE_OQ); } ENOKI_INLINE auto ge_ (Ref a) const { return ENOKI_COMP(ge, GE_OQ); } ENOKI_INLINE auto eq_ (Ref a) const { using Int = int_array_t; if constexpr (IsMask_) return mask_t(eq(Int(derived()), Int(a))); else return ENOKI_COMP(eq, EQ_OQ); } ENOKI_INLINE auto neq_(Ref a) const { using Int = int_array_t; if constexpr (IsMask_) return mask_t(neq(Int(derived()), Int(a))); else return ENOKI_COMP(neq, NEQ_UQ); } #undef ENOKI_COMP ENOKI_INLINE Derived abs_() const { return _mm_andnot_ps(_mm_set1_ps(-0.f), m); } ENOKI_INLINE Derived min_(Ref b) const { return _mm_min_ps(b.m, m); } ENOKI_INLINE Derived max_(Ref b) const { return _mm_max_ps(b.m, m); } ENOKI_INLINE Derived ceil_() const { return _mm_ceil_ps(m); } ENOKI_INLINE Derived floor_() const { return _mm_floor_ps(m); } ENOKI_INLINE Derived sqrt_() const { return _mm_sqrt_ps(m); } ENOKI_INLINE Derived round_() const { return _mm_round_ps(m, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } ENOKI_INLINE Derived trunc_() const { return _mm_round_ps(m, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } template static ENOKI_INLINE Derived select_(const Mask &m, Ref t, Ref f) { #if !defined(ENOKI_X86_AVX512VL) return _mm_blendv_ps(f.m, t.m, m.m); #else return _mm_mask_blend_ps(m.k, f.m, t.m); #endif } #if defined(ENOKI_X86_FMA) ENOKI_INLINE Derived fmadd_ (Ref b, Ref c) const { return _mm_fmadd_ps (m, b.m, c.m); } ENOKI_INLINE Derived fmsub_ (Ref b, Ref c) const { return _mm_fmsub_ps (m, b.m, c.m); } ENOKI_INLINE Derived fnmadd_ (Ref b, Ref c) const { return _mm_fnmadd_ps (m, b.m, c.m); } ENOKI_INLINE Derived fnmsub_ (Ref b, Ref c) const { return _mm_fnmsub_ps (m, b.m, c.m); } ENOKI_INLINE Derived fmsubadd_(Ref b, Ref c) const { return _mm_fmsubadd_ps(m, b.m, c.m); } ENOKI_INLINE Derived fmaddsub_(Ref b, Ref c) const { return _mm_fmaddsub_ps(m, b.m, c.m); } #endif template ENOKI_INLINE Derived shuffle_() const { #if defined(ENOKI_X86_AVX) return _mm_permute_ps(m, _MM_SHUFFLE(I3, I2, I1, I0)); #else return _mm_shuffle_ps(m, m, _MM_SHUFFLE(I3, I2, I1, I0)); #endif } template ENOKI_INLINE Derived shuffle_(const Index &index) const { #if defined(ENOKI_X86_AVX) return _mm_permutevar_ps(m, index.m); #else return Base::shuffle_(index); #endif } #if defined(ENOKI_X86_AVX512VL) ENOKI_INLINE Derived ldexp_(Ref arg) const { return _mm_scalef_ps(m, arg.m); } ENOKI_INLINE std::pair frexp_() const { return std::make_pair( _mm_getmant_ps(m, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src), _mm_getexp_ps(m)); } #endif ENOKI_INLINE Derived rcp_() const { #if defined(ENOKI_X86_AVX512ER) /* rel err < 2^28, use as is (even in non-approximate mode) */ return _mm512_castps512_ps128( _mm512_rcp28_ps(_mm512_castps128_ps512(m))); #else /* Use best reciprocal approximation available on the current hardware and refine */ __m128 r; #if defined(ENOKI_X86_AVX512VL) r = _mm_rcp14_ps(m); /* rel error < 2^-14 */ #else r = _mm_rcp_ps(m); /* rel error < 1.5*2^-12 */ #endif /* Refine using one Newton-Raphson iteration */ __m128 t0 = _mm_add_ps(r, r), t1 = _mm_mul_ps(r, m), ro = r; (void) ro; #if defined(ENOKI_X86_FMA) r = _mm_fnmadd_ps(t1, r, t0); #else r = _mm_sub_ps(t0, _mm_mul_ps(r, t1)); #endif #if defined(ENOKI_X86_AVX512VL) return _mm_fixupimm_ps(r, m, _mm_set1_epi32(0x0087A622), 0); #else return _mm_blendv_ps(r, ro, t1); /* mask bit is '1' iff t1 == nan */ #endif #endif } ENOKI_INLINE Derived rsqrt_() const { #if defined(ENOKI_X86_AVX512ER) /* rel err < 2^28, use as is (even in non-approximate mode) */ return _mm512_castps512_ps128( _mm512_rsqrt28_ps(_mm512_castps128_ps512(m))); #else /* Use best reciprocal square root approximation available on the current hardware and refine */ __m128 r; #if defined(ENOKI_X86_AVX512VL) r = _mm_rsqrt14_ps(m); /* rel error < 2^-14 */ #else r = _mm_rsqrt_ps(m); /* rel error < 1.5*2^-12 */ #endif /* Refine using one Newton-Raphson iteration */ const __m128 c0 = _mm_set1_ps(.5f), c1 = _mm_set1_ps(3.f); __m128 t0 = _mm_mul_ps(r, c0), t1 = _mm_mul_ps(r, m), ro = r; (void) ro; #if defined(ENOKI_X86_FMA) r = _mm_mul_ps(_mm_fnmadd_ps(t1, r, c1), t0); #else r = _mm_mul_ps(_mm_sub_ps(c1, _mm_mul_ps(t1, r)), t0); #endif #if defined(ENOKI_X86_AVX512VL) return _mm_fixupimm_ps(r, m, _mm_set1_epi32(0x0383A622), 0); #else return _mm_blendv_ps(r, ro, t1); /* mask bit is '1' iff t1 == nan */ #endif #endif } //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Horizontal operations // ----------------------------------------------------------------------- #define ENOKI_HORIZONTAL_OP(name, op) \ ENOKI_INLINE Value name##_() const { \ __m128 t1 = _mm_movehdup_ps(m); \ __m128 t2 = _mm_##op##_ps(m, t1); \ t1 = _mm_movehl_ps(t1, t2); \ t2 = _mm_##op##_ss(t2, t1); \ return _mm_cvtss_f32(t2); \ } ENOKI_HORIZONTAL_OP(hsum, add) ENOKI_HORIZONTAL_OP(hprod, mul) ENOKI_HORIZONTAL_OP(hmin, min) ENOKI_HORIZONTAL_OP(hmax, max) #undef ENOKI_HORIZONTAL_OP ENOKI_INLINE bool all_() const { return _mm_movemask_ps(m) == 0xF;} ENOKI_INLINE bool any_() const { return _mm_movemask_ps(m) != 0x0; } ENOKI_INLINE uint32_t bitmask_() const { return (uint32_t) _mm_movemask_ps(m); } ENOKI_INLINE size_t count_() const { return (size_t) _mm_popcnt_u32(bitmask_()); } ENOKI_INLINE Value dot_(Ref a) const { return _mm_cvtss_f32(_mm_dp_ps(m, a.m, 0b11110001)); } //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Masked versions of key operations // ----------------------------------------------------------------------- #if defined(ENOKI_X86_AVX512VL) template ENOKI_INLINE void massign_(const Derived &a, const Mask &mask) { m = _mm_mask_mov_ps(m, mask.k, a.m); } template ENOKI_INLINE void madd_ (const Derived &a, const Mask &mask) { m = _mm_mask_add_ps(m, mask.k, m, a.m); } template ENOKI_INLINE void msub_ (const Derived &a, const Mask &mask) { m = _mm_mask_sub_ps(m, mask.k, m, a.m); } template ENOKI_INLINE void mmul_ (const Derived &a, const Mask &mask) { m = _mm_mask_mul_ps(m, mask.k, m, a.m); } template ENOKI_INLINE void mdiv_ (const Derived &a, const Mask &mask) { m = _mm_mask_div_ps(m, mask.k, m, a.m); } template ENOKI_INLINE void mor_ (const Derived &a, const Mask &mask) { m = _mm_mask_or_ps(m, mask.k, m, a.m); } template ENOKI_INLINE void mand_ (const Derived &a, const Mask &mask) { m = _mm_mask_and_ps(m, mask.k, m, a.m); } template ENOKI_INLINE void mxor_ (const Derived &a, const Mask &mask) { m = _mm_mask_xor_ps(m, mask.k, m, a.m); } #endif //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Initialization, loading/writing data // ----------------------------------------------------------------------- ENOKI_INLINE void store_(void *ptr) const { assert((uintptr_t) ptr % 16 == 0); _mm_store_ps((Value *) ENOKI_ASSUME_ALIGNED(ptr, 16), m); } template ENOKI_INLINE void store_(void *ptr, const Mask &mask) const { #if defined(ENOKI_X86_AVX512VL) _mm_mask_store_ps((Value *) ptr, mask.k, m); #elif defined(ENOKI_X86_AVX) _mm_maskstore_ps((Value *) ptr, _mm_castps_si128(mask.m), m); #else Base::store_(ptr, mask); #endif } ENOKI_INLINE void store_unaligned_(void *ptr) const { _mm_storeu_ps((Value *) ptr, m); } template ENOKI_INLINE void store_unaligned_(void *ptr, const Mask &mask) const { #if defined(ENOKI_X86_AVX512VL) _mm_mask_storeu_ps((Value *) ptr, mask.k, m); #elif defined(ENOKI_X86_AVX) _mm_maskstore_ps((Value *) ptr, _mm_castps_si128(mask.m), m); #else Base::store_unaligned_(ptr, mask); #endif } static ENOKI_INLINE Derived load_(const void *ptr) { assert((uintptr_t) ptr % 16 == 0); return _mm_load_ps((const Value *) ENOKI_ASSUME_ALIGNED(ptr, 16)); } template static ENOKI_INLINE Derived load_(const void *ptr, const Mask &mask) { #if defined(ENOKI_X86_AVX512VL) return _mm_maskz_load_ps(mask.k, ptr); #elif defined(ENOKI_X86_AVX) return _mm_maskload_ps((const Value *) ptr, _mm_castps_si128(mask.m)); #else return Base::load_(ptr, mask); #endif } static ENOKI_INLINE Derived load_unaligned_(const void *ptr) { return _mm_loadu_ps((const Value *) ptr); } template static ENOKI_INLINE Derived load_unaligned_(const void *ptr, const Mask &mask) { #if defined(ENOKI_X86_AVX512VL) return _mm_maskz_loadu_ps(mask.k, ptr); #elif defined(ENOKI_X86_AVX) return _mm_maskload_ps((const Value *) ptr, _mm_castps_si128(mask.m)); #else return Base::load_unaligned_(ptr, mask); #endif } static ENOKI_INLINE Derived zero_() { return _mm_setzero_ps(); } #if defined(ENOKI_X86_AVX2) template static ENOKI_INLINE Derived gather_(const void *ptr, const Index &index, const Mask &mask) { #if defined(ENOKI_X86_AVX512VL) if constexpr (sizeof(scalar_t) == 4) return _mm_mmask_i32gather_ps(_mm_setzero_ps(), mask.k, index.m, (const float *) ptr, Stride); else return _mm256_mmask_i64gather_ps(_mm_setzero_ps(), mask.k, index.m, (const float *) ptr, Stride); #else if constexpr (sizeof(scalar_t) == 4) return _mm_mask_i32gather_ps(_mm_setzero_ps(), (const float *) ptr, index.m, mask.m, Stride); else return _mm256_mask_i64gather_ps(_mm_setzero_ps(), (const float *) ptr, index.m, mask.m, Stride); #endif } #endif #if defined(ENOKI_X86_AVX512VL) template ENOKI_INLINE void scatter_(void *ptr, const Index &index, const Mask &mask) const { if constexpr (sizeof(scalar_t) == 4) _mm_mask_i32scatter_ps(ptr, mask.k, index.m, m, Stride); else _mm256_mask_i64scatter_ps(ptr, mask.k, index.m, m, Stride); } #endif template ENOKI_INLINE Value extract_(const Mask &mask) const { #if !defined(ENOKI_X86_AVX512VL) unsigned int k = (unsigned int) _mm_movemask_ps(mask.m); return coeff((size_t) (detail::tzcnt_scalar(k) & 3)); #else return _mm_cvtss_f32(_mm_mask_compress_ps(_mm_setzero_ps(), mask.k, m)); #endif } template ENOKI_INLINE size_t compress_(float *&ptr, const Mask &mask) const { #if !defined(ENOKI_X86_AVX512VL) unsigned int k = (unsigned int) _mm_movemask_ps(mask.m); /** Fancy LUT-based partitioning algorithm, see https://deplinenoise.files.wordpress.com/2015/03/gdc2015_afredriksson_simd.pdf */ __m128i shuf = _mm_load_si128(((const __m128i *) detail::compress_lut_128) + k), perm = _mm_shuffle_epi8(_mm_castps_si128(m), shuf); _mm_storeu_si128((__m128i *) ptr, perm); #else unsigned int k = (unsigned int) mask.k; _mm_storeu_ps(ptr, _mm_mask_compress_ps(_mm_setzero_ps(), mask.k, m)); #endif size_t kn = (size_t) _mm_popcnt_u32(k); ptr += kn; return kn; } //! @} // ----------------------------------------------------------------------- } ENOKI_MAY_ALIAS; /// Partial overload of StaticArrayImpl using SSE4.2 intrinsics (double precision) template struct alignas(16) StaticArrayImpl : StaticArrayBase { ENOKI_NATIVE_ARRAY(double, 2, __m128d) // ----------------------------------------------------------------------- //! @{ \name Value constructors // ----------------------------------------------------------------------- ENOKI_INLINE StaticArrayImpl(Value value) : m(_mm_set1_pd(value)) { } ENOKI_INLINE StaticArrayImpl(Value v0, Value v1) : m(_mm_setr_pd(v0, v1)) { } //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Type converting constructors // ----------------------------------------------------------------------- /* No vectorized conversions from float/[u]int32_t (too small) */ ENOKI_CONVERT(double) : m(a.derived().m) { } #if defined(ENOKI_X86_AVX512DQ) && defined(ENOKI_X86_AVX512VL) ENOKI_CONVERT(int64_t) : m(_mm_cvtepi64_pd(a.derived().m)) { } ENOKI_CONVERT(uint64_t) : m(_mm_cvtepu64_pd(a.derived().m)) { } #endif //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Reinterpreting constructors, mask converters // ----------------------------------------------------------------------- ENOKI_REINTERPRET(bool) { int16_t ival; memcpy(&ival, a.derived().data(), 2); m = _mm_castsi128_pd(_mm_cvtepi8_epi64(_mm_cmpgt_epi8( _mm_cvtsi32_si128((int) ival), _mm_setzero_si128()))); } ENOKI_REINTERPRET(float) { ENOKI_TRACK_SCALAR("Constructor (reinterpreting, float32[2] -> double[2])"); auto v0 = a.derived().coeff(0), v1 = a.derived().coeff(1); m = _mm_castps_pd(_mm_setr_ps(v0, v0, v1, v1)); } ENOKI_REINTERPRET(int32_t) { ENOKI_TRACK_SCALAR("Constructor (reinterpreting, int32[2] -> double[2])"); auto v0 = a.derived().coeff(0), v1 = a.derived().coeff(1); m = _mm_castsi128_pd(_mm_setr_epi32(v0, v0, v1, v1)); } ENOKI_REINTERPRET(uint32_t) { ENOKI_TRACK_SCALAR("Constructor (reinterpreting, uint32[2] -> double[2])"); auto v0 = a.derived().coeff(0), v1 = a.derived().coeff(1); m = _mm_castsi128_pd(_mm_setr_epi32((int32_t) v0, (int32_t) v0, (int32_t) v1, (int32_t) v1)); } ENOKI_REINTERPRET(double) : m(a.derived().m) { } ENOKI_REINTERPRET(int64_t) : m(_mm_castsi128_pd(a.derived().m)) { } ENOKI_REINTERPRET(uint64_t) : m(_mm_castsi128_pd(a.derived().m)) { } //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Converting from/to half size vectors // ----------------------------------------------------------------------- StaticArrayImpl(const Array1 &a1, const Array2 &a2) : m(_mm_setr_pd(a1.coeff(0), a2.coeff(0))) { } ENOKI_INLINE Array1 low_() const { return Array1(coeff(0)); } ENOKI_INLINE Array2 high_() const { return Array2(coeff(1)); } //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Vertical operations // ----------------------------------------------------------------------- ENOKI_INLINE Derived add_(Ref a) const { return _mm_add_pd(m, a.m); } ENOKI_INLINE Derived sub_(Ref a) const { return _mm_sub_pd(m, a.m); } ENOKI_INLINE Derived mul_(Ref a) const { return _mm_mul_pd(m, a.m); } ENOKI_INLINE Derived div_(Ref a) const { return _mm_div_pd(m, a.m); } template ENOKI_INLINE Derived or_(const T &a) const { #if defined(ENOKI_X86_AVX512VL) if constexpr (is_mask_v) return _mm_mask_mov_pd(m, a.k, _mm_set1_pd(memcpy_cast(int64_t(-1)))); else #endif return _mm_or_pd(m, a.m); } template ENOKI_INLINE Derived and_(const T &a) const { #if defined(ENOKI_X86_AVX512VL) if constexpr (is_mask_v) return _mm_maskz_mov_pd(a.k, m); else #endif return _mm_and_pd(m, a.m); } template ENOKI_INLINE Derived xor_(const T &a) const { #if defined(ENOKI_X86_AVX512VL) if constexpr (is_mask_v) return _mm_mask_xor_pd(m, a.k, m, _mm_set1_pd(memcpy_cast(int64_t(-1)))); else #endif return _mm_xor_pd(m, a.m); } template ENOKI_INLINE Derived andnot_(const T &a) const { #if defined(ENOKI_X86_AVX512VL) if constexpr (is_mask_v) return _mm_mask_mov_pd(m, a.k, _mm_setzero_pd()); else #endif return _mm_andnot_pd(a.m, m); } #if defined(ENOKI_X86_AVX512VL) #define ENOKI_COMP(name, NAME) mask_t::from_k(_mm_cmp_pd_mask(m, a.m, _CMP_##NAME)) #elif defined(ENOKI_X86_AVX) #define ENOKI_COMP(name, NAME) mask_t(_mm_cmp_pd(m, a.m, _CMP_##NAME)) #else #define ENOKI_COMP(name, NAME) mask_t(_mm_cmp##name##_pd(m, a.m)) #endif ENOKI_INLINE auto lt_ (Ref a) const { return ENOKI_COMP(lt, LT_OQ); } ENOKI_INLINE auto gt_ (Ref a) const { return ENOKI_COMP(gt, GT_OQ); } ENOKI_INLINE auto le_ (Ref a) const { return ENOKI_COMP(le, LE_OQ); } ENOKI_INLINE auto ge_ (Ref a) const { return ENOKI_COMP(ge, GE_OQ); } ENOKI_INLINE auto eq_ (Ref a) const { using Int = int_array_t; if constexpr (IsMask_) return mask_t(eq(Int(derived()), Int(a))); else return ENOKI_COMP(eq, EQ_OQ); } ENOKI_INLINE auto neq_(Ref a) const { using Int = int_array_t; if constexpr (IsMask_) return mask_t(neq(Int(derived()), Int(a))); else return ENOKI_COMP(neq, NEQ_UQ); } #undef ENOKI_COMP ENOKI_INLINE Derived abs_() const { return _mm_andnot_pd(_mm_set1_pd(-0.), m); } ENOKI_INLINE Derived min_(Ref b) const { return _mm_min_pd(b.m, m); } ENOKI_INLINE Derived max_(Ref b) const { return _mm_max_pd(b.m, m); } ENOKI_INLINE Derived ceil_() const { return _mm_ceil_pd(m); } ENOKI_INLINE Derived floor_() const { return _mm_floor_pd(m); } ENOKI_INLINE Derived sqrt_() const { return _mm_sqrt_pd(m); } ENOKI_INLINE Derived round_() const { return _mm_round_pd(m, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } ENOKI_INLINE Derived trunc_() const { return _mm_round_pd(m, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } template static ENOKI_INLINE Derived select_(const Mask &m, Ref t, Ref f) { #if !defined(ENOKI_X86_AVX512VL) return _mm_blendv_pd(f.m, t.m, m.m); #else return _mm_mask_blend_pd(m.k, f.m, t.m); #endif } #if defined(ENOKI_X86_FMA) ENOKI_INLINE Derived fmadd_ (Ref b, Ref c) const { return _mm_fmadd_pd (m, b.m, c.m); } ENOKI_INLINE Derived fmsub_ (Ref b, Ref c) const { return _mm_fmsub_pd (m, b.m, c.m); } ENOKI_INLINE Derived fnmadd_ (Ref b, Ref c) const { return _mm_fnmadd_pd (m, b.m, c.m); } ENOKI_INLINE Derived fnmsub_ (Ref b, Ref c) const { return _mm_fnmsub_pd (m, b.m, c.m); } ENOKI_INLINE Derived fmsubadd_(Ref b, Ref c) const { return _mm_fmsubadd_pd(m, b.m, c.m); } ENOKI_INLINE Derived fmaddsub_(Ref b, Ref c) const { return _mm_fmaddsub_pd(m, b.m, c.m); } #endif #if defined(ENOKI_X86_AVX) #define ENOKI_SHUFFLE_PD(m, flags) _mm_permute_pd(m, flags) #else #define ENOKI_SHUFFLE_PD(m, flags) _mm_shuffle_pd(m, m, flags) #endif template ENOKI_INLINE Derived shuffle_() const { return ENOKI_SHUFFLE_PD(m, (I1 << 1) | I0); } template ENOKI_INLINE Derived shuffle_(const Index &index) const { #if defined(ENOKI_X86_AVX) return _mm_permutevar_pd(m, _mm_slli_epi64(index.m, 1)); #else return Base::shuffle_(index); #endif } #if defined(ENOKI_X86_AVX512VL) ENOKI_INLINE Derived ldexp_(Ref arg) const { return _mm_scalef_pd(m, arg.m); } ENOKI_INLINE std::pair frexp_() const { return std::make_pair( _mm_getmant_pd(m, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src), _mm_getexp_pd(m)); } #endif #if defined(ENOKI_X86_AVX512VL) || defined(ENOKI_X86_AVX512ER) ENOKI_INLINE Derived rcp_() const { /* Use best reciprocal approximation available on the current hardware and refine */ __m128d r; #if defined(ENOKI_X86_AVX512ER) /* rel err < 2^28 */ r = _mm512_castpd512_pd128( _mm512_rcp28_pd(_mm512_castpd128_pd512(m))); #elif defined(ENOKI_X86_AVX512VL) r = _mm_rcp14_pd(m); /* rel error < 2^-14 */ #endif __m128d ro = r, t0, t1; (void) ro; /* Refine using 1-2 Newton-Raphson iterations */ ENOKI_UNROLL for (int i = 0; i < (has_avx512er ? 1 : 2); ++i) { t0 = _mm_add_pd(r, r); t1 = _mm_mul_pd(r, m); r = _mm_fnmadd_pd(t1, r, t0); } #if defined(ENOKI_X86_AVX512VL) return _mm_fixupimm_pd(r, m, _mm_set1_epi32(0x0087A622), 0); #else return _mm_blendv_pd(r, ro, t1); /* mask bit is '1' iff t1 == nan */ #endif } ENOKI_INLINE Derived rsqrt_() const { /* Use best reciprocal square root approximation available on the current hardware and refine */ __m128d r; #if defined(ENOKI_X86_AVX512ER) /* rel err < 2^28 */ r = _mm512_castpd512_pd128( _mm512_rsqrt28_pd(_mm512_castpd128_pd512(m))); #elif defined(ENOKI_X86_AVX512VL) r = _mm_rsqrt14_pd(m); /* rel error < 2^-14 */ #endif const __m128d c0 = _mm_set1_pd(0.5), c1 = _mm_set1_pd(3.0); __m128d ro = r, t0, t1; (void) ro; /* Refine using 1-2 Newton-Raphson iterations */ ENOKI_UNROLL for (int i = 0; i < (has_avx512er ? 1 : 2); ++i) { t0 = _mm_mul_pd(r, c0); t1 = _mm_mul_pd(r, m); r = _mm_mul_pd(_mm_fnmadd_pd(t1, r, c1), t0); } #if defined(ENOKI_X86_AVX512VL) return _mm_fixupimm_pd(r, m, _mm_set1_epi32(0x0383A622), 0); #else return _mm_blendv_pd(r, ro, t1); /* mask bit is '1' iff t1 == nan */ #endif } #endif //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Horizontal operations // ----------------------------------------------------------------------- #define ENOKI_HORIZONTAL_OP(name, op) \ ENOKI_INLINE Value name##_() const { \ __m128d t0 = ENOKI_SHUFFLE_PD(m, 1); \ __m128d t1 = _mm_##op##_sd(t0, m); \ return _mm_cvtsd_f64(t1); \ } ENOKI_HORIZONTAL_OP(hsum, add) ENOKI_HORIZONTAL_OP(hprod, mul) ENOKI_HORIZONTAL_OP(hmin, min) ENOKI_HORIZONTAL_OP(hmax, max) #undef ENOKI_HORIZONTAL_OP #undef ENOKI_SHUFFLE_PD ENOKI_INLINE bool all_() const { return _mm_movemask_pd(m) == 0x3;} ENOKI_INLINE bool any_() const { return _mm_movemask_pd(m) != 0x0; } ENOKI_INLINE uint32_t bitmask_() const { return (uint32_t) _mm_movemask_pd(m); } ENOKI_INLINE size_t count_() const { return (size_t) _mm_popcnt_u32(bitmask_()); } ENOKI_INLINE Value dot_(Ref a) const { return _mm_cvtsd_f64(_mm_dp_pd(m, a.m, 0b00110001)); } //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Masked versions of key operations // ----------------------------------------------------------------------- #if defined(ENOKI_X86_AVX512VL) template ENOKI_INLINE void massign_(const Derived &a, const Mask &mask) { m = _mm_mask_mov_pd(m, mask.k, a.m); } template ENOKI_INLINE void madd_ (const Derived &a, const Mask &mask) { m = _mm_mask_add_pd(m, mask.k, m, a.m); } template ENOKI_INLINE void msub_ (const Derived &a, const Mask &mask) { m = _mm_mask_sub_pd(m, mask.k, m, a.m); } template ENOKI_INLINE void mmul_ (const Derived &a, const Mask &mask) { m = _mm_mask_mul_pd(m, mask.k, m, a.m); } template ENOKI_INLINE void mdiv_ (const Derived &a, const Mask &mask) { m = _mm_mask_div_pd(m, mask.k, m, a.m); } template ENOKI_INLINE void mor_ (const Derived &a, const Mask &mask) { m = _mm_mask_or_pd(m, mask.k, m, a.m); } template ENOKI_INLINE void mand_ (const Derived &a, const Mask &mask) { m = _mm_mask_and_pd(m, mask.k, m, a.m); } template ENOKI_INLINE void mxor_ (const Derived &a, const Mask &mask) { m = _mm_mask_xor_pd(m, mask.k, m, a.m); } #endif //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Initialization, loading/writing data // ----------------------------------------------------------------------- ENOKI_INLINE void store_(void *ptr) const { assert((uintptr_t) ptr % 16 == 0); _mm_store_pd((Value *) ENOKI_ASSUME_ALIGNED(ptr, 16), m); } template ENOKI_INLINE void store_(void *ptr, const Mask &mask) const { #if defined(ENOKI_X86_AVX512VL) _mm_mask_store_pd((Value *) ptr, mask.k, m); #elif defined(ENOKI_X86_AVX) _mm_maskstore_pd((Value *) ptr, _mm_castpd_si128(mask.m), m); #else Base::store_(ptr, mask); #endif } ENOKI_INLINE void store_unaligned_(void *ptr) const { _mm_storeu_pd((Value *) ptr, m); } template ENOKI_INLINE void store_unaligned_(void *ptr, const Mask &mask) const { #if defined(ENOKI_X86_AVX512VL) _mm_mask_storeu_pd((Value *) ptr, mask.k, m); #elif defined(ENOKI_X86_AVX) _mm_maskstore_pd((Value *) ptr, _mm_castpd_si128(mask.m), m); #else Base::store_unaligned_(ptr, mask); #endif } static ENOKI_INLINE Derived load_(const void *ptr) { assert((uintptr_t) ptr % 16 == 0); return _mm_load_pd((const Value *) ENOKI_ASSUME_ALIGNED(ptr, 16)); } template static ENOKI_INLINE Derived load_(const void *ptr, const Mask &mask) { #if defined(ENOKI_X86_AVX512VL) return _mm_maskz_load_pd(mask.k, ptr); #elif defined(ENOKI_X86_AVX) return _mm_maskload_pd((const Value *) ptr, _mm_castpd_si128(mask.m)); #else return Base::load_(ptr, mask); #endif } static ENOKI_INLINE Derived load_unaligned_(const void *ptr) { return _mm_loadu_pd((const Value *) ptr); } template static ENOKI_INLINE Derived load_unaligned_(const void *ptr, const Mask &mask) { #if defined(ENOKI_X86_AVX512VL) return _mm_maskz_loadu_pd(mask.k, ptr); #elif defined(ENOKI_X86_AVX) return _mm_maskload_pd((const Value *) ptr, _mm_castpd_si128(mask.m)); #else return Base::load_unaligned_(ptr, mask); #endif } static ENOKI_INLINE Derived zero_() { return _mm_setzero_pd(); } #if defined(ENOKI_X86_AVX2) template static ENOKI_INLINE Derived gather_(const void *ptr, const Index &index, const Mask &mask) { if constexpr (sizeof(scalar_t) == 4) { return Base::template gather_(ptr, index, mask); } else { #if defined(ENOKI_X86_AVX512VL) return _mm_mmask_i64gather_pd(_mm_setzero_pd(), mask.k, index.m, (const double *) ptr, Stride); #else return _mm_mask_i64gather_pd(_mm_setzero_pd(), (const double *) ptr, index.m, mask.m, Stride); #endif } } #endif #if defined(ENOKI_X86_AVX512VL) template ENOKI_INLINE void scatter_(void *ptr, const Index &index, const Mask &mask) const { if constexpr (sizeof(scalar_t) == 4) Base::template scatter_(ptr, index, mask); else _mm_mask_i64scatter_pd(ptr, mask.k, index.m, m, Stride); } template ENOKI_INLINE Value extract_(const Mask &mask) const { return _mm_cvtsd_f64(_mm_mask_compress_pd(_mm_setzero_pd(), mask.k, m)); } template ENOKI_INLINE size_t compress_(double *&ptr, const Mask &mask) const { _mm_storeu_pd(ptr, _mm_mask_compress_pd(_mm_setzero_pd(), mask.k, m)); size_t kn = (size_t) _mm_popcnt_u32(mask.k); ptr += kn; return kn; } #endif //! @} // ----------------------------------------------------------------------- } ENOKI_MAY_ALIAS; /// Partial overload of StaticArrayImpl using SSE4.2 intrinsics (32 bit integers) template struct alignas(16) StaticArrayImpl> : StaticArrayBase { ENOKI_NATIVE_ARRAY(Value_, 4, __m128i) // ----------------------------------------------------------------------- //! @{ \name Value constructors // ----------------------------------------------------------------------- ENOKI_INLINE StaticArrayImpl(Value value) : m(_mm_set1_epi32((int32_t) value)) { } ENOKI_INLINE StaticArrayImpl(Value v0, Value v1, Value v2, Value v3) : m(_mm_setr_epi32((int32_t) v0, (int32_t) v1, (int32_t) v2, (int32_t) v3)) { } //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Type converting constructors // ----------------------------------------------------------------------- ENOKI_CONVERT(float) { if constexpr (std::is_signed_v) { m = _mm_cvttps_epi32(a.derived().m); } else { #if defined(ENOKI_X86_AVX512DQ) && defined(ENOKI_X86_AVX512VL) m = _mm_cvttps_epu32(a.derived().m); #else constexpr uint32_t limit = 1u << 31; const __m128 limit_f = _mm_set1_ps((float) limit); const __m128i limit_i = _mm_set1_epi32((int) limit); __m128 v = a.derived().m; __m128i mask = _mm_castps_si128(_mm_cmpge_ps(v, limit_f)); __m128i b2 = _mm_add_epi32( _mm_cvttps_epi32(_mm_sub_ps(v, limit_f)), limit_i); __m128i b1 = _mm_cvttps_epi32(v); m = _mm_blendv_epi8(b1, b2, mask); #endif } } ENOKI_CONVERT(int32_t) : m(a.derived().m) { } ENOKI_CONVERT(uint32_t) : m(a.derived().m) { } #if defined(ENOKI_X86_AVX) ENOKI_CONVERT(double) { if constexpr (std::is_signed_v) { m = _mm256_cvttpd_epi32(a.derived().m); } else { #if defined(ENOKI_X86_AVX512DQ) && defined(ENOKI_X86_AVX512VL) m = _mm256_cvttpd_epu32(a.derived().m); #else ENOKI_TRACK_SCALAR("Constructor (converting, double[4] -> uint32[4])"); for (size_t i = 0; i < Size; ++i) coeff(i) = Value(a.derived().coeff(i)); #endif } } #endif #if defined(ENOKI_X86_AVX512DQ) && defined(ENOKI_X86_AVX512VL) ENOKI_CONVERT(int64_t) { m = _mm256_cvtepi64_epi32(a.derived().m); } ENOKI_CONVERT(uint64_t) { m = _mm256_cvtepi64_epi32(a.derived().m); } #endif //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Reinterpreting constructors, mask converters // ----------------------------------------------------------------------- ENOKI_REINTERPRET(bool) { int ival; memcpy(&ival, a.derived().data(), 4); m = _mm_cvtepi8_epi32( _mm_cmpgt_epi8(_mm_cvtsi32_si128(ival), _mm_setzero_si128())); } ENOKI_REINTERPRET(float) : m(_mm_castps_si128(a.derived().m)) { } ENOKI_REINTERPRET(int32_t) : m(a.derived().m) { } ENOKI_REINTERPRET(uint32_t) : m(a.derived().m) { } #if defined(ENOKI_X86_AVX) ENOKI_REINTERPRET(double) : m(detail::mm256_cvtepi64_epi32(_mm256_castpd_si256(a.derived().m))) { } #else ENOKI_REINTERPRET(double) : m(detail::mm256_cvtepi64_epi32(_mm_castpd_si128(low(a).m), _mm_castpd_si128(high(a).m))) { } #endif #if defined(ENOKI_X86_AVX2) ENOKI_REINTERPRET(uint64_t) : m(detail::mm256_cvtepi64_epi32(a.derived().m)) { } ENOKI_REINTERPRET(int64_t) : m(detail::mm256_cvtepi64_epi32(a.derived().m)) {} #else ENOKI_REINTERPRET(uint64_t) : m(detail::mm256_cvtepi64_epi32(low(a).m, high(a).m)) { } ENOKI_REINTERPRET(int64_t) : m(detail::mm256_cvtepi64_epi32(low(a).m, high(a).m)) { } #endif //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Converting from/to half size vectors // ----------------------------------------------------------------------- StaticArrayImpl(const Array1 &a1, const Array2 &a2) : m(_mm_setr_epi32((int32_t) a1.coeff(0), (int32_t) a1.coeff(1), (int32_t) a2.coeff(0), (int32_t) a2.coeff(1))) { } ENOKI_INLINE Array1 low_() const { return Array1(coeff(0), coeff(1)); } ENOKI_INLINE Array2 high_() const { return Array2(coeff(2), coeff(3)); } //! @} // ----------------------------------------------------------------------- ENOKI_INLINE Derived add_(Ref a) const { return _mm_add_epi32(m, a.m); } ENOKI_INLINE Derived sub_(Ref a) const { return _mm_sub_epi32(m, a.m); } ENOKI_INLINE Derived mul_(Ref a) const { return _mm_mullo_epi32(m, a.m); } template ENOKI_INLINE Derived or_(const T &a) const { #if defined(ENOKI_X86_AVX512VL) if constexpr (is_mask_v) return _mm_mask_mov_epi32(m, a.k, _mm_set1_epi32(-1)); else #endif return _mm_or_si128(m, a.m); } template ENOKI_INLINE Derived and_(const T &a) const { #if defined(ENOKI_X86_AVX512VL) if constexpr (is_mask_v) return _mm_maskz_mov_epi32(a.k, m); else #endif return _mm_and_si128(m, a.m); } template ENOKI_INLINE Derived xor_(const T &a) const { #if defined(ENOKI_X86_AVX512VL) if constexpr (is_mask_v) return _mm_mask_xor_epi32(m, a.k, m, _mm_set1_epi32(-1)); else #endif return _mm_xor_si128(m, a.m); } template ENOKI_INLINE Derived andnot_(const T &a) const { #if defined(ENOKI_X86_AVX512VL) if constexpr (is_mask_v) return _mm_mask_mov_epi32(m, a.k, _mm_setzero_si128()); else #endif return _mm_andnot_si128(a.m, m); } template ENOKI_INLINE Derived sl_() const { return _mm_slli_epi32(m, (int) Imm); } template ENOKI_INLINE Derived sr_() const { return std::is_signed_v ? _mm_srai_epi32(m, (int) Imm) : _mm_srli_epi32(m, (int) Imm); } ENOKI_INLINE Derived sl_(size_t k) const { return _mm_sll_epi32(m, _mm_set1_epi64x((long long) k)); } ENOKI_INLINE Derived sr_(size_t k) const { return std::is_signed_v ? _mm_sra_epi32(m, _mm_set1_epi64x((long long) k)) : _mm_srl_epi32(m, _mm_set1_epi64x((long long) k)); } ENOKI_INLINE Derived sl_(Ref k) const { #if defined(ENOKI_X86_AVX2) return _mm_sllv_epi32(m, k.m); #else Derived out; ENOKI_TRACK_SCALAR("sl"); for (size_t i = 0; i < Size; ++i) out.coeff(i) = coeff(i) << (size_t) k.coeff(i); return out; #endif } ENOKI_INLINE Derived sr_(Ref k) const { #if defined(ENOKI_X86_AVX2) return std::is_signed_v ? _mm_srav_epi32(m, k.m) : _mm_srlv_epi32(m, k.m); #else Derived out; ENOKI_TRACK_SCALAR("sr"); for (size_t i = 0; i < Size; ++i) out.coeff(i) = coeff(i) >> (size_t) k.coeff(i); return out; #endif } #if defined(ENOKI_X86_AVX512VL) template ENOKI_INLINE Derived rol_() const { return _mm_rol_epi32(m, (int) Imm); } template ENOKI_INLINE Derived ror_() const { return _mm_ror_epi32(m, (int) Imm); } ENOKI_INLINE Derived rol_(Ref k) const { return _mm_rolv_epi32(m, k.m); } ENOKI_INLINE Derived ror_(Ref k) const { return _mm_rorv_epi32(m, k.m); } #endif ENOKI_INLINE auto eq_(Ref a) const { using Return = mask_t; #if defined(ENOKI_X86_AVX512VL) return Return::from_k(_mm_cmpeq_epi32_mask(m, a.m)); #else return Return(_mm_cmpeq_epi32(m, a.m)); #endif } ENOKI_INLINE auto neq_(Ref a) const { #if defined(ENOKI_X86_AVX512VL) return mask_t::from_k(_mm_cmpneq_epi32_mask(m, a.m)); #else return ~eq_(a); #endif } ENOKI_INLINE auto lt_(Ref a) const { using Return = mask_t; #if !defined(ENOKI_X86_AVX512VL) if constexpr (std::is_signed_v) { return Return(_mm_cmpgt_epi32(a.m, m)); } else { const __m128i offset = _mm_set1_epi32((int32_t) 0x80000000ul); return Return(_mm_cmpgt_epi32(_mm_sub_epi32(a.m, offset), _mm_sub_epi32(m, offset))); } #else return Return::from_k(std::is_signed_v ? _mm_cmplt_epi32_mask(m, a.m) : _mm_cmplt_epu32_mask(m, a.m)); #endif } ENOKI_INLINE auto gt_(Ref a) const { using Return = mask_t; #if !defined(ENOKI_X86_AVX512VL) if constexpr (std::is_signed_v) { return Return(_mm_cmpgt_epi32(m, a.m)); } else { const __m128i offset = _mm_set1_epi32((int32_t) 0x80000000ul); return Return(_mm_cmpgt_epi32(_mm_sub_epi32(m, offset), _mm_sub_epi32(a.m, offset))); } #else return Return::from_k(std::is_signed_v ? _mm_cmpgt_epi32_mask(m, a.m) : _mm_cmpgt_epu32_mask(m, a.m)); #endif } ENOKI_INLINE auto le_(Ref a) const { #if defined(ENOKI_X86_AVX512VL) return mask_t::from_k(std::is_signed_v ? _mm_cmple_epi32_mask(m, a.m) : _mm_cmple_epu32_mask(m, a.m)); #else return ~gt_(a); #endif } ENOKI_INLINE auto ge_(Ref a) const { #if defined(ENOKI_X86_AVX512VL) return mask_t::from_k(std::is_signed_v ? _mm_cmpge_epi32_mask(m, a.m) : _mm_cmpge_epu32_mask(m, a.m)); #else return ~lt_(a); #endif } ENOKI_INLINE Derived min_(Ref a) const { return std::is_signed_v ? _mm_min_epi32(a.m, m) : _mm_min_epu32(a.m, m); } ENOKI_INLINE Derived max_(Ref a) const { return std::is_signed_v ? _mm_max_epi32(a.m, m) : _mm_max_epu32(a.m, m); } ENOKI_INLINE Derived abs_() const { return std::is_signed_v ? _mm_abs_epi32(m) : m; } template static ENOKI_INLINE Derived select_(const Mask &m, Ref t, Ref f) { #if !defined(ENOKI_X86_AVX512VL) return _mm_blendv_epi8(f.m, t.m, m.m); #else return _mm_mask_blend_epi32(m.k, f.m, t.m); #endif } template ENOKI_INLINE Derived shuffle_() const { return _mm_shuffle_epi32(m, _MM_SHUFFLE(I3, I2, I1, I0)); } template ENOKI_INLINE Derived shuffle_(const Index &index) const { #if defined(ENOKI_X86_AVX) return _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(m), index.m)); #else return Base::shuffle_(index); #endif } ENOKI_INLINE Derived mulhi_(Ref a) const { Derived even, odd; if constexpr (std::is_signed_v) { even.m = _mm_srli_epi64(_mm_mul_epi32(m, a.m), 32); odd.m = _mm_mul_epi32(_mm_srli_epi64(m, 32), _mm_srli_epi64(a.m, 32)); } else { even.m = _mm_srli_epi64(_mm_mul_epu32(m, a.m), 32); odd.m = _mm_mul_epu32(_mm_srli_epi64(m, 32), _mm_srli_epi64(a.m, 32)); } #if defined(ENOKI_X86_AVX512VL) const mask_t blend = mask_t::from_k(0b0101); #else const mask_t blend(Value(-1), Value(0), Value(-1), Value(0)); #endif return select(blend, even, odd); } #if defined(ENOKI_X86_AVX512CD) && defined(ENOKI_X86_AVX512VL) ENOKI_INLINE Derived lzcnt_() const { return _mm_lzcnt_epi32(m); } ENOKI_INLINE Derived tzcnt_() const { return Value(32) - lzcnt(~derived() & (derived() - Value(1))); } #endif //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Horizontal operations // ----------------------------------------------------------------------- #define ENOKI_HORIZONTAL_OP(name, op) \ ENOKI_INLINE Value name##_() const { \ __m128i t1 = _mm_shuffle_epi32(m, 0x4e); \ __m128i t2 = _mm_##op##_epi32(m, t1); \ t1 = _mm_shufflelo_epi16(t2, 0x4e); \ t2 = _mm_##op##_epi32(t2, t1); \ return (Value) _mm_cvtsi128_si32(t2); \ } #define ENOKI_HORIZONTAL_OP_SIGNED(name, op) \ ENOKI_INLINE Value name##_() const { \ __m128i t1 = _mm_shuffle_epi32(m, 0x4e); \ __m128i t2 = std::is_signed_v ? _mm_##op##_epi32(m, t1) : \ _mm_##op##_epu32(m, t1); \ t1 = _mm_shufflelo_epi16(t2, 0x4e); \ t2 = std::is_signed_v ? _mm_##op##_epi32(t2, t1) : \ _mm_##op##_epu32(t2, t1); \ return (Value) _mm_cvtsi128_si32(t2); \ } ENOKI_HORIZONTAL_OP(hsum, add) ENOKI_HORIZONTAL_OP(hprod, mullo) ENOKI_HORIZONTAL_OP_SIGNED(hmin, min) ENOKI_HORIZONTAL_OP_SIGNED(hmax, max) #undef ENOKI_HORIZONTAL_OP #undef ENOKI_HORIZONTAL_OP_SIGNED ENOKI_INLINE bool all_() const { return _mm_movemask_ps(_mm_castsi128_ps(m)) == 0xF;} ENOKI_INLINE bool any_() const { return _mm_movemask_ps(_mm_castsi128_ps(m)) != 0x0; } ENOKI_INLINE uint32_t bitmask_() const { return (uint32_t) _mm_movemask_ps(_mm_castsi128_ps(m)); } ENOKI_INLINE size_t count_() const { return (size_t) _mm_popcnt_u32(bitmask_()); } //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Masked versions of key operations // ----------------------------------------------------------------------- #if defined(ENOKI_X86_AVX512VL) template ENOKI_INLINE void massign_(const Derived &a, const Mask &mask) { m = _mm_mask_mov_epi32(m, mask.k, a.m); } template ENOKI_INLINE void madd_ (const Derived &a, const Mask &mask) { m = _mm_mask_add_epi32(m, mask.k, m, a.m); } template ENOKI_INLINE void msub_ (const Derived &a, const Mask &mask) { m = _mm_mask_sub_epi32(m, mask.k, m, a.m); } template ENOKI_INLINE void mmul_ (const Derived &a, const Mask &mask) { m = _mm_mask_mullo_epi32(m, mask.k, m, a.m); } template ENOKI_INLINE void mor_ (const Derived &a, const Mask &mask) { m = _mm_mask_or_epi32(m, mask.k, m, a.m); } template ENOKI_INLINE void mand_ (const Derived &a, const Mask &mask) { m = _mm_mask_and_epi32(m, mask.k, m, a.m); } template ENOKI_INLINE void mxor_ (const Derived &a, const Mask &mask) { m = _mm_mask_xor_epi32(m, mask.k, m, a.m); } #endif //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Initialization, loading/writing data // ----------------------------------------------------------------------- ENOKI_INLINE void store_(void *ptr) const { assert((uintptr_t) ptr % 16 == 0); _mm_store_si128((__m128i *) ENOKI_ASSUME_ALIGNED(ptr, 16), m); } template ENOKI_INLINE void store_(void *ptr, const Mask &mask) const { #if defined(ENOKI_X86_AVX512VL) _mm_mask_store_epi32(ptr, mask.k, m); #elif defined(ENOKI_X86_AVX2) _mm_maskstore_epi32((int *) ptr, mask.m, m); #else Base::store_(ptr, mask); #endif } ENOKI_INLINE void store_unaligned_(void *ptr) const { _mm_storeu_si128((__m128i *) ptr, m); } template ENOKI_INLINE void store_unaligned_(void *ptr, const Mask &mask) const { #if defined(ENOKI_X86_AVX512VL) _mm_mask_storeu_epi32(ptr, mask.k, m); #elif defined(ENOKI_X86_AVX2) _mm_maskstore_epi32((int *) ptr, mask.m, m); #else Base::store_unaligned_(ptr, mask); #endif } static ENOKI_INLINE Derived load_(const void *ptr) { assert((uintptr_t) ptr % 16 == 0); return _mm_load_si128((const __m128i *) ENOKI_ASSUME_ALIGNED(ptr, 16)); } template static ENOKI_INLINE Derived load_(const void *ptr, const Mask &mask) { #if defined(ENOKI_X86_AVX512VL) return _mm_maskz_load_epi32(mask.k, ptr); #elif defined(ENOKI_X86_AVX2) return _mm_maskload_epi32((const int *) ptr, mask.m); #else return Base::load_(ptr, mask); #endif } static ENOKI_INLINE Derived load_unaligned_(const void *ptr) { return _mm_loadu_si128((const __m128i *) ptr); } template static ENOKI_INLINE Derived load_unaligned_(const void *ptr, const Mask &mask) { #if defined(ENOKI_X86_AVX512VL) return _mm_maskz_loadu_epi32(mask.k, ptr); #elif defined(ENOKI_X86_AVX2) return _mm_maskload_epi32((const int *) ptr, mask.m); #else return Base::load_unaligned_(ptr, mask); #endif } static ENOKI_INLINE Derived zero_() { return _mm_setzero_si128(); } #if defined(ENOKI_X86_AVX2) template static ENOKI_INLINE Derived gather_(const void *ptr, const Index &index, const Mask &mask) { #if defined(ENOKI_X86_AVX512VL) if constexpr (sizeof(scalar_t) == 4) return _mm_mmask_i32gather_epi32(_mm_setzero_si128(), mask.k, index.m, (const int *) ptr, Stride); else return _mm256_mmask_i64gather_epi32(_mm_setzero_si128(), mask.k, index.m, (const int *) ptr, Stride); #else if constexpr (sizeof(scalar_t) == 4) return _mm_mask_i32gather_epi32(_mm_setzero_si128(), (const int *) ptr, index.m, mask.m, Stride); else return _mm256_mask_i64gather_epi32(_mm_setzero_si128(), (const int *) ptr, index.m, mask.m, Stride); #endif } #endif #if defined(ENOKI_X86_AVX512VL) template ENOKI_INLINE void scatter_(void *ptr, const Index &index, const Mask &mask) const { if constexpr (sizeof(scalar_t) == 4) _mm_mask_i32scatter_epi32(ptr, mask.k, index.m, m, Stride); else _mm256_mask_i64scatter_epi32(ptr, mask.k, index.m, m, Stride); } #endif template ENOKI_INLINE Value extract_(const Mask &mask) const { #if !defined(ENOKI_X86_AVX512VL) unsigned int k = (unsigned int) _mm_movemask_ps(_mm_castsi128_ps(mask.m)); return coeff((size_t) (detail::tzcnt_scalar(k) & 3)); #else return (Value) _mm_cvtsi128_si32(_mm_mask_compress_epi32(_mm_setzero_si128(), mask.k, m)); #endif } template ENOKI_INLINE size_t compress_(T *&ptr, const Mask &mask) const { #if !defined(ENOKI_X86_AVX512VL) unsigned int k = (unsigned int) _mm_movemask_ps(_mm_castsi128_ps(mask.m)); /** Fancy LUT-based partitioning algorithm, see https://deplinenoise.files.wordpress.com/2015/03/gdc2015_afredriksson_simd.pdf */ __m128i shuf = _mm_load_si128(((const __m128i *) detail::compress_lut_128) + k), perm = _mm_shuffle_epi8(m, shuf); _mm_storeu_si128((__m128i *) ptr, perm); #else _mm_storeu_si128((__m128i *) ptr, _mm_mask_compress_epi32(_mm_setzero_si128(), mask.k, m)); unsigned int k = (unsigned int) mask.k; #endif size_t kn = (size_t) _mm_popcnt_u32(k); ptr += kn; return kn; } //! @} // ----------------------------------------------------------------------- } ENOKI_MAY_ALIAS; /// Partial overload of StaticArrayImpl using SSE4.2 intrinsics (64 bit integers) template struct alignas(16) StaticArrayImpl> : StaticArrayBase { ENOKI_NATIVE_ARRAY(Value_, 2, __m128i) // ----------------------------------------------------------------------- //! @{ \name Value constructors // ----------------------------------------------------------------------- ENOKI_INLINE StaticArrayImpl(Value value) : m(_mm_set1_epi64x((int64_t) value)) { } ENOKI_INLINE StaticArrayImpl(Value v0, Value v1) { alignas(16) Value data[2]; data[0] = (Value) v0; data[1] = (Value) v1; m = _mm_load_si128((__m128i *) data); } //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Type converting constructors // ----------------------------------------------------------------------- #if defined(ENOKI_X86_AVX512DQ) && defined(ENOKI_X86_AVX512VL) ENOKI_CONVERT(double) { if constexpr (std::is_signed_v) m = _mm_cvttpd_epi64(a.derived().m); else m = _mm_cvttpd_epu64(a.derived().m); } #endif ENOKI_CONVERT(int64_t) : m(a.derived().m) { } ENOKI_CONVERT(uint64_t) : m(a.derived().m) { } //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Reinterpreting constructors, mask converters // ----------------------------------------------------------------------- ENOKI_REINTERPRET(bool) { int16_t ival; memcpy(&ival, a.derived().data(), 2); m = _mm_cvtepi8_epi64( _mm_cmpgt_epi8(_mm_cvtsi32_si128((int) ival), _mm_setzero_si128())); } ENOKI_REINTERPRET(float) { ENOKI_TRACK_SCALAR("Constructor (reinterpreting, float32[2] -> int64[2])"); auto v0 = a.derived().coeff(0), v1 = a.derived().coeff(1); m = _mm_castps_si128(_mm_setr_ps(v0, v0, v1, v1)); } ENOKI_REINTERPRET(int32_t) { ENOKI_TRACK_SCALAR("Constructor (reinterpreting, int32[2] -> int64[2])"); auto v0 = a.derived().coeff(0), v1 = a.derived().coeff(1); m = _mm_setr_epi32(v0, v0, v1, v1); } ENOKI_REINTERPRET(uint32_t) { ENOKI_TRACK_SCALAR("Constructor (reinterpreting, uint32[2] -> int64[2])"); auto v0 = a.derived().coeff(0), v1 = a.derived().coeff(1); m = _mm_setr_epi32((int32_t) v0, (int32_t) v0, (int32_t) v1, (int32_t) v1); } ENOKI_REINTERPRET(double) : m(_mm_castpd_si128(a.derived().m)) { } ENOKI_REINTERPRET(int64_t) : m(a.derived().m) { } ENOKI_REINTERPRET(uint64_t) : m(a.derived().m) { } //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Converting from/to half size vectors // ----------------------------------------------------------------------- StaticArrayImpl(const Array1 &a1, const Array2 &a2) { alignas(16) Value data[2]; data[0] = (Value) a1.coeff(0); data[1] = (Value) a2.coeff(0); m = _mm_load_si128((__m128i *) data); } ENOKI_INLINE Array1 low_() const { return Array1(coeff(0)); } ENOKI_INLINE Array2 high_() const { return Array2(coeff(1)); } //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Vertical operations // ----------------------------------------------------------------------- ENOKI_INLINE Derived add_(Ref a) const { return _mm_add_epi64(m, a.m); } ENOKI_INLINE Derived sub_(Ref a) const { return _mm_sub_epi64(m, a.m); } ENOKI_INLINE Derived mul_(Ref a) const { #if defined(ENOKI_X86_AVX512DQ) && defined(ENOKI_X86_AVX512VL) return _mm_mullo_epi64(m, a.m); #else Derived result; ENOKI_TRACK_SCALAR("mul"); for (size_t i = 0; i < Size; ++i) result.coeff(i) = coeff(i) * a.coeff(i); return result; #endif } template ENOKI_INLINE Derived or_(const T &a) const { #if defined(ENOKI_X86_AVX512VL) if constexpr (is_mask_v) return _mm_mask_mov_epi64(m, a.k, _mm_set1_epi64x(-1)); else #endif return _mm_or_si128(m, a.m); } template ENOKI_INLINE Derived and_(const T &a) const { #if defined(ENOKI_X86_AVX512VL) if constexpr (is_mask_v) return _mm_maskz_mov_epi64(a.k, m); else #endif return _mm_and_si128(m, a.m); } template ENOKI_INLINE Derived xor_(const T &a) const { #if defined(ENOKI_X86_AVX512VL) if constexpr (is_mask_v) return _mm_mask_xor_epi64(m, a.k, m, _mm_set1_epi64x(-1)); else #endif return _mm_xor_si128(m, a.m); } template ENOKI_INLINE Derived andnot_(const T &a) const { #if defined(ENOKI_X86_AVX512VL) if constexpr (is_mask_v) return _mm_mask_mov_epi64(m, a.k, _mm_setzero_si128()); else #endif return _mm_andnot_si128(a.m, m); } template ENOKI_INLINE Derived sl_() const { return _mm_slli_epi64(m, (int) k); } template ENOKI_INLINE Derived sr_() const { if constexpr (std::is_signed_v) { #if defined(ENOKI_X86_AVX512VL) return _mm_srai_epi64(m, (int) k); #else Derived out; ENOKI_TRACK_SCALAR("sr"); for (size_t i = 0; i < Size; ++i) out.coeff(i) = coeff(i) >> k; return out; #endif } else { return _mm_srli_epi64(m, (int) k); } } ENOKI_INLINE Derived sl_(size_t k) const { return _mm_sll_epi64(m, _mm_set1_epi64x((long long) k)); } ENOKI_INLINE Derived sr_(size_t k) const { if constexpr (std::is_signed_v) { #if defined(ENOKI_X86_AVX512VL) return _mm_sra_epi64(m, _mm_set1_epi64x((long long) k)); #else Derived out; ENOKI_TRACK_SCALAR("sr"); for (size_t i = 0; i < Size; ++i) out.coeff(i) = coeff(i) >> k; return out; #endif } else { return _mm_srl_epi64(m, _mm_set1_epi64x((long long) k)); } } ENOKI_INLINE Derived sl_(Ref k) const { #if defined(ENOKI_X86_AVX2) return _mm_sllv_epi64(m, k.m); #else Derived out; ENOKI_TRACK_SCALAR("sl"); for (size_t i = 0; i < Size; ++i) out.coeff(i) = coeff(i) << (unsigned int) k.coeff(i); return out; #endif } ENOKI_INLINE Derived sr_(Ref k) const { if constexpr (std::is_signed_v) { #if defined(ENOKI_X86_AVX512VL) return _mm_srav_epi64(m, k.m); #endif } else { #if defined(ENOKI_X86_AVX2) return _mm_srlv_epi64(m, k.m); #endif } Derived out; ENOKI_TRACK_SCALAR("sr"); for (size_t i = 0; i < Size; ++i) out.coeff(i) = coeff(i) >> (unsigned int) k.coeff(i); return out; } #if defined(ENOKI_X86_AVX512VL) template ENOKI_INLINE Derived rol_() const { return _mm_rol_epi64(m, (int) Imm); } template ENOKI_INLINE Derived ror_() const { return _mm_ror_epi64(m, (int) Imm); } ENOKI_INLINE Derived rol_(Ref k) const { return _mm_rolv_epi64(m, k.m); } ENOKI_INLINE Derived ror_(Ref k) const { return _mm_rorv_epi64(m, k.m); } #endif ENOKI_INLINE auto eq_(Ref a) const { using Return = mask_t; #if defined(ENOKI_X86_AVX512VL) return Return::from_k(_mm_cmpeq_epi64_mask(m, a.m)); #else return Return(_mm_cmpeq_epi64(m, a.m)); #endif } ENOKI_INLINE auto neq_(Ref a) const { #if defined(ENOKI_X86_AVX512VL) return mask_t::from_k(_mm_cmpneq_epi64_mask(m, a.m)); #else return ~eq_(a); #endif } ENOKI_INLINE auto lt_(Ref a) const { using Return = mask_t; #if !defined(ENOKI_X86_AVX512VL) if constexpr (std::is_signed_v) { return Return(_mm_cmpgt_epi64(a.m, m)); } else { const __m128i offset = _mm_set1_epi64x((long long) 0x8000000000000000ull); return Return(_mm_cmpgt_epi64( _mm_sub_epi64(a.m, offset), _mm_sub_epi64(m, offset) )); } #else return Return::from_k(std::is_signed_v ? _mm_cmplt_epi64_mask(m, a.m) : _mm_cmplt_epu64_mask(m, a.m)); #endif } ENOKI_INLINE auto gt_(Ref a) const { using Return = mask_t; #if !defined(ENOKI_X86_AVX512VL) if constexpr (std::is_signed_v) { return Return(_mm_cmpgt_epi64(m, a.m)); } else { const __m128i offset = _mm_set1_epi64x((long long) 0x8000000000000000ull); return Return(_mm_cmpgt_epi64( _mm_sub_epi64(m, offset), _mm_sub_epi64(a.m, offset) )); } #else return Return::from_k(std::is_signed_v ? _mm_cmpgt_epi64_mask(m, a.m) : _mm_cmpgt_epu64_mask(m, a.m)); #endif } ENOKI_INLINE auto le_(Ref a) const { #if defined(ENOKI_X86_AVX512VL) return mask_t::from_k(std::is_signed_v ? _mm_cmple_epi64_mask(m, a.m) : _mm_cmple_epu64_mask(m, a.m)); #else return ~gt_(a); #endif } ENOKI_INLINE auto ge_(Ref a) const { #if defined(ENOKI_X86_AVX512VL) return mask_t::from_k(std::is_signed_v ? _mm_cmpge_epi64_mask(m, a.m) : _mm_cmpge_epu64_mask(m, a.m)); #else return ~lt_(a); #endif } ENOKI_INLINE Derived min_(Ref a) const { #if defined(ENOKI_X86_AVX512VL) return std::is_signed_v ? _mm_min_epi64(a.m, m) : _mm_min_epu64(a.m, m); #else return select(derived() < a, derived(), a); #endif } ENOKI_INLINE Derived max_(Ref a) const { #if defined(ENOKI_X86_AVX512VL) return std::is_signed_v ? _mm_max_epi64(a.m, m) : _mm_max_epu64(a.m, m); #else return select(derived() > a, derived(), a); #endif } ENOKI_INLINE Derived abs_() const { if constexpr (std::is_signed_v) { #if defined(ENOKI_X86_AVX512VL) return _mm_abs_epi64(m); #else return select(derived() < zero(), ~derived() + Derived(Value(1)), derived()); #endif } else { return m; } } template static ENOKI_INLINE Derived select_(const Mask &m, Ref t, Ref f) { #if !defined(ENOKI_X86_AVX512VL) return _mm_blendv_epi8(f.m, t.m, m.m); #else return _mm_mask_blend_epi64(m.k, f.m, t.m); #endif } ENOKI_INLINE Derived mulhi_(Ref a) const { ENOKI_TRACK_SCALAR("mulhi"); return Derived( mulhi(coeff(0), a.coeff(0)), mulhi(coeff(1), a.coeff(1)) ); } template ENOKI_INLINE Derived shuffle_() const { return _mm_shuffle_epi32( m, _MM_SHUFFLE(I1 * 2 + 1, I1 * 2, I0 * 2 + 1, I0 * 2)); } template ENOKI_INLINE Derived shuffle_(const Index &index) const { #if defined(ENOKI_X86_AVX) return _mm_castpd_si128(_mm_permutevar_pd(_mm_castsi128_pd(m), _mm_slli_epi64(index.m, 1))); #else return Base::shuffle_(index); #endif } #if defined(ENOKI_X86_AVX512CD) && defined(ENOKI_X86_AVX512VL) ENOKI_INLINE Derived lzcnt_() const { return _mm_lzcnt_epi64(m); } ENOKI_INLINE Derived tzcnt_() const { return Value(64) - lzcnt(~derived() & (derived() - Value(1))); } #endif //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Masked versions of key operations // ----------------------------------------------------------------------- #if defined(ENOKI_X86_AVX512VL) template ENOKI_INLINE void massign_(const Derived &a, const Mask &mask) { m = _mm_mask_mov_epi64(m, mask.k, a.m); } template ENOKI_INLINE void madd_ (const Derived &a, const Mask &mask) { m = _mm_mask_add_epi64(m, mask.k, m, a.m); } template ENOKI_INLINE void msub_ (const Derived &a, const Mask &mask) { m = _mm_mask_sub_epi64(m, mask.k, m, a.m); } template ENOKI_INLINE void mmul_ (const Derived &a, const Mask &mask) { m = _mm_mask_mullo_epi64(m, mask.k, m, a.m); } template ENOKI_INLINE void mor_ (const Derived &a, const Mask &mask) { m = _mm_mask_or_epi64(m, mask.k, m, a.m); } template ENOKI_INLINE void mand_ (const Derived &a, const Mask &mask) { m = _mm_mask_and_epi64(m, mask.k, m, a.m); } template ENOKI_INLINE void mxor_ (const Derived &a, const Mask &mask) { m = _mm_mask_xor_epi64(m, mask.k, m, a.m); } #endif //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Horizontal operations // ----------------------------------------------------------------------- #define ENOKI_HORIZONTAL_OP(name, op) \ ENOKI_INLINE Value name##_() const { \ Value t1 = Value(detail::mm_extract_epi64<1>(m)); \ Value t2 = Value(detail::mm_cvtsi128_si64(m)); \ return op; \ } ENOKI_HORIZONTAL_OP(hsum, t1 + t2) ENOKI_HORIZONTAL_OP(hprod, t1 * t2) ENOKI_HORIZONTAL_OP(hmin, min(t1, t2)) ENOKI_HORIZONTAL_OP(hmax, max(t1, t2)) #undef ENOKI_HORIZONTAL_OP ENOKI_INLINE bool all_() const { return _mm_movemask_pd(_mm_castsi128_pd(m)) == 0x3;} ENOKI_INLINE bool any_() const { return _mm_movemask_pd(_mm_castsi128_pd(m)) != 0x0; } ENOKI_INLINE uint32_t bitmask_() const { return (uint32_t) _mm_movemask_pd(_mm_castsi128_pd(m)); } ENOKI_INLINE size_t count_() const { return (size_t) _mm_popcnt_u32(bitmask_()); } //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Initialization, loading/writing data // ----------------------------------------------------------------------- ENOKI_INLINE void store_(void *ptr) const { assert((uintptr_t) ptr % 16 == 0); _mm_store_si128((__m128i *) ENOKI_ASSUME_ALIGNED(ptr, 16), m); } template ENOKI_INLINE void store_(void *ptr, const Mask &mask) const { #if defined(ENOKI_X86_AVX512VL) _mm_mask_store_epi64(ptr, mask.k, m); #elif defined(ENOKI_X86_AVX2) _mm_maskstore_epi64((long long *) ptr, mask.m, m); #else Base::store_(ptr, mask); #endif } ENOKI_INLINE void store_unaligned_(void *ptr) const { _mm_storeu_si128((__m128i *) ptr, m); } template ENOKI_INLINE void store_unaligned_(void *ptr, const Mask &mask) const { #if defined(ENOKI_X86_AVX512VL) _mm_mask_storeu_epi64(ptr, mask.k, m); #elif defined(ENOKI_X86_AVX2) _mm_maskstore_epi64((long long *) ptr, mask.m, m); #else Base::store_unaligned_(ptr, mask); #endif } static ENOKI_INLINE Derived load_(const void *ptr) { assert((uintptr_t) ptr % 16 == 0); return _mm_load_si128((const __m128i *) ENOKI_ASSUME_ALIGNED(ptr, 16)); } template static ENOKI_INLINE Derived load_(const void *ptr, const Mask &mask) { #if defined(ENOKI_X86_AVX512VL) return _mm_maskz_load_epi64(mask.k, ptr); #elif defined(ENOKI_X86_AVX2) return _mm_maskload_epi64((const long long *) ptr, mask.m); #else return Base::load_(ptr, mask); #endif } static ENOKI_INLINE Derived load_unaligned_(const void *ptr) { return _mm_loadu_si128((const __m128i *) ptr); } template static ENOKI_INLINE Derived load_unaligned_(const void *ptr, const Mask &mask) { #if defined(ENOKI_X86_AVX512VL) return _mm_maskz_loadu_epi64(mask.k, ptr); #elif defined(ENOKI_X86_AVX2) return _mm_maskload_epi64((const long long *) ptr, mask.m); #else return Base::load_unaligned_(ptr, mask); #endif } static ENOKI_INLINE Derived zero_() { return _mm_setzero_si128(); } #if defined(ENOKI_X86_AVX2) template static ENOKI_INLINE Derived gather_(const void *ptr, const Index &index, const Mask &mask) { if constexpr (sizeof(scalar_t) == 4) { return Base::template gather_(ptr, index, mask); } else { #if defined(ENOKI_X86_AVX512VL) return _mm_mmask_i64gather_epi64(_mm_setzero_si128(), mask.k, index.m, (const long long *) ptr, Stride); #else return _mm_mask_i64gather_epi64(_mm_setzero_si128(), (const long long *) ptr, index.m, mask.m, Stride); #endif } } #endif #if defined(ENOKI_X86_AVX512VL) template ENOKI_INLINE void scatter_(void *ptr, const Index &index, const Mask &mask) const { if constexpr (sizeof(scalar_t) == 4) Base::template scatter_(ptr, index, mask); else _mm_mask_i64scatter_epi64(ptr, mask.k, index.m, m, Stride); } template ENOKI_INLINE Value extract_(const Mask &mask) const { return (Value) detail::mm_cvtsi128_si64(_mm_mask_compress_epi64(_mm_setzero_si128(), mask.k, m)); } template ENOKI_INLINE size_t compress_(Value_ *&ptr, const Mask &mask) const { _mm_storeu_si128((__m128i *) ptr, _mm_mask_compress_epi64(_mm_setzero_si128(), mask.k, m)); size_t kn = (size_t) _mm_popcnt_u32(mask.k); ptr += kn; return kn; } #endif //! @} // ----------------------------------------------------------------------- } ENOKI_MAY_ALIAS; /// Partial overload of StaticArrayImpl for the n=3 case (single precision) template struct alignas(16) StaticArrayImpl : StaticArrayImpl { using Base = StaticArrayImpl; ENOKI_DECLARE_3D_ARRAY(StaticArrayImpl) #if defined(ENOKI_X86_F16C) template ENOKI_INLINE StaticArrayImpl( const StaticArrayBase &a) { uint16_t temp[4]; memcpy(temp, a.derived().data(), sizeof(uint16_t) * 3); temp[3] = 0; m = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *) temp)); } #endif template ENOKI_INLINE Derived shuffle_() const { return Base::template shuffle_(); } template ENOKI_INLINE Derived shuffle_(const Index &index) const { return Base::shuffle_(index); } // ----------------------------------------------------------------------- //! @{ \name Horizontal operations (adapted for the n=3 case) // ----------------------------------------------------------------------- #define ENOKI_HORIZONTAL_OP(name, op) \ ENOKI_INLINE Value name##_() const { \ __m128 t1 = _mm_movehl_ps(m, m); \ __m128 t2 = _mm_##op##_ss(m, t1); \ t1 = _mm_movehdup_ps(m); \ t1 = _mm_##op##_ss(t1, t2); \ return _mm_cvtss_f32(t1); \ } ENOKI_HORIZONTAL_OP(hsum, add) ENOKI_HORIZONTAL_OP(hprod, mul) ENOKI_HORIZONTAL_OP(hmin, min) ENOKI_HORIZONTAL_OP(hmax, max) #undef ENOKI_HORIZONTAL_OP ENOKI_INLINE Value dot_(Ref a) const { return _mm_cvtss_f32(_mm_dp_ps(m, a.m, 0b01110001)); } ENOKI_INLINE bool all_() const { return (_mm_movemask_ps(m) & 7) == 7; } ENOKI_INLINE bool any_() const { return (_mm_movemask_ps(m) & 7) != 0; } ENOKI_INLINE uint32_t bitmask_() const { return (uint32_t) _mm_movemask_ps(m) & 7; } ENOKI_INLINE size_t count_() const { return (size_t) _mm_popcnt_u32(bitmask_()); } //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Loading/writing data (adapted for the n=3 case) // ----------------------------------------------------------------------- static ENOKI_INLINE auto mask_() { #if defined(ENOKI_X86_AVX512VL) return mask_t::from_k((__mmask8) 7); #else return mask_t(_mm_castsi128_ps(_mm_setr_epi32(-1, -1, -1, 0))); #endif } using Base::load_; using Base::load_unaligned_; using Base::store_; using Base::store_unaligned_; ENOKI_INLINE void store_(void *ptr) const { memcpy(ptr, &m, sizeof(Value) * 3); } ENOKI_INLINE void store_unaligned_(void *ptr) const { store_(ptr); } static ENOKI_INLINE Derived load_(const void *ptr) { return Base::load_unaligned_(ptr); } static ENOKI_INLINE Derived load_unaligned_(const void *ptr) { Derived result; memcpy(&result.m, ptr, sizeof(Value) * 3); return result; } #if defined(ENOKI_X86_AVX) template ENOKI_INLINE void store_(void *ptr, const Mask &mask) const { Base::store_(ptr, mask & mask_()); } template ENOKI_INLINE void store_unaligned_(void *ptr, const Mask &mask) const { Base::store_unaligned_(ptr, mask & mask_()); } template static ENOKI_INLINE Derived load_(const void *ptr, const Mask &mask) { return Base::load_(ptr, mask & mask_()); } template static ENOKI_INLINE Derived load_unaligned_(const void *ptr, const Mask &mask) { return Base::load_unaligned_(ptr, mask & mask_()); } #endif #if defined(ENOKI_X86_AVX2) template static ENOKI_INLINE Derived gather_(const void *ptr, const Index &index, const Mask &mask) { return Base::template gather_(ptr, index, mask & mask_()); } #endif #if defined(ENOKI_X86_AVX512VL) template ENOKI_INLINE void scatter_(void *ptr, const Index &index, const Mask &mask) const { Base::template scatter_(ptr, index, mask & mask_()); } #endif template ENOKI_INLINE size_t compress_(float *&ptr, const Mask &mask) const { return Base::compress_(ptr, mask & mask_()); } //! @} // ----------------------------------------------------------------------- } ENOKI_MAY_ALIAS; /// Partial overload of StaticArrayImpl for the n=3 case (32 bit integers) template struct alignas(16) StaticArrayImpl> : StaticArrayImpl { using Base = StaticArrayImpl; ENOKI_DECLARE_3D_ARRAY(StaticArrayImpl) template ENOKI_INLINE Derived shuffle_() const { return Base::template shuffle_(); } template ENOKI_INLINE Derived shuffle_(const Index &index) const { return Base::shuffle_(index); } // ----------------------------------------------------------------------- //! @{ \name Horizontal operations (adapted for the n=3 case) // ----------------------------------------------------------------------- #define ENOKI_HORIZONTAL_OP(name, op) \ ENOKI_INLINE Value name##_() const { \ __m128i t1 = _mm_unpackhi_epi32(m, m); \ __m128i t2 = _mm_##op##_epi32(m, t1); \ t1 = _mm_shuffle_epi32(m, 1); \ t1 = _mm_##op##_epi32(t1, t2); \ return (Value) _mm_cvtsi128_si32(t1); \ } #define ENOKI_HORIZONTAL_OP_SIGNED(name, op) \ ENOKI_INLINE Value name##_() const { \ __m128i t2, t1 = _mm_unpackhi_epi32(m, m); \ if constexpr (std::is_signed::value) \ t2 = _mm_##op##_epi32(m, t1); \ else \ t2 = _mm_##op##_epu32(m, t1); \ t1 = _mm_shuffle_epi32(m, 1); \ if constexpr (std::is_signed::value) \ t1 = _mm_##op##_epi32(t1, t2); \ else \ t1 = _mm_##op##_epu32(t1, t2); \ return (Value) _mm_cvtsi128_si32(t1); \ } ENOKI_HORIZONTAL_OP(hsum, add) ENOKI_HORIZONTAL_OP(hprod, mullo) ENOKI_HORIZONTAL_OP_SIGNED(hmin, min) ENOKI_HORIZONTAL_OP_SIGNED(hmax, max) #undef ENOKI_HORIZONTAL_OP #undef ENOKI_HORIZONTAL_OP_SIGNED ENOKI_INLINE bool all_() const { return (_mm_movemask_ps(_mm_castsi128_ps(m)) & 7) == 7;} ENOKI_INLINE bool any_() const { return (_mm_movemask_ps(_mm_castsi128_ps(m)) & 7) != 0; } ENOKI_INLINE uint32_t bitmask_() const { return (uint32_t) _mm_movemask_ps(_mm_castsi128_ps(m)) & 7; } ENOKI_INLINE size_t count_() const { return (size_t) _mm_popcnt_u32(bitmask_()); } //! @} // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- //! @{ \name Loading/writing data (adapted for the n=3 case) // ----------------------------------------------------------------------- static ENOKI_INLINE auto mask_() { #if defined(ENOKI_X86_AVX512VL) return mask_t::from_k((__mmask8) 7); #else return mask_t(_mm_setr_epi32(-1, -1, -1, 0)); #endif } using Base::load_; using Base::load_unaligned_; using Base::store_; using Base::store_unaligned_; ENOKI_INLINE void store_(void *ptr) const { memcpy(ptr, &m, sizeof(Value) * 3); } ENOKI_INLINE void store_unaligned_(void *ptr) const { store_(ptr); } static ENOKI_INLINE Derived load_(const void *ptr) { return Base::load_unaligned_(ptr); } static ENOKI_INLINE Derived load_unaligned_(const void *ptr) { Derived result; memcpy(&result.m, ptr, sizeof(Value) * 3); return result; } #if defined(ENOKI_X86_AVX2) template ENOKI_INLINE void store_unaligned_(void *ptr, const Mask &mask) const { return Base::store_unaligned_(ptr, mask & mask_()); } template ENOKI_INLINE void store_(void *ptr, const Mask &mask) const { return Base::store_(ptr, mask & mask_()); } template static ENOKI_INLINE Derived load_(const void *ptr, const Mask &mask) { return Base::load_(ptr, mask & mask_()); } template static ENOKI_INLINE Derived load_unaligned_(const void *ptr, const Mask &mask) { return Base::load_unaligned_(ptr, mask & mask_()); } #endif #if defined(ENOKI_X86_AVX2) template static ENOKI_INLINE Derived gather_(const void *ptr, const Index &index, const Mask &mask) { return Base::template gather_(ptr, index, mask & mask_()); } #endif #if defined(ENOKI_X86_AVX512VL) template ENOKI_INLINE void scatter_(void *ptr, const Index &index, const Mask &mask) const { Base::template scatter_(ptr, index, mask & mask_()); } #endif template ENOKI_INLINE size_t compress_(T *&ptr, const Mask &mask) const { return Base::compress_(ptr, mask & mask_()); } //! @} // ----------------------------------------------------------------------- } ENOKI_MAY_ALIAS; #if defined(ENOKI_X86_AVX512VL) template ENOKI_DECLARE_KMASK(float, 4, Derived_, int) template ENOKI_DECLARE_KMASK(float, 3, Derived_, int) template ENOKI_DECLARE_KMASK(double, 2, Derived_, int) template ENOKI_DECLARE_KMASK(Value_, 4, Derived_, enable_if_int32_t) template ENOKI_DECLARE_KMASK(Value_, 3, Derived_, enable_if_int32_t) template ENOKI_DECLARE_KMASK(Value_, 2, Derived_, enable_if_int64_t) #endif NAMESPACE_END(enoki)