#pragma once //////////////////////////////////////////////////////////////////////////////// // The MIT License (MIT) // // Copyright (c) 2017 Nicholas Frechette & Animation Compression Library contributors // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. //////////////////////////////////////////////////////////////////////////////// #include "acl/core/impl/compiler_utils.h" #include "acl/core/error.h" #include "acl/core/memory_utils.h" #include #include ACL_IMPL_FILE_PRAGMA_PUSH namespace acl { inline uint32_t pack_scalar_unsigned(float input, uint32_t num_bits) { ACL_ASSERT(num_bits < 31, "Attempting to pack on too many bits"); ACL_ASSERT(input >= 0.0F && input <= 1.0F, "Expected normalized unsigned input value: %f", input); const uint32_t max_value = (1 << num_bits) - 1; return static_cast(rtm::scalar_round_symmetric(input * rtm::scalar_safe_to_float(max_value))); } inline float unpack_scalar_unsigned(uint32_t input, uint32_t num_bits) { ACL_ASSERT(num_bits < 31, "Attempting to unpack from too many bits"); const uint32_t max_value = (1 << num_bits) - 1; ACL_ASSERT(input <= max_value, "Input value too large: %ull", input); // For performance reasons, unpacking is faster when multiplying with the reciprocal const float inv_max_value = 1.0F / rtm::scalar_safe_to_float(max_value); return rtm::scalar_safe_to_float(input) * inv_max_value; } inline uint32_t pack_scalar_signed(float input, uint32_t num_bits) { return pack_scalar_unsigned((input * 0.5F) + 0.5F, num_bits); } inline float unpack_scalar_signed(uint32_t input, uint32_t num_bits) { return (unpack_scalar_unsigned(input, num_bits) * 2.0F) - 1.0F; } // Assumes the 'vector_data' is in big-endian order and is padded in order to load up to 8 bytes from it inline rtm::scalarf RTM_SIMD_CALL unpack_scalarf_32_unsafe(const uint8_t* vector_data, uint32_t bit_offset) { #if defined(RTM_SSE2_INTRINSICS) const uint32_t byte_offset = bit_offset / 8; const uint32_t shift_offset = bit_offset % 8; uint64_t vector_u64 = unaligned_load(vector_data + byte_offset + 0); vector_u64 = byte_swap(vector_u64); vector_u64 <<= shift_offset; vector_u64 >>= 32; const uint32_t x32 = uint32_t(vector_u64); return rtm::scalarf{ _mm_castsi128_ps(_mm_set1_epi32(x32)) }; #elif defined(RTM_NEON_INTRINSICS) const uint32_t byte_offset = bit_offset / 8; const uint32_t shift_offset = bit_offset % 8; uint64_t vector_u64 = unaligned_load(vector_data + byte_offset + 0); vector_u64 = byte_swap(vector_u64); vector_u64 <<= shift_offset; vector_u64 >>= 32; const uint64_t x64 = vector_u64; const uint32x2_t xy = vcreate_u32(x64); return vget_lane_f32(vreinterpret_f32_u32(xy), 0); #else const uint32_t byte_offset = bit_offset / 8; const uint32_t shift_offset = bit_offset % 8; uint64_t vector_u64 = unaligned_load(vector_data + byte_offset + 0); vector_u64 = byte_swap(vector_u64); vector_u64 <<= shift_offset; vector_u64 >>= 32; const uint64_t x64 = vector_u64; const float x = aligned_load(&x64); return rtm::scalar_set(x); #endif } // Assumes the 'vector_data' is in big-endian order and padded in order to load up to 8 bytes from it inline rtm::scalarf RTM_SIMD_CALL unpack_scalarf_uXX_unsafe(uint32_t num_bits, const uint8_t* vector_data, uint32_t bit_offset) { ACL_ASSERT(num_bits <= 19, "This function does not support reading more than 19 bits per component"); struct PackedTableEntry { explicit constexpr PackedTableEntry(uint8_t num_bits_) : max_value(num_bits_ == 0 ? 1.0F : (1.0F / float((1 << num_bits_) - 1))) , mask((1 << num_bits_) - 1) {} float max_value; uint32_t mask; }; // TODO: We technically don't need the first 3 entries, which could save a few bytes alignas(64) static constexpr PackedTableEntry k_packed_constants[20] = { PackedTableEntry(0), PackedTableEntry(1), PackedTableEntry(2), PackedTableEntry(3), PackedTableEntry(4), PackedTableEntry(5), PackedTableEntry(6), PackedTableEntry(7), PackedTableEntry(8), PackedTableEntry(9), PackedTableEntry(10), PackedTableEntry(11), PackedTableEntry(12), PackedTableEntry(13), PackedTableEntry(14), PackedTableEntry(15), PackedTableEntry(16), PackedTableEntry(17), PackedTableEntry(18), PackedTableEntry(19), }; #if defined(RTM_SSE2_INTRINSICS) const uint32_t bit_shift = 32 - num_bits; const uint32_t mask = k_packed_constants[num_bits].mask; const __m128 inv_max_value = _mm_load_ps1(&k_packed_constants[num_bits].max_value); uint32_t byte_offset = bit_offset / 8; uint32_t vector_u32 = unaligned_load(vector_data + byte_offset); vector_u32 = byte_swap(vector_u32); const uint32_t x32 = (vector_u32 >> (bit_shift - (bit_offset % 8))); const __m128 value = _mm_cvtsi32_ss(inv_max_value, x32 & mask); return rtm::scalarf{ _mm_mul_ss(value, inv_max_value) }; #elif defined(RTM_NEON_INTRINSICS) const uint32_t bit_shift = 32 - num_bits; const uint32_t mask = k_packed_constants[num_bits].mask; const float inv_max_value = k_packed_constants[num_bits].max_value; uint32_t byte_offset = bit_offset / 8; uint32_t vector_u32 = unaligned_load(vector_data + byte_offset); vector_u32 = byte_swap(vector_u32); const uint32_t x32 = (vector_u32 >> (bit_shift - (bit_offset % 8))); const int32_t value_u32 = x32 & mask; const float value_f32 = static_cast(value_u32); return value_f32 * inv_max_value; #else const uint32_t bit_shift = 32 - num_bits; const uint32_t mask = k_packed_constants[num_bits].mask; const float inv_max_value = k_packed_constants[num_bits].max_value; uint32_t byte_offset = bit_offset / 8; uint32_t vector_u32 = unaligned_load(vector_data + byte_offset); vector_u32 = byte_swap(vector_u32); const uint32_t x32 = (vector_u32 >> (bit_shift - (bit_offset % 8))) & mask; return rtm::scalar_set(static_cast(x32) * inv_max_value); #endif } } ACL_IMPL_FILE_PRAGMA_POP