cocos-engine-external/sources/acl/compression/impl/quantize_streams.h

#pragma once

////////////////////////////////////////////////////////////////////////////////
// The MIT License (MIT)
//
// Copyright (c) 2017 Nicholas Frechette & Animation Compression Library contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
////////////////////////////////////////////////////////////////////////////////

#include "acl/core/iallocator.h"
#include "acl/core/impl/compiler_utils.h"
#include "acl/core/error.h"
#include "acl/core/track_formats.h"
#include "acl/core/utils.h"
#include "acl/core/variable_bit_rates.h"
#include "acl/math/quat_packing.h"
#include "acl/math/vector4_packing.h"
#include "acl/compression/impl/track_bit_rate_database.h"
#include "acl/compression/impl/transform_bit_rate_permutations.h"
#include "acl/compression/impl/clip_context.h"
#include "acl/compression/impl/sample_streams.h"
#include "acl/compression/impl/normalize_streams.h"
#include "acl/compression/impl/convert_rotation_streams.h"
#include "acl/compression/transform_error_metrics.h"
#include "acl/compression/compression_settings.h"

#include <rtm/quatf.h>
#include <rtm/vector4f.h>

#include <cstddef>
#include <cstdint>
#include <functional>

// 0 = no debug info, 1 = basic info, 2 = verbose
#define ACL_IMPL_DEBUG_VARIABLE_QUANTIZATION		0

// 0 = no profiling, 1 = we perform quantization 10 times in a row for every segment
#define ACL_IMPL_PROFILE_MATH						0

#if ACL_IMPL_PROFILE_MATH && defined(__ANDROID__)
#include <android/log.h>
#endif

ACL_IMPL_FILE_PRAGMA_PUSH

namespace acl
{
	namespace acl_impl
	{
		struct quantization_context
		{
			iallocator& allocator;
			clip_context& clip;
			const clip_context& raw_clip;
			const clip_context& additive_base_clip;
			SegmentContext* segment;
			BoneStreams* bone_streams;
			const transform_metadata* metadata;
			uint32_t num_bones;
			const itransform_error_metric* error_metric;

			track_bit_rate_database bit_rate_database;
			single_track_query local_query;
			hierarchical_track_query object_query;

			uint32_t num_samples;
			uint32_t segment_sample_start_index;
			float sample_rate;
			float clip_duration;
			float error_threshold;					// Error threshold of the current bone being optimized
			bool has_scale;
			bool has_additive_base;
			bool needs_conversion;

			rotation_format8 rotation_format;
			vector_format8 translation_format;
			vector_format8 scale_format;
			compression_level8 compression_level;

			const BoneStreams* raw_bone_streams;

			rtm::qvvf* additive_local_pose;			// 1 per transform
			rtm::qvvf* raw_local_pose;				// 1 per transform
			rtm::qvvf* lossy_local_pose;			// 1 per transform

			uint8_t* raw_local_transforms;			// 1 per transform per sample in segment
			uint8_t* base_local_transforms;			// 1 per transform per sample in segment
			uint8_t* raw_object_transforms;			// 1 per transform per sample in segment
			uint8_t* base_object_transforms;		// 1 per transform per sample in segment

			uint8_t* local_transforms_converted;	// 1 per transform
			uint8_t* lossy_object_pose;				// 1 per transform
			size_t metric_transform_size;

			BoneBitRate* bit_rate_per_bone;			// 1 per transform
			uint32_t* parent_transform_indices;		// 1 per transform
			uint32_t* self_transform_indices;		// 1 per transform

			uint32_t* chain_bone_indices;			// 1 per transform
			uint32_t num_bones_in_chain;
			uint32_t padding1;	// unused

			quantization_context(iallocator& allocator_, clip_context& clip_, const clip_context& raw_clip_, const clip_context& additive_base_clip_, const compression_settings& settings_)
				: allocator(allocator_)
				, clip(clip_)
				, raw_clip(raw_clip_)
				, additive_base_clip(additive_base_clip_)
				, segment(nullptr)
				, bone_streams(nullptr)
				, metadata(clip_.metadata)
				, num_bones(clip_.num_bones)
				, error_metric(settings_.error_metric)
				, bit_rate_database(allocator_, settings_.rotation_format, settings_.translation_format, settings_.scale_format, clip_.segments->bone_streams, raw_clip_.segments->bone_streams, clip_.num_bones, clip_.segments->num_samples)
				, local_query()
				, object_query(allocator_)
				, num_samples(~0U)
				, segment_sample_start_index(~0U)
				, sample_rate(clip_.sample_rate)
				, clip_duration(clip_.duration)
				, error_threshold(0.0F)
				, has_scale(clip_.has_scale)
				, has_additive_base(clip_.has_additive_base)
				, rotation_format(settings_.rotation_format)
				, translation_format(settings_.translation_format)
				, scale_format(settings_.scale_format)
				, compression_level(settings_.level)
				, raw_bone_streams(raw_clip_.segments[0].bone_streams)
				, num_bones_in_chain(0)
			{
				local_query.bind(bit_rate_database);
				object_query.bind(bit_rate_database);

				needs_conversion = settings_.error_metric->needs_conversion(clip_.has_scale);
				const size_t metric_transform_size_ = settings_.error_metric->get_transform_size(clip_.has_scale);
				metric_transform_size = metric_transform_size_;

				additive_local_pose = clip_.has_additive_base ? allocate_type_array<rtm::qvvf>(allocator, num_bones) : nullptr;
				raw_local_pose = allocate_type_array<rtm::qvvf>(allocator, num_bones);
				lossy_local_pose = allocate_type_array<rtm::qvvf>(allocator, num_bones);
				raw_local_transforms = allocate_type_array_aligned<uint8_t>(allocator, metric_transform_size_ * num_bones * clip_.segments->num_samples, 64);
				base_local_transforms = clip_.has_additive_base ? allocate_type_array_aligned<uint8_t>(allocator, metric_transform_size_ * num_bones * clip_.segments->num_samples, 64) : nullptr;
				raw_object_transforms = allocate_type_array_aligned<uint8_t>(allocator, metric_transform_size_ * num_bones * clip_.segments->num_samples, 64);
				base_object_transforms = clip_.has_additive_base ? allocate_type_array_aligned<uint8_t>(allocator, metric_transform_size_ * num_bones * clip_.segments->num_samples, 64) : nullptr;
				local_transforms_converted = needs_conversion ? allocate_type_array_aligned<uint8_t>(allocator, metric_transform_size_ * num_bones, 64) : nullptr;
				lossy_object_pose = allocate_type_array_aligned<uint8_t>(allocator, metric_transform_size_ * num_bones, 64);
				bit_rate_per_bone = allocate_type_array<BoneBitRate>(allocator, num_bones);
				parent_transform_indices = allocate_type_array<uint32_t>(allocator, num_bones);
				self_transform_indices = allocate_type_array<uint32_t>(allocator, num_bones);
				chain_bone_indices = allocate_type_array<uint32_t>(allocator, num_bones);

				for (uint32_t transform_index = 0; transform_index < num_bones; ++transform_index)
				{
					const transform_metadata& metadata_ = clip_.metadata[transform_index];
					parent_transform_indices[transform_index] = metadata_.parent_index;
					self_transform_indices[transform_index] = transform_index;
				}
			}

			~quantization_context()
			{
				deallocate_type_array(allocator, additive_local_pose, num_bones);
				deallocate_type_array(allocator, raw_local_pose, num_bones);
				deallocate_type_array(allocator, lossy_local_pose, num_bones);
				deallocate_type_array(allocator, raw_local_transforms, metric_transform_size * num_bones * clip.segments->num_samples);
				deallocate_type_array(allocator, base_local_transforms, metric_transform_size * num_bones * clip.segments->num_samples);
				deallocate_type_array(allocator, raw_object_transforms, metric_transform_size * num_bones * clip.segments->num_samples);
				deallocate_type_array(allocator, base_object_transforms, metric_transform_size * num_bones * clip.segments->num_samples);
				deallocate_type_array(allocator, local_transforms_converted, metric_transform_size * num_bones);
				deallocate_type_array(allocator, lossy_object_pose, metric_transform_size * num_bones);
				deallocate_type_array(allocator, bit_rate_per_bone, num_bones);
				deallocate_type_array(allocator, parent_transform_indices, num_bones);
				deallocate_type_array(allocator, self_transform_indices, num_bones);
				deallocate_type_array(allocator, chain_bone_indices, num_bones);
			}

			void set_segment(SegmentContext& segment_)
			{
				segment = &segment_;
				bone_streams = segment_.bone_streams;
				num_samples = segment_.num_samples;
				segment_sample_start_index = segment_.clip_sample_offset;
				bit_rate_database.set_segment(segment_.bone_streams, segment_.num_bones, segment_.num_samples);

				// Cache every raw local/object transforms and the base local transforms since they never change
				const itransform_error_metric* error_metric_ = error_metric;
				const size_t sample_transform_size = metric_transform_size * num_bones;

				const auto convert_transforms_impl = std::mem_fn(has_scale ? &itransform_error_metric::convert_transforms : &itransform_error_metric::convert_transforms_no_scale);
				const auto apply_additive_to_base_impl = std::mem_fn(has_scale ? &itransform_error_metric::apply_additive_to_base : &itransform_error_metric::apply_additive_to_base_no_scale);
				const auto local_to_object_space_impl = std::mem_fn(has_scale ? &itransform_error_metric::local_to_object_space : &itransform_error_metric::local_to_object_space_no_scale);

				itransform_error_metric::convert_transforms_args convert_transforms_args_raw;
				convert_transforms_args_raw.dirty_transform_indices = self_transform_indices;
				convert_transforms_args_raw.num_dirty_transforms = num_bones;
				convert_transforms_args_raw.transforms = raw_local_pose;
				convert_transforms_args_raw.num_transforms = num_bones;

				itransform_error_metric::convert_transforms_args convert_transforms_args_base = convert_transforms_args_raw;
				convert_transforms_args_base.transforms = additive_local_pose;

				itransform_error_metric::apply_additive_to_base_args apply_additive_to_base_args_raw;
				apply_additive_to_base_args_raw.dirty_transform_indices = self_transform_indices;
				apply_additive_to_base_args_raw.num_dirty_transforms = num_bones;
				apply_additive_to_base_args_raw.local_transforms = nullptr;
				apply_additive_to_base_args_raw.base_transforms = nullptr;
				apply_additive_to_base_args_raw.num_transforms = num_bones;

				itransform_error_metric::local_to_object_space_args local_to_object_space_args_raw;
				local_to_object_space_args_raw.dirty_transform_indices = self_transform_indices;
				local_to_object_space_args_raw.num_dirty_transforms = num_bones;
				local_to_object_space_args_raw.parent_transform_indices = parent_transform_indices;
				local_to_object_space_args_raw.local_transforms = nullptr;
				local_to_object_space_args_raw.num_transforms = num_bones;

				for (uint32_t sample_index = 0; sample_index < segment_.num_samples; ++sample_index)
				{
					// Sample our streams and calculate the error
					// The sample time is calculated from the full clip duration to be consistent with decompression
					const float sample_time = rtm::scalar_min(float(segment_.clip_sample_offset + sample_index) / sample_rate, clip_duration);

					sample_streams(raw_bone_streams, num_bones, sample_time, raw_local_pose);

					uint8_t* sample_raw_local_transforms = raw_local_transforms + (sample_index * sample_transform_size);

					if (needs_conversion)
						convert_transforms_impl(error_metric_, convert_transforms_args_raw, sample_raw_local_transforms);
					else
						std::memcpy(sample_raw_local_transforms, raw_local_pose, sample_transform_size);

					if (has_additive_base)
					{
						const float normalized_sample_time = additive_base_clip.num_samples > 1 ? (sample_time / clip_duration) : 0.0F;
						const float additive_sample_time = additive_base_clip.num_samples > 1 ? (normalized_sample_time * additive_base_clip.duration) : 0.0F;
						sample_streams(additive_base_clip.segments[0].bone_streams, num_bones, additive_sample_time, additive_local_pose);

						uint8_t* sample_base_local_transforms = base_local_transforms + (sample_index * sample_transform_size);

						if (needs_conversion)
							convert_transforms_impl(error_metric_, convert_transforms_args_base, sample_base_local_transforms);
						else
							std::memcpy(sample_base_local_transforms, additive_local_pose, sample_transform_size);

						apply_additive_to_base_args_raw.local_transforms = sample_raw_local_transforms;
						apply_additive_to_base_args_raw.base_transforms = sample_base_local_transforms;
						apply_additive_to_base_impl(error_metric_, apply_additive_to_base_args_raw, sample_raw_local_transforms);
					}

					local_to_object_space_args_raw.local_transforms = sample_raw_local_transforms;

					uint8_t* sample_raw_object_transforms = raw_object_transforms + (sample_index * sample_transform_size);
					local_to_object_space_impl(error_metric_, local_to_object_space_args_raw, sample_raw_object_transforms);
				}
			}

			bool is_valid() const { return segment != nullptr; }

			quantization_context(const quantization_context&) = delete;
			quantization_context(quantization_context&&) = delete;
			quantization_context& operator=(const quantization_context&) = delete;
			quantization_context& operator=(quantization_context&&) = delete;
		};

		inline void quantize_fixed_rotation_stream(iallocator& allocator, const RotationTrackStream& raw_stream, rotation_format8 rotation_format, RotationTrackStream& out_quantized_stream)
		{
			// We expect all our samples to have the same width of sizeof(rtm::vector4f)
			ACL_ASSERT(raw_stream.get_sample_size() == sizeof(rtm::vector4f), "Unexpected rotation sample size. %u != %zu", raw_stream.get_sample_size(), sizeof(rtm::vector4f));

			const uint32_t num_samples = raw_stream.get_num_samples();
			const uint32_t rotation_sample_size = get_packed_rotation_size(rotation_format);
			const float sample_rate = raw_stream.get_sample_rate();
			RotationTrackStream quantized_stream(allocator, num_samples, rotation_sample_size, sample_rate, rotation_format);

			for (uint32_t sample_index = 0; sample_index < num_samples; ++sample_index)
			{
				const rtm::quatf rotation = raw_stream.get_raw_sample<rtm::quatf>(sample_index);
				uint8_t* quantized_ptr = quantized_stream.get_raw_sample_ptr(sample_index);

				switch (rotation_format)
				{
				case rotation_format8::quatf_full:
					pack_vector4_128(rtm::quat_to_vector(rotation), quantized_ptr);
					break;
				case rotation_format8::quatf_drop_w_full:
					pack_vector3_96(rtm::quat_to_vector(rotation), quantized_ptr);
					break;
				case rotation_format8::quatf_drop_w_variable:
				default:
					ACL_ASSERT(false, "Invalid or unsupported rotation format: %s", get_rotation_format_name(rotation_format));
					break;
				}
			}

			out_quantized_stream = std::move(quantized_stream);
		}

		inline void quantize_fixed_rotation_stream(quantization_context& context, uint32_t bone_index, rotation_format8 rotation_format)
		{
			ACL_ASSERT(bone_index < context.num_bones, "Invalid bone index: %u", bone_index);

			BoneStreams& bone_stream = context.bone_streams[bone_index];

			// Default tracks aren't quantized
			if (bone_stream.is_rotation_default)
				return;

			quantize_fixed_rotation_stream(context.allocator, bone_stream.rotations, rotation_format, bone_stream.rotations);
		}

		inline void quantize_variable_rotation_stream(quantization_context& context, const RotationTrackStream& raw_clip_stream, const RotationTrackStream& raw_segment_stream, const TrackStreamRange& clip_range, uint8_t bit_rate, RotationTrackStream& out_quantized_stream)
		{
			// We expect all our samples to have the same width of sizeof(rtm::vector4f)
			ACL_ASSERT(raw_segment_stream.get_sample_size() == sizeof(rtm::vector4f), "Unexpected rotation sample size. %u != %zu", raw_segment_stream.get_sample_size(), sizeof(rtm::vector4f));

			const uint32_t num_samples = is_constant_bit_rate(bit_rate) ? 1 : raw_segment_stream.get_num_samples();
			const uint32_t sample_size = sizeof(uint64_t) * 2;
			const float sample_rate = raw_segment_stream.get_sample_rate();
			RotationTrackStream quantized_stream(context.allocator, num_samples, sample_size, sample_rate, rotation_format8::quatf_drop_w_variable, bit_rate);

			if (is_constant_bit_rate(bit_rate))
			{
				rtm::vector4f rotation = raw_clip_stream.get_raw_sample<rtm::vector4f>(context.segment_sample_start_index);
				rotation = convert_rotation(rotation, rotation_format8::quatf_full, rotation_format8::quatf_drop_w_variable);

				const rtm::vector4f normalized_rotation = normalize_sample(rotation, clip_range);

				uint8_t* quantized_ptr = quantized_stream.get_raw_sample_ptr(0);
				pack_vector3_u48_unsafe(normalized_rotation, quantized_ptr);
			}
			else
			{
				const uint32_t num_bits_at_bit_rate = get_num_bits_at_bit_rate(bit_rate);

				for (uint32_t sample_index = 0; sample_index < num_samples; ++sample_index)
				{
					uint8_t* quantized_ptr = quantized_stream.get_raw_sample_ptr(sample_index);

					if (is_raw_bit_rate(bit_rate))
					{
						rtm::vector4f rotation = raw_clip_stream.get_raw_sample<rtm::vector4f>(context.segment_sample_start_index + sample_index);
						rotation = convert_rotation(rotation, rotation_format8::quatf_full, rotation_format8::quatf_drop_w_variable);
						pack_vector3_96(rotation, quantized_ptr);
					}
					else
					{
						const rtm::quatf rotation = raw_segment_stream.get_raw_sample<rtm::quatf>(sample_index);
						pack_vector3_uXX_unsafe(rtm::quat_to_vector(rotation), num_bits_at_bit_rate, quantized_ptr);
					}
				}
			}

			out_quantized_stream = std::move(quantized_stream);
		}

		inline void quantize_variable_rotation_stream(quantization_context& context, uint32_t bone_index, uint8_t bit_rate)
		{
			ACL_ASSERT(bone_index < context.num_bones, "Invalid bone index: %u", bone_index);

			BoneStreams& bone_stream = context.bone_streams[bone_index];

			// Default tracks aren't quantized
			if (bone_stream.is_rotation_default)
				return;

			const BoneStreams& raw_bone_stream = context.raw_bone_streams[bone_index];
			const rotation_format8 highest_bit_rate = get_highest_variant_precision(rotation_variant8::quat_drop_w);
			const TrackStreamRange& bone_range = context.clip.ranges[bone_index].rotation;

			// If our format is variable, we keep them fixed at the highest bit rate in the variant
			if (bone_stream.is_rotation_constant)
				quantize_fixed_rotation_stream(context.allocator, bone_stream.rotations, highest_bit_rate, bone_stream.rotations);
			else
				quantize_variable_rotation_stream(context, raw_bone_stream.rotations, bone_stream.rotations, bone_range, bit_rate, bone_stream.rotations);
		}

		inline void quantize_fixed_translation_stream(iallocator& allocator, const TranslationTrackStream& raw_stream, vector_format8 translation_format, TranslationTrackStream& out_quantized_stream)
		{
			// We expect all our samples to have the same width of sizeof(rtm::vector4f)
			ACL_ASSERT(raw_stream.get_sample_size() == sizeof(rtm::vector4f), "Unexpected translation sample size. %u != %zu", raw_stream.get_sample_size(), sizeof(rtm::vector4f));
			ACL_ASSERT(raw_stream.get_vector_format() == vector_format8::vector3f_full, "Expected a vector3f_full vector format, found: %s", get_vector_format_name(raw_stream.get_vector_format()));

			const uint32_t num_samples = raw_stream.get_num_samples();
			const uint32_t sample_size = get_packed_vector_size(translation_format);
			const float sample_rate = raw_stream.get_sample_rate();
			TranslationTrackStream quantized_stream(allocator, num_samples, sample_size, sample_rate, translation_format);

			for (uint32_t sample_index = 0; sample_index < num_samples; ++sample_index)
			{
				const rtm::vector4f translation = raw_stream.get_raw_sample<rtm::vector4f>(sample_index);
				uint8_t* quantized_ptr = quantized_stream.get_raw_sample_ptr(sample_index);

				switch (translation_format)
				{
				case vector_format8::vector3f_full:
					pack_vector3_96(translation, quantized_ptr);
					break;
				case vector_format8::vector3f_variable:
				default:
					ACL_ASSERT(false, "Invalid or unsupported vector format: %s", get_vector_format_name(translation_format));
					break;
				}
			}

			out_quantized_stream = std::move(quantized_stream);
		}

		inline void quantize_fixed_translation_stream(quantization_context& context, uint32_t bone_index, vector_format8 translation_format)
		{
			ACL_ASSERT(bone_index < context.num_bones, "Invalid bone index: %u", bone_index);

			BoneStreams& bone_stream = context.bone_streams[bone_index];

			// Default tracks aren't quantized
			if (bone_stream.is_translation_default)
				return;

			// Constant translation tracks store the remaining sample with full precision
			const vector_format8 format = bone_stream.is_translation_constant ? vector_format8::vector3f_full : translation_format;

			quantize_fixed_translation_stream(context.allocator, bone_stream.translations, format, bone_stream.translations);
		}

		inline void quantize_variable_translation_stream(quantization_context& context, const TranslationTrackStream& raw_clip_stream, const TranslationTrackStream& raw_segment_stream, const TrackStreamRange& clip_range, uint8_t bit_rate, TranslationTrackStream& out_quantized_stream)
		{
			// We expect all our samples to have the same width of sizeof(rtm::vector4f)
			ACL_ASSERT(raw_segment_stream.get_sample_size() == sizeof(rtm::vector4f), "Unexpected translation sample size. %u != %zu", raw_segment_stream.get_sample_size(), sizeof(rtm::vector4f));
			ACL_ASSERT(raw_segment_stream.get_vector_format() == vector_format8::vector3f_full, "Expected a vector3f_full vector format, found: %s", get_vector_format_name(raw_segment_stream.get_vector_format()));

			const uint32_t num_samples = is_constant_bit_rate(bit_rate) ? 1 : raw_segment_stream.get_num_samples();
			const uint32_t sample_size = sizeof(uint64_t) * 2;
			const float sample_rate = raw_segment_stream.get_sample_rate();
			TranslationTrackStream quantized_stream(context.allocator, num_samples, sample_size, sample_rate, vector_format8::vector3f_variable, bit_rate);

			if (is_constant_bit_rate(bit_rate))
			{
				const rtm::vector4f translation = raw_clip_stream.get_raw_sample<rtm::vector4f>(context.segment_sample_start_index);
				const rtm::vector4f normalized_translation = normalize_sample(translation, clip_range);

				uint8_t* quantized_ptr = quantized_stream.get_raw_sample_ptr(0);
				pack_vector3_u48_unsafe(normalized_translation, quantized_ptr);
			}
			else
			{
				const uint32_t num_bits_at_bit_rate = get_num_bits_at_bit_rate(bit_rate);

				for (uint32_t sample_index = 0; sample_index < num_samples; ++sample_index)
				{
					uint8_t* quantized_ptr = quantized_stream.get_raw_sample_ptr(sample_index);

					if (is_raw_bit_rate(bit_rate))
					{
						const rtm::vector4f translation = raw_clip_stream.get_raw_sample<rtm::vector4f>(context.segment_sample_start_index + sample_index);
						pack_vector3_96(translation, quantized_ptr);
					}
					else
					{
						const rtm::vector4f translation = raw_segment_stream.get_raw_sample<rtm::vector4f>(sample_index);
						pack_vector3_uXX_unsafe(translation, num_bits_at_bit_rate, quantized_ptr);
					}
				}
			}

			out_quantized_stream = std::move(quantized_stream);
		}

		inline void quantize_variable_translation_stream(quantization_context& context, uint32_t bone_index, uint8_t bit_rate)
		{
			ACL_ASSERT(bone_index < context.num_bones, "Invalid bone index: %u", bone_index);

			BoneStreams& bone_stream = context.bone_streams[bone_index];

			// Default tracks aren't quantized
			if (bone_stream.is_translation_default)
				return;

			const TrackStreamRange& bone_range = context.clip.ranges[bone_index].translation;
			const BoneStreams& raw_bone_stream = context.raw_bone_streams[bone_index];

			// Constant translation tracks store the remaining sample with full precision
			if (bone_stream.is_translation_constant)
				quantize_fixed_translation_stream(context.allocator, bone_stream.translations, vector_format8::vector3f_full, bone_stream.translations);
			else
				quantize_variable_translation_stream(context, raw_bone_stream.translations, bone_stream.translations, bone_range, bit_rate, bone_stream.translations);
		}

		inline void quantize_fixed_scale_stream(iallocator& allocator, const ScaleTrackStream& raw_stream, vector_format8 scale_format, ScaleTrackStream& out_quantized_stream)
		{
			// We expect all our samples to have the same width of sizeof(rtm::vector4f)
			ACL_ASSERT(raw_stream.get_sample_size() == sizeof(rtm::vector4f), "Unexpected scale sample size. %u != %zu", raw_stream.get_sample_size(), sizeof(rtm::vector4f));
			ACL_ASSERT(raw_stream.get_vector_format() == vector_format8::vector3f_full, "Expected a vector3f_full vector format, found: %s", get_vector_format_name(raw_stream.get_vector_format()));

			const uint32_t num_samples = raw_stream.get_num_samples();
			const uint32_t sample_size = get_packed_vector_size(scale_format);
			const float sample_rate = raw_stream.get_sample_rate();
			ScaleTrackStream quantized_stream(allocator, num_samples, sample_size, sample_rate, scale_format);

			for (uint32_t sample_index = 0; sample_index < num_samples; ++sample_index)
			{
				const rtm::vector4f scale = raw_stream.get_raw_sample<rtm::vector4f>(sample_index);
				uint8_t* quantized_ptr = quantized_stream.get_raw_sample_ptr(sample_index);

				switch (scale_format)
				{
				case vector_format8::vector3f_full:
					pack_vector3_96(scale, quantized_ptr);
					break;
				case vector_format8::vector3f_variable:
				default:
					ACL_ASSERT(false, "Invalid or unsupported vector format: %s", get_vector_format_name(scale_format));
					break;
				}
			}

			out_quantized_stream = std::move(quantized_stream);
		}

		inline void quantize_fixed_scale_stream(quantization_context& context, uint32_t bone_index, vector_format8 scale_format)
		{
			ACL_ASSERT(bone_index < context.num_bones, "Invalid bone index: %u", bone_index);

			BoneStreams& bone_stream = context.bone_streams[bone_index];

			// Default tracks aren't quantized
			if (bone_stream.is_scale_default)
				return;

			// Constant scale tracks store the remaining sample with full precision
			const vector_format8 format = bone_stream.is_scale_constant ? vector_format8::vector3f_full : scale_format;

			quantize_fixed_scale_stream(context.allocator, bone_stream.scales, format, bone_stream.scales);
		}

		inline void quantize_variable_scale_stream(quantization_context& context, const ScaleTrackStream& raw_clip_stream, const ScaleTrackStream& raw_segment_stream, const TrackStreamRange& clip_range, uint8_t bit_rate, ScaleTrackStream& out_quantized_stream)
		{
			// We expect all our samples to have the same width of sizeof(rtm::vector4f)
			ACL_ASSERT(raw_segment_stream.get_sample_size() == sizeof(rtm::vector4f), "Unexpected scale sample size. %u != %zu", raw_segment_stream.get_sample_size(), sizeof(rtm::vector4f));
			ACL_ASSERT(raw_segment_stream.get_vector_format() == vector_format8::vector3f_full, "Expected a vector3f_full vector format, found: %s", get_vector_format_name(raw_segment_stream.get_vector_format()));

			const uint32_t num_samples = is_constant_bit_rate(bit_rate) ? 1 : raw_segment_stream.get_num_samples();
			const uint32_t sample_size = sizeof(uint64_t) * 2;
			const float sample_rate = raw_segment_stream.get_sample_rate();
			ScaleTrackStream quantized_stream(context.allocator, num_samples, sample_size, sample_rate, vector_format8::vector3f_variable, bit_rate);

			if (is_constant_bit_rate(bit_rate))
			{
				const rtm::vector4f scale = raw_clip_stream.get_raw_sample<rtm::vector4f>(context.segment_sample_start_index);
				const rtm::vector4f normalized_scale = normalize_sample(scale, clip_range);

				uint8_t* quantized_ptr = quantized_stream.get_raw_sample_ptr(0);
				pack_vector3_u48_unsafe(normalized_scale, quantized_ptr);
			}
			else
			{
				const uint32_t num_bits_at_bit_rate = get_num_bits_at_bit_rate(bit_rate);

				for (uint32_t sample_index = 0; sample_index < num_samples; ++sample_index)
				{
					uint8_t* quantized_ptr = quantized_stream.get_raw_sample_ptr(sample_index);

					if (is_raw_bit_rate(bit_rate))
					{
						const rtm::vector4f scale = raw_clip_stream.get_raw_sample<rtm::vector4f>(context.segment_sample_start_index + sample_index);
						pack_vector3_96(scale, quantized_ptr);
					}
					else
					{
						const rtm::vector4f scale = raw_segment_stream.get_raw_sample<rtm::vector4f>(sample_index);
						pack_vector3_uXX_unsafe(scale, num_bits_at_bit_rate, quantized_ptr);
					}
				}
			}

			out_quantized_stream = std::move(quantized_stream);
		}

		inline void quantize_variable_scale_stream(quantization_context& context, uint32_t bone_index, uint8_t bit_rate)
		{
			ACL_ASSERT(bone_index < context.num_bones, "Invalid bone index: %u", bone_index);

			BoneStreams& bone_stream = context.bone_streams[bone_index];

			// Default tracks aren't quantized
			if (bone_stream.is_scale_default)
				return;

			const TrackStreamRange& bone_range = context.clip.ranges[bone_index].scale;
			const BoneStreams& raw_bone_stream = context.raw_bone_streams[bone_index];

			// Constant scale tracks store the remaining sample with full precision
			if (bone_stream.is_scale_constant)
				quantize_fixed_scale_stream(context.allocator, bone_stream.scales, vector_format8::vector3f_full, bone_stream.scales);
			else
				quantize_variable_scale_stream(context, raw_bone_stream.scales, bone_stream.scales, bone_range, bit_rate, bone_stream.scales);
		}

		enum class error_scan_stop_condition { until_error_too_high, until_end_of_segment };

		inline float calculate_max_error_at_bit_rate_local(quantization_context& context, uint32_t target_bone_index, error_scan_stop_condition stop_condition)
		{
			const itransform_error_metric* error_metric = context.error_metric;
			const bool needs_conversion = context.needs_conversion;
			const bool has_additive_base = context.has_additive_base;
			const transform_metadata& target_bone = context.metadata[target_bone_index];
			const uint32_t num_transforms = context.num_bones;
			const size_t sample_transform_size = context.metric_transform_size * context.num_bones;
			const float sample_rate = context.sample_rate;
			const float clip_duration = context.clip_duration;
			const rtm::scalarf error_threshold = rtm::scalar_set(context.error_threshold);

			const auto convert_transforms_impl = std::mem_fn(context.has_scale ? &itransform_error_metric::convert_transforms : &itransform_error_metric::convert_transforms_no_scale);
			const auto apply_additive_to_base_impl = std::mem_fn(context.has_scale ? &itransform_error_metric::apply_additive_to_base : &itransform_error_metric::apply_additive_to_base_no_scale);
			const auto calculate_error_impl = std::mem_fn(context.has_scale ? &itransform_error_metric::calculate_error : &itransform_error_metric::calculate_error_no_scale);

			itransform_error_metric::convert_transforms_args convert_transforms_args_lossy;
			convert_transforms_args_lossy.dirty_transform_indices = &target_bone_index;
			convert_transforms_args_lossy.num_dirty_transforms = 1;
			convert_transforms_args_lossy.transforms = context.lossy_local_pose;
			convert_transforms_args_lossy.num_transforms = num_transforms;

			itransform_error_metric::apply_additive_to_base_args apply_additive_to_base_args_lossy;
			apply_additive_to_base_args_lossy.dirty_transform_indices = &target_bone_index;
			apply_additive_to_base_args_lossy.num_dirty_transforms = 1;
			apply_additive_to_base_args_lossy.local_transforms = needs_conversion ? (const void*)context.local_transforms_converted : (const void*)context.lossy_local_pose;
			apply_additive_to_base_args_lossy.base_transforms = nullptr;
			apply_additive_to_base_args_lossy.num_transforms = num_transforms;

			itransform_error_metric::calculate_error_args calculate_error_args;
			calculate_error_args.transform0 = nullptr;
			calculate_error_args.transform1 = needs_conversion ? (const void*)(context.local_transforms_converted + (context.metric_transform_size * target_bone_index)) : (const void*)(context.lossy_local_pose + target_bone_index);
			calculate_error_args.construct_sphere_shell(target_bone.shell_distance);

			const uint8_t* raw_transform = context.raw_local_transforms + (target_bone_index * context.metric_transform_size);
			const uint8_t* base_transforms = context.base_local_transforms;

			context.local_query.build(target_bone_index, context.bit_rate_per_bone[target_bone_index]);

			float sample_indexf = float(context.segment_sample_start_index);
			rtm::scalarf max_error = rtm::scalar_set(0.0F);

			for (uint32_t sample_index = 0; sample_index < context.num_samples; ++sample_index)
			{
				// Sample our streams and calculate the error
				// The sample time is calculated from the full clip duration to be consistent with decompression
				const float sample_time = rtm::scalar_min(sample_indexf / sample_rate, clip_duration);

				context.bit_rate_database.sample(context.local_query, sample_time, context.lossy_local_pose, num_transforms);

				if (needs_conversion)
					convert_transforms_impl(error_metric, convert_transforms_args_lossy, context.local_transforms_converted);

				if (has_additive_base)
				{
					apply_additive_to_base_args_lossy.base_transforms = base_transforms;
					base_transforms += sample_transform_size;

					apply_additive_to_base_impl(error_metric, apply_additive_to_base_args_lossy, context.lossy_local_pose);
				}

				calculate_error_args.transform0 = raw_transform;
				raw_transform += sample_transform_size;

				const rtm::scalarf error = calculate_error_impl(error_metric, calculate_error_args);

				max_error = rtm::scalar_max(max_error, error);
				if (stop_condition == error_scan_stop_condition::until_error_too_high && rtm::scalar_greater_equal(error, error_threshold))
					break;

				sample_indexf += 1.0F;
			}

			return rtm::scalar_cast(max_error);
		}

		inline float calculate_max_error_at_bit_rate_object(quantization_context& context, uint32_t target_bone_index, error_scan_stop_condition stop_condition)
		{
			const itransform_error_metric* error_metric = context.error_metric;
			const bool needs_conversion = context.needs_conversion;
			const bool has_additive_base = context.has_additive_base;
			const transform_metadata& target_bone = context.metadata[target_bone_index];
			const size_t sample_transform_size = context.metric_transform_size * context.num_bones;
			const float sample_rate = context.sample_rate;
			const float clip_duration = context.clip_duration;
			const rtm::scalarf error_threshold = rtm::scalar_set(context.error_threshold);

			const auto convert_transforms_impl = std::mem_fn(context.has_scale ? &itransform_error_metric::convert_transforms : &itransform_error_metric::convert_transforms_no_scale);
			const auto apply_additive_to_base_impl = std::mem_fn(context.has_scale ? &itransform_error_metric::apply_additive_to_base : &itransform_error_metric::apply_additive_to_base_no_scale);
			const auto local_to_object_space_impl = std::mem_fn(context.has_scale ? &itransform_error_metric::local_to_object_space : &itransform_error_metric::local_to_object_space_no_scale);
			const auto calculate_error_impl = std::mem_fn(context.has_scale ? &itransform_error_metric::calculate_error : &itransform_error_metric::calculate_error_no_scale);

			itransform_error_metric::convert_transforms_args convert_transforms_args_lossy;
			convert_transforms_args_lossy.dirty_transform_indices = context.chain_bone_indices;
			convert_transforms_args_lossy.num_dirty_transforms = context.num_bones_in_chain;
			convert_transforms_args_lossy.transforms = context.lossy_local_pose;
			convert_transforms_args_lossy.num_transforms = context.num_bones;

			itransform_error_metric::apply_additive_to_base_args apply_additive_to_base_args_lossy;
			apply_additive_to_base_args_lossy.dirty_transform_indices = context.chain_bone_indices;
			apply_additive_to_base_args_lossy.num_dirty_transforms = context.num_bones_in_chain;
			apply_additive_to_base_args_lossy.local_transforms = needs_conversion ? (const void*)(context.local_transforms_converted) : (const void*)context.lossy_local_pose;
			apply_additive_to_base_args_lossy.base_transforms = nullptr;
			apply_additive_to_base_args_lossy.num_transforms = context.num_bones;

			itransform_error_metric::local_to_object_space_args local_to_object_space_args_lossy;
			local_to_object_space_args_lossy.dirty_transform_indices = context.chain_bone_indices;
			local_to_object_space_args_lossy.num_dirty_transforms = context.num_bones_in_chain;
			local_to_object_space_args_lossy.parent_transform_indices = context.parent_transform_indices;
			local_to_object_space_args_lossy.local_transforms = needs_conversion ? (const void*)(context.local_transforms_converted) : (const void*)context.lossy_local_pose;
			local_to_object_space_args_lossy.num_transforms = context.num_bones;

			itransform_error_metric::calculate_error_args calculate_error_args;
			calculate_error_args.transform0 = nullptr;
			calculate_error_args.transform1 = context.lossy_object_pose + (target_bone_index * context.metric_transform_size);
			calculate_error_args.construct_sphere_shell(target_bone.shell_distance);

			const uint8_t* raw_transform = context.raw_object_transforms + (target_bone_index * context.metric_transform_size);
			const uint8_t* base_transforms = context.base_local_transforms;

			context.object_query.build(target_bone_index, context.bit_rate_per_bone, context.bone_streams);

			float sample_indexf = float(context.segment_sample_start_index);
			rtm::scalarf max_error = rtm::scalar_set(0.0F);

			for (uint32_t sample_index = 0; sample_index < context.num_samples; ++sample_index)
			{
				// Sample our streams and calculate the error
				// The sample time is calculated from the full clip duration to be consistent with decompression
				const float sample_time = rtm::scalar_min(sample_indexf / sample_rate, clip_duration);

				context.bit_rate_database.sample(context.object_query, sample_time, context.lossy_local_pose, context.num_bones);

				if (needs_conversion)
					convert_transforms_impl(error_metric, convert_transforms_args_lossy, context.local_transforms_converted);

				if (has_additive_base)
				{
					apply_additive_to_base_args_lossy.base_transforms = base_transforms;
					base_transforms += sample_transform_size;

					apply_additive_to_base_impl(error_metric, apply_additive_to_base_args_lossy, context.lossy_local_pose);
				}

				local_to_object_space_impl(error_metric, local_to_object_space_args_lossy, context.lossy_object_pose);

				calculate_error_args.transform0 = raw_transform;
				raw_transform += sample_transform_size;

				const rtm::scalarf error = calculate_error_impl(error_metric, calculate_error_args);

				max_error = rtm::scalar_max(max_error, error);
				if (stop_condition == error_scan_stop_condition::until_error_too_high && rtm::scalar_greater_equal(error, error_threshold))
					break;

				sample_indexf += 1.0F;
			}

			return rtm::scalar_cast(max_error);
		}

		inline void calculate_local_space_bit_rates(quantization_context& context)
		{
			// To minimize the bit rate, we first start by trying every permutation in local space
			// until our error is acceptable.
			// We try permutations from the lowest memory footprint to the highest.

			const uint32_t num_bones = context.num_bones;
			for (uint32_t bone_index = 0; bone_index < num_bones; ++bone_index)
			{
				// Update our error threshold
				const float error_threshold = context.metadata[bone_index].precision;
				context.error_threshold = error_threshold;

				// Bit rates at this point are one of three value:
				// 0: if the segment track is normalized, it can be constant within the segment
				// 1: if the segment track isn't normalized, it starts at the lowest bit rate
				// 255: if the track is constant/default for the whole clip
				const BoneBitRate bone_bit_rates = context.bit_rate_per_bone[bone_index];

				if (bone_bit_rates.rotation == k_invalid_bit_rate && bone_bit_rates.translation == k_invalid_bit_rate && bone_bit_rates.scale == k_invalid_bit_rate)
				{
#if ACL_IMPL_DEBUG_VARIABLE_QUANTIZATION
					printf("%u: Best bit rates: %u | %u | %u\n", bone_index, bone_bit_rates.rotation, bone_bit_rates.translation, bone_bit_rates.scale);
#endif
					continue;	// Every track bit rate is constant/default, nothing else to do
				}

				BoneBitRate best_bit_rates = bone_bit_rates;
				float best_error = 1.0E10F;
				uint32_t prev_transform_size = ~0U;
				bool is_error_good_enough = false;

				if (context.has_scale)
				{
					const size_t num_permutations = get_array_size(acl_impl::k_local_bit_rate_permutations);
					for (size_t permutation_index = 0; permutation_index < num_permutations; ++permutation_index)
					{
						const uint8_t rotation_bit_rate = acl_impl::k_local_bit_rate_permutations[permutation_index][0];
						if (bone_bit_rates.rotation == 1)
						{
							if (rotation_bit_rate == 0)
								continue;	// Skip permutations we aren't interested in
						}
						else if (bone_bit_rates.rotation == k_invalid_bit_rate)
						{
							if (rotation_bit_rate != 0)
								continue;	// Skip permutations we aren't interested in
						}

						const uint8_t translation_bit_rate = acl_impl::k_local_bit_rate_permutations[permutation_index][1];
						if (bone_bit_rates.translation == 1)
						{
							if (translation_bit_rate == 0)
								continue;	// Skip permutations we aren't interested in
						}
						else if (bone_bit_rates.translation == k_invalid_bit_rate)
						{
							if (translation_bit_rate != 0)
								continue;	// Skip permutations we aren't interested in
						}

						const uint8_t scale_bit_rate = acl_impl::k_local_bit_rate_permutations[permutation_index][2];
						if (bone_bit_rates.scale == 1)
						{
							if (scale_bit_rate == 0)
								continue;	// Skip permutations we aren't interested in
						}
						else if (bone_bit_rates.scale == k_invalid_bit_rate)
						{
							if (scale_bit_rate != 0)
								continue;	// Skip permutations we aren't interested in
						}

						const uint32_t rotation_size = get_num_bits_at_bit_rate(rotation_bit_rate);
						const uint32_t translation_size = get_num_bits_at_bit_rate(translation_bit_rate);
						const uint32_t scale_size = get_num_bits_at_bit_rate(scale_bit_rate);
						const uint32_t transform_size = rotation_size + translation_size + scale_size;

						if (transform_size != prev_transform_size && is_error_good_enough)
						{
							// We already found the lowest transform size and we tried every permutation with that same size
							break;
						}

						prev_transform_size = transform_size;

						context.bit_rate_per_bone[bone_index].rotation = bone_bit_rates.rotation != k_invalid_bit_rate ? rotation_bit_rate : k_invalid_bit_rate;
						context.bit_rate_per_bone[bone_index].translation = bone_bit_rates.translation != k_invalid_bit_rate ? translation_bit_rate : k_invalid_bit_rate;
						context.bit_rate_per_bone[bone_index].scale = bone_bit_rates.scale != k_invalid_bit_rate ? scale_bit_rate : k_invalid_bit_rate;

						const float error = calculate_max_error_at_bit_rate_local(context, bone_index, error_scan_stop_condition::until_error_too_high);

#if ACL_IMPL_DEBUG_VARIABLE_QUANTIZATION > 1
						printf("%u: %u | %u | %u (%u) = %f\n", bone_index, rotation_bit_rate, translation_bit_rate, scale_bit_rate, transform_size, error);
#endif

						if (error < best_error)
						{
							best_error = error;
							best_bit_rates = context.bit_rate_per_bone[bone_index];
							is_error_good_enough = error < error_threshold;
						}
					}
				}
				else
				{
					const size_t num_permutations = get_array_size(acl_impl::k_local_bit_rate_permutations_no_scale);
					for (size_t permutation_index = 0; permutation_index < num_permutations; ++permutation_index)
					{
						const uint8_t rotation_bit_rate = acl_impl::k_local_bit_rate_permutations_no_scale[permutation_index][0];
						if (bone_bit_rates.rotation == 1)
						{
							if (rotation_bit_rate == 0)
								continue;	// Skip permutations we aren't interested in
						}
						else if (bone_bit_rates.rotation == k_invalid_bit_rate)
						{
							if (rotation_bit_rate != 0)
								continue;	// Skip permutations we aren't interested in
						}

						const uint8_t translation_bit_rate = acl_impl::k_local_bit_rate_permutations_no_scale[permutation_index][1];
						if (bone_bit_rates.translation == 1)
						{
							if (translation_bit_rate == 0)
								continue;	// Skip permutations we aren't interested in
						}
						else if (bone_bit_rates.translation == k_invalid_bit_rate)
						{
							if (translation_bit_rate != 0)
								continue;	// Skip permutations we aren't interested in
						}

						const uint32_t rotation_size = get_num_bits_at_bit_rate(rotation_bit_rate);
						const uint32_t translation_size = get_num_bits_at_bit_rate(translation_bit_rate);
						const uint32_t transform_size = rotation_size + translation_size;

						if (transform_size != prev_transform_size && is_error_good_enough)
						{
							// We already found the lowest transform size and we tried every permutation with that same size
							break;
						}

						prev_transform_size = transform_size;

						context.bit_rate_per_bone[bone_index].rotation = bone_bit_rates.rotation != k_invalid_bit_rate ? rotation_bit_rate : k_invalid_bit_rate;
						context.bit_rate_per_bone[bone_index].translation = bone_bit_rates.translation != k_invalid_bit_rate ? translation_bit_rate : k_invalid_bit_rate;

						const float error = calculate_max_error_at_bit_rate_local(context, bone_index, error_scan_stop_condition::until_error_too_high);

#if ACL_IMPL_DEBUG_VARIABLE_QUANTIZATION > 1
						printf("%u: %u | %u | %u (%u) = %f\n", bone_index, rotation_bit_rate, translation_bit_rate, k_invalid_bit_rate, transform_size, error);
#endif

						if (error < best_error)
						{
							best_error = error;
							best_bit_rates = context.bit_rate_per_bone[bone_index];
							is_error_good_enough = error < error_threshold;
						}
					}
				}

#if ACL_IMPL_DEBUG_VARIABLE_QUANTIZATION
				printf("%u: Best bit rates: %u | %u | %u\n", bone_index, best_bit_rates.rotation, best_bit_rates.translation, best_bit_rates.scale);
#endif

				context.bit_rate_per_bone[bone_index] = best_bit_rates;
			}
		}

		constexpr uint32_t increment_and_clamp_bit_rate(uint32_t bit_rate, uint32_t increment)
		{
			return bit_rate >= k_highest_bit_rate ? bit_rate : std::min<uint32_t>(bit_rate + increment, k_highest_bit_rate);
		}

		inline float increase_bone_bit_rate(quantization_context& context, uint32_t bone_index, uint32_t num_increments, float old_error, BoneBitRate& out_best_bit_rates)
		{
			const BoneBitRate bone_bit_rates = context.bit_rate_per_bone[bone_index];
			const uint32_t num_scale_increments = context.has_scale ? num_increments : 0;

			BoneBitRate best_bit_rates = bone_bit_rates;
			float best_error = old_error;

			for (uint32_t rotation_increment = 0; rotation_increment <= num_increments; ++rotation_increment)
			{
				const uint32_t rotation_bit_rate = increment_and_clamp_bit_rate(bone_bit_rates.rotation, rotation_increment);

				for (uint32_t translation_increment = 0; translation_increment <= num_increments; ++translation_increment)
				{
					const uint32_t translation_bit_rate = increment_and_clamp_bit_rate(bone_bit_rates.translation, translation_increment);

					for (uint32_t scale_increment = 0; scale_increment <= num_scale_increments; ++scale_increment)
					{
						const uint32_t scale_bit_rate = increment_and_clamp_bit_rate(bone_bit_rates.scale, scale_increment);

						if (rotation_increment + translation_increment + scale_increment != num_increments)
						{
							if (scale_bit_rate >= k_highest_bit_rate)
								break;
							else
								continue;
						}

						context.bit_rate_per_bone[bone_index] = BoneBitRate{ (uint8_t)rotation_bit_rate, (uint8_t)translation_bit_rate, (uint8_t)scale_bit_rate };
						const float error = calculate_max_error_at_bit_rate_object(context, bone_index, error_scan_stop_condition::until_error_too_high);

						if (error < best_error)
						{
							best_error = error;
							best_bit_rates = context.bit_rate_per_bone[bone_index];
						}

						context.bit_rate_per_bone[bone_index] = bone_bit_rates;

						if (scale_bit_rate >= k_highest_bit_rate)
							break;
					}

					if (translation_bit_rate >= k_highest_bit_rate)
						break;
				}

				if (rotation_bit_rate >= k_highest_bit_rate)
					break;
			}

			out_best_bit_rates = best_bit_rates;
			return best_error;
		}

		inline float calculate_bone_permutation_error(quantization_context& context, BoneBitRate* permutation_bit_rates, uint8_t* bone_chain_permutation, uint32_t bone_index, BoneBitRate* best_bit_rates, float old_error)
		{
			const float error_threshold = context.error_threshold;
			float best_error = old_error;

			do
			{
				// Copy our current bit rates to the permutation rates
				std::memcpy(permutation_bit_rates, context.bit_rate_per_bone, sizeof(BoneBitRate) * context.num_bones);

				bool is_permutation_valid = false;
				const uint32_t num_bones_in_chain = context.num_bones_in_chain;
				for (uint32_t chain_link_index = 0; chain_link_index < num_bones_in_chain; ++chain_link_index)
				{
					if (bone_chain_permutation[chain_link_index] != 0)
					{
						// Increase bit rate
						const uint32_t chain_bone_index = context.chain_bone_indices[chain_link_index];
						BoneBitRate chain_bone_best_bit_rates;
						increase_bone_bit_rate(context, chain_bone_index, bone_chain_permutation[chain_link_index], old_error, chain_bone_best_bit_rates);
						is_permutation_valid |= chain_bone_best_bit_rates.rotation != permutation_bit_rates[chain_bone_index].rotation;
						is_permutation_valid |= chain_bone_best_bit_rates.translation != permutation_bit_rates[chain_bone_index].translation;
						is_permutation_valid |= chain_bone_best_bit_rates.scale != permutation_bit_rates[chain_bone_index].scale;
						permutation_bit_rates[chain_bone_index] = chain_bone_best_bit_rates;
					}
				}

				if (!is_permutation_valid)
					continue;	// Couldn't increase any bit rate, skip this permutation

				// Measure error
				std::swap(context.bit_rate_per_bone, permutation_bit_rates);
				const float permutation_error = calculate_max_error_at_bit_rate_object(context, bone_index, error_scan_stop_condition::until_error_too_high);
				std::swap(context.bit_rate_per_bone, permutation_bit_rates);

				if (permutation_error < best_error)
				{
					best_error = permutation_error;
					std::memcpy(best_bit_rates, permutation_bit_rates, sizeof(BoneBitRate) * context.num_bones);

					if (permutation_error < error_threshold)
						break;
				}
			} while (std::next_permutation(bone_chain_permutation, bone_chain_permutation + context.num_bones_in_chain));

			return best_error;
		}

		inline uint32_t calculate_bone_chain_indices(const clip_context& clip, uint32_t bone_index, uint32_t* out_chain_bone_indices)
		{
			const BoneChain bone_chain = clip.get_bone_chain(bone_index);

			uint32_t num_bones_in_chain = 0;
			for (uint32_t chain_bone_index : bone_chain)
				out_chain_bone_indices[num_bones_in_chain++] = chain_bone_index;

			return num_bones_in_chain;
		}

		inline void initialize_bone_bit_rates(const SegmentContext& segment, rotation_format8 rotation_format, vector_format8 translation_format, vector_format8 scale_format, BoneBitRate* out_bit_rate_per_bone)
		{
			const bool is_rotation_variable = is_rotation_format_variable(rotation_format);
			const bool is_translation_variable = is_vector_format_variable(translation_format);
			const bool is_scale_variable = segment_context_has_scale(segment) && is_vector_format_variable(scale_format);

			const uint32_t num_bones = segment.num_bones;
			for (uint32_t bone_index = 0; bone_index < num_bones; ++bone_index)
			{
				BoneBitRate& bone_bit_rate = out_bit_rate_per_bone[bone_index];

				const bool rotation_supports_constant_tracks = segment.are_rotations_normalized;
				if (is_rotation_variable && !segment.bone_streams[bone_index].is_rotation_constant)
					bone_bit_rate.rotation = rotation_supports_constant_tracks ? 0 : k_lowest_bit_rate;
				else
					bone_bit_rate.rotation = k_invalid_bit_rate;

				const bool translation_supports_constant_tracks = segment.are_translations_normalized;
				if (is_translation_variable && !segment.bone_streams[bone_index].is_translation_constant)
					bone_bit_rate.translation = translation_supports_constant_tracks ? 0 : k_lowest_bit_rate;
				else
					bone_bit_rate.translation = k_invalid_bit_rate;

				const bool scale_supports_constant_tracks = segment.are_scales_normalized;
				if (is_scale_variable && !segment.bone_streams[bone_index].is_scale_constant)
					bone_bit_rate.scale = scale_supports_constant_tracks ? 0 : k_lowest_bit_rate;
				else
					bone_bit_rate.scale = k_invalid_bit_rate;
			}
		}

		inline void quantize_all_streams(quantization_context& context)
		{
			ACL_ASSERT(context.is_valid(), "quantization_context isn't valid");

			const bool is_rotation_variable = is_rotation_format_variable(context.rotation_format);
			const bool is_translation_variable = is_vector_format_variable(context.translation_format);
			const bool is_scale_variable = is_vector_format_variable(context.scale_format);

			for (uint32_t bone_index = 0; bone_index < context.num_bones; ++bone_index)
			{
				const BoneBitRate& bone_bit_rate = context.bit_rate_per_bone[bone_index];

				if (is_rotation_variable)
					quantize_variable_rotation_stream(context, bone_index, bone_bit_rate.rotation);
				else
					quantize_fixed_rotation_stream(context, bone_index, context.rotation_format);

				if (is_translation_variable)
					quantize_variable_translation_stream(context, bone_index, bone_bit_rate.translation);
				else
					quantize_fixed_translation_stream(context, bone_index, context.translation_format);

				if (context.has_scale)
				{
					if (is_scale_variable)
						quantize_variable_scale_stream(context, bone_index, bone_bit_rate.scale);
					else
						quantize_fixed_scale_stream(context, bone_index, context.scale_format);
				}
			}
		}

		inline void find_optimal_bit_rates(quantization_context& context)
		{
			ACL_ASSERT(context.is_valid(), "quantization_context isn't valid");

			initialize_bone_bit_rates(*context.segment, context.rotation_format, context.translation_format, context.scale_format, context.bit_rate_per_bone);

			// First iterate over all bones and find the optimal bit rate for each track using the local space error.
			// We use the local space error to prime the algorithm. If each parent bone has infinite precision,
			// the local space error is equivalent. Since parents are lossy, it is a good approximation. It means
			// that whatever bit rate we find for a bone, it cannot be lower to reach our error threshold since
			// a lossy parent means we need to be equally or more accurate to maintain the threshold.
			//
			// In practice, the error from a child can compensate the error introduced by the parent but
			// this is unlikely to hold true for a whole track at every key. We thus make the assumption
			// that increasing the precision is always good regardless of the hierarchy level.

			calculate_local_space_bit_rates(context);

			// Now that we found an approximate lower bound for the bit rates, we start at the root and perform a brute force search.
			// For each bone, we do the following:
			//    - If object space error meets our error threshold, do nothing
			//    - Iterate over each bone in the chain and increment the bit rate by 1 (rotation or translation, pick lowest error)
			//    - Pick the bone that improved the error the most and increment the bit rate by 1
			//    - Repeat until we meet our error threshold
			//
			// The root is already optimal from the previous step since the local space error is equal to the object space error.
			// Next we'll add one bone to the chain under the root. Performing the above steps, we perform an exhaustive search
			// to find the smallest memory footprint that will meet our error threshold. No combination with a lower memory footprint
			// could yield a smaller error.
			// Next we'll add another bone to the chain. By performing these steps recursively, we can ensure that the accuracy always
			// increases and the memory footprint is always as low as possible.

			// 3 bone chain expansion:
			// 3:	[bone 0] + 1 [bone 1] + 0 [bone 2] + 0 (3)
			//		[bone 0] + 0 [bone 1] + 1 [bone 2] + 0 (3)
			//		[bone 0] + 0 [bone 1] + 0 [bone 2] + 1 (3)
			// 6:	[bone 0] + 2 [bone 1] + 0 [bone 2] + 0 (6)
			//		[bone 0] + 1 [bone 1] + 1 [bone 2] + 0 (6)
			//		[bone 0] + 1 [bone 1] + 0 [bone 2] + 1 (6)
			//		[bone 0] + 0 [bone 1] + 1 [bone 2] + 1 (6)
			//		[bone 0] + 0 [bone 1] + 2 [bone 2] + 0 (6)
			//		[bone 0] + 0 [bone 1] + 0 [bone 2] + 2 (6)
			//10:	[bone 0] + 3 [bone 1] + 0 [bone 2] + 0 (9)
			//		[bone 0] + 2 [bone 1] + 1 [bone 2] + 0 (9)
			//		[bone 0] + 2 [bone 1] + 0 [bone 2] + 1 (9)
			//		[bone 0] + 1 [bone 1] + 2 [bone 2] + 0 (9)
			//		[bone 0] + 1 [bone 1] + 1 [bone 2] + 1 (9)
			//		[bone 0] + 1 [bone 1] + 0 [bone 2] + 2 (9)
			//		[bone 0] + 0 [bone 1] + 3 [bone 2] + 0 (9)
			//		[bone 0] + 0 [bone 1] + 2 [bone 2] + 1 (9)
			//		[bone 0] + 0 [bone 1] + 1 [bone 2] + 2 (9)
			//		[bone 0] + 0 [bone 1] + 0 [bone 2] + 3 (9)

			uint8_t* bone_chain_permutation = allocate_type_array<uint8_t>(context.allocator, context.num_bones);
			BoneBitRate* permutation_bit_rates = allocate_type_array<BoneBitRate>(context.allocator, context.num_bones);
			BoneBitRate* best_permutation_bit_rates = allocate_type_array<BoneBitRate>(context.allocator, context.num_bones);
			BoneBitRate* best_bit_rates = allocate_type_array<BoneBitRate>(context.allocator, context.num_bones);
			std::memcpy(best_bit_rates, context.bit_rate_per_bone, sizeof(BoneBitRate) * context.num_bones);

			const uint32_t num_bones = context.num_bones;
			for (uint32_t bone_index = 0; bone_index < num_bones; ++bone_index)
			{
				// Update our context with the new bone data
				const float error_threshold = context.metadata[bone_index].precision;
				context.error_threshold = error_threshold;

				const uint32_t num_bones_in_chain = calculate_bone_chain_indices(context.clip, bone_index, context.chain_bone_indices);
				context.num_bones_in_chain = num_bones_in_chain;

				float error = calculate_max_error_at_bit_rate_object(context, bone_index, error_scan_stop_condition::until_error_too_high);
				if (error < error_threshold)
					continue;

				const float initial_error = error;

				while (error >= error_threshold)
				{
					// Generate permutations for up to 3 bit rate increments
					// Perform an exhaustive search of the permutations and pick the best result
					// If our best error is under the threshold, we are done, otherwise we will try again from there
					const float original_error = error;
					float best_error = error;

					// The first permutation increases the bit rate of a single track/bone
					std::fill(bone_chain_permutation, bone_chain_permutation + num_bones, uint8_t(0));
					bone_chain_permutation[num_bones_in_chain - 1] = 1;
					error = calculate_bone_permutation_error(context, permutation_bit_rates, bone_chain_permutation, bone_index, best_permutation_bit_rates, original_error);
					if (error < best_error)
					{
						best_error = error;
						std::memcpy(best_bit_rates, best_permutation_bit_rates, sizeof(BoneBitRate) * num_bones);

						if (error < error_threshold)
							break;
					}

					if (context.compression_level >= compression_level8::high)
					{
						// The second permutation increases the bit rate of 2 track/bones
						std::fill(bone_chain_permutation, bone_chain_permutation + num_bones, uint8_t(0));
						bone_chain_permutation[num_bones_in_chain - 1] = 2;
						error = calculate_bone_permutation_error(context, permutation_bit_rates, bone_chain_permutation, bone_index, best_permutation_bit_rates, original_error);
						if (error < best_error)
						{
							best_error = error;
							std::memcpy(best_bit_rates, best_permutation_bit_rates, sizeof(BoneBitRate) * num_bones);

							if (error < error_threshold)
								break;
						}

						if (num_bones_in_chain > 1)
						{
							std::fill(bone_chain_permutation, bone_chain_permutation + num_bones, uint8_t(0));
							bone_chain_permutation[num_bones_in_chain - 2] = 1;
							bone_chain_permutation[num_bones_in_chain - 1] = 1;
							error = calculate_bone_permutation_error(context, permutation_bit_rates, bone_chain_permutation, bone_index, best_permutation_bit_rates, original_error);
							if (error < best_error)
							{
								best_error = error;
								std::memcpy(best_bit_rates, best_permutation_bit_rates, sizeof(BoneBitRate) * num_bones);

								if (error < error_threshold)
									break;
							}
						}
					}

					if (context.compression_level >= compression_level8::highest)
					{
						// The third permutation increases the bit rate of 3 track/bones
						std::fill(bone_chain_permutation, bone_chain_permutation + num_bones, uint8_t(0));
						bone_chain_permutation[num_bones_in_chain - 1] = 3;
						error = calculate_bone_permutation_error(context, permutation_bit_rates, bone_chain_permutation, bone_index, best_permutation_bit_rates, original_error);
						if (error < best_error)
						{
							best_error = error;
							std::memcpy(best_bit_rates, best_permutation_bit_rates, sizeof(BoneBitRate) * num_bones);

							if (error < error_threshold)
								break;
						}

						if (num_bones_in_chain > 1)
						{
							std::fill(bone_chain_permutation, bone_chain_permutation + num_bones, uint8_t(0));
							bone_chain_permutation[num_bones_in_chain - 2] = 2;
							bone_chain_permutation[num_bones_in_chain - 1] = 1;
							error = calculate_bone_permutation_error(context, permutation_bit_rates, bone_chain_permutation, bone_index, best_permutation_bit_rates, original_error);
							if (error < best_error)
							{
								best_error = error;
								std::memcpy(best_bit_rates, best_permutation_bit_rates, sizeof(BoneBitRate) * num_bones);

								if (error < error_threshold)
									break;
							}

							if (num_bones_in_chain > 2)
							{
								std::fill(bone_chain_permutation, bone_chain_permutation + num_bones, uint8_t(0));
								bone_chain_permutation[num_bones_in_chain - 3] = 1;
								bone_chain_permutation[num_bones_in_chain - 2] = 1;
								bone_chain_permutation[num_bones_in_chain - 1] = 1;
								error = calculate_bone_permutation_error(context, permutation_bit_rates, bone_chain_permutation, bone_index, best_permutation_bit_rates, original_error);
								if (error < best_error)
								{
									best_error = error;
									std::memcpy(best_bit_rates, best_permutation_bit_rates, sizeof(BoneBitRate) * num_bones);

									if (error < error_threshold)
										break;
								}
							}
						}
					}

					if (best_error >= original_error)
						break;	// No progress made

					error = best_error;
					if (error < original_error)
					{
#if ACL_IMPL_DEBUG_VARIABLE_QUANTIZATION
						std::swap(context.bit_rate_per_bone, best_bit_rates);
						float new_error = calculate_max_error_at_bit_rate_object(context, bone_index, error_scan_stop_condition::until_end_of_segment);
						std::swap(context.bit_rate_per_bone, best_bit_rates);

						for (uint32_t i = 0; i < context.num_bones; ++i)
						{
							const BoneBitRate& bone_bit_rate = context.bit_rate_per_bone[i];
							const BoneBitRate& best_bone_bit_rate = best_bit_rates[i];
							bool rotation_differs = bone_bit_rate.rotation != best_bone_bit_rate.rotation;
							bool translation_differs = bone_bit_rate.translation != best_bone_bit_rate.translation;
							bool scale_differs = bone_bit_rate.scale != best_bone_bit_rate.scale;
							if (rotation_differs || translation_differs || scale_differs)
								printf("%u: %u | %u | %u => %u  %u %u (%f)\n", i, bone_bit_rate.rotation, bone_bit_rate.translation, bone_bit_rate.scale, best_bone_bit_rate.rotation, best_bone_bit_rate.translation, best_bone_bit_rate.scale, new_error);
						}
#endif

						std::memcpy(context.bit_rate_per_bone, best_bit_rates, sizeof(BoneBitRate) * num_bones);
					}
				}

				if (error < initial_error)
				{
#if ACL_IMPL_DEBUG_VARIABLE_QUANTIZATION
					std::swap(context.bit_rate_per_bone, best_bit_rates);
					float new_error = calculate_max_error_at_bit_rate_object(context, bone_index, error_scan_stop_condition::until_end_of_segment);
					std::swap(context.bit_rate_per_bone, best_bit_rates);

					for (uint32_t i = 0; i < context.num_bones; ++i)
					{
						const BoneBitRate& bone_bit_rate = context.bit_rate_per_bone[i];
						const BoneBitRate& best_bone_bit_rate = best_bit_rates[i];
						bool rotation_differs = bone_bit_rate.rotation != best_bone_bit_rate.rotation;
						bool translation_differs = bone_bit_rate.translation != best_bone_bit_rate.translation;
						bool scale_differs = bone_bit_rate.scale != best_bone_bit_rate.scale;
						if (rotation_differs || translation_differs || scale_differs)
							printf("%u: %u | %u | %u => %u  %u %u (%f)\n", i, bone_bit_rate.rotation, bone_bit_rate.translation, bone_bit_rate.scale, best_bone_bit_rate.rotation, best_bone_bit_rate.translation, best_bone_bit_rate.scale, new_error);
					}
#endif

					std::memcpy(context.bit_rate_per_bone, best_bit_rates, sizeof(BoneBitRate) * num_bones);
				}

				// Our error remains too high, this should be rare.
				// Attempt to increase the bit rate as much as we can while still back tracking if it doesn't help.
				error = calculate_max_error_at_bit_rate_object(context, bone_index, error_scan_stop_condition::until_end_of_segment);
				while (error >= error_threshold)
				{
					// From child to parent, increase the bit rate indiscriminately
					uint32_t num_maxed_out = 0;
					for (int32_t chain_link_index = num_bones_in_chain - 1; chain_link_index >= 0; --chain_link_index)
					{
						const uint32_t chain_bone_index = context.chain_bone_indices[chain_link_index];

						// Work with a copy. We'll increase the bit rate as much as we can and retain the values
						// that yield the smallest error BUT increasing the bit rate does NOT always means
						// that the error will reduce and improve. It could get worse in which case we'll do nothing.

						BoneBitRate& bone_bit_rate = context.bit_rate_per_bone[chain_bone_index];

						// Copy original values
						BoneBitRate best_bone_bit_rate = bone_bit_rate;
						float best_bit_rate_error = error;

						while (error >= error_threshold)
						{
							static_assert(offsetof(BoneBitRate, rotation) == 0 && offsetof(BoneBitRate, scale) == sizeof(BoneBitRate) - 1, "Invalid BoneBitRate offsets");
							uint8_t& smallest_bit_rate = *std::min_element<uint8_t*>(&bone_bit_rate.rotation, &bone_bit_rate.scale + 1);

							if (smallest_bit_rate >= k_highest_bit_rate)
							{
								num_maxed_out++;
								break;
							}

							// If rotation == translation and translation has room, bias translation
							// This seems to yield an overall tiny win but it isn't always the case.
							// TODO: Brute force this?
							if (bone_bit_rate.rotation == bone_bit_rate.translation && bone_bit_rate.translation < k_highest_bit_rate && bone_bit_rate.scale >= k_highest_bit_rate)
								bone_bit_rate.translation++;
							else
								smallest_bit_rate++;

							ACL_ASSERT((bone_bit_rate.rotation <= k_highest_bit_rate || bone_bit_rate.rotation == k_invalid_bit_rate) && (bone_bit_rate.translation <= k_highest_bit_rate || bone_bit_rate.translation == k_invalid_bit_rate) && (bone_bit_rate.scale <= k_highest_bit_rate || bone_bit_rate.scale == k_invalid_bit_rate), "Invalid bit rate! [%u, %u, %u]", bone_bit_rate.rotation, bone_bit_rate.translation, bone_bit_rate.scale);

							error = calculate_max_error_at_bit_rate_object(context, bone_index, error_scan_stop_condition::until_end_of_segment);

							if (error < best_bit_rate_error)
							{
								best_bone_bit_rate = bone_bit_rate;
								best_bit_rate_error = error;

#if ACL_IMPL_DEBUG_VARIABLE_QUANTIZATION
								printf("%u: => %u %u %u (%f)\n", chain_bone_index, bone_bit_rate.rotation, bone_bit_rate.translation, bone_bit_rate.scale, error);
								for (uint32_t i = chain_link_index + 1; i < num_bones_in_chain; ++i)
								{
									const uint32_t chain_bone_index2 = context.chain_bone_indices[chain_link_index];
									float error2 = calculate_max_error_at_bit_rate_object(context, chain_bone_index2, error_scan_stop_condition::until_end_of_segment);
									printf("  %u: => (%f)\n", i, error2);
								}
#endif
							}
						}

						// Only retain the lowest error bit rates
						bone_bit_rate = best_bone_bit_rate;
						error = best_bit_rate_error;

						if (error < error_threshold)
							break;
					}

					if (num_maxed_out == num_bones_in_chain)
						break;

					// TODO: Try to lower the bit rate again in the reverse direction?
				}

				// Despite our best efforts, we failed to meet the threshold with our heuristics.
				// No longer attempt to find what is best for size, max out the bit rates until we meet the threshold.
				// Only do this if the rotation format is full precision quaternions. This last step is not guaranteed
				// to reach the error threshold but it will very likely increase the memory footprint. Even if we do
				// reach the error threshold for the given bone, another sibling bone already processed might now
				// have an error higher than it used to if quantization caused its error to compensate. More often than
				// not, sibling bones will remain fairly close in their error. Some packed rotation formats, namely
				// drop W component can have a high error even with raw values, it is assumed that if such a format
				// is used then a best effort approach to reach the error threshold is entirely fine.
				if (error >= error_threshold && context.rotation_format == rotation_format8::quatf_full)
				{
					// From child to parent, max out the bit rate
					for (int32_t chain_link_index = num_bones_in_chain - 1; chain_link_index >= 0; --chain_link_index)
					{
						const uint32_t chain_bone_index = context.chain_bone_indices[chain_link_index];
						BoneBitRate& bone_bit_rate = context.bit_rate_per_bone[chain_bone_index];
						bone_bit_rate.rotation = std::max<uint8_t>(bone_bit_rate.rotation, k_highest_bit_rate);
						bone_bit_rate.translation = std::max<uint8_t>(bone_bit_rate.translation, k_highest_bit_rate);
						bone_bit_rate.scale = std::max<uint8_t>(bone_bit_rate.scale, k_highest_bit_rate);

						error = calculate_max_error_at_bit_rate_object(context, bone_index, error_scan_stop_condition::until_end_of_segment);
						if (error < error_threshold)
							break;
					}
				}
			}

#if ACL_IMPL_DEBUG_VARIABLE_QUANTIZATION
			printf("Variable quantization optimization results:\n");
			for (uint32_t bone_index = 0; bone_index < num_bones; ++bone_index)
			{
				// Update our context with the new bone data
				const float error_threshold = context.metadata[bone_index].precision;
				context.error_threshold = error_threshold;

				const uint32_t num_bones_in_chain = calculate_bone_chain_indices(context.clip, bone_index, context.chain_bone_indices);
				context.num_bones_in_chain = num_bones_in_chain;

				float error = calculate_max_error_at_bit_rate_object(context, bone_index, error_scan_stop_condition::until_end_of_segment);
				const BoneBitRate& bone_bit_rate = context.bit_rate_per_bone[bone_index];
				printf("%u: %u | %u | %u => %f %s\n", bone_index, bone_bit_rate.rotation, bone_bit_rate.translation, bone_bit_rate.scale, error, error >= error_threshold ? "!" : "");
			}
#endif

			deallocate_type_array(context.allocator, bone_chain_permutation, num_bones);
			deallocate_type_array(context.allocator, permutation_bit_rates, num_bones);
			deallocate_type_array(context.allocator, best_permutation_bit_rates, num_bones);
			deallocate_type_array(context.allocator, best_bit_rates, num_bones);
		}

		inline void quantize_streams(iallocator& allocator, clip_context& clip, const compression_settings& settings, const clip_context& raw_clip_context, const clip_context& additive_base_clip_context, output_stats& out_stats)
		{
			(void)out_stats;

			const bool is_rotation_variable = is_rotation_format_variable(settings.rotation_format);
			const bool is_translation_variable = is_vector_format_variable(settings.translation_format);
			const bool is_scale_variable = is_vector_format_variable(settings.scale_format);
			const bool is_any_variable = is_rotation_variable || is_translation_variable || is_scale_variable;

			quantization_context context(allocator, clip, raw_clip_context, additive_base_clip_context, settings);

			for (SegmentContext& segment : clip.segment_iterator())
			{
#if ACL_IMPL_DEBUG_VARIABLE_QUANTIZATION
				printf("Quantizing segment %u...\n", segment.segment_index);
#endif

#if ACL_IMPL_PROFILE_MATH
				{
					scope_profiler timer;

					for (int32_t i = 0; i < 10; ++i)
					{
						context.set_segment(segment);

						if (is_any_variable)
							find_optimal_bit_rates(context);
					}

					timer.stop();

#if defined(__ANDROID__)
					__android_log_print(ANDROID_LOG_INFO, "acl", "Quantization optimization for segment %u took: %.4f ms", segment.segment_index, timer.get_elapsed_milliseconds());
#else
					printf("Quantization optimization for segment %u took: %.4f ms\n", segment.segment_index, timer.get_elapsed_milliseconds());
#endif
				}
#endif

				context.set_segment(segment);

				if (is_any_variable)
					find_optimal_bit_rates(context);

				// Quantize our streams now that we found the optimal bit rates
				quantize_all_streams(context);
			}

#if defined(SJSON_CPP_WRITER)
			if (are_all_enum_flags_set(out_stats.logging, stat_logging::detailed))
			{
				sjson::ObjectWriter& writer = *out_stats.writer;
				writer["track_bit_rate_database_size"] = static_cast<uint32_t>(context.bit_rate_database.get_allocated_size());

				size_t transform_cache_size = 0;
				transform_cache_size += sizeof(rtm::qvvf) * context.num_bones;	// raw_local_pose
				transform_cache_size += sizeof(rtm::qvvf) * context.num_bones;	// lossy_local_pose
				transform_cache_size += context.metric_transform_size * context.num_bones;	// lossy_object_pose
				transform_cache_size += context.metric_transform_size * context.num_bones * context.clip.segments->num_samples;	// raw_local_transforms
				transform_cache_size += context.metric_transform_size * context.num_bones * context.clip.segments->num_samples;	// raw_object_transforms

				if (context.needs_conversion)
					transform_cache_size += context.metric_transform_size * context.num_bones;	// local_transforms_converted

				if (context.has_additive_base)
				{
					transform_cache_size += sizeof(rtm::qvvf) * context.num_bones;	// additive_local_pose
					transform_cache_size += context.metric_transform_size * context.num_bones * context.clip.segments->num_samples;	// base_local_transforms
					transform_cache_size += context.metric_transform_size * context.num_bones * context.clip.segments->num_samples;	// base_object_transforms
				}

				writer["transform_cache_size"] = static_cast<uint32_t>(transform_cache_size);
			}
#endif
		}
	}
}

ACL_IMPL_FILE_PRAGMA_POP