cocos-engine-external/sources/acl/compression/impl/write_decompression_stats.h

#pragma once

////////////////////////////////////////////////////////////////////////////////
// The MIT License (MIT)
//
// Copyright (c) 2019 Nicholas Frechette & Animation Compression Library contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
////////////////////////////////////////////////////////////////////////////////

#if defined(SJSON_CPP_WRITER)

#include "acl/core/compressed_tracks_version.h"
#include "acl/core/scope_profiler.h"
#include "acl/core/track_formats.h"
#include "acl/core/utils.h"
#include "acl/core/impl/compiler_utils.h"
#include "acl/core/impl/memory_cache.h"
#include "acl/compression/output_stats.h"
#include "acl/decompression/decompress.h"

#include <rtm/scalard.h>
#include <rtm/scalarf.h>

#include <algorithm>
#include <thread>
#include <chrono>
#include <cstring>
#include <random>

ACL_IMPL_FILE_PRAGMA_PUSH

namespace acl
{
	namespace acl_impl
	{
		constexpr uint32_t k_num_decompression_samples = 100;
		constexpr uint32_t k_num_decompression_evaluations = 100;

		enum class PlaybackDirection
		{
			Forward,
			Backward,
			Random,
		};

		enum class DecompressionFunction
		{
			DecompressPose,
			DecompressBone,
		};

		template<class DecompressionContextType>
		inline void write_decompression_performance_stats(
			stat_logging logging, sjson::ObjectWriter& writer, const char* action_type,
			PlaybackDirection playback_direction, DecompressionFunction decompression_function,
			compressed_tracks* compressed_clips[k_num_decompression_evaluations],
			DecompressionContextType* contexts[k_num_decompression_evaluations],
			CPUCacheFlusher* cache_flusher, debug_track_writer& pose_writer)
		{
			const uint32_t num_tracks = compressed_clips[0]->get_num_tracks();
			const float duration = compressed_clips[0]->get_duration();
			const bool is_cold_cache_profiling = cache_flusher != nullptr;

			float sample_times[k_num_decompression_samples];
			for (uint32_t sample_index = 0; sample_index < k_num_decompression_samples; ++sample_index)
			{
				const float normalized_sample_time = float(sample_index) / float(k_num_decompression_samples - 1);
				sample_times[sample_index] = rtm::scalar_clamp(normalized_sample_time, 0.0F, 1.0F) * duration;
			}

			switch (playback_direction)
			{
			case PlaybackDirection::Forward:
			default:
				break;
			case PlaybackDirection::Backward:
				std::reverse(&sample_times[0], &sample_times[k_num_decompression_samples]);
				break;
			case PlaybackDirection::Random:
				std::shuffle(&sample_times[0], &sample_times[k_num_decompression_samples], std::default_random_engine(0));
				break;
			}

			// Initialize and clear our contexts
			bool init_success = true;
			for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index)
				init_success |= contexts[clip_index]->initialize(*compressed_clips[clip_index]);

			ACL_ASSERT(init_success, "Failed to initialize decompression context");
			if (!init_success)
				return;

			writer[action_type] = [&](sjson::ObjectWriter& action_writer)
			{

				double clip_max_ms = 0.0;
				double clip_min_ms = 1000000.0;
				double clip_total_ms = 0.0;
				double clip_time_ms[k_num_decompression_samples];

				action_writer["data"] = [&](sjson::ArrayWriter& data_writer)
				{
					for (uint32_t sample_index = 0; sample_index < k_num_decompression_samples; ++sample_index)
					{
						const float sample_time = sample_times[sample_index];

						// Clearing the context ensures the decoder cannot reuse any state cached from the last sample.
						if (playback_direction == PlaybackDirection::Random)
						{
							for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index)
								contexts[clip_index]->initialize(*compressed_clips[clip_index]);
						}

						// Clear the CPU cache if necessary
						if (is_cold_cache_profiling)
						{
							cache_flusher->begin_flushing();
							for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index)
							{
								cache_flusher->flush_buffer(contexts[clip_index], sizeof(DecompressionContextType));
								cache_flusher->flush_buffer(compressed_clips[clip_index], compressed_clips[clip_index]->get_size());
							}
							cache_flusher->end_flushing();
						}
						else
						{
							// If we want the cache warm, decompress everything once to prime it
							DecompressionContextType* context = contexts[0];
							context->seek(sample_time, sample_rounding_policy::none);
							context->decompress_tracks(pose_writer);
						}

						// We yield our time slice and wait for a new one before priming the cache
						// to help keep it warm and minimize the risk that we'll be interrupted during decompression
						std::this_thread::sleep_for(std::chrono::nanoseconds(1));

						scope_profiler timer;

						for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index)
						{
							// If we measure with a cold CPU cache, we use a different context every time otherwise we use the first one
							DecompressionContextType* context = is_cold_cache_profiling ? contexts[clip_index] : contexts[0];

							context->seek(sample_time, sample_rounding_policy::none);

							switch (decompression_function)
							{
							case DecompressionFunction::DecompressPose:
								context->decompress_tracks(pose_writer);
								break;
							case DecompressionFunction::DecompressBone:
								for (uint32_t bone_index = 0; bone_index < num_tracks; ++bone_index)
									context->decompress_track(bone_index, pose_writer);
								break;
							}
						}

						timer.stop();

						const double elapsed_ms = timer.get_elapsed_milliseconds() / k_num_decompression_evaluations;

						if (are_any_enum_flags_set(logging, stat_logging::exhaustive_decompression))
							data_writer.push(elapsed_ms);

						clip_min_ms = rtm::scalar_min(clip_min_ms, elapsed_ms);
						clip_max_ms = rtm::scalar_max(clip_max_ms, elapsed_ms);
						clip_total_ms += elapsed_ms;
						clip_time_ms[sample_index] = elapsed_ms;
					}
				};

				std::sort(&clip_time_ms[0], &clip_time_ms[k_num_decompression_samples]);

				action_writer["min_time_ms"] = clip_min_ms;
				action_writer["max_time_ms"] = clip_max_ms;
				action_writer["avg_time_ms"] = clip_total_ms / double(k_num_decompression_samples);
				action_writer["med_time_ms"] = clip_time_ms[k_num_decompression_samples / 2];
			};
		}

		inline void write_memcpy_performance_stats(iallocator& allocator, sjson::ObjectWriter& writer, CPUCacheFlusher* cache_flusher, rtm::qvvf* lossy_pose_transforms, uint32_t num_bones)
		{
			rtm::qvvf* memcpy_src_transforms = allocate_type_array<rtm::qvvf>(allocator, num_bones);

			double decompression_time_ms = 1000000.0;
			for (uint32_t pass_index = 0; pass_index < 3; ++pass_index)
			{
				if (cache_flusher != nullptr)
				{
					cache_flusher->begin_flushing();
					cache_flusher->flush_buffer(memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
					cache_flusher->end_flushing();

					// Now that the cache is cold, yield our time slice and wait for a new one
					// This helps minimize the risk that we'll be interrupted during decompression
					std::this_thread::sleep_for(std::chrono::nanoseconds(1));
				}
				else
				{
					// We yield our time slice and wait for a new one before priming the cache
					// to help keep it warm and minimize the risk that we'll be interrupted during decompression
					std::this_thread::sleep_for(std::chrono::nanoseconds(1));

					std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
				}

				double execution_count;
				scope_profiler timer;
				if (cache_flusher != nullptr)
				{
					std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
					execution_count = 1.0;
				}
				else
				{
					// Warm cache is too fast, execute multiple times and divide by the count
					std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
					std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
					std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
					std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
					std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
					std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
					std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
					std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
					std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
					std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
					execution_count = 10.0;
				}
				timer.stop();

				const double elapsed_ms = timer.get_elapsed_milliseconds() / execution_count;
				decompression_time_ms = rtm::scalar_min(decompression_time_ms, elapsed_ms);
			}

			writer[cache_flusher != nullptr ? "memcpy_cold" : "memcpy_warm"] = [&](sjson::ObjectWriter& memcpy_writer)
			{
				memcpy_writer["data"] = [&](sjson::ArrayWriter&) {};
				memcpy_writer["min_time_ms"] = decompression_time_ms;
				memcpy_writer["max_time_ms"] = decompression_time_ms;
				memcpy_writer["avg_time_ms"] = decompression_time_ms;
			};

			deallocate_type_array(allocator, memcpy_src_transforms, num_bones);
		}

		template<class DecompressionContextType>
		inline void write_decompression_performance_stats(iallocator& allocator, compressed_tracks* compressed_clips[k_num_decompression_evaluations], DecompressionContextType* contexts[k_num_decompression_evaluations], stat_logging logging, sjson::ObjectWriter& writer)
		{
			CPUCacheFlusher* cache_flusher = allocate_type<CPUCacheFlusher>(allocator);

			const uint32_t num_tracks = compressed_clips[0]->get_num_tracks();
			debug_track_writer pose_writer(allocator, track_type8::qvvf, num_tracks);

			const uint32_t num_bytes_per_bone = (4 + 3 + 3) * sizeof(float);	// Rotation, Translation, Scale
			writer["pose_size"] = num_tracks * num_bytes_per_bone;

			writer["decompression_time_per_sample"] = [&](sjson::ObjectWriter& per_sample_writer)
			{
				// Cold/Warm CPU cache, memcpy
				write_memcpy_performance_stats(allocator, per_sample_writer, cache_flusher, pose_writer.tracks_typed.qvvf, num_tracks);
				write_memcpy_performance_stats(allocator, per_sample_writer, nullptr, pose_writer.tracks_typed.qvvf, num_tracks);

				// Cold CPU cache, decompress_pose
				write_decompression_performance_stats(logging, per_sample_writer, "forward_pose_cold", PlaybackDirection::Forward, DecompressionFunction::DecompressPose, compressed_clips, contexts, cache_flusher, pose_writer);
				write_decompression_performance_stats(logging, per_sample_writer, "backward_pose_cold", PlaybackDirection::Backward, DecompressionFunction::DecompressPose, compressed_clips, contexts, cache_flusher, pose_writer);
				write_decompression_performance_stats(logging, per_sample_writer, "random_pose_cold", PlaybackDirection::Random, DecompressionFunction::DecompressPose, compressed_clips, contexts, cache_flusher, pose_writer);

				// Warm CPU cache, decompress_pose
				write_decompression_performance_stats(logging, per_sample_writer, "forward_pose_warm", PlaybackDirection::Forward, DecompressionFunction::DecompressPose, compressed_clips, contexts, nullptr, pose_writer);
				write_decompression_performance_stats(logging, per_sample_writer, "backward_pose_warm", PlaybackDirection::Backward, DecompressionFunction::DecompressPose, compressed_clips, contexts, nullptr, pose_writer);
				write_decompression_performance_stats(logging, per_sample_writer, "random_pose_warm", PlaybackDirection::Random, DecompressionFunction::DecompressPose, compressed_clips, contexts, nullptr, pose_writer);

				// Cold CPU cache, decompress_bone
				write_decompression_performance_stats(logging, per_sample_writer, "forward_bone_cold", PlaybackDirection::Forward, DecompressionFunction::DecompressBone, compressed_clips, contexts, cache_flusher, pose_writer);
				write_decompression_performance_stats(logging, per_sample_writer, "backward_bone_cold", PlaybackDirection::Backward, DecompressionFunction::DecompressBone, compressed_clips, contexts, cache_flusher, pose_writer);
				write_decompression_performance_stats(logging, per_sample_writer, "random_bone_cold", PlaybackDirection::Random, DecompressionFunction::DecompressBone, compressed_clips, contexts, cache_flusher, pose_writer);

				// Warm CPU cache, decompress_bone
				write_decompression_performance_stats(logging, per_sample_writer, "forward_bone_warm", PlaybackDirection::Forward, DecompressionFunction::DecompressBone, compressed_clips, contexts, nullptr, pose_writer);
				write_decompression_performance_stats(logging, per_sample_writer, "backward_bone_warm", PlaybackDirection::Backward, DecompressionFunction::DecompressBone, compressed_clips, contexts, nullptr, pose_writer);
				write_decompression_performance_stats(logging, per_sample_writer, "random_bone_warm", PlaybackDirection::Random, DecompressionFunction::DecompressBone, compressed_clips, contexts, nullptr, pose_writer);
			};

			deallocate_type(allocator, cache_flusher);
		}

		struct default_transform_decompression_settings_latest final : public default_transform_decompression_settings
		{
			static constexpr compressed_tracks_version16 version_supported() { return compressed_tracks_version16::latest; }
		};

		inline void write_decompression_performance_stats(iallocator& allocator, const compression_settings& settings, const compressed_tracks& compressed_clip, stat_logging logging, sjson::ObjectWriter& writer)
		{
			(void)settings;

			if (compressed_clip.get_algorithm_type() != algorithm_type8::uniformly_sampled)
				return;

#if defined(ACL_HAS_ASSERT_CHECKS)
			// If we can, we use a fast-path that simulates what a real game engine would use
			// by disabling the things they normally wouldn't care about like deprecated formats
			// and debugging features
			const bool use_uniform_fast_path = settings.rotation_format == rotation_format8::quatf_drop_w_variable
				&& settings.translation_format == vector_format8::vector3f_variable
				&& settings.scale_format == vector_format8::vector3f_variable;

			ACL_ASSERT(use_uniform_fast_path, "We do not support profiling the debug code path");
#endif

			compressed_tracks* compressed_clips[k_num_decompression_evaluations];
			for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index)
			{
				void* clip = allocator.allocate(compressed_clip.get_size(), alignof(compressed_tracks));
				std::memcpy(clip, &compressed_clip, compressed_clip.get_size());
				compressed_clips[clip_index] = reinterpret_cast<compressed_tracks*>(clip);
			}

			decompression_context<default_transform_decompression_settings_latest>* contexts[k_num_decompression_evaluations];
			for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index)
				contexts[clip_index] = make_decompression_context<default_transform_decompression_settings_latest>(allocator);

			write_decompression_performance_stats(allocator, compressed_clips, contexts, logging, writer);

			for (uint32_t pass_index = 0; pass_index < k_num_decompression_evaluations; ++pass_index)
				deallocate_type(allocator, contexts[pass_index]);

			for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index)
				allocator.deallocate(compressed_clips[clip_index], compressed_clip.get_size());
		}
	}
}

ACL_IMPL_FILE_PRAGMA_POP

#endif	// #if defined(SJSON_CPP_WRITER)