#pragma once //////////////////////////////////////////////////////////////////////////////// // The MIT License (MIT) // // Copyright (c) 2019 Nicholas Frechette & Animation Compression Library contributors // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. //////////////////////////////////////////////////////////////////////////////// #if defined(SJSON_CPP_WRITER) #include "acl/core/compressed_tracks_version.h" #include "acl/core/scope_profiler.h" #include "acl/core/track_formats.h" #include "acl/core/utils.h" #include "acl/core/impl/compiler_utils.h" #include "acl/core/impl/memory_cache.h" #include "acl/compression/output_stats.h" #include "acl/decompression/decompress.h" #include #include #include #include #include #include #include ACL_IMPL_FILE_PRAGMA_PUSH namespace acl { namespace acl_impl { constexpr uint32_t k_num_decompression_samples = 100; constexpr uint32_t k_num_decompression_evaluations = 100; enum class PlaybackDirection { Forward, Backward, Random, }; enum class DecompressionFunction { DecompressPose, DecompressBone, }; template inline void write_decompression_performance_stats( stat_logging logging, sjson::ObjectWriter& writer, const char* action_type, PlaybackDirection playback_direction, DecompressionFunction decompression_function, compressed_tracks* compressed_clips[k_num_decompression_evaluations], DecompressionContextType* contexts[k_num_decompression_evaluations], CPUCacheFlusher* cache_flusher, debug_track_writer& pose_writer) { const uint32_t num_tracks = compressed_clips[0]->get_num_tracks(); const float duration = compressed_clips[0]->get_duration(); const bool is_cold_cache_profiling = cache_flusher != nullptr; float sample_times[k_num_decompression_samples]; for (uint32_t sample_index = 0; sample_index < k_num_decompression_samples; ++sample_index) { const float normalized_sample_time = float(sample_index) / float(k_num_decompression_samples - 1); sample_times[sample_index] = rtm::scalar_clamp(normalized_sample_time, 0.0F, 1.0F) * duration; } switch (playback_direction) { case PlaybackDirection::Forward: default: break; case PlaybackDirection::Backward: std::reverse(&sample_times[0], &sample_times[k_num_decompression_samples]); break; case PlaybackDirection::Random: std::shuffle(&sample_times[0], &sample_times[k_num_decompression_samples], std::default_random_engine(0)); break; } // Initialize and clear our contexts bool init_success = true; for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index) init_success |= contexts[clip_index]->initialize(*compressed_clips[clip_index]); ACL_ASSERT(init_success, "Failed to initialize decompression context"); if (!init_success) return; writer[action_type] = [&](sjson::ObjectWriter& action_writer) { double clip_max_ms = 0.0; double clip_min_ms = 1000000.0; double clip_total_ms = 0.0; double clip_time_ms[k_num_decompression_samples]; action_writer["data"] = [&](sjson::ArrayWriter& data_writer) { for (uint32_t sample_index = 0; sample_index < k_num_decompression_samples; ++sample_index) { const float sample_time = sample_times[sample_index]; // Clearing the context ensures the decoder cannot reuse any state cached from the last sample. if (playback_direction == PlaybackDirection::Random) { for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index) contexts[clip_index]->initialize(*compressed_clips[clip_index]); } // Clear the CPU cache if necessary if (is_cold_cache_profiling) { cache_flusher->begin_flushing(); for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index) { cache_flusher->flush_buffer(contexts[clip_index], sizeof(DecompressionContextType)); cache_flusher->flush_buffer(compressed_clips[clip_index], compressed_clips[clip_index]->get_size()); } cache_flusher->end_flushing(); } else { // If we want the cache warm, decompress everything once to prime it DecompressionContextType* context = contexts[0]; context->seek(sample_time, sample_rounding_policy::none); context->decompress_tracks(pose_writer); } // We yield our time slice and wait for a new one before priming the cache // to help keep it warm and minimize the risk that we'll be interrupted during decompression std::this_thread::sleep_for(std::chrono::nanoseconds(1)); scope_profiler timer; for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index) { // If we measure with a cold CPU cache, we use a different context every time otherwise we use the first one DecompressionContextType* context = is_cold_cache_profiling ? contexts[clip_index] : contexts[0]; context->seek(sample_time, sample_rounding_policy::none); switch (decompression_function) { case DecompressionFunction::DecompressPose: context->decompress_tracks(pose_writer); break; case DecompressionFunction::DecompressBone: for (uint32_t bone_index = 0; bone_index < num_tracks; ++bone_index) context->decompress_track(bone_index, pose_writer); break; } } timer.stop(); const double elapsed_ms = timer.get_elapsed_milliseconds() / k_num_decompression_evaluations; if (are_any_enum_flags_set(logging, stat_logging::exhaustive_decompression)) data_writer.push(elapsed_ms); clip_min_ms = rtm::scalar_min(clip_min_ms, elapsed_ms); clip_max_ms = rtm::scalar_max(clip_max_ms, elapsed_ms); clip_total_ms += elapsed_ms; clip_time_ms[sample_index] = elapsed_ms; } }; std::sort(&clip_time_ms[0], &clip_time_ms[k_num_decompression_samples]); action_writer["min_time_ms"] = clip_min_ms; action_writer["max_time_ms"] = clip_max_ms; action_writer["avg_time_ms"] = clip_total_ms / double(k_num_decompression_samples); action_writer["med_time_ms"] = clip_time_ms[k_num_decompression_samples / 2]; }; } inline void write_memcpy_performance_stats(iallocator& allocator, sjson::ObjectWriter& writer, CPUCacheFlusher* cache_flusher, rtm::qvvf* lossy_pose_transforms, uint32_t num_bones) { rtm::qvvf* memcpy_src_transforms = allocate_type_array(allocator, num_bones); double decompression_time_ms = 1000000.0; for (uint32_t pass_index = 0; pass_index < 3; ++pass_index) { if (cache_flusher != nullptr) { cache_flusher->begin_flushing(); cache_flusher->flush_buffer(memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones); cache_flusher->end_flushing(); // Now that the cache is cold, yield our time slice and wait for a new one // This helps minimize the risk that we'll be interrupted during decompression std::this_thread::sleep_for(std::chrono::nanoseconds(1)); } else { // We yield our time slice and wait for a new one before priming the cache // to help keep it warm and minimize the risk that we'll be interrupted during decompression std::this_thread::sleep_for(std::chrono::nanoseconds(1)); std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones); } double execution_count; scope_profiler timer; if (cache_flusher != nullptr) { std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones); execution_count = 1.0; } else { // Warm cache is too fast, execute multiple times and divide by the count std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones); std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones); std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones); std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones); std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones); std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones); std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones); std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones); std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones); std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones); execution_count = 10.0; } timer.stop(); const double elapsed_ms = timer.get_elapsed_milliseconds() / execution_count; decompression_time_ms = rtm::scalar_min(decompression_time_ms, elapsed_ms); } writer[cache_flusher != nullptr ? "memcpy_cold" : "memcpy_warm"] = [&](sjson::ObjectWriter& memcpy_writer) { memcpy_writer["data"] = [&](sjson::ArrayWriter&) {}; memcpy_writer["min_time_ms"] = decompression_time_ms; memcpy_writer["max_time_ms"] = decompression_time_ms; memcpy_writer["avg_time_ms"] = decompression_time_ms; }; deallocate_type_array(allocator, memcpy_src_transforms, num_bones); } template inline void write_decompression_performance_stats(iallocator& allocator, compressed_tracks* compressed_clips[k_num_decompression_evaluations], DecompressionContextType* contexts[k_num_decompression_evaluations], stat_logging logging, sjson::ObjectWriter& writer) { CPUCacheFlusher* cache_flusher = allocate_type(allocator); const uint32_t num_tracks = compressed_clips[0]->get_num_tracks(); debug_track_writer pose_writer(allocator, track_type8::qvvf, num_tracks); const uint32_t num_bytes_per_bone = (4 + 3 + 3) * sizeof(float); // Rotation, Translation, Scale writer["pose_size"] = num_tracks * num_bytes_per_bone; writer["decompression_time_per_sample"] = [&](sjson::ObjectWriter& per_sample_writer) { // Cold/Warm CPU cache, memcpy write_memcpy_performance_stats(allocator, per_sample_writer, cache_flusher, pose_writer.tracks_typed.qvvf, num_tracks); write_memcpy_performance_stats(allocator, per_sample_writer, nullptr, pose_writer.tracks_typed.qvvf, num_tracks); // Cold CPU cache, decompress_pose write_decompression_performance_stats(logging, per_sample_writer, "forward_pose_cold", PlaybackDirection::Forward, DecompressionFunction::DecompressPose, compressed_clips, contexts, cache_flusher, pose_writer); write_decompression_performance_stats(logging, per_sample_writer, "backward_pose_cold", PlaybackDirection::Backward, DecompressionFunction::DecompressPose, compressed_clips, contexts, cache_flusher, pose_writer); write_decompression_performance_stats(logging, per_sample_writer, "random_pose_cold", PlaybackDirection::Random, DecompressionFunction::DecompressPose, compressed_clips, contexts, cache_flusher, pose_writer); // Warm CPU cache, decompress_pose write_decompression_performance_stats(logging, per_sample_writer, "forward_pose_warm", PlaybackDirection::Forward, DecompressionFunction::DecompressPose, compressed_clips, contexts, nullptr, pose_writer); write_decompression_performance_stats(logging, per_sample_writer, "backward_pose_warm", PlaybackDirection::Backward, DecompressionFunction::DecompressPose, compressed_clips, contexts, nullptr, pose_writer); write_decompression_performance_stats(logging, per_sample_writer, "random_pose_warm", PlaybackDirection::Random, DecompressionFunction::DecompressPose, compressed_clips, contexts, nullptr, pose_writer); // Cold CPU cache, decompress_bone write_decompression_performance_stats(logging, per_sample_writer, "forward_bone_cold", PlaybackDirection::Forward, DecompressionFunction::DecompressBone, compressed_clips, contexts, cache_flusher, pose_writer); write_decompression_performance_stats(logging, per_sample_writer, "backward_bone_cold", PlaybackDirection::Backward, DecompressionFunction::DecompressBone, compressed_clips, contexts, cache_flusher, pose_writer); write_decompression_performance_stats(logging, per_sample_writer, "random_bone_cold", PlaybackDirection::Random, DecompressionFunction::DecompressBone, compressed_clips, contexts, cache_flusher, pose_writer); // Warm CPU cache, decompress_bone write_decompression_performance_stats(logging, per_sample_writer, "forward_bone_warm", PlaybackDirection::Forward, DecompressionFunction::DecompressBone, compressed_clips, contexts, nullptr, pose_writer); write_decompression_performance_stats(logging, per_sample_writer, "backward_bone_warm", PlaybackDirection::Backward, DecompressionFunction::DecompressBone, compressed_clips, contexts, nullptr, pose_writer); write_decompression_performance_stats(logging, per_sample_writer, "random_bone_warm", PlaybackDirection::Random, DecompressionFunction::DecompressBone, compressed_clips, contexts, nullptr, pose_writer); }; deallocate_type(allocator, cache_flusher); } struct default_transform_decompression_settings_latest final : public default_transform_decompression_settings { static constexpr compressed_tracks_version16 version_supported() { return compressed_tracks_version16::latest; } }; inline void write_decompression_performance_stats(iallocator& allocator, const compression_settings& settings, const compressed_tracks& compressed_clip, stat_logging logging, sjson::ObjectWriter& writer) { (void)settings; if (compressed_clip.get_algorithm_type() != algorithm_type8::uniformly_sampled) return; #if defined(ACL_HAS_ASSERT_CHECKS) // If we can, we use a fast-path that simulates what a real game engine would use // by disabling the things they normally wouldn't care about like deprecated formats // and debugging features const bool use_uniform_fast_path = settings.rotation_format == rotation_format8::quatf_drop_w_variable && settings.translation_format == vector_format8::vector3f_variable && settings.scale_format == vector_format8::vector3f_variable; ACL_ASSERT(use_uniform_fast_path, "We do not support profiling the debug code path"); #endif compressed_tracks* compressed_clips[k_num_decompression_evaluations]; for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index) { void* clip = allocator.allocate(compressed_clip.get_size(), alignof(compressed_tracks)); std::memcpy(clip, &compressed_clip, compressed_clip.get_size()); compressed_clips[clip_index] = reinterpret_cast(clip); } decompression_context* contexts[k_num_decompression_evaluations]; for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index) contexts[clip_index] = make_decompression_context(allocator); write_decompression_performance_stats(allocator, compressed_clips, contexts, logging, writer); for (uint32_t pass_index = 0; pass_index < k_num_decompression_evaluations; ++pass_index) deallocate_type(allocator, contexts[pass_index]); for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index) allocator.deallocate(compressed_clips[clip_index], compressed_clip.get_size()); } } } ACL_IMPL_FILE_PRAGMA_POP #endif // #if defined(SJSON_CPP_WRITER)