cocos-engine-external/sources/acl/compression/impl/write_decompression_stats.h

353 lines
16 KiB
C++

#pragma once
////////////////////////////////////////////////////////////////////////////////
// The MIT License (MIT)
//
// Copyright (c) 2019 Nicholas Frechette & Animation Compression Library contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
////////////////////////////////////////////////////////////////////////////////
#if defined(SJSON_CPP_WRITER)
#include "acl/core/compressed_tracks_version.h"
#include "acl/core/scope_profiler.h"
#include "acl/core/track_formats.h"
#include "acl/core/utils.h"
#include "acl/core/impl/compiler_utils.h"
#include "acl/core/impl/memory_cache.h"
#include "acl/compression/output_stats.h"
#include "acl/decompression/decompress.h"
#include <rtm/scalard.h>
#include <rtm/scalarf.h>
#include <algorithm>
#include <thread>
#include <chrono>
#include <cstring>
#include <random>
ACL_IMPL_FILE_PRAGMA_PUSH
namespace acl
{
namespace acl_impl
{
constexpr uint32_t k_num_decompression_samples = 100;
constexpr uint32_t k_num_decompression_evaluations = 100;
enum class PlaybackDirection
{
Forward,
Backward,
Random,
};
enum class DecompressionFunction
{
DecompressPose,
DecompressBone,
};
template<class DecompressionContextType>
inline void write_decompression_performance_stats(
stat_logging logging, sjson::ObjectWriter& writer, const char* action_type,
PlaybackDirection playback_direction, DecompressionFunction decompression_function,
compressed_tracks* compressed_clips[k_num_decompression_evaluations],
DecompressionContextType* contexts[k_num_decompression_evaluations],
CPUCacheFlusher* cache_flusher, debug_track_writer& pose_writer)
{
const uint32_t num_tracks = compressed_clips[0]->get_num_tracks();
const float duration = compressed_clips[0]->get_duration();
const bool is_cold_cache_profiling = cache_flusher != nullptr;
float sample_times[k_num_decompression_samples];
for (uint32_t sample_index = 0; sample_index < k_num_decompression_samples; ++sample_index)
{
const float normalized_sample_time = float(sample_index) / float(k_num_decompression_samples - 1);
sample_times[sample_index] = rtm::scalar_clamp(normalized_sample_time, 0.0F, 1.0F) * duration;
}
switch (playback_direction)
{
case PlaybackDirection::Forward:
default:
break;
case PlaybackDirection::Backward:
std::reverse(&sample_times[0], &sample_times[k_num_decompression_samples]);
break;
case PlaybackDirection::Random:
std::shuffle(&sample_times[0], &sample_times[k_num_decompression_samples], std::default_random_engine(0));
break;
}
// Initialize and clear our contexts
bool init_success = true;
for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index)
init_success |= contexts[clip_index]->initialize(*compressed_clips[clip_index]);
ACL_ASSERT(init_success, "Failed to initialize decompression context");
if (!init_success)
return;
writer[action_type] = [&](sjson::ObjectWriter& action_writer)
{
double clip_max_ms = 0.0;
double clip_min_ms = 1000000.0;
double clip_total_ms = 0.0;
double clip_time_ms[k_num_decompression_samples];
action_writer["data"] = [&](sjson::ArrayWriter& data_writer)
{
for (uint32_t sample_index = 0; sample_index < k_num_decompression_samples; ++sample_index)
{
const float sample_time = sample_times[sample_index];
// Clearing the context ensures the decoder cannot reuse any state cached from the last sample.
if (playback_direction == PlaybackDirection::Random)
{
for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index)
contexts[clip_index]->initialize(*compressed_clips[clip_index]);
}
// Clear the CPU cache if necessary
if (is_cold_cache_profiling)
{
cache_flusher->begin_flushing();
for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index)
{
cache_flusher->flush_buffer(contexts[clip_index], sizeof(DecompressionContextType));
cache_flusher->flush_buffer(compressed_clips[clip_index], compressed_clips[clip_index]->get_size());
}
cache_flusher->end_flushing();
}
else
{
// If we want the cache warm, decompress everything once to prime it
DecompressionContextType* context = contexts[0];
context->seek(sample_time, sample_rounding_policy::none);
context->decompress_tracks(pose_writer);
}
// We yield our time slice and wait for a new one before priming the cache
// to help keep it warm and minimize the risk that we'll be interrupted during decompression
std::this_thread::sleep_for(std::chrono::nanoseconds(1));
scope_profiler timer;
for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index)
{
// If we measure with a cold CPU cache, we use a different context every time otherwise we use the first one
DecompressionContextType* context = is_cold_cache_profiling ? contexts[clip_index] : contexts[0];
context->seek(sample_time, sample_rounding_policy::none);
switch (decompression_function)
{
case DecompressionFunction::DecompressPose:
context->decompress_tracks(pose_writer);
break;
case DecompressionFunction::DecompressBone:
for (uint32_t bone_index = 0; bone_index < num_tracks; ++bone_index)
context->decompress_track(bone_index, pose_writer);
break;
}
}
timer.stop();
const double elapsed_ms = timer.get_elapsed_milliseconds() / k_num_decompression_evaluations;
if (are_any_enum_flags_set(logging, stat_logging::exhaustive_decompression))
data_writer.push(elapsed_ms);
clip_min_ms = rtm::scalar_min(clip_min_ms, elapsed_ms);
clip_max_ms = rtm::scalar_max(clip_max_ms, elapsed_ms);
clip_total_ms += elapsed_ms;
clip_time_ms[sample_index] = elapsed_ms;
}
};
std::sort(&clip_time_ms[0], &clip_time_ms[k_num_decompression_samples]);
action_writer["min_time_ms"] = clip_min_ms;
action_writer["max_time_ms"] = clip_max_ms;
action_writer["avg_time_ms"] = clip_total_ms / double(k_num_decompression_samples);
action_writer["med_time_ms"] = clip_time_ms[k_num_decompression_samples / 2];
};
}
inline void write_memcpy_performance_stats(iallocator& allocator, sjson::ObjectWriter& writer, CPUCacheFlusher* cache_flusher, rtm::qvvf* lossy_pose_transforms, uint32_t num_bones)
{
rtm::qvvf* memcpy_src_transforms = allocate_type_array<rtm::qvvf>(allocator, num_bones);
double decompression_time_ms = 1000000.0;
for (uint32_t pass_index = 0; pass_index < 3; ++pass_index)
{
if (cache_flusher != nullptr)
{
cache_flusher->begin_flushing();
cache_flusher->flush_buffer(memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
cache_flusher->end_flushing();
// Now that the cache is cold, yield our time slice and wait for a new one
// This helps minimize the risk that we'll be interrupted during decompression
std::this_thread::sleep_for(std::chrono::nanoseconds(1));
}
else
{
// We yield our time slice and wait for a new one before priming the cache
// to help keep it warm and minimize the risk that we'll be interrupted during decompression
std::this_thread::sleep_for(std::chrono::nanoseconds(1));
std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
}
double execution_count;
scope_profiler timer;
if (cache_flusher != nullptr)
{
std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
execution_count = 1.0;
}
else
{
// Warm cache is too fast, execute multiple times and divide by the count
std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
std::memcpy(lossy_pose_transforms, memcpy_src_transforms, sizeof(rtm::qvvf) * num_bones);
execution_count = 10.0;
}
timer.stop();
const double elapsed_ms = timer.get_elapsed_milliseconds() / execution_count;
decompression_time_ms = rtm::scalar_min(decompression_time_ms, elapsed_ms);
}
writer[cache_flusher != nullptr ? "memcpy_cold" : "memcpy_warm"] = [&](sjson::ObjectWriter& memcpy_writer)
{
memcpy_writer["data"] = [&](sjson::ArrayWriter&) {};
memcpy_writer["min_time_ms"] = decompression_time_ms;
memcpy_writer["max_time_ms"] = decompression_time_ms;
memcpy_writer["avg_time_ms"] = decompression_time_ms;
};
deallocate_type_array(allocator, memcpy_src_transforms, num_bones);
}
template<class DecompressionContextType>
inline void write_decompression_performance_stats(iallocator& allocator, compressed_tracks* compressed_clips[k_num_decompression_evaluations], DecompressionContextType* contexts[k_num_decompression_evaluations], stat_logging logging, sjson::ObjectWriter& writer)
{
CPUCacheFlusher* cache_flusher = allocate_type<CPUCacheFlusher>(allocator);
const uint32_t num_tracks = compressed_clips[0]->get_num_tracks();
debug_track_writer pose_writer(allocator, track_type8::qvvf, num_tracks);
const uint32_t num_bytes_per_bone = (4 + 3 + 3) * sizeof(float); // Rotation, Translation, Scale
writer["pose_size"] = num_tracks * num_bytes_per_bone;
writer["decompression_time_per_sample"] = [&](sjson::ObjectWriter& per_sample_writer)
{
// Cold/Warm CPU cache, memcpy
write_memcpy_performance_stats(allocator, per_sample_writer, cache_flusher, pose_writer.tracks_typed.qvvf, num_tracks);
write_memcpy_performance_stats(allocator, per_sample_writer, nullptr, pose_writer.tracks_typed.qvvf, num_tracks);
// Cold CPU cache, decompress_pose
write_decompression_performance_stats(logging, per_sample_writer, "forward_pose_cold", PlaybackDirection::Forward, DecompressionFunction::DecompressPose, compressed_clips, contexts, cache_flusher, pose_writer);
write_decompression_performance_stats(logging, per_sample_writer, "backward_pose_cold", PlaybackDirection::Backward, DecompressionFunction::DecompressPose, compressed_clips, contexts, cache_flusher, pose_writer);
write_decompression_performance_stats(logging, per_sample_writer, "random_pose_cold", PlaybackDirection::Random, DecompressionFunction::DecompressPose, compressed_clips, contexts, cache_flusher, pose_writer);
// Warm CPU cache, decompress_pose
write_decompression_performance_stats(logging, per_sample_writer, "forward_pose_warm", PlaybackDirection::Forward, DecompressionFunction::DecompressPose, compressed_clips, contexts, nullptr, pose_writer);
write_decompression_performance_stats(logging, per_sample_writer, "backward_pose_warm", PlaybackDirection::Backward, DecompressionFunction::DecompressPose, compressed_clips, contexts, nullptr, pose_writer);
write_decompression_performance_stats(logging, per_sample_writer, "random_pose_warm", PlaybackDirection::Random, DecompressionFunction::DecompressPose, compressed_clips, contexts, nullptr, pose_writer);
// Cold CPU cache, decompress_bone
write_decompression_performance_stats(logging, per_sample_writer, "forward_bone_cold", PlaybackDirection::Forward, DecompressionFunction::DecompressBone, compressed_clips, contexts, cache_flusher, pose_writer);
write_decompression_performance_stats(logging, per_sample_writer, "backward_bone_cold", PlaybackDirection::Backward, DecompressionFunction::DecompressBone, compressed_clips, contexts, cache_flusher, pose_writer);
write_decompression_performance_stats(logging, per_sample_writer, "random_bone_cold", PlaybackDirection::Random, DecompressionFunction::DecompressBone, compressed_clips, contexts, cache_flusher, pose_writer);
// Warm CPU cache, decompress_bone
write_decompression_performance_stats(logging, per_sample_writer, "forward_bone_warm", PlaybackDirection::Forward, DecompressionFunction::DecompressBone, compressed_clips, contexts, nullptr, pose_writer);
write_decompression_performance_stats(logging, per_sample_writer, "backward_bone_warm", PlaybackDirection::Backward, DecompressionFunction::DecompressBone, compressed_clips, contexts, nullptr, pose_writer);
write_decompression_performance_stats(logging, per_sample_writer, "random_bone_warm", PlaybackDirection::Random, DecompressionFunction::DecompressBone, compressed_clips, contexts, nullptr, pose_writer);
};
deallocate_type(allocator, cache_flusher);
}
struct default_transform_decompression_settings_latest final : public default_transform_decompression_settings
{
static constexpr compressed_tracks_version16 version_supported() { return compressed_tracks_version16::latest; }
};
inline void write_decompression_performance_stats(iallocator& allocator, const compression_settings& settings, const compressed_tracks& compressed_clip, stat_logging logging, sjson::ObjectWriter& writer)
{
(void)settings;
if (compressed_clip.get_algorithm_type() != algorithm_type8::uniformly_sampled)
return;
#if defined(ACL_HAS_ASSERT_CHECKS)
// If we can, we use a fast-path that simulates what a real game engine would use
// by disabling the things they normally wouldn't care about like deprecated formats
// and debugging features
const bool use_uniform_fast_path = settings.rotation_format == rotation_format8::quatf_drop_w_variable
&& settings.translation_format == vector_format8::vector3f_variable
&& settings.scale_format == vector_format8::vector3f_variable;
ACL_ASSERT(use_uniform_fast_path, "We do not support profiling the debug code path");
#endif
compressed_tracks* compressed_clips[k_num_decompression_evaluations];
for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index)
{
void* clip = allocator.allocate(compressed_clip.get_size(), alignof(compressed_tracks));
std::memcpy(clip, &compressed_clip, compressed_clip.get_size());
compressed_clips[clip_index] = reinterpret_cast<compressed_tracks*>(clip);
}
decompression_context<default_transform_decompression_settings_latest>* contexts[k_num_decompression_evaluations];
for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index)
contexts[clip_index] = make_decompression_context<default_transform_decompression_settings_latest>(allocator);
write_decompression_performance_stats(allocator, compressed_clips, contexts, logging, writer);
for (uint32_t pass_index = 0; pass_index < k_num_decompression_evaluations; ++pass_index)
deallocate_type(allocator, contexts[pass_index]);
for (uint32_t clip_index = 0; clip_index < k_num_decompression_evaluations; ++clip_index)
allocator.deallocate(compressed_clips[clip_index], compressed_clip.get_size());
}
}
}
ACL_IMPL_FILE_PRAGMA_POP
#endif // #if defined(SJSON_CPP_WRITER)