diff --git a/Axiom/CMakeLists.txt b/Axiom/CMakeLists.txt index d6413812..0e4f8bce 100644 --- a/Axiom/CMakeLists.txt +++ b/Axiom/CMakeLists.txt @@ -351,6 +351,9 @@ function(axiom_configure_module_target target_name) AXIOM_SOURCE_DIR="${CMAKE_SOURCE_DIR}" AXIOM_ENABLE_WEBRTC=$,1,0> AXIOM_THREADED_RENDER=$,1,0> + AXIOM_PARALLEL_CULL=$,1,0> + AXIOM_VERIFY_PARALLEL_CULL=$,1,0> + AXIOM_FRAME_TASK_GRAPH=$,1,0> AXIOM_WEBRTC_LINKED=${AXIOM_WEBRTC_LINKED} ) @@ -370,13 +373,7 @@ function(axiom_configure_module_target target_name) target_link_options(${target_name} PUBLIC ${AXIOM_WEBRTC_LINK_OPTIONS}) endif() - if(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU") - target_compile_options(${target_name} PRIVATE - $<$:-O3 -march=native> - $<$:-O2 -g -march=native> - $<$:-Os> - ) - endif() + axiom_apply_performance_options(${target_name}) if(AXIOM_ENABLE_TSAN) if(NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU") @@ -539,6 +536,9 @@ target_compile_definitions(AxiomRendererVulkanObjects PRIVATE AXIOM_SOURCE_DIR="${CMAKE_SOURCE_DIR}" AXIOM_ENABLE_WEBRTC=$,1,0> AXIOM_THREADED_RENDER=$,1,0> + AXIOM_PARALLEL_CULL=$,1,0> + AXIOM_VERIFY_PARALLEL_CULL=$,1,0> + AXIOM_FRAME_TASK_GRAPH=$,1,0> AXIOM_WEBRTC_LINKED=${AXIOM_WEBRTC_LINKED} ) if(AXIOM_ENABLE_TSAN) @@ -547,6 +547,7 @@ if(AXIOM_ENABLE_TSAN) -fno-omit-frame-pointer ) endif() +axiom_apply_performance_options(AxiomRendererVulkanObjects) if(AXIOM_ENABLE_SCRIPTING) set(AXIOM_CORAL_MANAGED_DIR diff --git a/Axiom/Core/Application.cpp b/Axiom/Core/Application.cpp index e07bc508..98d5cf40 100644 --- a/Axiom/Core/Application.cpp +++ b/Axiom/Core/Application.cpp @@ -66,6 +66,8 @@ Application::Application(const ApplicationConfig &Config, .Width = m_Window->GetWidth(), .Height = m_Window->GetHeight(), .EnableThreadedRendering = m_Config.EnableThreadedRendering, + .EnableParallelCull = m_Config.EnableParallelCull, + .VerifyParallelCull = m_Config.VerifyParallelCull, }); } Jobs::Startup(); diff --git a/Axiom/Core/Application.h b/Axiom/Core/Application.h index c6510fea..f4c774a3 100644 --- a/Axiom/Core/Application.h +++ b/Axiom/Core/Application.h @@ -15,6 +15,15 @@ namespace Axiom { #ifndef AXIOM_THREADED_RENDER #define AXIOM_THREADED_RENDER 0 #endif +#ifndef AXIOM_PARALLEL_CULL +#define AXIOM_PARALLEL_CULL 0 +#endif +#ifndef AXIOM_VERIFY_PARALLEL_CULL +#define AXIOM_VERIFY_PARALLEL_CULL 0 +#endif +#ifndef AXIOM_FRAME_TASK_GRAPH +#define AXIOM_FRAME_TASK_GRAPH 0 +#endif class Renderer; struct RendererDeleter { @@ -40,6 +49,9 @@ struct ApplicationConfig { RuntimeMode Mode{RuntimeMode::LocalWindowedEditor}; IViewportFrameOutput *FrameOutput{nullptr}; bool EnableThreadedRendering{AXIOM_THREADED_RENDER != 0}; + bool EnableParallelCull{AXIOM_PARALLEL_CULL != 0}; + bool VerifyParallelCull{AXIOM_VERIFY_PARALLEL_CULL != 0}; + bool EnableFrameTaskGraph{AXIOM_FRAME_TASK_GRAPH != 0}; }; class Application { @@ -65,6 +77,9 @@ class Application { [[nodiscard]] float GetDeltaTime() const { return m_DeltaTime; } [[nodiscard]] uint64_t GetFrameIndex() const { return m_FrameIndex; } [[nodiscard]] RuntimeMode GetRuntimeMode() const { return m_Config.Mode; } + [[nodiscard]] bool IsFrameTaskGraphEnabled() const { + return m_Config.EnableFrameTaskGraph; + } [[nodiscard]] Renderer &GetRenderer() const; [[nodiscard]] Renderer *TryGetRenderer() const; [[nodiscard]] ModuleManager &GetModuleManager() { return m_ModuleManager; } diff --git a/Axiom/Jobs/JobSystem.cpp b/Axiom/Jobs/JobSystem.cpp index 0c393e1b..6ba31c43 100644 --- a/Axiom/Jobs/JobSystem.cpp +++ b/Axiom/Jobs/JobSystem.cpp @@ -3,7 +3,10 @@ #include "Core/Threading.h" #include "Jobs/TaskScheduler.h" +#include #include +#include +#include #include #include #include @@ -11,23 +14,37 @@ #include namespace Axiom::Jobs { -struct JobState { - std::unique_ptr Task; - std::vector DependencyHandles; -}; - namespace { +constexpr size_t kJobPoolCapacity = 131072; +constexpr size_t kMaxInlineDependencies = 16; + void OnWorkerThreadStart(uint32_t ThreadNum); class LambdaTaskSet final : public enki::ITaskSet { public: - explicit LambdaTaskSet(JobFn Function) - : enki::ITaskSet(1), m_Function(std::move(Function)) {} + LambdaTaskSet() : enki::ITaskSet(1) {} + + void Reset(JobFn Function, std::span Dependencies) { + m_SetSize = 1; + m_MinRange = 1; + m_Function = std::move(Function); + m_DependencyCount = std::min(Dependencies.size(), m_Dependencies.size()); + for (size_t Index = 0; Index < m_DependencyCount; ++Index) { + m_Dependencies[Index] = Dependencies[Index]; + } + } - void ExecuteRange(enki::TaskSetPartition, uint32_t) override { m_Function(); } + void Clear() { + m_Function.Reset(); + m_DependencyCount = 0; + } + + void ExecuteRange(enki::TaskSetPartition, uint32_t) override; private: JobFn m_Function; + std::array m_Dependencies; + size_t m_DependencyCount{0}; }; class ParallelForTaskSet final : public enki::ITaskSet { @@ -51,6 +68,16 @@ class ParallelForTaskSet final : public enki::ITaskSet { ParallelForFn m_Function; }; +} // namespace + +struct JobState { + LambdaTaskSet Task; + std::atomic Generation{0}; + std::atomic Recycled{true}; +}; + +namespace { + class JobSystem { public: void Startup() { @@ -60,7 +87,9 @@ class JobSystem { m_Scheduler = std::make_unique(); enki::TaskSchedulerConfig Config = m_Scheduler->GetConfig(); Config.profilerCallbacks.threadStart = &OnWorkerThreadStart; + Config.numExternalTaskThreads = 4; m_Scheduler->Initialize(Config); + ResetPool(); } } @@ -74,54 +103,77 @@ class JobSystem { if (m_StartupCount == 0 && m_Scheduler != nullptr) { m_Scheduler->WaitforAllAndShutdown(); m_Scheduler.reset(); + ResetPool(); } } JobHandle ScheduleJob(JobFn Function) { - auto State = std::make_shared(); - State->Task = std::make_unique(std::move(Function)); - m_Scheduler->AddTaskSetToPipe(State->Task.get()); - return {.State = std::move(State)}; + if (!CanUseScheduler()) { + Function(); + return {}; + } + + JobHandle Handle = AcquireTask(Function, {}); + if (!Handle.IsValid()) { + Function(); + return {}; + } + + m_Scheduler->AddTaskSetToPipe(&Handle.State->Task); + return Handle; } JobHandle ScheduleJobAfter(JobFn Function, std::span Deps) { - auto State = std::make_shared(); - State->DependencyHandles.reserve(Deps.size()); - for (const JobHandle &Dependency : Deps) { - if (!Dependency.IsValid() || Dependency.State->Task == nullptr) { - continue; + if (!CanUseScheduler()) { + for (const JobHandle &Dependency : Deps) { + Wait(Dependency); } - State->DependencyHandles.push_back(Dependency); + Function(); + return {}; } - State->Task = std::make_unique( - [this, State, Function = std::move(Function)]() mutable { - for (const JobHandle &Dependency : State->DependencyHandles) { - if (!Dependency.IsValid() || Dependency.State->Task == nullptr) { - continue; - } + if (Deps.size() > kMaxInlineDependencies) { + for (size_t Index = kMaxInlineDependencies; Index < Deps.size(); + ++Index) { + Wait(Deps[Index]); + } + } - m_Scheduler->WaitforTask(Dependency.State->Task.get()); - } + const size_t InlineDependencyCount = + std::min(Deps.size(), kMaxInlineDependencies); + JobHandle Handle = + AcquireTask(Function, Deps.first(InlineDependencyCount)); + if (!Handle.IsValid()) { + for (size_t Index = 0; Index < InlineDependencyCount; ++Index) { + Wait(Deps[Index]); + } + Function(); + return {}; + } - Function(); - }); - m_Scheduler->AddTaskSetToPipe(State->Task.get()); - return {.State = std::move(State)}; + m_Scheduler->AddTaskSetToPipe(&Handle.State->Task); + return Handle; } void Wait(JobHandle Handle) { - if (!Handle.IsValid() || Handle.State->Task == nullptr) { + if (m_Scheduler == nullptr || !IsCurrent(Handle)) { return; } - m_Scheduler->WaitforTask(Handle.State->Task.get()); + m_Scheduler->WaitforTask(&Handle.State->Task); + ReleaseTask(Handle); } void ParallelFor(size_t Count, ParallelForFn Function) { if (Count == 0) { return; } + if (!CanUseScheduler()) { + for (size_t Index = 0; Index < Count; ++Index) { + Function(Index); + } + return; + } ParallelForTaskSet Task(Count, std::move(Function)); m_Scheduler->AddTaskSetToPipe(&Task); @@ -131,7 +183,72 @@ class JobSystem { private: std::mutex m_Mutex; std::unique_ptr m_Scheduler; + std::unique_ptr m_TaskPool; + std::vector m_FreeList; size_t m_StartupCount{0}; + + void ResetPool() { + if (m_TaskPool == nullptr) { + m_TaskPool = std::make_unique(kJobPoolCapacity); + } + + m_FreeList.reserve(kJobPoolCapacity); + m_FreeList.clear(); + for (size_t Index = 0; Index < kJobPoolCapacity; ++Index) { + JobState &State = m_TaskPool[Index]; + State.Task.Clear(); + State.Generation.fetch_add(1, std::memory_order_relaxed); + State.Recycled.store(true, std::memory_order_relaxed); + m_FreeList.push_back(&State); + } + } + + JobHandle AcquireTask(JobFn &Function, std::span Deps) { + std::scoped_lock Lock(m_Mutex); + if (m_TaskPool == nullptr || m_FreeList.empty()) { + return {}; + } + + JobState *State = m_FreeList.back(); + m_FreeList.pop_back(); + State->Recycled.store(false, std::memory_order_relaxed); + const uint32_t Generation = + State->Generation.fetch_add(1, std::memory_order_relaxed) + 1; + State->Task.Reset(std::move(Function), Deps); + return {.State = State, .Generation = Generation}; + } + + void ReleaseTask(JobHandle Handle) { + JobState *State = Handle.State; + if (State == nullptr || + State->Generation.load(std::memory_order_acquire) != + Handle.Generation || + State->Recycled.exchange(true, std::memory_order_acq_rel)) { + return; + } + + State->Task.Clear(); + + std::scoped_lock Lock(m_Mutex); + m_FreeList.push_back(State); + } + + bool IsCurrent(JobHandle Handle) const { + return Handle.State != nullptr && + Handle.State->Generation.load(std::memory_order_acquire) == + Handle.Generation && + !Handle.State->Recycled.load(std::memory_order_acquire); + } + + bool CanUseScheduler() { + if (m_Scheduler == nullptr) { + return false; + } + if (m_Scheduler->GetThreadNum() != enki::NO_THREAD_NUM) { + return true; + } + return m_Scheduler->RegisterExternalTaskThread(); + } }; JobSystem &GetJobSystem() { @@ -143,6 +260,14 @@ void OnWorkerThreadStart(uint32_t ThreadNum) { Threading::SetCurrentThreadName("Axiom Job Worker " + std::to_string(ThreadNum)); } + +void LambdaTaskSet::ExecuteRange(enki::TaskSetPartition, uint32_t) { + for (size_t Index = 0; Index < m_DependencyCount; ++Index) { + GetJobSystem().Wait(m_Dependencies[Index]); + } + + m_Function(); +} } // namespace void Startup() { GetJobSystem().Startup(); } diff --git a/Axiom/Jobs/JobSystem.h b/Axiom/Jobs/JobSystem.h index 50c92122..0cfa9f84 100644 --- a/Axiom/Jobs/JobSystem.h +++ b/Axiom/Jobs/JobSystem.h @@ -1,20 +1,160 @@ #pragma once #include +#include #include -#include +#include #include +#include +#include namespace Axiom::Jobs { -using JobFn = std::function; -using ParallelForFn = std::function; +namespace Detail { +template class SmallFunction; + +template +class SmallFunction { +public: + SmallFunction() = default; + SmallFunction(std::nullptr_t) noexcept {} + + template , + std::enable_if_t, int> = 0> + SmallFunction(Callable &&Function) { + Emplace(std::forward(Function)); + } + + SmallFunction(const SmallFunction &Other) { CopyFrom(Other); } + + SmallFunction(SmallFunction &&Other) noexcept { MoveFrom(std::move(Other)); } + + SmallFunction &operator=(const SmallFunction &Other) { + if (this != &Other) { + Reset(); + CopyFrom(Other); + } + return *this; + } + + SmallFunction &operator=(SmallFunction &&Other) noexcept { + if (this != &Other) { + Reset(); + MoveFrom(std::move(Other)); + } + return *this; + } + + SmallFunction &operator=(std::nullptr_t) noexcept { + Reset(); + return *this; + } + + template , + std::enable_if_t, int> = 0> + SmallFunction &operator=(Callable &&Function) { + Reset(); + Emplace(std::forward(Function)); + return *this; + } + + ~SmallFunction() { Reset(); } + + explicit operator bool() const noexcept { return m_Invoke != nullptr; } + + R operator()(Args... Arguments) { + if (m_Invoke == nullptr) { + throw std::bad_function_call(); + } + return m_Invoke(&m_Storage, std::forward(Arguments)...); + } + + void Reset() noexcept { + if (m_Destroy != nullptr) { + m_Destroy(&m_Storage); + } + m_Invoke = nullptr; + m_Destroy = nullptr; + m_Copy = nullptr; + m_Move = nullptr; + } + +private: + using Storage = std::aligned_storage_t; + + template void Emplace(Callable &&Function) { + using Stored = std::decay_t; + static_assert(sizeof(Stored) <= StorageSize, + "Job callable capture is too large for inline storage"); + static_assert(alignof(Stored) <= alignof(Storage), + "Job callable alignment is too large for inline storage"); + static_assert(std::is_copy_constructible_v, + "Job callables must be copy constructible"); + + new (&m_Storage) Stored(std::forward(Function)); + m_Invoke = [](void *StoragePtr, Args... Arguments) -> R { + return (*std::launder(reinterpret_cast(StoragePtr)))( + std::forward(Arguments)...); + }; + m_Destroy = [](void *StoragePtr) noexcept { + std::launder(reinterpret_cast(StoragePtr))->~Stored(); + }; + m_Copy = [](void *Destination, const void *Source) { + new (Destination) + Stored(*std::launder(reinterpret_cast(Source))); + }; + m_Move = [](void *Destination, void *Source) noexcept { + new (Destination) + Stored(std::move(*std::launder(reinterpret_cast(Source)))); + std::launder(reinterpret_cast(Source))->~Stored(); + }; + } + + void CopyFrom(const SmallFunction &Other) { + if (Other.m_Invoke == nullptr) { + return; + } + Other.m_Copy(&m_Storage, &Other.m_Storage); + m_Invoke = Other.m_Invoke; + m_Destroy = Other.m_Destroy; + m_Copy = Other.m_Copy; + m_Move = Other.m_Move; + } + + void MoveFrom(SmallFunction &&Other) noexcept { + if (Other.m_Invoke == nullptr) { + return; + } + Other.m_Move(&m_Storage, &Other.m_Storage); + m_Invoke = Other.m_Invoke; + m_Destroy = Other.m_Destroy; + m_Copy = Other.m_Copy; + m_Move = Other.m_Move; + Other.m_Invoke = nullptr; + Other.m_Destroy = nullptr; + Other.m_Copy = nullptr; + Other.m_Move = nullptr; + } + + Storage m_Storage; + R (*m_Invoke)(void *, Args...) = nullptr; + void (*m_Destroy)(void *) noexcept = nullptr; + void (*m_Copy)(void *, const void *) = nullptr; + void (*m_Move)(void *, void *) noexcept = nullptr; +}; +} // namespace Detail + +using JobFn = Detail::SmallFunction; +using ParallelForFn = Detail::SmallFunction; struct JobState; struct JobHandle { [[nodiscard]] bool IsValid() const noexcept { return State != nullptr; } - std::shared_ptr State; + JobState *State = nullptr; + uint32_t Generation = 0; }; void Startup(); diff --git a/Axiom/Renderer/RendererFrameModule.cpp b/Axiom/Renderer/RendererFrameModule.cpp index 34c581ba..0f36a63b 100644 --- a/Axiom/Renderer/RendererFrameModule.cpp +++ b/Axiom/Renderer/RendererFrameModule.cpp @@ -3,17 +3,30 @@ #include "Core/Application.h" #include "Renderer/Renderer.h" +#include +#include + namespace Axiom { std::string_view RendererFrameModule::GetName() const { return "Core.RendererFrame"; } bool RendererFrameModule::Initialize(Application &App) { - (void)App; + m_UseFrameTaskGraph = App.IsFrameTaskGraphEnabled(); + ResetTaskGraph(); return true; } void RendererFrameModule::Update(const ModuleUpdateContext &Context) { + if (m_UseFrameTaskGraph) { + UpdateTaskGraph(Context); + return; + } + + UpdateSerial(Context); +} + +void RendererFrameModule::UpdateSerial(const ModuleUpdateContext &Context) { switch (Context.Phase) { case ModuleUpdatePhase::FrameStart: Context.App.GetRenderer().SetCpuFrameTime(Context.DeltaTimeSeconds * @@ -33,5 +46,45 @@ void RendererFrameModule::Update(const ModuleUpdateContext &Context) { } } -void RendererFrameModule::Shutdown(Application &App) { (void)App; } +void RendererFrameModule::UpdateTaskGraph(const ModuleUpdateContext &Context) { + Renderer &Renderer = Context.App.GetRenderer(); + switch (Context.Phase) { + case ModuleUpdatePhase::FrameStart: + ResetTaskGraph(); + Renderer.SetCpuFrameTime(Context.DeltaTimeSeconds * 1000.0f); + break; + case ModuleUpdatePhase::RenderBegin: + m_BeginFrameJob = Jobs::ScheduleJob([&Renderer]() { Renderer.BeginFrame(); }); + Jobs::Wait(m_BeginFrameJob); + break; + case ModuleUpdatePhase::Render: { + std::array Dependencies = {m_BeginFrameJob}; + m_RenderJob = Jobs::ScheduleJobAfter( + [&Renderer]() { Renderer.Render(); }, + std::span(Dependencies)); + break; + } + case ModuleUpdatePhase::ImGuiRender: + break; + case ModuleUpdatePhase::RenderEnd: { + std::array Dependencies = {m_RenderJob}; + m_EndFrameJob = Jobs::ScheduleJobAfter( + [&Renderer]() { Renderer.EndFrame(); }, + std::span(Dependencies)); + Jobs::Wait(m_EndFrameJob); + break; + } + } +} + +void RendererFrameModule::ResetTaskGraph() { + m_BeginFrameJob = {}; + m_RenderJob = {}; + m_EndFrameJob = {}; +} + +void RendererFrameModule::Shutdown(Application &App) { + (void)App; + ResetTaskGraph(); +} } // namespace Axiom diff --git a/Axiom/Renderer/RendererFrameModule.h b/Axiom/Renderer/RendererFrameModule.h index f60f97b8..849e935f 100644 --- a/Axiom/Renderer/RendererFrameModule.h +++ b/Axiom/Renderer/RendererFrameModule.h @@ -1,6 +1,7 @@ #pragma once #include "Core/IModule.h" +#include "Jobs/JobSystem.h" namespace Axiom { class RendererFrameModule final : public IModule { @@ -9,5 +10,15 @@ class RendererFrameModule final : public IModule { bool Initialize(Application &App) override; void Update(const ModuleUpdateContext &Context) override; void Shutdown(Application &App) override; + +private: + void UpdateSerial(const ModuleUpdateContext &Context); + void UpdateTaskGraph(const ModuleUpdateContext &Context); + void ResetTaskGraph(); + + Jobs::JobHandle m_BeginFrameJob; + Jobs::JobHandle m_RenderJob; + Jobs::JobHandle m_EndFrameJob; + bool m_UseFrameTaskGraph{false}; }; } // namespace Axiom diff --git a/Axiom/Renderer/RendererTypes.h b/Axiom/Renderer/RendererTypes.h index 26aa1744..bf7814fa 100644 --- a/Axiom/Renderer/RendererTypes.h +++ b/Axiom/Renderer/RendererTypes.h @@ -14,6 +14,12 @@ namespace Axiom { #ifndef AXIOM_THREADED_RENDER #define AXIOM_THREADED_RENDER 0 #endif +#ifndef AXIOM_PARALLEL_CULL +#define AXIOM_PARALLEL_CULL 0 +#endif +#ifndef AXIOM_VERIFY_PARALLEL_CULL +#define AXIOM_VERIFY_PARALLEL_CULL 0 +#endif enum class RendererBackendType : uint32_t { Vulkan = 0, @@ -37,6 +43,8 @@ struct RendererCreateInfo { uint32_t Width{0}; uint32_t Height{0}; bool EnableThreadedRendering{AXIOM_THREADED_RENDER != 0}; + bool EnableParallelCull{AXIOM_PARALLEL_CULL != 0}; + bool VerifyParallelCull{AXIOM_VERIFY_PARALLEL_CULL != 0}; std::function ThreadedRenderSceneStartCallback; std::function ThreadedRenderSceneCompleteCallback; RendererBackendType BackendType{RendererBackendType::Vulkan}; diff --git a/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.cpp b/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.cpp index 074533dc..70ff15db 100644 --- a/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.cpp +++ b/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.cpp @@ -6,6 +6,7 @@ #include "AxiomRHI/Vulkan/VulkanInitializers.h" #include "AxiomRHI/Vulkan/VulkanMesh.h" #include "AxiomRHI/Vulkan/VulkanRhiDevice.h" +#include "Jobs/JobSystem.h" #include #include @@ -13,6 +14,7 @@ #include #include #include +#include #include #include @@ -61,6 +63,8 @@ uint64_t PackTranslucentSortKey(float SortDepth, uint32_t SubmissionIndex) { void VulkanSceneRenderer::Init(IRHIDevice &Device, const RendererCreateInfo &CreateInfo) { m_Device = static_cast(&Device); + m_EnableParallelCull = CreateInfo.EnableParallelCull; + m_VerifyParallelCull = CreateInfo.VerifyParallelCull; if (m_Device != nullptr) { m_FrameOutput = CreateInfo.FrameOutput; m_Device->GetDrawSubmissionSystem().SetRecordPreparedScenePasses( @@ -238,35 +242,37 @@ void VulkanSceneRenderer::PrepareSceneFrame(RenderScene &Scene) { auto &Candidates = m_CandidateScratch; auto &VisibleSubmissions = m_PreparedSceneState.VisibleSubmissions; + auto &CullInputs = m_CullInputScratch; Candidates.clear(); VisibleSubmissions.Clear(); + CullInputs.clear(); Candidates.reserve(SubmissionCount); + CullInputs.resize(SubmissionCount); VisibleSubmissions.OpaqueGraphics.reserve(SubmissionCount); VisibleSubmissions.TranslucentGraphics.reserve(SubmissionCount); VisibleSubmissions.Compute.reserve(SubmissionCount); for (size_t Index = 0; Index < SubmissionCount; ++Index) { const auto &Submission = Scene.Submissions[Index]; - VulkanMesh *VulkanMeshRef = m_Device->ResolveMeshHandle(Submission.MeshHandle); - if (VulkanMeshRef == nullptr) { - continue; - } + CullInputs[Index].Mesh = m_Device->ResolveMeshHandle(Submission.MeshHandle); + } - if (!m_PreparedSceneState.ForceWireframe && - !m_Device->GetOcclusionCulling().IsBoundsVisible( - m_PreparedSceneState.CameraData.ViewProjection, Submission.Transform, - VulkanMeshRef->BoundsMin, VulkanMeshRef->BoundsMax)) { - ++FrameStats.FrustumCulledMeshCount; - continue; + size_t FrustumCulledCount = 0; + if (ShouldUseParallelCull(SubmissionCount)) { + FrustumCulledCount = BuildCullCandidatesParallel(Scene, CullInputs, Candidates); + if (m_VerifyParallelCull) { + auto &SerialCandidates = m_VerificationCandidateScratch; + const size_t SerialFrustumCulledCount = + BuildCullCandidatesSerial(Scene, CullInputs, SerialCandidates); + assert(SerialFrustumCulledCount == FrustumCulledCount && + "Parallel cull frustum count diverged from serial cull"); + assert(SerialCandidates == Candidates && + "Parallel cull candidates diverged from serial cull"); } - - const glm::vec3 WorldCenter = ComputeWorldCenter(Submission, *VulkanMeshRef); - const glm::vec3 Delta = WorldCenter - Scene.ActiveCamera->GetPosition(); - Candidates.push_back({.SubmissionIndex = static_cast(Index), - .MeshHandle = Submission.MeshHandle, - .Mesh = VulkanMeshRef, - .SortDepth = glm::dot(Delta, Delta)}); + } else { + FrustumCulledCount = BuildCullCandidatesSerial(Scene, CullInputs, Candidates); } + FrameStats.FrustumCulledMeshCount = static_cast(FrustumCulledCount); if (!m_PreparedSceneState.ForceWireframe) { std::sort(Candidates.begin(), Candidates.end(), @@ -333,6 +339,103 @@ void VulkanSceneRenderer::PrepareSceneFrame(RenderScene &Scene) { PrepareGraphicsMaterialDescriptors(); } +size_t VulkanSceneRenderer::BuildCullCandidatesSerial( + const RenderScene &Scene, std::span Inputs, + std::vector &Candidates) const { + Candidates.clear(); + Candidates.reserve(Inputs.size()); + + size_t FrustumCulledCount = 0; + const glm::vec3 CameraPosition = Scene.ActiveCamera->GetPosition(); + for (size_t Index = 0; Index < Inputs.size(); ++Index) { + const auto &Submission = Scene.Submissions[Index]; + VulkanMesh *VulkanMeshRef = Inputs[Index].Mesh; + if (VulkanMeshRef == nullptr) { + continue; + } + + if (!m_PreparedSceneState.ForceWireframe && + !m_Device->GetOcclusionCulling().IsBoundsVisible( + m_PreparedSceneState.CameraData.ViewProjection, Submission.Transform, + VulkanMeshRef->BoundsMin, VulkanMeshRef->BoundsMax)) { + ++FrustumCulledCount; + continue; + } + + const glm::vec3 WorldCenter = ComputeWorldCenter(Submission, *VulkanMeshRef); + const glm::vec3 Delta = WorldCenter - CameraPosition; + Candidates.push_back({.SubmissionIndex = static_cast(Index), + .MeshHandle = Submission.MeshHandle, + .Mesh = VulkanMeshRef, + .SortDepth = glm::dot(Delta, Delta)}); + } + + return FrustumCulledCount; +} + +size_t VulkanSceneRenderer::BuildCullCandidatesParallel( + const RenderScene &Scene, std::span Inputs, + std::vector &Candidates) const { + Candidates.clear(); + Candidates.reserve(Inputs.size()); + + const unsigned HardwareThreads = std::max(1u, std::thread::hardware_concurrency()); + const size_t BucketCount = + std::min(Inputs.size(), std::max(1u, HardwareThreads)); + const size_t BucketSize = (Inputs.size() + BucketCount - 1u) / BucketCount; + const glm::vec3 CameraPosition = Scene.ActiveCamera->GetPosition(); + + struct CandidateBucket { + std::vector Candidates; + size_t FrustumCulledCount{0}; + }; + std::vector Buckets(BucketCount); + + Jobs::ParallelFor(BucketCount, [&](size_t BucketIndex) { + const size_t Begin = BucketIndex * BucketSize; + const size_t End = std::min(Inputs.size(), Begin + BucketSize); + CandidateBucket &Bucket = Buckets[BucketIndex]; + Bucket.Candidates.reserve(End - Begin); + + for (size_t Index = Begin; Index < End; ++Index) { + const auto &Submission = Scene.Submissions[Index]; + VulkanMesh *VulkanMeshRef = Inputs[Index].Mesh; + if (VulkanMeshRef == nullptr) { + continue; + } + + if (!m_PreparedSceneState.ForceWireframe && + !m_Device->GetOcclusionCulling().IsBoundsVisible( + m_PreparedSceneState.CameraData.ViewProjection, Submission.Transform, + VulkanMeshRef->BoundsMin, VulkanMeshRef->BoundsMax)) { + ++Bucket.FrustumCulledCount; + continue; + } + + const glm::vec3 WorldCenter = ComputeWorldCenter(Submission, *VulkanMeshRef); + const glm::vec3 Delta = WorldCenter - CameraPosition; + Bucket.Candidates.push_back({.SubmissionIndex = static_cast(Index), + .MeshHandle = Submission.MeshHandle, + .Mesh = VulkanMeshRef, + .SortDepth = glm::dot(Delta, Delta)}); + } + }); + + size_t FrustumCulledCount = 0; + for (const CandidateBucket &Bucket : Buckets) { + FrustumCulledCount += Bucket.FrustumCulledCount; + Candidates.insert(Candidates.end(), Bucket.Candidates.begin(), + Bucket.Candidates.end()); + } + + return FrustumCulledCount; +} + +bool VulkanSceneRenderer::ShouldUseParallelCull(size_t SubmissionCount) const { + constexpr size_t kParallelCullSubmissionThreshold = 512; + return m_EnableParallelCull && SubmissionCount >= kParallelCullSubmissionThreshold; +} + void VulkanSceneRenderer::RecordBackground() { QueueScenePass(ScenePassPrimitive::Background); } diff --git a/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.h b/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.h index 60c8c1a8..7e1545a3 100644 --- a/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.h +++ b/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.h @@ -4,6 +4,7 @@ #include "AxiomRHI/Vulkan/VulkanRendererTypes.h" #include +#include #include namespace Axiom { @@ -48,6 +49,12 @@ class VulkanSceneRenderer final : public ISceneRendererBackend { MeshHandle MeshHandle{}; VulkanMesh *Mesh{nullptr}; float SortDepth{0.0f}; + + constexpr bool operator==(const CandidateSubmission &) const = default; + }; + + struct SubmissionCullInput { + VulkanMesh *Mesh{nullptr}; }; struct PreparedSceneState { @@ -60,6 +67,13 @@ class VulkanSceneRenderer final : public ISceneRendererBackend { }; void PrepareSceneFrame(RenderScene &Scene); + size_t BuildCullCandidatesSerial( + const RenderScene &Scene, std::span Inputs, + std::vector &Candidates) const; + size_t BuildCullCandidatesParallel( + const RenderScene &Scene, std::span Inputs, + std::vector &Candidates) const; + bool ShouldUseParallelCull(size_t SubmissionCount) const; void RecordBackground(); void RecordDepthPrepass(); void BuildHzb(); @@ -107,7 +121,11 @@ class VulkanSceneRenderer final : public ISceneRendererBackend { IViewportFrameOutput *m_FrameOutput{nullptr}; PreparedSceneState m_PreparedSceneState{}; std::vector m_CandidateScratch; + std::vector m_VerificationCandidateScratch; + std::vector m_CullInputScratch; std::vector m_QueuedScenePasses; + bool m_EnableParallelCull{AXIOM_PARALLEL_CULL != 0}; + bool m_VerifyParallelCull{AXIOM_VERIFY_PARALLEL_CULL != 0}; VkImageLayout m_SceneDrawImageLayout{VK_IMAGE_LAYOUT_UNDEFINED}; VkImageLayout m_SceneRasterDepthLayout{VK_IMAGE_LAYOUT_UNDEFINED}; }; diff --git a/CMakeLists.txt b/CMakeLists.txt index b2e02f93..746ecc2e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,8 @@ set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +include(cmake/AxiomCompilerOptions.cmake) + option(BUILD_TESTING "Build tests" OFF) option(AXIOM_ENABLE_SCRIPTING "Enable the Coral C# scripting host" OFF) @@ -18,7 +20,13 @@ option(AXIOM_ENABLE_WEBRTC option(AXIOM_ENABLE_PHYSICS "Enable the JoltPhysics runtime simulation seam" ON) option(AXIOM_THREADED_RENDER - "Enable the experimental threaded renderer and worker job system" OFF) + "Enable the threaded renderer and worker job system" ON) +option(AXIOM_PARALLEL_CULL + "Enable guarded parallel CPU culling in the Vulkan scene renderer" ON) +option(AXIOM_VERIFY_PARALLEL_CULL + "Verify parallel CPU culling against the serial path in debug/test builds" OFF) +option(AXIOM_FRAME_TASK_GRAPH + "Enable the minimal renderer frame task graph backed by enkiTS jobs" ON) option(AXIOM_ENABLE_TSAN "Build with ThreadSanitizer instrumentation" OFF) set(AXIOM_WEBRTC_FRAMEWORK_PATH "" CACHE PATH diff --git a/CMakePresets.json b/CMakePresets.json index a1a00267..0c2147a6 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -39,17 +39,20 @@ { "name": "debug", "displayName": "Build Debug", - "configurePreset": "debug" + "configurePreset": "debug", + "jobs": 0 }, { "name": "release", "displayName": "Build Release", - "configurePreset": "release" + "configurePreset": "release", + "jobs": 0 }, { "name": "minsizerel", "displayName": "Build MinSizeRel", - "configurePreset": "minsizerel" + "configurePreset": "minsizerel", + "jobs": 0 } ] } diff --git a/Content/Cooked/sponza_atrium_3.wmesh b/Content/Cooked/sponza_atrium_3.wmesh index 295f5566..bb2d74d1 100644 Binary files a/Content/Cooked/sponza_atrium_3.wmesh and b/Content/Cooked/sponza_atrium_3.wmesh differ diff --git a/Docs/DistributedWraithEngineDesign.md b/Docs/DistributedWraithEngineDesign.md index f8404113..96fd0854 100644 --- a/Docs/DistributedWraithEngineDesign.md +++ b/Docs/DistributedWraithEngineDesign.md @@ -1,13 +1,13 @@ # Distributed Wraith Engine Design ## Document Status -- Status: Draft -- Date: 2026-05-25 +- Status: Draft, current-state refresh +- Date: 2026-06-07 - Audience: Engine, tools, networking, web, and infrastructure contributors - Intended outcome: Establish the target architecture for evolving WraithEngine into a distributed game engine and browser-based collaborative editor ## Implementation Progress -- `event-system` branch now contains the first local authoritative editor-session slice +- The current tree contains the first local authoritative editor-session slice - Added engine-owned `EditorSession`, `EditorCommand`, `EditorEvent`, `SessionId`, and `SessionUserId` foundations in `Axiom` - The native editor now translates GLFW input into commands and renders from session-owned camera/scene state instead of mutating camera state directly in the layer - Added deterministic in-process command draining, authoritative event publication, and focused tests for camera/look state transitions and command rejection @@ -23,6 +23,9 @@ - `AxiomRemoteViewportServer` now treats WebRTC as the only supported remote viewport media path - `WraithNetworking` now exposes initialization state and connection metrics for future CVAR/config integration - Removed the largest remote-viewport performance bottlenecks by unthrottling the headless server loop and tuning the encoder/input path for latency +- Headless offscreen rendering now uses asynchronous readback polling instead of waiting immediately after graphics submit, with completed captures published on later ticks in submit order +- Remote render views now track dirty/burst scheduling state; idle connected clients are throttled through round-robin cadence instead of forcing full-rate rendering for every client every tick +- Performance-oriented defaults are enabled for first-party builds: Release is the implicit single-config default, preset builds run with parallel jobs, and optimized targets use native CPU tuning plus IPO/LTO when supported - The remote viewport now runs at acceptable frame rate, but still has noticeable residual input latency that likely requires deeper WebRTC sender/playout tuning - A root-level `EditorFrontend` workspace now serves as the longer-lived browser editor shell using Next.js, React, and Tailwind CSS - `EditorFrontend` contains a docked editor UI with a menu bar, toolbar, outliner, details panel, content browser, and the active WebRTC viewport client @@ -33,7 +36,7 @@ - a delayed-readback frame attribution bug in multi-pass headless rendering was fixed by stamping each offscreen capture with the submitting `SessionUserId` at submission time - The next browser-facing step after the migration is turning the browser shell plus authoritative session into a real single-user scene editor, not more work on a server-hosted prototype page - Collaboration should continue to follow that same authoritative command/event path after the single-user authoring loop is stable, rather than leading the roadmap ahead of core editor behavior -- `scene-editing` branch introduces the first authoritative object-lifecycle commands: `CreateObjectCommand`, `DuplicateObjectCommand`, and `DeleteObjectCommand`, with matching `ObjectCreatedEvent` and `ObjectDeletedEvent` authoritative events +- The current tree includes the first authoritative object-lifecycle commands: `CreateObjectCommand`, `DuplicateObjectCommand`, and `DeleteObjectCommand`, with matching `ObjectCreatedEvent` and `ObjectDeletedEvent` authoritative events - All scene objects are now backed by an Instance-class hierarchy rooted at a `DataModel` node, mimicking the Roblox object model; `EditorSession` owns the live `DataModel` tree and keeps `EditorSceneState::Items` synchronized as a derived projection - Concrete scene Instance subclasses introduced: `SceneFolder`, `SceneMeshObject`, `SceneLight`, `SceneCamera`, and `SceneActor` under `Axiom/CoreInstance/SceneInstances.h` - `SetSceneState` and `SetSceneItems` now rebuild the Instance tree from snapshot data, enabling round-trip snapshot rehydration diff --git a/Docs/HeadlessAxiomSessionPrototype.md b/Docs/HeadlessAxiomSessionPrototype.md index e1048281..3cc4e5b6 100644 --- a/Docs/HeadlessAxiomSessionPrototype.md +++ b/Docs/HeadlessAxiomSessionPrototype.md @@ -13,8 +13,8 @@ That macOS-specific media path now lives behind the engine-wide `HAL/` layer, so ## Current Status - Status: working prototype -- Verified on Windows as of 2026-05-05 -- Builds on macOS as of 2026-05-07 +- Current state reviewed on 2026-06-07 +- Builds on macOS with the `debug`, `release`, and `minsizerel` CMake presets - Runtime validation on macOS requires a Vulkan/MoltenVK-capable environment with Metal available - This subphase is complete for the runtime-side seam restoration work - `AxiomHeadless` is a command-driven authoritative runtime, not a full editor client @@ -46,6 +46,9 @@ That macOS-specific media path now lives behind the engine-wide `HAL/` layer, so - the browser client now pumps camera/input updates on `requestAnimationFrame` and flushes pointer-lock look input immediately instead of batching on a fixed timer - the current stream no longer has the severe FPS collapse seen in the older prototype, but there is still roughly half a second of residual input latency to investigate later - a multi-client frame-routing bug was fixed by stamping each offscreen capture with the submitting `SessionUserId` at render time instead of inferring ownership later from mutable active-pass state +- headless offscreen capture no longer waits immediately after graphics submit; completed readbacks are polled and published on later ticks with submit-time user attribution preserved +- remote render views now have dirty/burst scheduling state, so idle views are throttled instead of forcing one full render pass per connected client every engine tick +- high-performance release builds now enable threaded rendering, the renderer frame task graph, parallel CPU culling, native CPU tuning, and IPO/LTO when supported - the headless command/protocol layer and the remote project/script HTTP JSON helpers now use `rapidjson` internally; command/event/session payload schemas are unchanged, but the earlier handwritten JSON serializer/parser code on those paths has been removed - a root-level `EditorFrontend` workspace now hosts the primary browser editor shell using Next.js, React, and Tailwind CSS - `EditorFrontend` includes the docked editor layout, menu bar, toolbar, outliner, details panel, content browser, and the active WebRTC viewport client in `components/engine/viewport.tsx` @@ -108,9 +111,13 @@ Dev-client example: Remote viewport server example: ```sh -./AxiomRemoteViewportServer --host 127.0.0.1 --port 8080 --width 1280 --height 720 +./build/release/Headless/AxiomRemoteViewportServer --host 127.0.0.1 --port 8080 --width 1280 --height 720 ``` +For the browser WebRTC viewport, configure the server with +`-DAXIOM_ENABLE_WEBRTC=ON` and a local WebRTC framework or library path before +building `AxiomRemoteViewportServer`. + On startup, the process registers `WraithNetworking` with `ModuleManager`; the module initializes the `uWebSockets` transport, reports whether networking initialized successfully, and keeps per-connection metrics available for future runtime introspection. Then start the browser editor: @@ -256,7 +263,7 @@ This prototype does not yet provide: ## Scene Authoring Progress -The authoritative scene-authoring loop has advanced on the `scene-editing` branch: +The authoritative scene-authoring loop is implemented in the current tree: - `CreateObjectCommand`, `DuplicateObjectCommand`, and `DeleteObjectCommand` are now implemented as validated authoritative commands with matching `ObjectCreatedEvent` and `ObjectDeletedEvent` events - all scene objects are now backed by a `DataModel`-rooted Instance hierarchy (`Axiom/CoreInstance/SceneInstances.h`); `EditorSession` owns the live tree and keeps `EditorSceneState::Items` synchronized as a derived projection @@ -283,7 +290,7 @@ The authoritative scene-authoring loop has advanced on the `scene-editing` branc ## Gizmo System -A server-side transform gizmo is now fully implemented on the `scene-editing` branch: +A server-side transform gizmo is now fully implemented in the current tree: ### Rendering (`VulkanGizmoRenderer`) - a dedicated Vulkan pipeline draws gizmo handles as billboard line-segment quads inserted between mesh rendering and the offscreen capture step diff --git a/Docs/HeadlessScalabilityBaseline.md b/Docs/HeadlessScalabilityBaseline.md index 57eb63e0..3335ca81 100644 --- a/Docs/HeadlessScalabilityBaseline.md +++ b/Docs/HeadlessScalabilityBaseline.md @@ -1,6 +1,11 @@ # Headless Scalability Baseline -This note captures where to read the new Phase 0 / Phase 1 headless scalability counters before changing scheduling or asynchronous readback behavior. +This note captures where to read the Phase 0 / Phase 1 headless scalability counters while tuning scheduling and asynchronous readback behavior. + +Phase 1 has landed: offscreen readbacks are now published after later fence polling +instead of waiting immediately after submit, and idle remote views are throttled by +dirty/burst scheduling. Treat this file as the current instrumentation guide, not +as a pre-Phase-1 TODO. ## Where The Counters Live @@ -8,7 +13,7 @@ This note captures where to read the new Phase 0 / Phase 1 headless scalability - Headless render-pass scheduling hook in [Headless/HeadlessSessionHost.cpp](/Users/joshua/Documents/GitHub/WraithEngine/Headless/HeadlessSessionHost.cpp) - Offscreen readback hook in [Axiom/Renderer/Vulkan/VulkanDrawSubmissionSystem.cpp](/Users/joshua/Documents/GitHub/WraithEngine/Axiom/Renderer/Vulkan/VulkanDrawSubmissionSystem.cpp) -## What To Capture Before Refactors +## What To Capture During Tuning - `LastTickRenderPassCount`: current render passes scheduled for one engine tick. - `TotalRenderPasses`: cumulative render-pass work over a benchmark window. @@ -35,4 +40,7 @@ For each run, record the snapshot after a fixed tick window and compare: - pending readbacks during steady state - per-client cadence symmetry between active and idle clients -The current baseline is expected to show roughly one render pass per connected remote client per engine tick, with no scheduler distinction yet between active and idle remote clients. +The old pre-Phase-1 baseline showed roughly one render pass per connected remote +client per engine tick. The current expected behavior is different: active or +recently mutated clients should receive a short full-rate burst, while idle +clients should settle into the scheduler's throttled cadence. diff --git a/Docs/WraithEngineRefactorPlan.md b/Docs/WraithEngineRefactorPlan.md index 55f0a555..082191ed 100644 --- a/Docs/WraithEngineRefactorPlan.md +++ b/Docs/WraithEngineRefactorPlan.md @@ -1,8 +1,8 @@ # WraithEngine Refactor Plan ## Document Status -- Status: Draft -- Date: 2026-05-25 +- Status: Draft, updated after Phase 1 headless scalability work +- Date: 2026-06-07 - Audience: Engine, rendering, headless runtime, and editor contributors - Intended outcome: Turn the current engineering audit into an executable refactor roadmap ordered by dependency, risk, and team size @@ -14,8 +14,8 @@ The most important current facts are: - Scene authority already lives in editor-owned structs in `EditorSession`, but that data is mirrored into a recursive heap-owned `Instance` tree for hierarchy operations and projection. - Render submission still carries `shared_ptr` ownership and still recovers backend-specific Vulkan types through `dynamic_cast` in the submission build path. -- Headless offscreen rendering still blocks on `vkWaitForFences` immediately after submit, which defeats frames-in-flight for the headless path. -- Multi-client headless rendering still performs one render pass per remote client per engine tick. +- Phase 1 headless offscreen rendering no longer waits immediately after submit; completed readbacks are polled and published on later ticks. +- Multi-client headless rendering now has dirty/burst scheduling and idle-client throttling, but still needs broader policy tuning and production load validation. - `RemoteViewportServer` still mixes transport, WebRTC, project lifecycle, script workspace, asset upload, presence, input routing, and frame delivery in one class. - String-keyed maps remain widespread in editor, headless, scripting, physics, and scene serialization paths even where stable integer handles would make the authority layer simpler and cheaper. @@ -54,12 +54,12 @@ Validation: Current implementation shape: - Headless uses an offscreen render surface and publishes captured frames through the renderer frame-output seam in [Axiom/Core/Application.cpp](/Users/joshua/Documents/GitHub/WraithEngine/Axiom/Core/Application.cpp:152), [Axiom/Renderer/Vulkan/VulkanRendererBackend.cpp](/Users/joshua/Documents/GitHub/WraithEngine/Axiom/Renderer/Vulkan/VulkanRendererBackend.cpp:150), and [Axiom/Renderer/Vulkan/VulkanDrawSubmissionSystem.cpp](/Users/joshua/Documents/GitHub/WraithEngine/Axiom/Renderer/Vulkan/VulkanDrawSubmissionSystem.cpp:633). -- In the offscreen path, the draw submission system submits graphics work, marks the capture pending, then immediately waits on `CurrentFrame.RenderFence` before publishing the frame in [Axiom/Renderer/Vulkan/VulkanDrawSubmissionSystem.cpp](/Users/joshua/Documents/GitHub/WraithEngine/Axiom/Renderer/Vulkan/VulkanDrawSubmissionSystem.cpp:770). +- In the offscreen path, the draw submission system submits graphics work, records capture ownership at submit time, and publishes completed readbacks from later polling rather than waiting immediately after submit. Validation: -- This finding is fully accurate. -- The current implementation preserves frame attribution correctness, but it serializes headless rendering at the point where frames-in-flight should be helping. +- The original immediate-fence finding has been addressed by the Phase 1 implementation. +- Remaining risk is around queue depth, capture latency, and production-scale scheduling policy rather than the old unconditional immediate wait. ### 4. Multi-client rendering @@ -71,8 +71,8 @@ Current implementation shape: Validation: -- This finding is fully accurate. -- The engine currently re-renders once per active remote client per tick, even when the scene is shared and only camera/view overlays differ. +- The original full-rate-per-client baseline has been improved by dirty/burst scheduling and idle-client throttling. +- The engine still needs deeper policy tuning for many clients, shared-scene reuse, and active/idle fairness under real browser workloads. ### 5. `RemoteViewportServer` @@ -105,19 +105,21 @@ Validation: ### Phase 1: Headless scalability slice +Status: implemented. See [HeadlessPhase1ImplementationNote.md](/Users/joshua/Documents/GitHub/WraithEngine/Docs/HeadlessPhase1ImplementationNote.md). + Why first: - Highest current scalability payoff. - Lowest semantic blast radius compared with scene storage rewrites. - Directly addresses the clearest N-client cost center. -Target architecture: +Implemented architecture: - Offscreen rendering uses true asynchronous readback. - Completed frames are published when fences signal on a later tick rather than by waiting immediately after submit. -- Headless render scheduling becomes policy-driven per view, with at least dirty-state or cadence-based throttling for inactive clients. +- Headless render scheduling is policy-driven per view, with dirty-state and cadence-based throttling for inactive clients. -Migration strategy: +Completed migration: 1. Remove the immediate fence wait from the headless offscreen path. 2. Let pending readbacks complete in later frames through the existing `PublishCompletedOffscreenFrames()` path. @@ -143,9 +145,9 @@ Test strategy: - Add a renderer-level regression test or harness for two queued offscreen frames with distinct users. - Add instrumentation assertions in headless integration tests for render-pass count versus active-client count. -Incremental or staged: +Follow-up: -- Incremental. +- Use the instrumentation baseline to tune policy thresholds and validate many-client behavior. ### Phase 2: Render submission cleanup diff --git a/Editor/CMakeLists.txt b/Editor/CMakeLists.txt index 8bc705b2..071b0fa5 100644 --- a/Editor/CMakeLists.txt +++ b/Editor/CMakeLists.txt @@ -4,6 +4,8 @@ add_executable(AxiomEditor GlfwEditorModule.cpp ) +axiom_apply_performance_options(AxiomEditor) + target_link_libraries(AxiomEditor PRIVATE AxiomCore) target_link_libraries(AxiomEditor PRIVATE AxiomScene diff --git a/FRAME_GRAPH.md b/FRAME_GRAPH.md new file mode 100644 index 00000000..bb6ba7b9 --- /dev/null +++ b/FRAME_GRAPH.md @@ -0,0 +1,51 @@ +# Minimal Frame Task Graph + +This is the first low-risk frame graph for WraithEngine. The existing +`ModuleManager` phase walk remains authoritative for modules that have not been +ported. The renderer frame module is the only ported module. + +## Current Stages + +```mermaid +flowchart LR + Simulate["simulate: FrameStart modules"] --> Begin["begin: Renderer::BeginFrame"] + Begin --> RecordCommands["legacy Render phase modules submit RenderCommand data"] + RecordCommands --> CullRecord["cull + record: Renderer::Render"] + CullRecord --> Submit["submit: Renderer::EndFrame"] +``` + +`BeginFrame` is scheduled through `Jobs::ScheduleJob` and then waited at the end +of `RenderBegin`, because render-command producers still run later in the +regular `Render` phase and require an open `RenderScene`. + +`Renderer::Render` is scheduled after the begin job. It owns the Vulkan scene +preparation work, including the guarded parallel CPU cull path. `Renderer::EndFrame` +is scheduled with `Jobs::ScheduleJobAfter` after render and waited before +`Application::Step` returns. + +## Determinism Rules + +- CPU cull resolves mesh handles serially into an immutable per-frame snapshot. +- Parallel cull workers write only to range-local candidate buckets. +- Buckets are merged in ascending input-range order before the existing sort and + visible-list classification. +- `AXIOM_VERIFY_PARALLEL_CULL=ON` reruns the serial cull builder and asserts + that candidate lists and frustum-cull counts match before downstream work. + +## Config Flags + +- `AXIOM_PARALLEL_CULL`: enables the Vulkan parallel CPU cull path. +- `AXIOM_VERIFY_PARALLEL_CULL`: compares parallel cull output against serial + output. +- `AXIOM_FRAME_TASK_GRAPH`: enables the renderer frame module job chain. + +Each flag is also exposed through `ApplicationConfig` / `RendererCreateInfo` so +tests and hosts can force a specific mode without changing global build flags. + +## Next Ports + +Future ports should move command-producing modules onto explicit producer tasks +instead of relying on `RenderCommand` global scene state. Once those producers +return immutable render packets, `BeginFrame` no longer needs to be synchronized +before the `Render` phase, and the graph can express true +`simulate -> cull(parallel) -> record -> submit` overlap. diff --git a/HAL/CMakeLists.txt b/HAL/CMakeLists.txt index f4fb0d09..df20707f 100644 --- a/HAL/CMakeLists.txt +++ b/HAL/CMakeLists.txt @@ -16,6 +16,8 @@ if(APPLE) ) endif() +axiom_apply_performance_options(AxiomHAL) + target_include_directories(AxiomHAL PUBLIC "${CMAKE_SOURCE_DIR}" "${CMAKE_SOURCE_DIR}/Axiom" diff --git a/Headless/CMakeLists.txt b/Headless/CMakeLists.txt index 43d983c7..4dabd7f7 100644 --- a/Headless/CMakeLists.txt +++ b/Headless/CMakeLists.txt @@ -7,6 +7,8 @@ add_library(uSockets STATIC "${CMAKE_SOURCE_DIR}/ThirdParty/uWebSockets/uSockets/src/eventing/epoll_kqueue.c" ) +axiom_apply_performance_options(uSockets) + target_include_directories(uSockets PUBLIC "${CMAKE_SOURCE_DIR}/ThirdParty/uWebSockets/uSockets/src" ) @@ -31,6 +33,8 @@ add_library(WraithNetworking STATIC WraithNetworkingModule.cpp ) +axiom_apply_performance_options(WraithNetworking) + target_include_directories(WraithNetworking PUBLIC "${CMAKE_SOURCE_DIR}/Headless" "${CMAKE_SOURCE_DIR}/ThirdParty/glfw/deps" @@ -66,6 +70,8 @@ add_executable(AxiomHeadless WebRtcSession.cpp ) +axiom_apply_performance_options(AxiomHeadless) + add_executable(AxiomRemoteViewportDevClient DevRemoteViewportClient.cpp HeadlessCommandProtocol.cpp @@ -76,6 +82,8 @@ add_executable(AxiomRemoteViewportDevClient WebRtcSession.cpp ) +axiom_apply_performance_options(AxiomRemoteViewportDevClient) + target_include_directories(AxiomHeadless PRIVATE "${CMAKE_SOURCE_DIR}/ThirdParty/glfw/deps" ) @@ -121,6 +129,8 @@ add_executable(AxiomRemoteViewportServer WebRtcSession.cpp ) +axiom_apply_performance_options(AxiomRemoteViewportServer) + add_executable(AxiomPackagedRuntime AxiomPackagedRuntime.cpp HeadlessOverlayModule.cpp @@ -129,6 +139,8 @@ add_executable(AxiomPackagedRuntime HeadlessSessionModule.cpp ) +axiom_apply_performance_options(AxiomPackagedRuntime) + target_include_directories(AxiomRemoteViewportServer PRIVATE "${CMAKE_SOURCE_DIR}/ThirdParty/glfw/deps" ) diff --git a/README.md b/README.md index 9ec578f1..2a749f8c 100644 --- a/README.md +++ b/README.md @@ -133,13 +133,35 @@ AxiomCore runtime flow ## Build -### Quick start (minimal — no scripting, no WebRTC) +By default, single-config CMake generators use `Release` when no build type is +specified. The checked-in presets still provide explicit `debug`, `release`, and +`minsizerel` configurations, and preset builds run with parallel jobs enabled. + +### Quick start (debug — no scripting, no WebRTC) ```bash cmake --preset debug cmake --build build/debug ``` +### Performance server build (scripting + WebRTC) + +This is the normal high-performance local command for the browser-facing server +on macOS when using a locally built `WebRTC.framework`: + +```bash +cmake --preset release \ + -DAXIOM_ENABLE_SCRIPTING=ON \ + -DAXIOM_ENABLE_WEBRTC=ON \ + -DAXIOM_WEBRTC_FRAMEWORK_PATH=/Users/joshua/webrtc-checkout/src/out/Default/WebRTC.framework + +cmake --build build/release --target AxiomRemoteViewportServer +``` + +The release preset uses optimized first-party compile defaults, native CPU +tuning, IPO/LTO when the toolchain supports it, threaded rendering, the renderer +frame task graph, parallel CPU culling, and parallel build jobs. + ### With physics enabled Physics uses Jolt and is currently enabled by default, but this is the explicit build if you want to guarantee it is on: @@ -233,6 +255,9 @@ cmake --preset release -DAXIOM_ENABLE_SCRIPTING=ON cmake --build build/release ``` +The release preset is the recommended baseline for runtime performance. Use the +debug preset for development diagnostics and tests. + --- ## CMake Options Reference @@ -245,6 +270,12 @@ cmake --build build/release | `AXIOM_SCRIPTING_TRUST_DEFAULT` | `STRING` | `Restricted` | Default sandbox tier for user scripts. `Restricted` (hosted — blocks `System.Net.*`, `System.Reflection.Emit`, etc.) or `Trusted` (local dev — full BCL access) | | `AXIOM_ENABLE_WEBRTC` | `BOOL` | `OFF` | Enable the macOS WebRTC transport | | `AXIOM_ENABLE_PHYSICS` | `BOOL` | `ON` | Enable the JoltPhysics runtime simulation seam | +| `AXIOM_THREADED_RENDER` | `BOOL` | `ON` | Enable the threaded renderer and worker job system | +| `AXIOM_PARALLEL_CULL` | `BOOL` | `ON` | Enable guarded parallel CPU culling in the Vulkan scene renderer | +| `AXIOM_FRAME_TASK_GRAPH` | `BOOL` | `ON` | Enable the renderer frame task graph backed by enkiTS jobs | +| `AXIOM_ENABLE_PERFORMANCE_DEFAULTS` | `BOOL` | `ON` | Apply optimized compile/link defaults to first-party targets | +| `AXIOM_OPTIMIZE_FOR_NATIVE_ARCH` | `BOOL` | `ON` | Tune optimized builds for the host CPU architecture | +| `AXIOM_ENABLE_IPO` | `BOOL` | `ON` | Enable interprocedural optimization/LTO for optimized builds when supported | | `AXIOM_WEBRTC_FRAMEWORK_PATH` | `PATH` | _(empty)_ | Path to a `WebRTC.framework` bundle (macOS framework variant) | | `AXIOM_WEBRTC_LIBRARY_PATH` | `FILEPATH` | _(empty)_ | Path to a `libwebrtc` static/shared binary (non-framework variant) | | `AXIOM_WEBRTC_INCLUDE_DIR` | `PATH` | _(empty)_ | Include directory for the non-framework libwebrtc variant | @@ -262,11 +293,25 @@ cmake --build build/release ### Remote viewport server +Release build: + +```bash +./build/release/Headless/AxiomRemoteViewportServer \ + --host 127.0.0.1 --port 8080 --width 1280 --height 720 +``` + +Debug build: + ```bash ./build/debug/Headless/AxiomRemoteViewportServer \ --host 127.0.0.1 --port 8080 --width 1280 --height 720 ``` +If WebRTC was not enabled at configure time, the server starts but the browser +viewport cannot receive the H.264 WebRTC media path. Reconfigure with +`-DAXIOM_ENABLE_WEBRTC=ON` and either `AXIOM_WEBRTC_FRAMEWORK_PATH` or the +`AXIOM_WEBRTC_LIBRARY_PATH` / `AXIOM_WEBRTC_INCLUDE_DIR` pair. + At startup, `AxiomRemoteViewportServer` registers the toggleable `WraithNetworking` module with `ModuleManager`. That module owns transport initialization, publishes connection metrics/state snapshots, and keeps the existing WebRTC session logic active behind the same public server API. ### Browser editor @@ -309,7 +354,7 @@ To test the packaged runtime binary built in `build/` against an existing staged package: ```bash -./build/debug/Headless/AxiomPackagedRuntime \ +./build/release/Headless/AxiomPackagedRuntime \ --package-root /absolute/path/to/Projects//Package ``` @@ -318,7 +363,7 @@ Open `http://localhost:3000` in your browser. ### Local native editor (no browser required) ```bash -./build/debug/Editor/AxiomEditor +./build/release/Editor/AxiomEditor ``` --- diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt index f83ba47c..cca09e34 100644 --- a/Tests/CMakeLists.txt +++ b/Tests/CMakeLists.txt @@ -19,6 +19,8 @@ add_executable(AxiomTests ../Headless/WebRtcSession.cpp ) +axiom_apply_performance_options(AxiomTests) + if(APPLE AND EXISTS "${CMAKE_SOURCE_DIR}/Headless/MacOSWebRtcSession.mm") target_sources(AxiomTests PRIVATE ../Headless/MacOSWebRtcSession.mm diff --git a/Tests/RenderSubmissionTests.cpp b/Tests/RenderSubmissionTests.cpp index 8b91c92e..eed409ed 100644 --- a/Tests/RenderSubmissionTests.cpp +++ b/Tests/RenderSubmissionTests.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -107,6 +108,12 @@ Axiom::MeshData MakeTriangleMesh() { .BoundsMax = {0.25f, 0.25f, 0.0f}, }; } + +class ScopedJobSystem { +public: + ScopedJobSystem() { Axiom::Jobs::Startup(); } + ~ScopedJobSystem() { Axiom::Jobs::Shutdown(); } +}; } // namespace TEST(RenderSubmissionTests, EditorSceneRendererAdapterReusesCachedMeshUntilAssetChanges) { @@ -166,22 +173,25 @@ TEST(RenderSubmissionTests, EditorSceneRendererAdapterDropsDeletedObjectsFromCac } TEST(RenderSubmissionTests, - VulkanRendererRendersAllFiveThousandSubmittedMeshesOffscreen) { + VulkanRendererRendersAllTenThousandSubmittedMeshesOffscreen) { constexpr uint32_t Width = 1280; constexpr uint32_t Height = 720; - constexpr size_t MeshCount = 5000; + constexpr size_t MeshCount = 10000; EnsureLoggingInitialized(); if (!Axiom::CanInitializeHeadlessVulkan()) { GTEST_SKIP() << "Headless Vulkan is unavailable on this host"; } + ScopedJobSystem Jobs; auto Surface = std::make_shared(Width, Height); Axiom::Renderer Renderer; Renderer.Init({ .TargetSurface = Surface, .Width = Width, .Height = Height, + .EnableParallelCull = true, + .VerifyParallelCull = true, }); Renderer.SetViewMode(Axiom::RendererViewMode::Wireframe); diff --git a/Tests/ThreadingTests.cpp b/Tests/ThreadingTests.cpp index 793f2ce7..3bea6f16 100644 --- a/Tests/ThreadingTests.cpp +++ b/Tests/ThreadingTests.cpp @@ -16,11 +16,16 @@ #include #include #include +#include #include +#include #include #include namespace { +std::atomic g_CountAllocations{false}; +std::atomic g_AllocationCount{0}; + void EnsureLoggingInitialized() { static bool Initialized = false; if (!Initialized) { @@ -60,6 +65,65 @@ Axiom::CommandContext MakeContext(uint64_t FrameIndex = 1, } } // namespace +void *operator new(std::size_t Size) { + if (g_CountAllocations.load(std::memory_order_relaxed)) { + g_AllocationCount.fetch_add(1, std::memory_order_relaxed); + } + if (void *Pointer = std::malloc(Size)) { + return Pointer; + } + throw std::bad_alloc(); +} + +void *operator new[](std::size_t Size) { + if (g_CountAllocations.load(std::memory_order_relaxed)) { + g_AllocationCount.fetch_add(1, std::memory_order_relaxed); + } + if (void *Pointer = std::malloc(Size)) { + return Pointer; + } + throw std::bad_alloc(); +} + +void *operator new(std::size_t Size, std::align_val_t Alignment) { + if (g_CountAllocations.load(std::memory_order_relaxed)) { + g_AllocationCount.fetch_add(1, std::memory_order_relaxed); + } + void *Pointer = nullptr; + if (posix_memalign(&Pointer, static_cast(Alignment), Size) == 0) { + return Pointer; + } + throw std::bad_alloc(); +} + +void *operator new[](std::size_t Size, std::align_val_t Alignment) { + if (g_CountAllocations.load(std::memory_order_relaxed)) { + g_AllocationCount.fetch_add(1, std::memory_order_relaxed); + } + void *Pointer = nullptr; + if (posix_memalign(&Pointer, static_cast(Alignment), Size) == 0) { + return Pointer; + } + throw std::bad_alloc(); +} + +void operator delete(void *Pointer) noexcept { std::free(Pointer); } +void operator delete[](void *Pointer) noexcept { std::free(Pointer); } +void operator delete(void *Pointer, std::size_t) noexcept { std::free(Pointer); } +void operator delete[](void *Pointer, std::size_t) noexcept { std::free(Pointer); } +void operator delete(void *Pointer, std::align_val_t) noexcept { + std::free(Pointer); +} +void operator delete[](void *Pointer, std::align_val_t) noexcept { + std::free(Pointer); +} +void operator delete(void *Pointer, std::size_t, std::align_val_t) noexcept { + std::free(Pointer); +} +void operator delete[](void *Pointer, std::size_t, std::align_val_t) noexcept { + std::free(Pointer); +} + TEST(ThreadingTests, JobsRunWithDependenciesAndParallelFor) { Axiom::Jobs::Startup(); @@ -85,6 +149,34 @@ TEST(ThreadingTests, JobsRunWithDependenciesAndParallelFor) { Axiom::Jobs::Shutdown(); } +TEST(ThreadingTests, SchedulingTrivialJobsDoesNotAllocatePerJob) { + constexpr size_t JobCount = 100000; + static std::array Handles; + + Axiom::Jobs::Startup(); + + Axiom::Jobs::JobHandle Warmup = Axiom::Jobs::ScheduleJob([]() {}); + Axiom::Jobs::Wait(Warmup); + + g_AllocationCount.store(0, std::memory_order_relaxed); + g_CountAllocations.store(true, std::memory_order_release); + + for (Axiom::Jobs::JobHandle &Handle : Handles) { + Handle = Axiom::Jobs::ScheduleJob([]() {}); + } + for (Axiom::Jobs::JobHandle Handle : Handles) { + Axiom::Jobs::Wait(Handle); + } + + g_CountAllocations.store(false, std::memory_order_release); + const size_t AllocationCount = + g_AllocationCount.load(std::memory_order_relaxed); + + Axiom::Jobs::Shutdown(); + + EXPECT_EQ(AllocationCount, 0u); +} + TEST(ThreadingTests, ThreadedRendererRunsHeadlessForThousandFramesWithoutDeadlock) { #if AXIOM_THREADED_RENDER == 0 GTEST_SKIP() << "Threaded renderer is disabled in this build"; diff --git a/cmake/AxiomCompilerOptions.cmake b/cmake/AxiomCompilerOptions.cmake new file mode 100644 index 00000000..5f5809dc --- /dev/null +++ b/cmake/AxiomCompilerOptions.cmake @@ -0,0 +1,78 @@ +include(CheckIPOSupported) + +if(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release CACHE STRING + "Build type for single-configuration generators" FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS + Debug Release RelWithDebInfo MinSizeRel) +endif() + +option(AXIOM_ENABLE_PERFORMANCE_DEFAULTS + "Use high-performance compile defaults for first-party targets" ON) +option(AXIOM_OPTIMIZE_FOR_NATIVE_ARCH + "Tune optimized builds for the host CPU architecture" ON) +option(AXIOM_ENABLE_IPO + "Enable interprocedural optimization/LTO for optimized builds" ON) + +set(AXIOM_IPO_SUPPORTED OFF) +if(AXIOM_ENABLE_IPO) + check_ipo_supported(RESULT AXIOM_IPO_SUPPORTED OUTPUT AXIOM_IPO_OUTPUT) + if(NOT AXIOM_IPO_SUPPORTED) + message(WARNING + "IPO/LTO was requested but is not supported by this toolchain: " + "${AXIOM_IPO_OUTPUT}") + endif() +endif() + +function(axiom_apply_performance_options target_name) + if(NOT AXIOM_ENABLE_PERFORMANCE_DEFAULTS) + return() + endif() + + if(NOT TARGET ${target_name}) + message(FATAL_ERROR + "axiom_apply_performance_options called for missing target: ${target_name}") + endif() + + get_target_property(AXIOM_TARGET_TYPE ${target_name} TYPE) + if(AXIOM_TARGET_TYPE STREQUAL "INTERFACE_LIBRARY" OR + AXIOM_TARGET_TYPE STREQUAL "UTILITY") + return() + endif() + + if(MSVC) + target_compile_options(${target_name} PRIVATE + $<$:/O2 /Ob3 /Oi /Ot /Gy /Gw> + $<$:/O2 /Ob3 /Oi /Ot /Gy /Gw> + $<$:/O2 /Ob3 /Oi /Ot /Gy /Gw> + ) + target_link_options(${target_name} PRIVATE + $<$:/OPT:REF /OPT:ICF> + $<$:/OPT:REF /OPT:ICF> + $<$:/OPT:REF /OPT:ICF> + ) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU") + target_compile_options(${target_name} PRIVATE + $<$:-O3> + $<$:-O3> + $<$:-Os> + ) + + if(AXIOM_OPTIMIZE_FOR_NATIVE_ARCH) + target_compile_options(${target_name} PRIVATE + $<$:-march=native> + $<$:-march=native> + $<$:-march=native> + ) + endif() + endif() + + if(AXIOM_IPO_SUPPORTED) + set_property(TARGET ${target_name} PROPERTY + INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE) + set_property(TARGET ${target_name} PROPERTY + INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO TRUE) + set_property(TARGET ${target_name} PROPERTY + INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL TRUE) + endif() +endfunction()