From c7f866cf0082183a441049f40fc62e5e20f545b4 Mon Sep 17 00:00:00 2001 From: Tamely Date: Sun, 7 Jun 2026 10:47:42 -0500 Subject: [PATCH 1/4] Turn the frame into a job graph --- Axiom/CMakeLists.txt | 6 + Axiom/Core/Application.cpp | 2 + Axiom/Core/Application.h | 15 ++ Axiom/Jobs/JobSystem.cpp | 30 ++++ Axiom/Renderer/RendererFrameModule.cpp | 57 +++++++- Axiom/Renderer/RendererFrameModule.h | 11 ++ Axiom/Renderer/RendererTypes.h | 8 + .../AxiomRHI/Vulkan/VulkanSceneRenderer.cpp | 137 +++++++++++++++--- .../AxiomRHI/Vulkan/VulkanSceneRenderer.h | 18 +++ CMakeLists.txt | 6 + FRAME_GRAPH.md | 51 +++++++ Tests/RenderSubmissionTests.cpp | 14 +- 12 files changed, 334 insertions(+), 21 deletions(-) create mode 100644 FRAME_GRAPH.md diff --git a/Axiom/CMakeLists.txt b/Axiom/CMakeLists.txt index d641381..7bc7fed 100644 --- a/Axiom/CMakeLists.txt +++ b/Axiom/CMakeLists.txt @@ -351,6 +351,9 @@ function(axiom_configure_module_target target_name) AXIOM_SOURCE_DIR="${CMAKE_SOURCE_DIR}" AXIOM_ENABLE_WEBRTC=$,1,0> AXIOM_THREADED_RENDER=$,1,0> + AXIOM_PARALLEL_CULL=$,1,0> + AXIOM_VERIFY_PARALLEL_CULL=$,1,0> + AXIOM_FRAME_TASK_GRAPH=$,1,0> AXIOM_WEBRTC_LINKED=${AXIOM_WEBRTC_LINKED} ) @@ -539,6 +542,9 @@ target_compile_definitions(AxiomRendererVulkanObjects PRIVATE AXIOM_SOURCE_DIR="${CMAKE_SOURCE_DIR}" AXIOM_ENABLE_WEBRTC=$,1,0> AXIOM_THREADED_RENDER=$,1,0> + AXIOM_PARALLEL_CULL=$,1,0> + AXIOM_VERIFY_PARALLEL_CULL=$,1,0> + AXIOM_FRAME_TASK_GRAPH=$,1,0> AXIOM_WEBRTC_LINKED=${AXIOM_WEBRTC_LINKED} ) if(AXIOM_ENABLE_TSAN) diff --git a/Axiom/Core/Application.cpp b/Axiom/Core/Application.cpp index e07bc50..98d5cf4 100644 --- a/Axiom/Core/Application.cpp +++ b/Axiom/Core/Application.cpp @@ -66,6 +66,8 @@ Application::Application(const ApplicationConfig &Config, .Width = m_Window->GetWidth(), .Height = m_Window->GetHeight(), .EnableThreadedRendering = m_Config.EnableThreadedRendering, + .EnableParallelCull = m_Config.EnableParallelCull, + .VerifyParallelCull = m_Config.VerifyParallelCull, }); } Jobs::Startup(); diff --git a/Axiom/Core/Application.h b/Axiom/Core/Application.h index c6510fe..f4c774a 100644 --- a/Axiom/Core/Application.h +++ b/Axiom/Core/Application.h @@ -15,6 +15,15 @@ namespace Axiom { #ifndef AXIOM_THREADED_RENDER #define AXIOM_THREADED_RENDER 0 #endif +#ifndef AXIOM_PARALLEL_CULL +#define AXIOM_PARALLEL_CULL 0 +#endif +#ifndef AXIOM_VERIFY_PARALLEL_CULL +#define AXIOM_VERIFY_PARALLEL_CULL 0 +#endif +#ifndef AXIOM_FRAME_TASK_GRAPH +#define AXIOM_FRAME_TASK_GRAPH 0 +#endif class Renderer; struct RendererDeleter { @@ -40,6 +49,9 @@ struct ApplicationConfig { RuntimeMode Mode{RuntimeMode::LocalWindowedEditor}; IViewportFrameOutput *FrameOutput{nullptr}; bool EnableThreadedRendering{AXIOM_THREADED_RENDER != 0}; + bool EnableParallelCull{AXIOM_PARALLEL_CULL != 0}; + bool VerifyParallelCull{AXIOM_VERIFY_PARALLEL_CULL != 0}; + bool EnableFrameTaskGraph{AXIOM_FRAME_TASK_GRAPH != 0}; }; class Application { @@ -65,6 +77,9 @@ class Application { [[nodiscard]] float GetDeltaTime() const { return m_DeltaTime; } [[nodiscard]] uint64_t GetFrameIndex() const { return m_FrameIndex; } [[nodiscard]] RuntimeMode GetRuntimeMode() const { return m_Config.Mode; } + [[nodiscard]] bool IsFrameTaskGraphEnabled() const { + return m_Config.EnableFrameTaskGraph; + } [[nodiscard]] Renderer &GetRenderer() const; [[nodiscard]] Renderer *TryGetRenderer() const; [[nodiscard]] ModuleManager &GetModuleManager() { return m_ModuleManager; } diff --git a/Axiom/Jobs/JobSystem.cpp b/Axiom/Jobs/JobSystem.cpp index 0c393e1..16e674b 100644 --- a/Axiom/Jobs/JobSystem.cpp +++ b/Axiom/Jobs/JobSystem.cpp @@ -60,6 +60,7 @@ class JobSystem { m_Scheduler = std::make_unique(); enki::TaskSchedulerConfig Config = m_Scheduler->GetConfig(); Config.profilerCallbacks.threadStart = &OnWorkerThreadStart; + Config.numExternalTaskThreads = 4; m_Scheduler->Initialize(Config); } } @@ -78,6 +79,11 @@ class JobSystem { } JobHandle ScheduleJob(JobFn Function) { + if (!CanUseScheduler()) { + Function(); + return {}; + } + auto State = std::make_shared(); State->Task = std::make_unique(std::move(Function)); m_Scheduler->AddTaskSetToPipe(State->Task.get()); @@ -85,6 +91,14 @@ class JobSystem { } JobHandle ScheduleJobAfter(JobFn Function, std::span Deps) { + if (!CanUseScheduler()) { + for (const JobHandle &Dependency : Deps) { + Wait(Dependency); + } + Function(); + return {}; + } + auto State = std::make_shared(); State->DependencyHandles.reserve(Deps.size()); for (const JobHandle &Dependency : Deps) { @@ -122,6 +136,12 @@ class JobSystem { if (Count == 0) { return; } + if (!CanUseScheduler()) { + for (size_t Index = 0; Index < Count; ++Index) { + Function(Index); + } + return; + } ParallelForTaskSet Task(Count, std::move(Function)); m_Scheduler->AddTaskSetToPipe(&Task); @@ -132,6 +152,16 @@ class JobSystem { std::mutex m_Mutex; std::unique_ptr m_Scheduler; size_t m_StartupCount{0}; + + bool CanUseScheduler() { + if (m_Scheduler == nullptr) { + return false; + } + if (m_Scheduler->GetThreadNum() != enki::NO_THREAD_NUM) { + return true; + } + return m_Scheduler->RegisterExternalTaskThread(); + } }; JobSystem &GetJobSystem() { diff --git a/Axiom/Renderer/RendererFrameModule.cpp b/Axiom/Renderer/RendererFrameModule.cpp index 34c581b..0f36a63 100644 --- a/Axiom/Renderer/RendererFrameModule.cpp +++ b/Axiom/Renderer/RendererFrameModule.cpp @@ -3,17 +3,30 @@ #include "Core/Application.h" #include "Renderer/Renderer.h" +#include +#include + namespace Axiom { std::string_view RendererFrameModule::GetName() const { return "Core.RendererFrame"; } bool RendererFrameModule::Initialize(Application &App) { - (void)App; + m_UseFrameTaskGraph = App.IsFrameTaskGraphEnabled(); + ResetTaskGraph(); return true; } void RendererFrameModule::Update(const ModuleUpdateContext &Context) { + if (m_UseFrameTaskGraph) { + UpdateTaskGraph(Context); + return; + } + + UpdateSerial(Context); +} + +void RendererFrameModule::UpdateSerial(const ModuleUpdateContext &Context) { switch (Context.Phase) { case ModuleUpdatePhase::FrameStart: Context.App.GetRenderer().SetCpuFrameTime(Context.DeltaTimeSeconds * @@ -33,5 +46,45 @@ void RendererFrameModule::Update(const ModuleUpdateContext &Context) { } } -void RendererFrameModule::Shutdown(Application &App) { (void)App; } +void RendererFrameModule::UpdateTaskGraph(const ModuleUpdateContext &Context) { + Renderer &Renderer = Context.App.GetRenderer(); + switch (Context.Phase) { + case ModuleUpdatePhase::FrameStart: + ResetTaskGraph(); + Renderer.SetCpuFrameTime(Context.DeltaTimeSeconds * 1000.0f); + break; + case ModuleUpdatePhase::RenderBegin: + m_BeginFrameJob = Jobs::ScheduleJob([&Renderer]() { Renderer.BeginFrame(); }); + Jobs::Wait(m_BeginFrameJob); + break; + case ModuleUpdatePhase::Render: { + std::array Dependencies = {m_BeginFrameJob}; + m_RenderJob = Jobs::ScheduleJobAfter( + [&Renderer]() { Renderer.Render(); }, + std::span(Dependencies)); + break; + } + case ModuleUpdatePhase::ImGuiRender: + break; + case ModuleUpdatePhase::RenderEnd: { + std::array Dependencies = {m_RenderJob}; + m_EndFrameJob = Jobs::ScheduleJobAfter( + [&Renderer]() { Renderer.EndFrame(); }, + std::span(Dependencies)); + Jobs::Wait(m_EndFrameJob); + break; + } + } +} + +void RendererFrameModule::ResetTaskGraph() { + m_BeginFrameJob = {}; + m_RenderJob = {}; + m_EndFrameJob = {}; +} + +void RendererFrameModule::Shutdown(Application &App) { + (void)App; + ResetTaskGraph(); +} } // namespace Axiom diff --git a/Axiom/Renderer/RendererFrameModule.h b/Axiom/Renderer/RendererFrameModule.h index f60f97b..849e935 100644 --- a/Axiom/Renderer/RendererFrameModule.h +++ b/Axiom/Renderer/RendererFrameModule.h @@ -1,6 +1,7 @@ #pragma once #include "Core/IModule.h" +#include "Jobs/JobSystem.h" namespace Axiom { class RendererFrameModule final : public IModule { @@ -9,5 +10,15 @@ class RendererFrameModule final : public IModule { bool Initialize(Application &App) override; void Update(const ModuleUpdateContext &Context) override; void Shutdown(Application &App) override; + +private: + void UpdateSerial(const ModuleUpdateContext &Context); + void UpdateTaskGraph(const ModuleUpdateContext &Context); + void ResetTaskGraph(); + + Jobs::JobHandle m_BeginFrameJob; + Jobs::JobHandle m_RenderJob; + Jobs::JobHandle m_EndFrameJob; + bool m_UseFrameTaskGraph{false}; }; } // namespace Axiom diff --git a/Axiom/Renderer/RendererTypes.h b/Axiom/Renderer/RendererTypes.h index 26aa174..bf7814f 100644 --- a/Axiom/Renderer/RendererTypes.h +++ b/Axiom/Renderer/RendererTypes.h @@ -14,6 +14,12 @@ namespace Axiom { #ifndef AXIOM_THREADED_RENDER #define AXIOM_THREADED_RENDER 0 #endif +#ifndef AXIOM_PARALLEL_CULL +#define AXIOM_PARALLEL_CULL 0 +#endif +#ifndef AXIOM_VERIFY_PARALLEL_CULL +#define AXIOM_VERIFY_PARALLEL_CULL 0 +#endif enum class RendererBackendType : uint32_t { Vulkan = 0, @@ -37,6 +43,8 @@ struct RendererCreateInfo { uint32_t Width{0}; uint32_t Height{0}; bool EnableThreadedRendering{AXIOM_THREADED_RENDER != 0}; + bool EnableParallelCull{AXIOM_PARALLEL_CULL != 0}; + bool VerifyParallelCull{AXIOM_VERIFY_PARALLEL_CULL != 0}; std::function ThreadedRenderSceneStartCallback; std::function ThreadedRenderSceneCompleteCallback; RendererBackendType BackendType{RendererBackendType::Vulkan}; diff --git a/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.cpp b/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.cpp index 074533d..70ff15d 100644 --- a/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.cpp +++ b/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.cpp @@ -6,6 +6,7 @@ #include "AxiomRHI/Vulkan/VulkanInitializers.h" #include "AxiomRHI/Vulkan/VulkanMesh.h" #include "AxiomRHI/Vulkan/VulkanRhiDevice.h" +#include "Jobs/JobSystem.h" #include #include @@ -13,6 +14,7 @@ #include #include #include +#include #include #include @@ -61,6 +63,8 @@ uint64_t PackTranslucentSortKey(float SortDepth, uint32_t SubmissionIndex) { void VulkanSceneRenderer::Init(IRHIDevice &Device, const RendererCreateInfo &CreateInfo) { m_Device = static_cast(&Device); + m_EnableParallelCull = CreateInfo.EnableParallelCull; + m_VerifyParallelCull = CreateInfo.VerifyParallelCull; if (m_Device != nullptr) { m_FrameOutput = CreateInfo.FrameOutput; m_Device->GetDrawSubmissionSystem().SetRecordPreparedScenePasses( @@ -238,35 +242,37 @@ void VulkanSceneRenderer::PrepareSceneFrame(RenderScene &Scene) { auto &Candidates = m_CandidateScratch; auto &VisibleSubmissions = m_PreparedSceneState.VisibleSubmissions; + auto &CullInputs = m_CullInputScratch; Candidates.clear(); VisibleSubmissions.Clear(); + CullInputs.clear(); Candidates.reserve(SubmissionCount); + CullInputs.resize(SubmissionCount); VisibleSubmissions.OpaqueGraphics.reserve(SubmissionCount); VisibleSubmissions.TranslucentGraphics.reserve(SubmissionCount); VisibleSubmissions.Compute.reserve(SubmissionCount); for (size_t Index = 0; Index < SubmissionCount; ++Index) { const auto &Submission = Scene.Submissions[Index]; - VulkanMesh *VulkanMeshRef = m_Device->ResolveMeshHandle(Submission.MeshHandle); - if (VulkanMeshRef == nullptr) { - continue; - } + CullInputs[Index].Mesh = m_Device->ResolveMeshHandle(Submission.MeshHandle); + } - if (!m_PreparedSceneState.ForceWireframe && - !m_Device->GetOcclusionCulling().IsBoundsVisible( - m_PreparedSceneState.CameraData.ViewProjection, Submission.Transform, - VulkanMeshRef->BoundsMin, VulkanMeshRef->BoundsMax)) { - ++FrameStats.FrustumCulledMeshCount; - continue; + size_t FrustumCulledCount = 0; + if (ShouldUseParallelCull(SubmissionCount)) { + FrustumCulledCount = BuildCullCandidatesParallel(Scene, CullInputs, Candidates); + if (m_VerifyParallelCull) { + auto &SerialCandidates = m_VerificationCandidateScratch; + const size_t SerialFrustumCulledCount = + BuildCullCandidatesSerial(Scene, CullInputs, SerialCandidates); + assert(SerialFrustumCulledCount == FrustumCulledCount && + "Parallel cull frustum count diverged from serial cull"); + assert(SerialCandidates == Candidates && + "Parallel cull candidates diverged from serial cull"); } - - const glm::vec3 WorldCenter = ComputeWorldCenter(Submission, *VulkanMeshRef); - const glm::vec3 Delta = WorldCenter - Scene.ActiveCamera->GetPosition(); - Candidates.push_back({.SubmissionIndex = static_cast(Index), - .MeshHandle = Submission.MeshHandle, - .Mesh = VulkanMeshRef, - .SortDepth = glm::dot(Delta, Delta)}); + } else { + FrustumCulledCount = BuildCullCandidatesSerial(Scene, CullInputs, Candidates); } + FrameStats.FrustumCulledMeshCount = static_cast(FrustumCulledCount); if (!m_PreparedSceneState.ForceWireframe) { std::sort(Candidates.begin(), Candidates.end(), @@ -333,6 +339,103 @@ void VulkanSceneRenderer::PrepareSceneFrame(RenderScene &Scene) { PrepareGraphicsMaterialDescriptors(); } +size_t VulkanSceneRenderer::BuildCullCandidatesSerial( + const RenderScene &Scene, std::span Inputs, + std::vector &Candidates) const { + Candidates.clear(); + Candidates.reserve(Inputs.size()); + + size_t FrustumCulledCount = 0; + const glm::vec3 CameraPosition = Scene.ActiveCamera->GetPosition(); + for (size_t Index = 0; Index < Inputs.size(); ++Index) { + const auto &Submission = Scene.Submissions[Index]; + VulkanMesh *VulkanMeshRef = Inputs[Index].Mesh; + if (VulkanMeshRef == nullptr) { + continue; + } + + if (!m_PreparedSceneState.ForceWireframe && + !m_Device->GetOcclusionCulling().IsBoundsVisible( + m_PreparedSceneState.CameraData.ViewProjection, Submission.Transform, + VulkanMeshRef->BoundsMin, VulkanMeshRef->BoundsMax)) { + ++FrustumCulledCount; + continue; + } + + const glm::vec3 WorldCenter = ComputeWorldCenter(Submission, *VulkanMeshRef); + const glm::vec3 Delta = WorldCenter - CameraPosition; + Candidates.push_back({.SubmissionIndex = static_cast(Index), + .MeshHandle = Submission.MeshHandle, + .Mesh = VulkanMeshRef, + .SortDepth = glm::dot(Delta, Delta)}); + } + + return FrustumCulledCount; +} + +size_t VulkanSceneRenderer::BuildCullCandidatesParallel( + const RenderScene &Scene, std::span Inputs, + std::vector &Candidates) const { + Candidates.clear(); + Candidates.reserve(Inputs.size()); + + const unsigned HardwareThreads = std::max(1u, std::thread::hardware_concurrency()); + const size_t BucketCount = + std::min(Inputs.size(), std::max(1u, HardwareThreads)); + const size_t BucketSize = (Inputs.size() + BucketCount - 1u) / BucketCount; + const glm::vec3 CameraPosition = Scene.ActiveCamera->GetPosition(); + + struct CandidateBucket { + std::vector Candidates; + size_t FrustumCulledCount{0}; + }; + std::vector Buckets(BucketCount); + + Jobs::ParallelFor(BucketCount, [&](size_t BucketIndex) { + const size_t Begin = BucketIndex * BucketSize; + const size_t End = std::min(Inputs.size(), Begin + BucketSize); + CandidateBucket &Bucket = Buckets[BucketIndex]; + Bucket.Candidates.reserve(End - Begin); + + for (size_t Index = Begin; Index < End; ++Index) { + const auto &Submission = Scene.Submissions[Index]; + VulkanMesh *VulkanMeshRef = Inputs[Index].Mesh; + if (VulkanMeshRef == nullptr) { + continue; + } + + if (!m_PreparedSceneState.ForceWireframe && + !m_Device->GetOcclusionCulling().IsBoundsVisible( + m_PreparedSceneState.CameraData.ViewProjection, Submission.Transform, + VulkanMeshRef->BoundsMin, VulkanMeshRef->BoundsMax)) { + ++Bucket.FrustumCulledCount; + continue; + } + + const glm::vec3 WorldCenter = ComputeWorldCenter(Submission, *VulkanMeshRef); + const glm::vec3 Delta = WorldCenter - CameraPosition; + Bucket.Candidates.push_back({.SubmissionIndex = static_cast(Index), + .MeshHandle = Submission.MeshHandle, + .Mesh = VulkanMeshRef, + .SortDepth = glm::dot(Delta, Delta)}); + } + }); + + size_t FrustumCulledCount = 0; + for (const CandidateBucket &Bucket : Buckets) { + FrustumCulledCount += Bucket.FrustumCulledCount; + Candidates.insert(Candidates.end(), Bucket.Candidates.begin(), + Bucket.Candidates.end()); + } + + return FrustumCulledCount; +} + +bool VulkanSceneRenderer::ShouldUseParallelCull(size_t SubmissionCount) const { + constexpr size_t kParallelCullSubmissionThreshold = 512; + return m_EnableParallelCull && SubmissionCount >= kParallelCullSubmissionThreshold; +} + void VulkanSceneRenderer::RecordBackground() { QueueScenePass(ScenePassPrimitive::Background); } diff --git a/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.h b/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.h index 60c8c1a..7e1545a 100644 --- a/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.h +++ b/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.h @@ -4,6 +4,7 @@ #include "AxiomRHI/Vulkan/VulkanRendererTypes.h" #include +#include #include namespace Axiom { @@ -48,6 +49,12 @@ class VulkanSceneRenderer final : public ISceneRendererBackend { MeshHandle MeshHandle{}; VulkanMesh *Mesh{nullptr}; float SortDepth{0.0f}; + + constexpr bool operator==(const CandidateSubmission &) const = default; + }; + + struct SubmissionCullInput { + VulkanMesh *Mesh{nullptr}; }; struct PreparedSceneState { @@ -60,6 +67,13 @@ class VulkanSceneRenderer final : public ISceneRendererBackend { }; void PrepareSceneFrame(RenderScene &Scene); + size_t BuildCullCandidatesSerial( + const RenderScene &Scene, std::span Inputs, + std::vector &Candidates) const; + size_t BuildCullCandidatesParallel( + const RenderScene &Scene, std::span Inputs, + std::vector &Candidates) const; + bool ShouldUseParallelCull(size_t SubmissionCount) const; void RecordBackground(); void RecordDepthPrepass(); void BuildHzb(); @@ -107,7 +121,11 @@ class VulkanSceneRenderer final : public ISceneRendererBackend { IViewportFrameOutput *m_FrameOutput{nullptr}; PreparedSceneState m_PreparedSceneState{}; std::vector m_CandidateScratch; + std::vector m_VerificationCandidateScratch; + std::vector m_CullInputScratch; std::vector m_QueuedScenePasses; + bool m_EnableParallelCull{AXIOM_PARALLEL_CULL != 0}; + bool m_VerifyParallelCull{AXIOM_VERIFY_PARALLEL_CULL != 0}; VkImageLayout m_SceneDrawImageLayout{VK_IMAGE_LAYOUT_UNDEFINED}; VkImageLayout m_SceneRasterDepthLayout{VK_IMAGE_LAYOUT_UNDEFINED}; }; diff --git a/CMakeLists.txt b/CMakeLists.txt index b2e02f9..4e6a318 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,6 +19,12 @@ option(AXIOM_ENABLE_PHYSICS "Enable the JoltPhysics runtime simulation seam" ON) option(AXIOM_THREADED_RENDER "Enable the experimental threaded renderer and worker job system" OFF) +option(AXIOM_PARALLEL_CULL + "Enable guarded parallel CPU culling in the Vulkan scene renderer" ON) +option(AXIOM_VERIFY_PARALLEL_CULL + "Verify parallel CPU culling against the serial path in debug/test builds" OFF) +option(AXIOM_FRAME_TASK_GRAPH + "Enable the minimal renderer frame task graph backed by enkiTS jobs" OFF) option(AXIOM_ENABLE_TSAN "Build with ThreadSanitizer instrumentation" OFF) set(AXIOM_WEBRTC_FRAMEWORK_PATH "" CACHE PATH diff --git a/FRAME_GRAPH.md b/FRAME_GRAPH.md new file mode 100644 index 0000000..bb6ba7b --- /dev/null +++ b/FRAME_GRAPH.md @@ -0,0 +1,51 @@ +# Minimal Frame Task Graph + +This is the first low-risk frame graph for WraithEngine. The existing +`ModuleManager` phase walk remains authoritative for modules that have not been +ported. The renderer frame module is the only ported module. + +## Current Stages + +```mermaid +flowchart LR + Simulate["simulate: FrameStart modules"] --> Begin["begin: Renderer::BeginFrame"] + Begin --> RecordCommands["legacy Render phase modules submit RenderCommand data"] + RecordCommands --> CullRecord["cull + record: Renderer::Render"] + CullRecord --> Submit["submit: Renderer::EndFrame"] +``` + +`BeginFrame` is scheduled through `Jobs::ScheduleJob` and then waited at the end +of `RenderBegin`, because render-command producers still run later in the +regular `Render` phase and require an open `RenderScene`. + +`Renderer::Render` is scheduled after the begin job. It owns the Vulkan scene +preparation work, including the guarded parallel CPU cull path. `Renderer::EndFrame` +is scheduled with `Jobs::ScheduleJobAfter` after render and waited before +`Application::Step` returns. + +## Determinism Rules + +- CPU cull resolves mesh handles serially into an immutable per-frame snapshot. +- Parallel cull workers write only to range-local candidate buckets. +- Buckets are merged in ascending input-range order before the existing sort and + visible-list classification. +- `AXIOM_VERIFY_PARALLEL_CULL=ON` reruns the serial cull builder and asserts + that candidate lists and frustum-cull counts match before downstream work. + +## Config Flags + +- `AXIOM_PARALLEL_CULL`: enables the Vulkan parallel CPU cull path. +- `AXIOM_VERIFY_PARALLEL_CULL`: compares parallel cull output against serial + output. +- `AXIOM_FRAME_TASK_GRAPH`: enables the renderer frame module job chain. + +Each flag is also exposed through `ApplicationConfig` / `RendererCreateInfo` so +tests and hosts can force a specific mode without changing global build flags. + +## Next Ports + +Future ports should move command-producing modules onto explicit producer tasks +instead of relying on `RenderCommand` global scene state. Once those producers +return immutable render packets, `BeginFrame` no longer needs to be synchronized +before the `Render` phase, and the graph can express true +`simulate -> cull(parallel) -> record -> submit` overlap. diff --git a/Tests/RenderSubmissionTests.cpp b/Tests/RenderSubmissionTests.cpp index 8b91c92..eed409e 100644 --- a/Tests/RenderSubmissionTests.cpp +++ b/Tests/RenderSubmissionTests.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -107,6 +108,12 @@ Axiom::MeshData MakeTriangleMesh() { .BoundsMax = {0.25f, 0.25f, 0.0f}, }; } + +class ScopedJobSystem { +public: + ScopedJobSystem() { Axiom::Jobs::Startup(); } + ~ScopedJobSystem() { Axiom::Jobs::Shutdown(); } +}; } // namespace TEST(RenderSubmissionTests, EditorSceneRendererAdapterReusesCachedMeshUntilAssetChanges) { @@ -166,22 +173,25 @@ TEST(RenderSubmissionTests, EditorSceneRendererAdapterDropsDeletedObjectsFromCac } TEST(RenderSubmissionTests, - VulkanRendererRendersAllFiveThousandSubmittedMeshesOffscreen) { + VulkanRendererRendersAllTenThousandSubmittedMeshesOffscreen) { constexpr uint32_t Width = 1280; constexpr uint32_t Height = 720; - constexpr size_t MeshCount = 5000; + constexpr size_t MeshCount = 10000; EnsureLoggingInitialized(); if (!Axiom::CanInitializeHeadlessVulkan()) { GTEST_SKIP() << "Headless Vulkan is unavailable on this host"; } + ScopedJobSystem Jobs; auto Surface = std::make_shared(Width, Height); Axiom::Renderer Renderer; Renderer.Init({ .TargetSurface = Surface, .Width = Width, .Height = Height, + .EnableParallelCull = true, + .VerifyParallelCull = true, }); Renderer.SetViewMode(Axiom::RendererViewMode::Wireframe); From f3ffb1c7fb69219ad224f9e8c3d9bae22f4bcdea Mon Sep 17 00:00:00 2001 From: Tamely Date: Sun, 7 Jun 2026 10:57:48 -0500 Subject: [PATCH 2/4] Fix JobSystem hot-path allocations --- Axiom/Jobs/JobSystem.cpp | 161 +++++++++++++++++++++++++++++++-------- Axiom/Jobs/JobSystem.h | 148 ++++++++++++++++++++++++++++++++++- Tests/ThreadingTests.cpp | 92 ++++++++++++++++++++++ 3 files changed, 364 insertions(+), 37 deletions(-) diff --git a/Axiom/Jobs/JobSystem.cpp b/Axiom/Jobs/JobSystem.cpp index 16e674b..6ba31c4 100644 --- a/Axiom/Jobs/JobSystem.cpp +++ b/Axiom/Jobs/JobSystem.cpp @@ -3,7 +3,10 @@ #include "Core/Threading.h" #include "Jobs/TaskScheduler.h" +#include #include +#include +#include #include #include #include @@ -11,23 +14,37 @@ #include namespace Axiom::Jobs { -struct JobState { - std::unique_ptr Task; - std::vector DependencyHandles; -}; - namespace { +constexpr size_t kJobPoolCapacity = 131072; +constexpr size_t kMaxInlineDependencies = 16; + void OnWorkerThreadStart(uint32_t ThreadNum); class LambdaTaskSet final : public enki::ITaskSet { public: - explicit LambdaTaskSet(JobFn Function) - : enki::ITaskSet(1), m_Function(std::move(Function)) {} + LambdaTaskSet() : enki::ITaskSet(1) {} + + void Reset(JobFn Function, std::span Dependencies) { + m_SetSize = 1; + m_MinRange = 1; + m_Function = std::move(Function); + m_DependencyCount = std::min(Dependencies.size(), m_Dependencies.size()); + for (size_t Index = 0; Index < m_DependencyCount; ++Index) { + m_Dependencies[Index] = Dependencies[Index]; + } + } - void ExecuteRange(enki::TaskSetPartition, uint32_t) override { m_Function(); } + void Clear() { + m_Function.Reset(); + m_DependencyCount = 0; + } + + void ExecuteRange(enki::TaskSetPartition, uint32_t) override; private: JobFn m_Function; + std::array m_Dependencies; + size_t m_DependencyCount{0}; }; class ParallelForTaskSet final : public enki::ITaskSet { @@ -51,6 +68,16 @@ class ParallelForTaskSet final : public enki::ITaskSet { ParallelForFn m_Function; }; +} // namespace + +struct JobState { + LambdaTaskSet Task; + std::atomic Generation{0}; + std::atomic Recycled{true}; +}; + +namespace { + class JobSystem { public: void Startup() { @@ -62,6 +89,7 @@ class JobSystem { Config.profilerCallbacks.threadStart = &OnWorkerThreadStart; Config.numExternalTaskThreads = 4; m_Scheduler->Initialize(Config); + ResetPool(); } } @@ -75,6 +103,7 @@ class JobSystem { if (m_StartupCount == 0 && m_Scheduler != nullptr) { m_Scheduler->WaitforAllAndShutdown(); m_Scheduler.reset(); + ResetPool(); } } @@ -84,10 +113,14 @@ class JobSystem { return {}; } - auto State = std::make_shared(); - State->Task = std::make_unique(std::move(Function)); - m_Scheduler->AddTaskSetToPipe(State->Task.get()); - return {.State = std::move(State)}; + JobHandle Handle = AcquireTask(Function, {}); + if (!Handle.IsValid()) { + Function(); + return {}; + } + + m_Scheduler->AddTaskSetToPipe(&Handle.State->Task); + return Handle; } JobHandle ScheduleJobAfter(JobFn Function, std::span Deps) { @@ -99,37 +132,36 @@ class JobSystem { return {}; } - auto State = std::make_shared(); - State->DependencyHandles.reserve(Deps.size()); - for (const JobHandle &Dependency : Deps) { - if (!Dependency.IsValid() || Dependency.State->Task == nullptr) { - continue; + if (Deps.size() > kMaxInlineDependencies) { + for (size_t Index = kMaxInlineDependencies; Index < Deps.size(); + ++Index) { + Wait(Deps[Index]); } - State->DependencyHandles.push_back(Dependency); } - State->Task = std::make_unique( - [this, State, Function = std::move(Function)]() mutable { - for (const JobHandle &Dependency : State->DependencyHandles) { - if (!Dependency.IsValid() || Dependency.State->Task == nullptr) { - continue; - } - - m_Scheduler->WaitforTask(Dependency.State->Task.get()); - } + const size_t InlineDependencyCount = + std::min(Deps.size(), kMaxInlineDependencies); + JobHandle Handle = + AcquireTask(Function, Deps.first(InlineDependencyCount)); + if (!Handle.IsValid()) { + for (size_t Index = 0; Index < InlineDependencyCount; ++Index) { + Wait(Deps[Index]); + } + Function(); + return {}; + } - Function(); - }); - m_Scheduler->AddTaskSetToPipe(State->Task.get()); - return {.State = std::move(State)}; + m_Scheduler->AddTaskSetToPipe(&Handle.State->Task); + return Handle; } void Wait(JobHandle Handle) { - if (!Handle.IsValid() || Handle.State->Task == nullptr) { + if (m_Scheduler == nullptr || !IsCurrent(Handle)) { return; } - m_Scheduler->WaitforTask(Handle.State->Task.get()); + m_Scheduler->WaitforTask(&Handle.State->Task); + ReleaseTask(Handle); } void ParallelFor(size_t Count, ParallelForFn Function) { @@ -151,8 +183,63 @@ class JobSystem { private: std::mutex m_Mutex; std::unique_ptr m_Scheduler; + std::unique_ptr m_TaskPool; + std::vector m_FreeList; size_t m_StartupCount{0}; + void ResetPool() { + if (m_TaskPool == nullptr) { + m_TaskPool = std::make_unique(kJobPoolCapacity); + } + + m_FreeList.reserve(kJobPoolCapacity); + m_FreeList.clear(); + for (size_t Index = 0; Index < kJobPoolCapacity; ++Index) { + JobState &State = m_TaskPool[Index]; + State.Task.Clear(); + State.Generation.fetch_add(1, std::memory_order_relaxed); + State.Recycled.store(true, std::memory_order_relaxed); + m_FreeList.push_back(&State); + } + } + + JobHandle AcquireTask(JobFn &Function, std::span Deps) { + std::scoped_lock Lock(m_Mutex); + if (m_TaskPool == nullptr || m_FreeList.empty()) { + return {}; + } + + JobState *State = m_FreeList.back(); + m_FreeList.pop_back(); + State->Recycled.store(false, std::memory_order_relaxed); + const uint32_t Generation = + State->Generation.fetch_add(1, std::memory_order_relaxed) + 1; + State->Task.Reset(std::move(Function), Deps); + return {.State = State, .Generation = Generation}; + } + + void ReleaseTask(JobHandle Handle) { + JobState *State = Handle.State; + if (State == nullptr || + State->Generation.load(std::memory_order_acquire) != + Handle.Generation || + State->Recycled.exchange(true, std::memory_order_acq_rel)) { + return; + } + + State->Task.Clear(); + + std::scoped_lock Lock(m_Mutex); + m_FreeList.push_back(State); + } + + bool IsCurrent(JobHandle Handle) const { + return Handle.State != nullptr && + Handle.State->Generation.load(std::memory_order_acquire) == + Handle.Generation && + !Handle.State->Recycled.load(std::memory_order_acquire); + } + bool CanUseScheduler() { if (m_Scheduler == nullptr) { return false; @@ -173,6 +260,14 @@ void OnWorkerThreadStart(uint32_t ThreadNum) { Threading::SetCurrentThreadName("Axiom Job Worker " + std::to_string(ThreadNum)); } + +void LambdaTaskSet::ExecuteRange(enki::TaskSetPartition, uint32_t) { + for (size_t Index = 0; Index < m_DependencyCount; ++Index) { + GetJobSystem().Wait(m_Dependencies[Index]); + } + + m_Function(); +} } // namespace void Startup() { GetJobSystem().Startup(); } diff --git a/Axiom/Jobs/JobSystem.h b/Axiom/Jobs/JobSystem.h index 50c9212..0cfa9f8 100644 --- a/Axiom/Jobs/JobSystem.h +++ b/Axiom/Jobs/JobSystem.h @@ -1,20 +1,160 @@ #pragma once #include +#include #include -#include +#include #include +#include +#include namespace Axiom::Jobs { -using JobFn = std::function; -using ParallelForFn = std::function; +namespace Detail { +template class SmallFunction; + +template +class SmallFunction { +public: + SmallFunction() = default; + SmallFunction(std::nullptr_t) noexcept {} + + template , + std::enable_if_t, int> = 0> + SmallFunction(Callable &&Function) { + Emplace(std::forward(Function)); + } + + SmallFunction(const SmallFunction &Other) { CopyFrom(Other); } + + SmallFunction(SmallFunction &&Other) noexcept { MoveFrom(std::move(Other)); } + + SmallFunction &operator=(const SmallFunction &Other) { + if (this != &Other) { + Reset(); + CopyFrom(Other); + } + return *this; + } + + SmallFunction &operator=(SmallFunction &&Other) noexcept { + if (this != &Other) { + Reset(); + MoveFrom(std::move(Other)); + } + return *this; + } + + SmallFunction &operator=(std::nullptr_t) noexcept { + Reset(); + return *this; + } + + template , + std::enable_if_t, int> = 0> + SmallFunction &operator=(Callable &&Function) { + Reset(); + Emplace(std::forward(Function)); + return *this; + } + + ~SmallFunction() { Reset(); } + + explicit operator bool() const noexcept { return m_Invoke != nullptr; } + + R operator()(Args... Arguments) { + if (m_Invoke == nullptr) { + throw std::bad_function_call(); + } + return m_Invoke(&m_Storage, std::forward(Arguments)...); + } + + void Reset() noexcept { + if (m_Destroy != nullptr) { + m_Destroy(&m_Storage); + } + m_Invoke = nullptr; + m_Destroy = nullptr; + m_Copy = nullptr; + m_Move = nullptr; + } + +private: + using Storage = std::aligned_storage_t; + + template void Emplace(Callable &&Function) { + using Stored = std::decay_t; + static_assert(sizeof(Stored) <= StorageSize, + "Job callable capture is too large for inline storage"); + static_assert(alignof(Stored) <= alignof(Storage), + "Job callable alignment is too large for inline storage"); + static_assert(std::is_copy_constructible_v, + "Job callables must be copy constructible"); + + new (&m_Storage) Stored(std::forward(Function)); + m_Invoke = [](void *StoragePtr, Args... Arguments) -> R { + return (*std::launder(reinterpret_cast(StoragePtr)))( + std::forward(Arguments)...); + }; + m_Destroy = [](void *StoragePtr) noexcept { + std::launder(reinterpret_cast(StoragePtr))->~Stored(); + }; + m_Copy = [](void *Destination, const void *Source) { + new (Destination) + Stored(*std::launder(reinterpret_cast(Source))); + }; + m_Move = [](void *Destination, void *Source) noexcept { + new (Destination) + Stored(std::move(*std::launder(reinterpret_cast(Source)))); + std::launder(reinterpret_cast(Source))->~Stored(); + }; + } + + void CopyFrom(const SmallFunction &Other) { + if (Other.m_Invoke == nullptr) { + return; + } + Other.m_Copy(&m_Storage, &Other.m_Storage); + m_Invoke = Other.m_Invoke; + m_Destroy = Other.m_Destroy; + m_Copy = Other.m_Copy; + m_Move = Other.m_Move; + } + + void MoveFrom(SmallFunction &&Other) noexcept { + if (Other.m_Invoke == nullptr) { + return; + } + Other.m_Move(&m_Storage, &Other.m_Storage); + m_Invoke = Other.m_Invoke; + m_Destroy = Other.m_Destroy; + m_Copy = Other.m_Copy; + m_Move = Other.m_Move; + Other.m_Invoke = nullptr; + Other.m_Destroy = nullptr; + Other.m_Copy = nullptr; + Other.m_Move = nullptr; + } + + Storage m_Storage; + R (*m_Invoke)(void *, Args...) = nullptr; + void (*m_Destroy)(void *) noexcept = nullptr; + void (*m_Copy)(void *, const void *) = nullptr; + void (*m_Move)(void *, void *) noexcept = nullptr; +}; +} // namespace Detail + +using JobFn = Detail::SmallFunction; +using ParallelForFn = Detail::SmallFunction; struct JobState; struct JobHandle { [[nodiscard]] bool IsValid() const noexcept { return State != nullptr; } - std::shared_ptr State; + JobState *State = nullptr; + uint32_t Generation = 0; }; void Startup(); diff --git a/Tests/ThreadingTests.cpp b/Tests/ThreadingTests.cpp index 793f2ce..3bea6f1 100644 --- a/Tests/ThreadingTests.cpp +++ b/Tests/ThreadingTests.cpp @@ -16,11 +16,16 @@ #include #include #include +#include #include +#include #include #include namespace { +std::atomic g_CountAllocations{false}; +std::atomic g_AllocationCount{0}; + void EnsureLoggingInitialized() { static bool Initialized = false; if (!Initialized) { @@ -60,6 +65,65 @@ Axiom::CommandContext MakeContext(uint64_t FrameIndex = 1, } } // namespace +void *operator new(std::size_t Size) { + if (g_CountAllocations.load(std::memory_order_relaxed)) { + g_AllocationCount.fetch_add(1, std::memory_order_relaxed); + } + if (void *Pointer = std::malloc(Size)) { + return Pointer; + } + throw std::bad_alloc(); +} + +void *operator new[](std::size_t Size) { + if (g_CountAllocations.load(std::memory_order_relaxed)) { + g_AllocationCount.fetch_add(1, std::memory_order_relaxed); + } + if (void *Pointer = std::malloc(Size)) { + return Pointer; + } + throw std::bad_alloc(); +} + +void *operator new(std::size_t Size, std::align_val_t Alignment) { + if (g_CountAllocations.load(std::memory_order_relaxed)) { + g_AllocationCount.fetch_add(1, std::memory_order_relaxed); + } + void *Pointer = nullptr; + if (posix_memalign(&Pointer, static_cast(Alignment), Size) == 0) { + return Pointer; + } + throw std::bad_alloc(); +} + +void *operator new[](std::size_t Size, std::align_val_t Alignment) { + if (g_CountAllocations.load(std::memory_order_relaxed)) { + g_AllocationCount.fetch_add(1, std::memory_order_relaxed); + } + void *Pointer = nullptr; + if (posix_memalign(&Pointer, static_cast(Alignment), Size) == 0) { + return Pointer; + } + throw std::bad_alloc(); +} + +void operator delete(void *Pointer) noexcept { std::free(Pointer); } +void operator delete[](void *Pointer) noexcept { std::free(Pointer); } +void operator delete(void *Pointer, std::size_t) noexcept { std::free(Pointer); } +void operator delete[](void *Pointer, std::size_t) noexcept { std::free(Pointer); } +void operator delete(void *Pointer, std::align_val_t) noexcept { + std::free(Pointer); +} +void operator delete[](void *Pointer, std::align_val_t) noexcept { + std::free(Pointer); +} +void operator delete(void *Pointer, std::size_t, std::align_val_t) noexcept { + std::free(Pointer); +} +void operator delete[](void *Pointer, std::size_t, std::align_val_t) noexcept { + std::free(Pointer); +} + TEST(ThreadingTests, JobsRunWithDependenciesAndParallelFor) { Axiom::Jobs::Startup(); @@ -85,6 +149,34 @@ TEST(ThreadingTests, JobsRunWithDependenciesAndParallelFor) { Axiom::Jobs::Shutdown(); } +TEST(ThreadingTests, SchedulingTrivialJobsDoesNotAllocatePerJob) { + constexpr size_t JobCount = 100000; + static std::array Handles; + + Axiom::Jobs::Startup(); + + Axiom::Jobs::JobHandle Warmup = Axiom::Jobs::ScheduleJob([]() {}); + Axiom::Jobs::Wait(Warmup); + + g_AllocationCount.store(0, std::memory_order_relaxed); + g_CountAllocations.store(true, std::memory_order_release); + + for (Axiom::Jobs::JobHandle &Handle : Handles) { + Handle = Axiom::Jobs::ScheduleJob([]() {}); + } + for (Axiom::Jobs::JobHandle Handle : Handles) { + Axiom::Jobs::Wait(Handle); + } + + g_CountAllocations.store(false, std::memory_order_release); + const size_t AllocationCount = + g_AllocationCount.load(std::memory_order_relaxed); + + Axiom::Jobs::Shutdown(); + + EXPECT_EQ(AllocationCount, 0u); +} + TEST(ThreadingTests, ThreadedRendererRunsHeadlessForThousandFramesWithoutDeadlock) { #if AXIOM_THREADED_RENDER == 0 GTEST_SKIP() << "Threaded renderer is disabled in this build"; From 423efafdf500843022eab392f93ebd9e3530fcf6 Mon Sep 17 00:00:00 2001 From: Tamely Date: Sun, 7 Jun 2026 11:03:42 -0500 Subject: [PATCH 3/4] Compile with the most performant options by default --- Axiom/CMakeLists.txt | 9 +--- CMakeLists.txt | 6 ++- CMakePresets.json | 9 ++-- Editor/CMakeLists.txt | 2 + HAL/CMakeLists.txt | 2 + Headless/CMakeLists.txt | 12 +++++ README.md | 6 +++ Tests/CMakeLists.txt | 2 + cmake/AxiomCompilerOptions.cmake | 78 ++++++++++++++++++++++++++++++++ 9 files changed, 114 insertions(+), 12 deletions(-) create mode 100644 cmake/AxiomCompilerOptions.cmake diff --git a/Axiom/CMakeLists.txt b/Axiom/CMakeLists.txt index 7bc7fed..0e4f8bc 100644 --- a/Axiom/CMakeLists.txt +++ b/Axiom/CMakeLists.txt @@ -373,13 +373,7 @@ function(axiom_configure_module_target target_name) target_link_options(${target_name} PUBLIC ${AXIOM_WEBRTC_LINK_OPTIONS}) endif() - if(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU") - target_compile_options(${target_name} PRIVATE - $<$:-O3 -march=native> - $<$:-O2 -g -march=native> - $<$:-Os> - ) - endif() + axiom_apply_performance_options(${target_name}) if(AXIOM_ENABLE_TSAN) if(NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU") @@ -553,6 +547,7 @@ if(AXIOM_ENABLE_TSAN) -fno-omit-frame-pointer ) endif() +axiom_apply_performance_options(AxiomRendererVulkanObjects) if(AXIOM_ENABLE_SCRIPTING) set(AXIOM_CORAL_MANAGED_DIR diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e6a318..746ecc2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,8 @@ set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +include(cmake/AxiomCompilerOptions.cmake) + option(BUILD_TESTING "Build tests" OFF) option(AXIOM_ENABLE_SCRIPTING "Enable the Coral C# scripting host" OFF) @@ -18,13 +20,13 @@ option(AXIOM_ENABLE_WEBRTC option(AXIOM_ENABLE_PHYSICS "Enable the JoltPhysics runtime simulation seam" ON) option(AXIOM_THREADED_RENDER - "Enable the experimental threaded renderer and worker job system" OFF) + "Enable the threaded renderer and worker job system" ON) option(AXIOM_PARALLEL_CULL "Enable guarded parallel CPU culling in the Vulkan scene renderer" ON) option(AXIOM_VERIFY_PARALLEL_CULL "Verify parallel CPU culling against the serial path in debug/test builds" OFF) option(AXIOM_FRAME_TASK_GRAPH - "Enable the minimal renderer frame task graph backed by enkiTS jobs" OFF) + "Enable the minimal renderer frame task graph backed by enkiTS jobs" ON) option(AXIOM_ENABLE_TSAN "Build with ThreadSanitizer instrumentation" OFF) set(AXIOM_WEBRTC_FRAMEWORK_PATH "" CACHE PATH diff --git a/CMakePresets.json b/CMakePresets.json index a1a0026..0c2147a 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -39,17 +39,20 @@ { "name": "debug", "displayName": "Build Debug", - "configurePreset": "debug" + "configurePreset": "debug", + "jobs": 0 }, { "name": "release", "displayName": "Build Release", - "configurePreset": "release" + "configurePreset": "release", + "jobs": 0 }, { "name": "minsizerel", "displayName": "Build MinSizeRel", - "configurePreset": "minsizerel" + "configurePreset": "minsizerel", + "jobs": 0 } ] } diff --git a/Editor/CMakeLists.txt b/Editor/CMakeLists.txt index 8bc705b..071b0fa 100644 --- a/Editor/CMakeLists.txt +++ b/Editor/CMakeLists.txt @@ -4,6 +4,8 @@ add_executable(AxiomEditor GlfwEditorModule.cpp ) +axiom_apply_performance_options(AxiomEditor) + target_link_libraries(AxiomEditor PRIVATE AxiomCore) target_link_libraries(AxiomEditor PRIVATE AxiomScene diff --git a/HAL/CMakeLists.txt b/HAL/CMakeLists.txt index f4fb0d0..df20707 100644 --- a/HAL/CMakeLists.txt +++ b/HAL/CMakeLists.txt @@ -16,6 +16,8 @@ if(APPLE) ) endif() +axiom_apply_performance_options(AxiomHAL) + target_include_directories(AxiomHAL PUBLIC "${CMAKE_SOURCE_DIR}" "${CMAKE_SOURCE_DIR}/Axiom" diff --git a/Headless/CMakeLists.txt b/Headless/CMakeLists.txt index 43d983c..4dabd7f 100644 --- a/Headless/CMakeLists.txt +++ b/Headless/CMakeLists.txt @@ -7,6 +7,8 @@ add_library(uSockets STATIC "${CMAKE_SOURCE_DIR}/ThirdParty/uWebSockets/uSockets/src/eventing/epoll_kqueue.c" ) +axiom_apply_performance_options(uSockets) + target_include_directories(uSockets PUBLIC "${CMAKE_SOURCE_DIR}/ThirdParty/uWebSockets/uSockets/src" ) @@ -31,6 +33,8 @@ add_library(WraithNetworking STATIC WraithNetworkingModule.cpp ) +axiom_apply_performance_options(WraithNetworking) + target_include_directories(WraithNetworking PUBLIC "${CMAKE_SOURCE_DIR}/Headless" "${CMAKE_SOURCE_DIR}/ThirdParty/glfw/deps" @@ -66,6 +70,8 @@ add_executable(AxiomHeadless WebRtcSession.cpp ) +axiom_apply_performance_options(AxiomHeadless) + add_executable(AxiomRemoteViewportDevClient DevRemoteViewportClient.cpp HeadlessCommandProtocol.cpp @@ -76,6 +82,8 @@ add_executable(AxiomRemoteViewportDevClient WebRtcSession.cpp ) +axiom_apply_performance_options(AxiomRemoteViewportDevClient) + target_include_directories(AxiomHeadless PRIVATE "${CMAKE_SOURCE_DIR}/ThirdParty/glfw/deps" ) @@ -121,6 +129,8 @@ add_executable(AxiomRemoteViewportServer WebRtcSession.cpp ) +axiom_apply_performance_options(AxiomRemoteViewportServer) + add_executable(AxiomPackagedRuntime AxiomPackagedRuntime.cpp HeadlessOverlayModule.cpp @@ -129,6 +139,8 @@ add_executable(AxiomPackagedRuntime HeadlessSessionModule.cpp ) +axiom_apply_performance_options(AxiomPackagedRuntime) + target_include_directories(AxiomRemoteViewportServer PRIVATE "${CMAKE_SOURCE_DIR}/ThirdParty/glfw/deps" ) diff --git a/README.md b/README.md index 9ec578f..9517558 100644 --- a/README.md +++ b/README.md @@ -245,6 +245,12 @@ cmake --build build/release | `AXIOM_SCRIPTING_TRUST_DEFAULT` | `STRING` | `Restricted` | Default sandbox tier for user scripts. `Restricted` (hosted — blocks `System.Net.*`, `System.Reflection.Emit`, etc.) or `Trusted` (local dev — full BCL access) | | `AXIOM_ENABLE_WEBRTC` | `BOOL` | `OFF` | Enable the macOS WebRTC transport | | `AXIOM_ENABLE_PHYSICS` | `BOOL` | `ON` | Enable the JoltPhysics runtime simulation seam | +| `AXIOM_THREADED_RENDER` | `BOOL` | `ON` | Enable the threaded renderer and worker job system | +| `AXIOM_PARALLEL_CULL` | `BOOL` | `ON` | Enable guarded parallel CPU culling in the Vulkan scene renderer | +| `AXIOM_FRAME_TASK_GRAPH` | `BOOL` | `ON` | Enable the renderer frame task graph backed by enkiTS jobs | +| `AXIOM_ENABLE_PERFORMANCE_DEFAULTS` | `BOOL` | `ON` | Apply optimized compile/link defaults to first-party targets | +| `AXIOM_OPTIMIZE_FOR_NATIVE_ARCH` | `BOOL` | `ON` | Tune optimized builds for the host CPU architecture | +| `AXIOM_ENABLE_IPO` | `BOOL` | `ON` | Enable interprocedural optimization/LTO for optimized builds when supported | | `AXIOM_WEBRTC_FRAMEWORK_PATH` | `PATH` | _(empty)_ | Path to a `WebRTC.framework` bundle (macOS framework variant) | | `AXIOM_WEBRTC_LIBRARY_PATH` | `FILEPATH` | _(empty)_ | Path to a `libwebrtc` static/shared binary (non-framework variant) | | `AXIOM_WEBRTC_INCLUDE_DIR` | `PATH` | _(empty)_ | Include directory for the non-framework libwebrtc variant | diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt index f83ba47..cca09e3 100644 --- a/Tests/CMakeLists.txt +++ b/Tests/CMakeLists.txt @@ -19,6 +19,8 @@ add_executable(AxiomTests ../Headless/WebRtcSession.cpp ) +axiom_apply_performance_options(AxiomTests) + if(APPLE AND EXISTS "${CMAKE_SOURCE_DIR}/Headless/MacOSWebRtcSession.mm") target_sources(AxiomTests PRIVATE ../Headless/MacOSWebRtcSession.mm diff --git a/cmake/AxiomCompilerOptions.cmake b/cmake/AxiomCompilerOptions.cmake new file mode 100644 index 0000000..5f5809d --- /dev/null +++ b/cmake/AxiomCompilerOptions.cmake @@ -0,0 +1,78 @@ +include(CheckIPOSupported) + +if(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release CACHE STRING + "Build type for single-configuration generators" FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS + Debug Release RelWithDebInfo MinSizeRel) +endif() + +option(AXIOM_ENABLE_PERFORMANCE_DEFAULTS + "Use high-performance compile defaults for first-party targets" ON) +option(AXIOM_OPTIMIZE_FOR_NATIVE_ARCH + "Tune optimized builds for the host CPU architecture" ON) +option(AXIOM_ENABLE_IPO + "Enable interprocedural optimization/LTO for optimized builds" ON) + +set(AXIOM_IPO_SUPPORTED OFF) +if(AXIOM_ENABLE_IPO) + check_ipo_supported(RESULT AXIOM_IPO_SUPPORTED OUTPUT AXIOM_IPO_OUTPUT) + if(NOT AXIOM_IPO_SUPPORTED) + message(WARNING + "IPO/LTO was requested but is not supported by this toolchain: " + "${AXIOM_IPO_OUTPUT}") + endif() +endif() + +function(axiom_apply_performance_options target_name) + if(NOT AXIOM_ENABLE_PERFORMANCE_DEFAULTS) + return() + endif() + + if(NOT TARGET ${target_name}) + message(FATAL_ERROR + "axiom_apply_performance_options called for missing target: ${target_name}") + endif() + + get_target_property(AXIOM_TARGET_TYPE ${target_name} TYPE) + if(AXIOM_TARGET_TYPE STREQUAL "INTERFACE_LIBRARY" OR + AXIOM_TARGET_TYPE STREQUAL "UTILITY") + return() + endif() + + if(MSVC) + target_compile_options(${target_name} PRIVATE + $<$:/O2 /Ob3 /Oi /Ot /Gy /Gw> + $<$:/O2 /Ob3 /Oi /Ot /Gy /Gw> + $<$:/O2 /Ob3 /Oi /Ot /Gy /Gw> + ) + target_link_options(${target_name} PRIVATE + $<$:/OPT:REF /OPT:ICF> + $<$:/OPT:REF /OPT:ICF> + $<$:/OPT:REF /OPT:ICF> + ) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU") + target_compile_options(${target_name} PRIVATE + $<$:-O3> + $<$:-O3> + $<$:-Os> + ) + + if(AXIOM_OPTIMIZE_FOR_NATIVE_ARCH) + target_compile_options(${target_name} PRIVATE + $<$:-march=native> + $<$:-march=native> + $<$:-march=native> + ) + endif() + endif() + + if(AXIOM_IPO_SUPPORTED) + set_property(TARGET ${target_name} PROPERTY + INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE) + set_property(TARGET ${target_name} PROPERTY + INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO TRUE) + set_property(TARGET ${target_name} PROPERTY + INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL TRUE) + endif() +endfunction() From 01e5e84ff3e8bc8391990233b9e270a70538707f Mon Sep 17 00:00:00 2001 From: Tamely Date: Sun, 7 Jun 2026 11:16:36 -0500 Subject: [PATCH 4/4] Update readme and docs --- Content/Cooked/sponza_atrium_3.wmesh | Bin 9323716 -> 9323716 bytes Docs/DistributedWraithEngineDesign.md | 11 ++++--- Docs/HeadlessAxiomSessionPrototype.md | 17 +++++++--- Docs/HeadlessScalabilityBaseline.md | 14 ++++++-- Docs/WraithEngineRefactorPlan.md | 30 +++++++++-------- README.md | 45 ++++++++++++++++++++++++-- 6 files changed, 88 insertions(+), 29 deletions(-) diff --git a/Content/Cooked/sponza_atrium_3.wmesh b/Content/Cooked/sponza_atrium_3.wmesh index 295f556624b02815af7f97f9abdb3b745f55a9cf..bb2d74d1952caa18a567149e5b528038cf4d281b 100644 GIT binary patch delta 442 zcmWN|M>`Mz0Dxh$Y?9At2!)cY2uWm>JtJh4nN7;d?#n1W{)v-AcRwtL_xN6&{0q6d zcAgLzu#W^vkOWJJgvt%MDPa;W5fUj;5-l+jD{*p5;^np^$aPPYBuSPOxg&Swp4^vI zc_3+$E*bJr9!aKTNwz$eCz2yiC0CwFp5#k`6iSgimtrZA7g8!^QZ6s0LS9Lwyp}4d zmN!x(wNfYb(jaf;ois|5G)s%LN}IGxhjdDpbW4xCmtN_U57I9KGAJKqNQPxZM&*-y zmN6NZ37M2BnU*gyBeOCm^Rgg|vLws0B41@yzR7o4lXcmUP1%wkvMoEZD|@mp2XZJs WMfoL1ax5ouD!=8AoL%zwqV7MPYJ-RX delta 442 zcmWN|S34B|0D$4=aBMO*;`>6Jd|mjM}+AsLqE@/Package ``` @@ -324,7 +363,7 @@ Open `http://localhost:3000` in your browser. ### Local native editor (no browser required) ```bash -./build/debug/Editor/AxiomEditor +./build/release/Editor/AxiomEditor ``` ---