diff --git a/Axiom/CMakeLists.txt b/Axiom/CMakeLists.txt
index d6413812..0e4f8bce 100644
--- a/Axiom/CMakeLists.txt
+++ b/Axiom/CMakeLists.txt
@@ -351,6 +351,9 @@ function(axiom_configure_module_target target_name)
     AXIOM_SOURCE_DIR="${CMAKE_SOURCE_DIR}"
     AXIOM_ENABLE_WEBRTC=$<IF:$<BOOL:${AXIOM_ENABLE_WEBRTC}>,1,0>
     AXIOM_THREADED_RENDER=$<IF:$<BOOL:${AXIOM_THREADED_RENDER}>,1,0>
+    AXIOM_PARALLEL_CULL=$<IF:$<BOOL:${AXIOM_PARALLEL_CULL}>,1,0>
+    AXIOM_VERIFY_PARALLEL_CULL=$<IF:$<BOOL:${AXIOM_VERIFY_PARALLEL_CULL}>,1,0>
+    AXIOM_FRAME_TASK_GRAPH=$<IF:$<BOOL:${AXIOM_FRAME_TASK_GRAPH}>,1,0>
     AXIOM_WEBRTC_LINKED=${AXIOM_WEBRTC_LINKED}
   )
 
@@ -370,13 +373,7 @@ function(axiom_configure_module_target target_name)
     target_link_options(${target_name} PUBLIC ${AXIOM_WEBRTC_LINK_OPTIONS})
   endif()
 
-  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
-    target_compile_options(${target_name} PRIVATE
-      $<$<CONFIG:Release>:-O3 -march=native>
-      $<$<CONFIG:RelWithDebInfo>:-O2 -g -march=native>
-      $<$<CONFIG:MinSizeRel>:-Os>
-    )
-  endif()
+  axiom_apply_performance_options(${target_name})
 
   if(AXIOM_ENABLE_TSAN)
     if(NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
@@ -539,6 +536,9 @@ target_compile_definitions(AxiomRendererVulkanObjects PRIVATE
   AXIOM_SOURCE_DIR="${CMAKE_SOURCE_DIR}"
   AXIOM_ENABLE_WEBRTC=$<IF:$<BOOL:${AXIOM_ENABLE_WEBRTC}>,1,0>
   AXIOM_THREADED_RENDER=$<IF:$<BOOL:${AXIOM_THREADED_RENDER}>,1,0>
+  AXIOM_PARALLEL_CULL=$<IF:$<BOOL:${AXIOM_PARALLEL_CULL}>,1,0>
+  AXIOM_VERIFY_PARALLEL_CULL=$<IF:$<BOOL:${AXIOM_VERIFY_PARALLEL_CULL}>,1,0>
+  AXIOM_FRAME_TASK_GRAPH=$<IF:$<BOOL:${AXIOM_FRAME_TASK_GRAPH}>,1,0>
   AXIOM_WEBRTC_LINKED=${AXIOM_WEBRTC_LINKED}
 )
 if(AXIOM_ENABLE_TSAN)
@@ -547,6 +547,7 @@ if(AXIOM_ENABLE_TSAN)
     -fno-omit-frame-pointer
   )
 endif()
+axiom_apply_performance_options(AxiomRendererVulkanObjects)
 
 if(AXIOM_ENABLE_SCRIPTING)
   set(AXIOM_CORAL_MANAGED_DIR
diff --git a/Axiom/Core/Application.cpp b/Axiom/Core/Application.cpp
index e07bc508..98d5cf40 100644
--- a/Axiom/Core/Application.cpp
+++ b/Axiom/Core/Application.cpp
@@ -66,6 +66,8 @@ Application::Application(const ApplicationConfig &Config,
         .Width = m_Window->GetWidth(),
         .Height = m_Window->GetHeight(),
         .EnableThreadedRendering = m_Config.EnableThreadedRendering,
+        .EnableParallelCull = m_Config.EnableParallelCull,
+        .VerifyParallelCull = m_Config.VerifyParallelCull,
     });
   }
   Jobs::Startup();
diff --git a/Axiom/Core/Application.h b/Axiom/Core/Application.h
index c6510fea..f4c774a3 100644
--- a/Axiom/Core/Application.h
+++ b/Axiom/Core/Application.h
@@ -15,6 +15,15 @@ namespace Axiom {
 #ifndef AXIOM_THREADED_RENDER
 #define AXIOM_THREADED_RENDER 0
 #endif
+#ifndef AXIOM_PARALLEL_CULL
+#define AXIOM_PARALLEL_CULL 0
+#endif
+#ifndef AXIOM_VERIFY_PARALLEL_CULL
+#define AXIOM_VERIFY_PARALLEL_CULL 0
+#endif
+#ifndef AXIOM_FRAME_TASK_GRAPH
+#define AXIOM_FRAME_TASK_GRAPH 0
+#endif
 
 class Renderer;
 struct RendererDeleter {
@@ -40,6 +49,9 @@ struct ApplicationConfig {
   RuntimeMode Mode{RuntimeMode::LocalWindowedEditor};
   IViewportFrameOutput *FrameOutput{nullptr};
   bool EnableThreadedRendering{AXIOM_THREADED_RENDER != 0};
+  bool EnableParallelCull{AXIOM_PARALLEL_CULL != 0};
+  bool VerifyParallelCull{AXIOM_VERIFY_PARALLEL_CULL != 0};
+  bool EnableFrameTaskGraph{AXIOM_FRAME_TASK_GRAPH != 0};
 };
 
 class Application {
@@ -65,6 +77,9 @@ class Application {
   [[nodiscard]] float GetDeltaTime() const { return m_DeltaTime; }
   [[nodiscard]] uint64_t GetFrameIndex() const { return m_FrameIndex; }
   [[nodiscard]] RuntimeMode GetRuntimeMode() const { return m_Config.Mode; }
+  [[nodiscard]] bool IsFrameTaskGraphEnabled() const {
+    return m_Config.EnableFrameTaskGraph;
+  }
   [[nodiscard]] Renderer &GetRenderer() const;
   [[nodiscard]] Renderer *TryGetRenderer() const;
   [[nodiscard]] ModuleManager &GetModuleManager() { return m_ModuleManager; }
diff --git a/Axiom/Jobs/JobSystem.cpp b/Axiom/Jobs/JobSystem.cpp
index 0c393e1b..6ba31c43 100644
--- a/Axiom/Jobs/JobSystem.cpp
+++ b/Axiom/Jobs/JobSystem.cpp
@@ -3,7 +3,10 @@
 #include "Core/Threading.h"
 #include "Jobs/TaskScheduler.h"
 
+#include <array>
 #include <algorithm>
+#include <atomic>
+#include <cstdint>
 #include <memory>
 #include <mutex>
 #include <string>
@@ -11,23 +14,37 @@
 #include <vector>
 
 namespace Axiom::Jobs {
-struct JobState {
-  std::unique_ptr<enki::ITaskSet> Task;
-  std::vector<JobHandle> DependencyHandles;
-};
-
 namespace {
+constexpr size_t kJobPoolCapacity = 131072;
+constexpr size_t kMaxInlineDependencies = 16;
+
 void OnWorkerThreadStart(uint32_t ThreadNum);
 
 class LambdaTaskSet final : public enki::ITaskSet {
 public:
-  explicit LambdaTaskSet(JobFn Function)
-      : enki::ITaskSet(1), m_Function(std::move(Function)) {}
+  LambdaTaskSet() : enki::ITaskSet(1) {}
+
+  void Reset(JobFn Function, std::span<const JobHandle> Dependencies) {
+    m_SetSize = 1;
+    m_MinRange = 1;
+    m_Function = std::move(Function);
+    m_DependencyCount = std::min(Dependencies.size(), m_Dependencies.size());
+    for (size_t Index = 0; Index < m_DependencyCount; ++Index) {
+      m_Dependencies[Index] = Dependencies[Index];
+    }
+  }
 
-  void ExecuteRange(enki::TaskSetPartition, uint32_t) override { m_Function(); }
+  void Clear() {
+    m_Function.Reset();
+    m_DependencyCount = 0;
+  }
+
+  void ExecuteRange(enki::TaskSetPartition, uint32_t) override;
 
 private:
   JobFn m_Function;
+  std::array<JobHandle, kMaxInlineDependencies> m_Dependencies;
+  size_t m_DependencyCount{0};
 };
 
 class ParallelForTaskSet final : public enki::ITaskSet {
@@ -51,6 +68,16 @@ class ParallelForTaskSet final : public enki::ITaskSet {
   ParallelForFn m_Function;
 };
 
+} // namespace
+
+struct JobState {
+  LambdaTaskSet Task;
+  std::atomic<uint32_t> Generation{0};
+  std::atomic<bool> Recycled{true};
+};
+
+namespace {
+
 class JobSystem {
 public:
   void Startup() {
@@ -60,7 +87,9 @@ class JobSystem {
       m_Scheduler = std::make_unique<enki::TaskScheduler>();
       enki::TaskSchedulerConfig Config = m_Scheduler->GetConfig();
       Config.profilerCallbacks.threadStart = &OnWorkerThreadStart;
+      Config.numExternalTaskThreads = 4;
       m_Scheduler->Initialize(Config);
+      ResetPool();
     }
   }
 
@@ -74,54 +103,77 @@ class JobSystem {
     if (m_StartupCount == 0 && m_Scheduler != nullptr) {
       m_Scheduler->WaitforAllAndShutdown();
       m_Scheduler.reset();
+      ResetPool();
     }
   }
 
   JobHandle ScheduleJob(JobFn Function) {
-    auto State = std::make_shared<JobState>();
-    State->Task = std::make_unique<LambdaTaskSet>(std::move(Function));
-    m_Scheduler->AddTaskSetToPipe(State->Task.get());
-    return {.State = std::move(State)};
+    if (!CanUseScheduler()) {
+      Function();
+      return {};
+    }
+
+    JobHandle Handle = AcquireTask(Function, {});
+    if (!Handle.IsValid()) {
+      Function();
+      return {};
+    }
+
+    m_Scheduler->AddTaskSetToPipe(&Handle.State->Task);
+    return Handle;
   }
 
   JobHandle ScheduleJobAfter(JobFn Function, std::span<JobHandle> Deps) {
-    auto State = std::make_shared<JobState>();
-    State->DependencyHandles.reserve(Deps.size());
-    for (const JobHandle &Dependency : Deps) {
-      if (!Dependency.IsValid() || Dependency.State->Task == nullptr) {
-        continue;
+    if (!CanUseScheduler()) {
+      for (const JobHandle &Dependency : Deps) {
+        Wait(Dependency);
       }
-      State->DependencyHandles.push_back(Dependency);
+      Function();
+      return {};
     }
 
-    State->Task = std::make_unique<LambdaTaskSet>(
-        [this, State, Function = std::move(Function)]() mutable {
-          for (const JobHandle &Dependency : State->DependencyHandles) {
-            if (!Dependency.IsValid() || Dependency.State->Task == nullptr) {
-              continue;
-            }
+    if (Deps.size() > kMaxInlineDependencies) {
+      for (size_t Index = kMaxInlineDependencies; Index < Deps.size();
+           ++Index) {
+        Wait(Deps[Index]);
+      }
+    }
 
-            m_Scheduler->WaitforTask(Dependency.State->Task.get());
-          }
+    const size_t InlineDependencyCount =
+        std::min(Deps.size(), kMaxInlineDependencies);
+    JobHandle Handle =
+        AcquireTask(Function, Deps.first(InlineDependencyCount));
+    if (!Handle.IsValid()) {
+      for (size_t Index = 0; Index < InlineDependencyCount; ++Index) {
+        Wait(Deps[Index]);
+      }
+      Function();
+      return {};
+    }
 
-          Function();
-        });
-    m_Scheduler->AddTaskSetToPipe(State->Task.get());
-    return {.State = std::move(State)};
+    m_Scheduler->AddTaskSetToPipe(&Handle.State->Task);
+    return Handle;
   }
 
   void Wait(JobHandle Handle) {
-    if (!Handle.IsValid() || Handle.State->Task == nullptr) {
+    if (m_Scheduler == nullptr || !IsCurrent(Handle)) {
       return;
     }
 
-    m_Scheduler->WaitforTask(Handle.State->Task.get());
+    m_Scheduler->WaitforTask(&Handle.State->Task);
+    ReleaseTask(Handle);
   }
 
   void ParallelFor(size_t Count, ParallelForFn Function) {
     if (Count == 0) {
       return;
     }
+    if (!CanUseScheduler()) {
+      for (size_t Index = 0; Index < Count; ++Index) {
+        Function(Index);
+      }
+      return;
+    }
 
     ParallelForTaskSet Task(Count, std::move(Function));
     m_Scheduler->AddTaskSetToPipe(&Task);
@@ -131,7 +183,72 @@ class JobSystem {
 private:
   std::mutex m_Mutex;
   std::unique_ptr<enki::TaskScheduler> m_Scheduler;
+  std::unique_ptr<JobState[]> m_TaskPool;
+  std::vector<JobState *> m_FreeList;
   size_t m_StartupCount{0};
+
+  void ResetPool() {
+    if (m_TaskPool == nullptr) {
+      m_TaskPool = std::make_unique<JobState[]>(kJobPoolCapacity);
+    }
+
+    m_FreeList.reserve(kJobPoolCapacity);
+    m_FreeList.clear();
+    for (size_t Index = 0; Index < kJobPoolCapacity; ++Index) {
+      JobState &State = m_TaskPool[Index];
+      State.Task.Clear();
+      State.Generation.fetch_add(1, std::memory_order_relaxed);
+      State.Recycled.store(true, std::memory_order_relaxed);
+      m_FreeList.push_back(&State);
+    }
+  }
+
+  JobHandle AcquireTask(JobFn &Function, std::span<const JobHandle> Deps) {
+    std::scoped_lock Lock(m_Mutex);
+    if (m_TaskPool == nullptr || m_FreeList.empty()) {
+      return {};
+    }
+
+    JobState *State = m_FreeList.back();
+    m_FreeList.pop_back();
+    State->Recycled.store(false, std::memory_order_relaxed);
+    const uint32_t Generation =
+        State->Generation.fetch_add(1, std::memory_order_relaxed) + 1;
+    State->Task.Reset(std::move(Function), Deps);
+    return {.State = State, .Generation = Generation};
+  }
+
+  void ReleaseTask(JobHandle Handle) {
+    JobState *State = Handle.State;
+    if (State == nullptr ||
+        State->Generation.load(std::memory_order_acquire) !=
+            Handle.Generation ||
+        State->Recycled.exchange(true, std::memory_order_acq_rel)) {
+      return;
+    }
+
+    State->Task.Clear();
+
+    std::scoped_lock Lock(m_Mutex);
+    m_FreeList.push_back(State);
+  }
+
+  bool IsCurrent(JobHandle Handle) const {
+    return Handle.State != nullptr &&
+           Handle.State->Generation.load(std::memory_order_acquire) ==
+               Handle.Generation &&
+           !Handle.State->Recycled.load(std::memory_order_acquire);
+  }
+
+  bool CanUseScheduler() {
+    if (m_Scheduler == nullptr) {
+      return false;
+    }
+    if (m_Scheduler->GetThreadNum() != enki::NO_THREAD_NUM) {
+      return true;
+    }
+    return m_Scheduler->RegisterExternalTaskThread();
+  }
 };
 
 JobSystem &GetJobSystem() {
@@ -143,6 +260,14 @@ void OnWorkerThreadStart(uint32_t ThreadNum) {
   Threading::SetCurrentThreadName("Axiom Job Worker " +
                                   std::to_string(ThreadNum));
 }
+
+void LambdaTaskSet::ExecuteRange(enki::TaskSetPartition, uint32_t) {
+  for (size_t Index = 0; Index < m_DependencyCount; ++Index) {
+    GetJobSystem().Wait(m_Dependencies[Index]);
+  }
+
+  m_Function();
+}
 } // namespace
 
 void Startup() { GetJobSystem().Startup(); }
diff --git a/Axiom/Jobs/JobSystem.h b/Axiom/Jobs/JobSystem.h
index 50c92122..0cfa9f84 100644
--- a/Axiom/Jobs/JobSystem.h
+++ b/Axiom/Jobs/JobSystem.h
@@ -1,20 +1,160 @@
 #pragma once
 
 #include <cstddef>
+#include <cstdint>
 #include <functional>
-#include <memory>
+#include <new>
 #include <span>
+#include <type_traits>
+#include <utility>
 
 namespace Axiom::Jobs {
-using JobFn = std::function<void()>;
-using ParallelForFn = std::function<void(size_t)>;
+namespace Detail {
+template <typename Signature, size_t StorageSize> class SmallFunction;
+
+template <typename R, typename... Args, size_t StorageSize>
+class SmallFunction<R(Args...), StorageSize> {
+public:
+  SmallFunction() = default;
+  SmallFunction(std::nullptr_t) noexcept {}
+
+  template <typename Callable,
+            typename Decayed = std::decay_t<Callable>,
+            std::enable_if_t<!std::is_same_v<Decayed, SmallFunction>, int> = 0>
+  SmallFunction(Callable &&Function) {
+    Emplace<Decayed>(std::forward<Callable>(Function));
+  }
+
+  SmallFunction(const SmallFunction &Other) { CopyFrom(Other); }
+
+  SmallFunction(SmallFunction &&Other) noexcept { MoveFrom(std::move(Other)); }
+
+  SmallFunction &operator=(const SmallFunction &Other) {
+    if (this != &Other) {
+      Reset();
+      CopyFrom(Other);
+    }
+    return *this;
+  }
+
+  SmallFunction &operator=(SmallFunction &&Other) noexcept {
+    if (this != &Other) {
+      Reset();
+      MoveFrom(std::move(Other));
+    }
+    return *this;
+  }
+
+  SmallFunction &operator=(std::nullptr_t) noexcept {
+    Reset();
+    return *this;
+  }
+
+  template <typename Callable,
+            typename Decayed = std::decay_t<Callable>,
+            std::enable_if_t<!std::is_same_v<Decayed, SmallFunction>, int> = 0>
+  SmallFunction &operator=(Callable &&Function) {
+    Reset();
+    Emplace<Decayed>(std::forward<Callable>(Function));
+    return *this;
+  }
+
+  ~SmallFunction() { Reset(); }
+
+  explicit operator bool() const noexcept { return m_Invoke != nullptr; }
+
+  R operator()(Args... Arguments) {
+    if (m_Invoke == nullptr) {
+      throw std::bad_function_call();
+    }
+    return m_Invoke(&m_Storage, std::forward<Args>(Arguments)...);
+  }
+
+  void Reset() noexcept {
+    if (m_Destroy != nullptr) {
+      m_Destroy(&m_Storage);
+    }
+    m_Invoke = nullptr;
+    m_Destroy = nullptr;
+    m_Copy = nullptr;
+    m_Move = nullptr;
+  }
+
+private:
+  using Storage = std::aligned_storage_t<StorageSize, alignof(std::max_align_t)>;
+
+  template <typename Callable> void Emplace(Callable &&Function) {
+    using Stored = std::decay_t<Callable>;
+    static_assert(sizeof(Stored) <= StorageSize,
+                  "Job callable capture is too large for inline storage");
+    static_assert(alignof(Stored) <= alignof(Storage),
+                  "Job callable alignment is too large for inline storage");
+    static_assert(std::is_copy_constructible_v<Stored>,
+                  "Job callables must be copy constructible");
+
+    new (&m_Storage) Stored(std::forward<Callable>(Function));
+    m_Invoke = [](void *StoragePtr, Args... Arguments) -> R {
+      return (*std::launder(reinterpret_cast<Stored *>(StoragePtr)))(
+          std::forward<Args>(Arguments)...);
+    };
+    m_Destroy = [](void *StoragePtr) noexcept {
+      std::launder(reinterpret_cast<Stored *>(StoragePtr))->~Stored();
+    };
+    m_Copy = [](void *Destination, const void *Source) {
+      new (Destination)
+          Stored(*std::launder(reinterpret_cast<const Stored *>(Source)));
+    };
+    m_Move = [](void *Destination, void *Source) noexcept {
+      new (Destination)
+          Stored(std::move(*std::launder(reinterpret_cast<Stored *>(Source))));
+      std::launder(reinterpret_cast<Stored *>(Source))->~Stored();
+    };
+  }
+
+  void CopyFrom(const SmallFunction &Other) {
+    if (Other.m_Invoke == nullptr) {
+      return;
+    }
+    Other.m_Copy(&m_Storage, &Other.m_Storage);
+    m_Invoke = Other.m_Invoke;
+    m_Destroy = Other.m_Destroy;
+    m_Copy = Other.m_Copy;
+    m_Move = Other.m_Move;
+  }
+
+  void MoveFrom(SmallFunction &&Other) noexcept {
+    if (Other.m_Invoke == nullptr) {
+      return;
+    }
+    Other.m_Move(&m_Storage, &Other.m_Storage);
+    m_Invoke = Other.m_Invoke;
+    m_Destroy = Other.m_Destroy;
+    m_Copy = Other.m_Copy;
+    m_Move = Other.m_Move;
+    Other.m_Invoke = nullptr;
+    Other.m_Destroy = nullptr;
+    Other.m_Copy = nullptr;
+    Other.m_Move = nullptr;
+  }
+
+  Storage m_Storage;
+  R (*m_Invoke)(void *, Args...) = nullptr;
+  void (*m_Destroy)(void *) noexcept = nullptr;
+  void (*m_Copy)(void *, const void *) = nullptr;
+  void (*m_Move)(void *, void *) noexcept = nullptr;
+};
+} // namespace Detail
+
+using JobFn = Detail::SmallFunction<void(), 64>;
+using ParallelForFn = Detail::SmallFunction<void(size_t), 64>;
 
 struct JobState;
 
 struct JobHandle {
   [[nodiscard]] bool IsValid() const noexcept { return State != nullptr; }
 
-  std::shared_ptr<JobState> State;
+  JobState *State = nullptr;
+  uint32_t Generation = 0;
 };
 
 void Startup();
diff --git a/Axiom/Renderer/RendererFrameModule.cpp b/Axiom/Renderer/RendererFrameModule.cpp
index 34c581ba..0f36a63b 100644
--- a/Axiom/Renderer/RendererFrameModule.cpp
+++ b/Axiom/Renderer/RendererFrameModule.cpp
@@ -3,17 +3,30 @@
 #include "Core/Application.h"
 #include "Renderer/Renderer.h"
 
+#include <array>
+#include <span>
+
 namespace Axiom {
 std::string_view RendererFrameModule::GetName() const {
   return "Core.RendererFrame";
 }
 
 bool RendererFrameModule::Initialize(Application &App) {
-  (void)App;
+  m_UseFrameTaskGraph = App.IsFrameTaskGraphEnabled();
+  ResetTaskGraph();
   return true;
 }
 
 void RendererFrameModule::Update(const ModuleUpdateContext &Context) {
+  if (m_UseFrameTaskGraph) {
+    UpdateTaskGraph(Context);
+    return;
+  }
+
+  UpdateSerial(Context);
+}
+
+void RendererFrameModule::UpdateSerial(const ModuleUpdateContext &Context) {
   switch (Context.Phase) {
   case ModuleUpdatePhase::FrameStart:
     Context.App.GetRenderer().SetCpuFrameTime(Context.DeltaTimeSeconds *
@@ -33,5 +46,45 @@ void RendererFrameModule::Update(const ModuleUpdateContext &Context) {
   }
 }
 
-void RendererFrameModule::Shutdown(Application &App) { (void)App; }
+void RendererFrameModule::UpdateTaskGraph(const ModuleUpdateContext &Context) {
+  Renderer &Renderer = Context.App.GetRenderer();
+  switch (Context.Phase) {
+  case ModuleUpdatePhase::FrameStart:
+    ResetTaskGraph();
+    Renderer.SetCpuFrameTime(Context.DeltaTimeSeconds * 1000.0f);
+    break;
+  case ModuleUpdatePhase::RenderBegin:
+    m_BeginFrameJob = Jobs::ScheduleJob([&Renderer]() { Renderer.BeginFrame(); });
+    Jobs::Wait(m_BeginFrameJob);
+    break;
+  case ModuleUpdatePhase::Render: {
+    std::array<Jobs::JobHandle, 1> Dependencies = {m_BeginFrameJob};
+    m_RenderJob = Jobs::ScheduleJobAfter(
+        [&Renderer]() { Renderer.Render(); },
+        std::span<Jobs::JobHandle>(Dependencies));
+    break;
+  }
+  case ModuleUpdatePhase::ImGuiRender:
+    break;
+  case ModuleUpdatePhase::RenderEnd: {
+    std::array<Jobs::JobHandle, 1> Dependencies = {m_RenderJob};
+    m_EndFrameJob = Jobs::ScheduleJobAfter(
+        [&Renderer]() { Renderer.EndFrame(); },
+        std::span<Jobs::JobHandle>(Dependencies));
+    Jobs::Wait(m_EndFrameJob);
+    break;
+  }
+  }
+}
+
+void RendererFrameModule::ResetTaskGraph() {
+  m_BeginFrameJob = {};
+  m_RenderJob = {};
+  m_EndFrameJob = {};
+}
+
+void RendererFrameModule::Shutdown(Application &App) {
+  (void)App;
+  ResetTaskGraph();
+}
 } // namespace Axiom
diff --git a/Axiom/Renderer/RendererFrameModule.h b/Axiom/Renderer/RendererFrameModule.h
index f60f97b8..849e935f 100644
--- a/Axiom/Renderer/RendererFrameModule.h
+++ b/Axiom/Renderer/RendererFrameModule.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "Core/IModule.h"
+#include "Jobs/JobSystem.h"
 
 namespace Axiom {
 class RendererFrameModule final : public IModule {
@@ -9,5 +10,15 @@ class RendererFrameModule final : public IModule {
   bool Initialize(Application &App) override;
   void Update(const ModuleUpdateContext &Context) override;
   void Shutdown(Application &App) override;
+
+private:
+  void UpdateSerial(const ModuleUpdateContext &Context);
+  void UpdateTaskGraph(const ModuleUpdateContext &Context);
+  void ResetTaskGraph();
+
+  Jobs::JobHandle m_BeginFrameJob;
+  Jobs::JobHandle m_RenderJob;
+  Jobs::JobHandle m_EndFrameJob;
+  bool m_UseFrameTaskGraph{false};
 };
 } // namespace Axiom
diff --git a/Axiom/Renderer/RendererTypes.h b/Axiom/Renderer/RendererTypes.h
index 26aa1744..bf7814fa 100644
--- a/Axiom/Renderer/RendererTypes.h
+++ b/Axiom/Renderer/RendererTypes.h
@@ -14,6 +14,12 @@ namespace Axiom {
 #ifndef AXIOM_THREADED_RENDER
 #define AXIOM_THREADED_RENDER 0
 #endif
+#ifndef AXIOM_PARALLEL_CULL
+#define AXIOM_PARALLEL_CULL 0
+#endif
+#ifndef AXIOM_VERIFY_PARALLEL_CULL
+#define AXIOM_VERIFY_PARALLEL_CULL 0
+#endif
 
 enum class RendererBackendType : uint32_t {
   Vulkan = 0,
@@ -37,6 +43,8 @@ struct RendererCreateInfo {
   uint32_t Width{0};
   uint32_t Height{0};
   bool EnableThreadedRendering{AXIOM_THREADED_RENDER != 0};
+  bool EnableParallelCull{AXIOM_PARALLEL_CULL != 0};
+  bool VerifyParallelCull{AXIOM_VERIFY_PARALLEL_CULL != 0};
   std::function<void(uint64_t)> ThreadedRenderSceneStartCallback;
   std::function<void(uint64_t)> ThreadedRenderSceneCompleteCallback;
   RendererBackendType BackendType{RendererBackendType::Vulkan};
diff --git a/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.cpp b/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.cpp
index 074533dc..70ff15db 100644
--- a/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.cpp
+++ b/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.cpp
@@ -6,6 +6,7 @@
 #include "AxiomRHI/Vulkan/VulkanInitializers.h"
 #include "AxiomRHI/Vulkan/VulkanMesh.h"
 #include "AxiomRHI/Vulkan/VulkanRhiDevice.h"
+#include "Jobs/JobSystem.h"
 
 #include <algorithm>
 #include <array>
@@ -13,6 +14,7 @@
 #include <cmath>
 #include <cstring>
 #include <limits>
+#include <thread>
 #include <unordered_set>
 
 #include <glm/geometric.hpp>
@@ -61,6 +63,8 @@ uint64_t PackTranslucentSortKey(float SortDepth, uint32_t SubmissionIndex) {
 void VulkanSceneRenderer::Init(IRHIDevice &Device,
                                const RendererCreateInfo &CreateInfo) {
   m_Device = static_cast<VulkanRhiDevice *>(&Device);
+  m_EnableParallelCull = CreateInfo.EnableParallelCull;
+  m_VerifyParallelCull = CreateInfo.VerifyParallelCull;
   if (m_Device != nullptr) {
     m_FrameOutput = CreateInfo.FrameOutput;
     m_Device->GetDrawSubmissionSystem().SetRecordPreparedScenePasses(
@@ -238,35 +242,37 @@ void VulkanSceneRenderer::PrepareSceneFrame(RenderScene &Scene) {
 
   auto &Candidates = m_CandidateScratch;
   auto &VisibleSubmissions = m_PreparedSceneState.VisibleSubmissions;
+  auto &CullInputs = m_CullInputScratch;
   Candidates.clear();
   VisibleSubmissions.Clear();
+  CullInputs.clear();
   Candidates.reserve(SubmissionCount);
+  CullInputs.resize(SubmissionCount);
   VisibleSubmissions.OpaqueGraphics.reserve(SubmissionCount);
   VisibleSubmissions.TranslucentGraphics.reserve(SubmissionCount);
   VisibleSubmissions.Compute.reserve(SubmissionCount);
 
   for (size_t Index = 0; Index < SubmissionCount; ++Index) {
     const auto &Submission = Scene.Submissions[Index];
-    VulkanMesh *VulkanMeshRef = m_Device->ResolveMeshHandle(Submission.MeshHandle);
-    if (VulkanMeshRef == nullptr) {
-      continue;
-    }
+    CullInputs[Index].Mesh = m_Device->ResolveMeshHandle(Submission.MeshHandle);
+  }
 
-    if (!m_PreparedSceneState.ForceWireframe &&
-        !m_Device->GetOcclusionCulling().IsBoundsVisible(
-            m_PreparedSceneState.CameraData.ViewProjection, Submission.Transform,
-            VulkanMeshRef->BoundsMin, VulkanMeshRef->BoundsMax)) {
-      ++FrameStats.FrustumCulledMeshCount;
-      continue;
+  size_t FrustumCulledCount = 0;
+  if (ShouldUseParallelCull(SubmissionCount)) {
+    FrustumCulledCount = BuildCullCandidatesParallel(Scene, CullInputs, Candidates);
+    if (m_VerifyParallelCull) {
+      auto &SerialCandidates = m_VerificationCandidateScratch;
+      const size_t SerialFrustumCulledCount =
+          BuildCullCandidatesSerial(Scene, CullInputs, SerialCandidates);
+      assert(SerialFrustumCulledCount == FrustumCulledCount &&
+             "Parallel cull frustum count diverged from serial cull");
+      assert(SerialCandidates == Candidates &&
+             "Parallel cull candidates diverged from serial cull");
     }
-
-    const glm::vec3 WorldCenter = ComputeWorldCenter(Submission, *VulkanMeshRef);
-    const glm::vec3 Delta = WorldCenter - Scene.ActiveCamera->GetPosition();
-    Candidates.push_back({.SubmissionIndex = static_cast<uint32_t>(Index),
-                          .MeshHandle = Submission.MeshHandle,
-                          .Mesh = VulkanMeshRef,
-                          .SortDepth = glm::dot(Delta, Delta)});
+  } else {
+    FrustumCulledCount = BuildCullCandidatesSerial(Scene, CullInputs, Candidates);
   }
+  FrameStats.FrustumCulledMeshCount = static_cast<uint32_t>(FrustumCulledCount);
 
   if (!m_PreparedSceneState.ForceWireframe) {
     std::sort(Candidates.begin(), Candidates.end(),
@@ -333,6 +339,103 @@ void VulkanSceneRenderer::PrepareSceneFrame(RenderScene &Scene) {
   PrepareGraphicsMaterialDescriptors();
 }
 
+size_t VulkanSceneRenderer::BuildCullCandidatesSerial(
+    const RenderScene &Scene, std::span<const SubmissionCullInput> Inputs,
+    std::vector<CandidateSubmission> &Candidates) const {
+  Candidates.clear();
+  Candidates.reserve(Inputs.size());
+
+  size_t FrustumCulledCount = 0;
+  const glm::vec3 CameraPosition = Scene.ActiveCamera->GetPosition();
+  for (size_t Index = 0; Index < Inputs.size(); ++Index) {
+    const auto &Submission = Scene.Submissions[Index];
+    VulkanMesh *VulkanMeshRef = Inputs[Index].Mesh;
+    if (VulkanMeshRef == nullptr) {
+      continue;
+    }
+
+    if (!m_PreparedSceneState.ForceWireframe &&
+        !m_Device->GetOcclusionCulling().IsBoundsVisible(
+            m_PreparedSceneState.CameraData.ViewProjection, Submission.Transform,
+            VulkanMeshRef->BoundsMin, VulkanMeshRef->BoundsMax)) {
+      ++FrustumCulledCount;
+      continue;
+    }
+
+    const glm::vec3 WorldCenter = ComputeWorldCenter(Submission, *VulkanMeshRef);
+    const glm::vec3 Delta = WorldCenter - CameraPosition;
+    Candidates.push_back({.SubmissionIndex = static_cast<uint32_t>(Index),
+                          .MeshHandle = Submission.MeshHandle,
+                          .Mesh = VulkanMeshRef,
+                          .SortDepth = glm::dot(Delta, Delta)});
+  }
+
+  return FrustumCulledCount;
+}
+
+size_t VulkanSceneRenderer::BuildCullCandidatesParallel(
+    const RenderScene &Scene, std::span<const SubmissionCullInput> Inputs,
+    std::vector<CandidateSubmission> &Candidates) const {
+  Candidates.clear();
+  Candidates.reserve(Inputs.size());
+
+  const unsigned HardwareThreads = std::max(1u, std::thread::hardware_concurrency());
+  const size_t BucketCount =
+      std::min<size_t>(Inputs.size(), std::max<size_t>(1u, HardwareThreads));
+  const size_t BucketSize = (Inputs.size() + BucketCount - 1u) / BucketCount;
+  const glm::vec3 CameraPosition = Scene.ActiveCamera->GetPosition();
+
+  struct CandidateBucket {
+    std::vector<CandidateSubmission> Candidates;
+    size_t FrustumCulledCount{0};
+  };
+  std::vector<CandidateBucket> Buckets(BucketCount);
+
+  Jobs::ParallelFor(BucketCount, [&](size_t BucketIndex) {
+    const size_t Begin = BucketIndex * BucketSize;
+    const size_t End = std::min(Inputs.size(), Begin + BucketSize);
+    CandidateBucket &Bucket = Buckets[BucketIndex];
+    Bucket.Candidates.reserve(End - Begin);
+
+    for (size_t Index = Begin; Index < End; ++Index) {
+      const auto &Submission = Scene.Submissions[Index];
+      VulkanMesh *VulkanMeshRef = Inputs[Index].Mesh;
+      if (VulkanMeshRef == nullptr) {
+        continue;
+      }
+
+      if (!m_PreparedSceneState.ForceWireframe &&
+          !m_Device->GetOcclusionCulling().IsBoundsVisible(
+              m_PreparedSceneState.CameraData.ViewProjection, Submission.Transform,
+              VulkanMeshRef->BoundsMin, VulkanMeshRef->BoundsMax)) {
+        ++Bucket.FrustumCulledCount;
+        continue;
+      }
+
+      const glm::vec3 WorldCenter = ComputeWorldCenter(Submission, *VulkanMeshRef);
+      const glm::vec3 Delta = WorldCenter - CameraPosition;
+      Bucket.Candidates.push_back({.SubmissionIndex = static_cast<uint32_t>(Index),
+                                   .MeshHandle = Submission.MeshHandle,
+                                   .Mesh = VulkanMeshRef,
+                                   .SortDepth = glm::dot(Delta, Delta)});
+    }
+  });
+
+  size_t FrustumCulledCount = 0;
+  for (const CandidateBucket &Bucket : Buckets) {
+    FrustumCulledCount += Bucket.FrustumCulledCount;
+    Candidates.insert(Candidates.end(), Bucket.Candidates.begin(),
+                      Bucket.Candidates.end());
+  }
+
+  return FrustumCulledCount;
+}
+
+bool VulkanSceneRenderer::ShouldUseParallelCull(size_t SubmissionCount) const {
+  constexpr size_t kParallelCullSubmissionThreshold = 512;
+  return m_EnableParallelCull && SubmissionCount >= kParallelCullSubmissionThreshold;
+}
+
 void VulkanSceneRenderer::RecordBackground() {
   QueueScenePass(ScenePassPrimitive::Background);
 }
diff --git a/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.h b/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.h
index 60c8c1a8..7e1545a3 100644
--- a/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.h
+++ b/AxiomInternal/AxiomRHI/Vulkan/VulkanSceneRenderer.h
@@ -4,6 +4,7 @@
 #include "AxiomRHI/Vulkan/VulkanRendererTypes.h"
 
 #include <optional>
+#include <span>
 #include <vector>
 
 namespace Axiom {
@@ -48,6 +49,12 @@ class VulkanSceneRenderer final : public ISceneRendererBackend {
     MeshHandle MeshHandle{};
     VulkanMesh *Mesh{nullptr};
     float SortDepth{0.0f};
+
+    constexpr bool operator==(const CandidateSubmission &) const = default;
+  };
+
+  struct SubmissionCullInput {
+    VulkanMesh *Mesh{nullptr};
   };
 
   struct PreparedSceneState {
@@ -60,6 +67,13 @@ class VulkanSceneRenderer final : public ISceneRendererBackend {
   };
 
   void PrepareSceneFrame(RenderScene &Scene);
+  size_t BuildCullCandidatesSerial(
+      const RenderScene &Scene, std::span<const SubmissionCullInput> Inputs,
+      std::vector<CandidateSubmission> &Candidates) const;
+  size_t BuildCullCandidatesParallel(
+      const RenderScene &Scene, std::span<const SubmissionCullInput> Inputs,
+      std::vector<CandidateSubmission> &Candidates) const;
+  bool ShouldUseParallelCull(size_t SubmissionCount) const;
   void RecordBackground();
   void RecordDepthPrepass();
   void BuildHzb();
@@ -107,7 +121,11 @@ class VulkanSceneRenderer final : public ISceneRendererBackend {
   IViewportFrameOutput *m_FrameOutput{nullptr};
   PreparedSceneState m_PreparedSceneState{};
   std::vector<CandidateSubmission> m_CandidateScratch;
+  std::vector<CandidateSubmission> m_VerificationCandidateScratch;
+  std::vector<SubmissionCullInput> m_CullInputScratch;
   std::vector<ScenePassPrimitive> m_QueuedScenePasses;
+  bool m_EnableParallelCull{AXIOM_PARALLEL_CULL != 0};
+  bool m_VerifyParallelCull{AXIOM_VERIFY_PARALLEL_CULL != 0};
   VkImageLayout m_SceneDrawImageLayout{VK_IMAGE_LAYOUT_UNDEFINED};
   VkImageLayout m_SceneRasterDepthLayout{VK_IMAGE_LAYOUT_UNDEFINED};
 };
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b2e02f93..746ecc2e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,8 @@ set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+include(cmake/AxiomCompilerOptions.cmake)
+
 option(BUILD_TESTING "Build tests" OFF)
 option(AXIOM_ENABLE_SCRIPTING
        "Enable the Coral C# scripting host" OFF)
@@ -18,7 +20,13 @@ option(AXIOM_ENABLE_WEBRTC
 option(AXIOM_ENABLE_PHYSICS
        "Enable the JoltPhysics runtime simulation seam" ON)
 option(AXIOM_THREADED_RENDER
-       "Enable the experimental threaded renderer and worker job system" OFF)
+       "Enable the threaded renderer and worker job system" ON)
+option(AXIOM_PARALLEL_CULL
+       "Enable guarded parallel CPU culling in the Vulkan scene renderer" ON)
+option(AXIOM_VERIFY_PARALLEL_CULL
+       "Verify parallel CPU culling against the serial path in debug/test builds" OFF)
+option(AXIOM_FRAME_TASK_GRAPH
+       "Enable the minimal renderer frame task graph backed by enkiTS jobs" ON)
 option(AXIOM_ENABLE_TSAN
        "Build with ThreadSanitizer instrumentation" OFF)
 set(AXIOM_WEBRTC_FRAMEWORK_PATH "" CACHE PATH
diff --git a/CMakePresets.json b/CMakePresets.json
index a1a00267..0c2147a6 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -39,17 +39,20 @@
     {
       "name": "debug",
       "displayName": "Build Debug",
-      "configurePreset": "debug"
+      "configurePreset": "debug",
+      "jobs": 0
     },
     {
       "name": "release",
       "displayName": "Build Release",
-      "configurePreset": "release"
+      "configurePreset": "release",
+      "jobs": 0
     },
     {
       "name": "minsizerel",
       "displayName": "Build MinSizeRel",
-      "configurePreset": "minsizerel"
+      "configurePreset": "minsizerel",
+      "jobs": 0
     }
   ]
 }
diff --git a/Content/Cooked/sponza_atrium_3.wmesh b/Content/Cooked/sponza_atrium_3.wmesh
index 295f5566..bb2d74d1 100644
Binary files a/Content/Cooked/sponza_atrium_3.wmesh and b/Content/Cooked/sponza_atrium_3.wmesh differ
diff --git a/Docs/DistributedWraithEngineDesign.md b/Docs/DistributedWraithEngineDesign.md
index f8404113..96fd0854 100644
--- a/Docs/DistributedWraithEngineDesign.md
+++ b/Docs/DistributedWraithEngineDesign.md
@@ -1,13 +1,13 @@
 # Distributed Wraith Engine Design
 
 ## Document Status
-- Status: Draft
-- Date: 2026-05-25
+- Status: Draft, current-state refresh
+- Date: 2026-06-07
 - Audience: Engine, tools, networking, web, and infrastructure contributors
 - Intended outcome: Establish the target architecture for evolving WraithEngine into a distributed game engine and browser-based collaborative editor
 
 ## Implementation Progress
-- `event-system` branch now contains the first local authoritative editor-session slice
+- The current tree contains the first local authoritative editor-session slice
 - Added engine-owned `EditorSession`, `EditorCommand`, `EditorEvent`, `SessionId`, and `SessionUserId` foundations in `Axiom`
 - The native editor now translates GLFW input into commands and renders from session-owned camera/scene state instead of mutating camera state directly in the layer
 - Added deterministic in-process command draining, authoritative event publication, and focused tests for camera/look state transitions and command rejection
@@ -23,6 +23,9 @@
 - `AxiomRemoteViewportServer` now treats WebRTC as the only supported remote viewport media path
 - `WraithNetworking` now exposes initialization state and connection metrics for future CVAR/config integration
 - Removed the largest remote-viewport performance bottlenecks by unthrottling the headless server loop and tuning the encoder/input path for latency
+- Headless offscreen rendering now uses asynchronous readback polling instead of waiting immediately after graphics submit, with completed captures published on later ticks in submit order
+- Remote render views now track dirty/burst scheduling state; idle connected clients are throttled through round-robin cadence instead of forcing full-rate rendering for every client every tick
+- Performance-oriented defaults are enabled for first-party builds: Release is the implicit single-config default, preset builds run with parallel jobs, and optimized targets use native CPU tuning plus IPO/LTO when supported
 - The remote viewport now runs at acceptable frame rate, but still has noticeable residual input latency that likely requires deeper WebRTC sender/playout tuning
 - A root-level `EditorFrontend` workspace now serves as the longer-lived browser editor shell using Next.js, React, and Tailwind CSS
 - `EditorFrontend` contains a docked editor UI with a menu bar, toolbar, outliner, details panel, content browser, and the active WebRTC viewport client
@@ -33,7 +36,7 @@
 - a delayed-readback frame attribution bug in multi-pass headless rendering was fixed by stamping each offscreen capture with the submitting `SessionUserId` at submission time
 - The next browser-facing step after the migration is turning the browser shell plus authoritative session into a real single-user scene editor, not more work on a server-hosted prototype page
 - Collaboration should continue to follow that same authoritative command/event path after the single-user authoring loop is stable, rather than leading the roadmap ahead of core editor behavior
-- `scene-editing` branch introduces the first authoritative object-lifecycle commands: `CreateObjectCommand`, `DuplicateObjectCommand`, and `DeleteObjectCommand`, with matching `ObjectCreatedEvent` and `ObjectDeletedEvent` authoritative events
+- The current tree includes the first authoritative object-lifecycle commands: `CreateObjectCommand`, `DuplicateObjectCommand`, and `DeleteObjectCommand`, with matching `ObjectCreatedEvent` and `ObjectDeletedEvent` authoritative events
 - All scene objects are now backed by an Instance-class hierarchy rooted at a `DataModel` node, mimicking the Roblox object model; `EditorSession` owns the live `DataModel` tree and keeps `EditorSceneState::Items` synchronized as a derived projection
 - Concrete scene Instance subclasses introduced: `SceneFolder`, `SceneMeshObject`, `SceneLight`, `SceneCamera`, and `SceneActor` under `Axiom/CoreInstance/SceneInstances.h`
 - `SetSceneState` and `SetSceneItems` now rebuild the Instance tree from snapshot data, enabling round-trip snapshot rehydration
diff --git a/Docs/HeadlessAxiomSessionPrototype.md b/Docs/HeadlessAxiomSessionPrototype.md
index e1048281..3cc4e5b6 100644
--- a/Docs/HeadlessAxiomSessionPrototype.md
+++ b/Docs/HeadlessAxiomSessionPrototype.md
@@ -13,8 +13,8 @@ That macOS-specific media path now lives behind the engine-wide `HAL/` layer, so
 ## Current Status
 
 - Status: working prototype
-- Verified on Windows as of 2026-05-05
-- Builds on macOS as of 2026-05-07
+- Current state reviewed on 2026-06-07
+- Builds on macOS with the `debug`, `release`, and `minsizerel` CMake presets
 - Runtime validation on macOS requires a Vulkan/MoltenVK-capable environment with Metal available
 - This subphase is complete for the runtime-side seam restoration work
 - `AxiomHeadless` is a command-driven authoritative runtime, not a full editor client
@@ -46,6 +46,9 @@ That macOS-specific media path now lives behind the engine-wide `HAL/` layer, so
 - the browser client now pumps camera/input updates on `requestAnimationFrame` and flushes pointer-lock look input immediately instead of batching on a fixed timer
 - the current stream no longer has the severe FPS collapse seen in the older prototype, but there is still roughly half a second of residual input latency to investigate later
 - a multi-client frame-routing bug was fixed by stamping each offscreen capture with the submitting `SessionUserId` at render time instead of inferring ownership later from mutable active-pass state
+- headless offscreen capture no longer waits immediately after graphics submit; completed readbacks are polled and published on later ticks with submit-time user attribution preserved
+- remote render views now have dirty/burst scheduling state, so idle views are throttled instead of forcing one full render pass per connected client every engine tick
+- high-performance release builds now enable threaded rendering, the renderer frame task graph, parallel CPU culling, native CPU tuning, and IPO/LTO when supported
 - the headless command/protocol layer and the remote project/script HTTP JSON helpers now use `rapidjson` internally; command/event/session payload schemas are unchanged, but the earlier handwritten JSON serializer/parser code on those paths has been removed
 - a root-level `EditorFrontend` workspace now hosts the primary browser editor shell using Next.js, React, and Tailwind CSS
 - `EditorFrontend` includes the docked editor layout, menu bar, toolbar, outliner, details panel, content browser, and the active WebRTC viewport client in `components/engine/viewport.tsx`
@@ -108,9 +111,13 @@ Dev-client example:
 Remote viewport server example:
 
 ```sh
-./AxiomRemoteViewportServer --host 127.0.0.1 --port 8080 --width 1280 --height 720
+./build/release/Headless/AxiomRemoteViewportServer --host 127.0.0.1 --port 8080 --width 1280 --height 720
 ```
 
+For the browser WebRTC viewport, configure the server with
+`-DAXIOM_ENABLE_WEBRTC=ON` and a local WebRTC framework or library path before
+building `AxiomRemoteViewportServer`.
+
 On startup, the process registers `WraithNetworking` with `ModuleManager`; the module initializes the `uWebSockets` transport, reports whether networking initialized successfully, and keeps per-connection metrics available for future runtime introspection.
 
 Then start the browser editor:
@@ -256,7 +263,7 @@ This prototype does not yet provide:
 
 ## Scene Authoring Progress
 
-The authoritative scene-authoring loop has advanced on the `scene-editing` branch:
+The authoritative scene-authoring loop is implemented in the current tree:
 
 - `CreateObjectCommand`, `DuplicateObjectCommand`, and `DeleteObjectCommand` are now implemented as validated authoritative commands with matching `ObjectCreatedEvent` and `ObjectDeletedEvent` events
 - all scene objects are now backed by a `DataModel`-rooted Instance hierarchy (`Axiom/CoreInstance/SceneInstances.h`); `EditorSession` owns the live tree and keeps `EditorSceneState::Items` synchronized as a derived projection
@@ -283,7 +290,7 @@ The authoritative scene-authoring loop has advanced on the `scene-editing` branc
 
 ## Gizmo System
 
-A server-side transform gizmo is now fully implemented on the `scene-editing` branch:
+A server-side transform gizmo is now fully implemented in the current tree:
 
 ### Rendering (`VulkanGizmoRenderer`)
 - a dedicated Vulkan pipeline draws gizmo handles as billboard line-segment quads inserted between mesh rendering and the offscreen capture step
diff --git a/Docs/HeadlessScalabilityBaseline.md b/Docs/HeadlessScalabilityBaseline.md
index 57eb63e0..3335ca81 100644
--- a/Docs/HeadlessScalabilityBaseline.md
+++ b/Docs/HeadlessScalabilityBaseline.md
@@ -1,6 +1,11 @@
 # Headless Scalability Baseline
 
-This note captures where to read the new Phase 0 / Phase 1 headless scalability counters before changing scheduling or asynchronous readback behavior.
+This note captures where to read the Phase 0 / Phase 1 headless scalability counters while tuning scheduling and asynchronous readback behavior.
+
+Phase 1 has landed: offscreen readbacks are now published after later fence polling
+instead of waiting immediately after submit, and idle remote views are throttled by
+dirty/burst scheduling. Treat this file as the current instrumentation guide, not
+as a pre-Phase-1 TODO.
 
 ## Where The Counters Live
 
@@ -8,7 +13,7 @@ This note captures where to read the new Phase 0 / Phase 1 headless scalability
 - Headless render-pass scheduling hook in [Headless/HeadlessSessionHost.cpp](/Users/joshua/Documents/GitHub/WraithEngine/Headless/HeadlessSessionHost.cpp)
 - Offscreen readback hook in [Axiom/Renderer/Vulkan/VulkanDrawSubmissionSystem.cpp](/Users/joshua/Documents/GitHub/WraithEngine/Axiom/Renderer/Vulkan/VulkanDrawSubmissionSystem.cpp)
 
-## What To Capture Before Refactors
+## What To Capture During Tuning
 
 - `LastTickRenderPassCount`: current render passes scheduled for one engine tick.
 - `TotalRenderPasses`: cumulative render-pass work over a benchmark window.
@@ -35,4 +40,7 @@ For each run, record the snapshot after a fixed tick window and compare:
 - pending readbacks during steady state
 - per-client cadence symmetry between active and idle clients
 
-The current baseline is expected to show roughly one render pass per connected remote client per engine tick, with no scheduler distinction yet between active and idle remote clients.
+The old pre-Phase-1 baseline showed roughly one render pass per connected remote
+client per engine tick. The current expected behavior is different: active or
+recently mutated clients should receive a short full-rate burst, while idle
+clients should settle into the scheduler's throttled cadence.
diff --git a/Docs/WraithEngineRefactorPlan.md b/Docs/WraithEngineRefactorPlan.md
index 55f0a555..082191ed 100644
--- a/Docs/WraithEngineRefactorPlan.md
+++ b/Docs/WraithEngineRefactorPlan.md
@@ -1,8 +1,8 @@
 # WraithEngine Refactor Plan
 
 ## Document Status
-- Status: Draft
-- Date: 2026-05-25
+- Status: Draft, updated after Phase 1 headless scalability work
+- Date: 2026-06-07
 - Audience: Engine, rendering, headless runtime, and editor contributors
 - Intended outcome: Turn the current engineering audit into an executable refactor roadmap ordered by dependency, risk, and team size
 
@@ -14,8 +14,8 @@ The most important current facts are:
 
 - Scene authority already lives in editor-owned structs in `EditorSession`, but that data is mirrored into a recursive heap-owned `Instance` tree for hierarchy operations and projection.
 - Render submission still carries `shared_ptr` ownership and still recovers backend-specific Vulkan types through `dynamic_cast` in the submission build path.
-- Headless offscreen rendering still blocks on `vkWaitForFences` immediately after submit, which defeats frames-in-flight for the headless path.
-- Multi-client headless rendering still performs one render pass per remote client per engine tick.
+- Phase 1 headless offscreen rendering no longer waits immediately after submit; completed readbacks are polled and published on later ticks.
+- Multi-client headless rendering now has dirty/burst scheduling and idle-client throttling, but still needs broader policy tuning and production load validation.
 - `RemoteViewportServer` still mixes transport, WebRTC, project lifecycle, script workspace, asset upload, presence, input routing, and frame delivery in one class.
 - String-keyed maps remain widespread in editor, headless, scripting, physics, and scene serialization paths even where stable integer handles would make the authority layer simpler and cheaper.
 
@@ -54,12 +54,12 @@ Validation:
 Current implementation shape:
 
 - Headless uses an offscreen render surface and publishes captured frames through the renderer frame-output seam in [Axiom/Core/Application.cpp](/Users/joshua/Documents/GitHub/WraithEngine/Axiom/Core/Application.cpp:152), [Axiom/Renderer/Vulkan/VulkanRendererBackend.cpp](/Users/joshua/Documents/GitHub/WraithEngine/Axiom/Renderer/Vulkan/VulkanRendererBackend.cpp:150), and [Axiom/Renderer/Vulkan/VulkanDrawSubmissionSystem.cpp](/Users/joshua/Documents/GitHub/WraithEngine/Axiom/Renderer/Vulkan/VulkanDrawSubmissionSystem.cpp:633).
-- In the offscreen path, the draw submission system submits graphics work, marks the capture pending, then immediately waits on `CurrentFrame.RenderFence` before publishing the frame in [Axiom/Renderer/Vulkan/VulkanDrawSubmissionSystem.cpp](/Users/joshua/Documents/GitHub/WraithEngine/Axiom/Renderer/Vulkan/VulkanDrawSubmissionSystem.cpp:770).
+- In the offscreen path, the draw submission system submits graphics work, records capture ownership at submit time, and publishes completed readbacks from later polling rather than waiting immediately after submit.
 
 Validation:
 
-- This finding is fully accurate.
-- The current implementation preserves frame attribution correctness, but it serializes headless rendering at the point where frames-in-flight should be helping.
+- The original immediate-fence finding has been addressed by the Phase 1 implementation.
+- Remaining risk is around queue depth, capture latency, and production-scale scheduling policy rather than the old unconditional immediate wait.
 
 ### 4. Multi-client rendering
 
@@ -71,8 +71,8 @@ Current implementation shape:
 
 Validation:
 
-- This finding is fully accurate.
-- The engine currently re-renders once per active remote client per tick, even when the scene is shared and only camera/view overlays differ.
+- The original full-rate-per-client baseline has been improved by dirty/burst scheduling and idle-client throttling.
+- The engine still needs deeper policy tuning for many clients, shared-scene reuse, and active/idle fairness under real browser workloads.
 
 ### 5. `RemoteViewportServer`
 
@@ -105,19 +105,21 @@ Validation:
 
 ### Phase 1: Headless scalability slice
 
+Status: implemented. See [HeadlessPhase1ImplementationNote.md](/Users/joshua/Documents/GitHub/WraithEngine/Docs/HeadlessPhase1ImplementationNote.md).
+
 Why first:
 
 - Highest current scalability payoff.
 - Lowest semantic blast radius compared with scene storage rewrites.
 - Directly addresses the clearest N-client cost center.
 
-Target architecture:
+Implemented architecture:
 
 - Offscreen rendering uses true asynchronous readback.
 - Completed frames are published when fences signal on a later tick rather than by waiting immediately after submit.
-- Headless render scheduling becomes policy-driven per view, with at least dirty-state or cadence-based throttling for inactive clients.
+- Headless render scheduling is policy-driven per view, with dirty-state and cadence-based throttling for inactive clients.
 
-Migration strategy:
+Completed migration:
 
 1. Remove the immediate fence wait from the headless offscreen path.
 2. Let pending readbacks complete in later frames through the existing `PublishCompletedOffscreenFrames()` path.
@@ -143,9 +145,9 @@ Test strategy:
 - Add a renderer-level regression test or harness for two queued offscreen frames with distinct users.
 - Add instrumentation assertions in headless integration tests for render-pass count versus active-client count.
 
-Incremental or staged:
+Follow-up:
 
-- Incremental.
+- Use the instrumentation baseline to tune policy thresholds and validate many-client behavior.
 
 ### Phase 2: Render submission cleanup
 
diff --git a/Editor/CMakeLists.txt b/Editor/CMakeLists.txt
index 8bc705b2..071b0fa5 100644
--- a/Editor/CMakeLists.txt
+++ b/Editor/CMakeLists.txt
@@ -4,6 +4,8 @@ add_executable(AxiomEditor
   GlfwEditorModule.cpp
 )
 
+axiom_apply_performance_options(AxiomEditor)
+
 target_link_libraries(AxiomEditor PRIVATE AxiomCore)
 target_link_libraries(AxiomEditor PRIVATE
   AxiomScene
diff --git a/FRAME_GRAPH.md b/FRAME_GRAPH.md
new file mode 100644
index 00000000..bb6ba7b9
--- /dev/null
+++ b/FRAME_GRAPH.md
@@ -0,0 +1,51 @@
+# Minimal Frame Task Graph
+
+This is the first low-risk frame graph for WraithEngine. The existing
+`ModuleManager` phase walk remains authoritative for modules that have not been
+ported. The renderer frame module is the only ported module.
+
+## Current Stages
+
+```mermaid
+flowchart LR
+  Simulate["simulate: FrameStart modules"] --> Begin["begin: Renderer::BeginFrame"]
+  Begin --> RecordCommands["legacy Render phase modules submit RenderCommand data"]
+  RecordCommands --> CullRecord["cull + record: Renderer::Render"]
+  CullRecord --> Submit["submit: Renderer::EndFrame"]
+```
+
+`BeginFrame` is scheduled through `Jobs::ScheduleJob` and then waited at the end
+of `RenderBegin`, because render-command producers still run later in the
+regular `Render` phase and require an open `RenderScene`.
+
+`Renderer::Render` is scheduled after the begin job. It owns the Vulkan scene
+preparation work, including the guarded parallel CPU cull path. `Renderer::EndFrame`
+is scheduled with `Jobs::ScheduleJobAfter` after render and waited before
+`Application::Step` returns.
+
+## Determinism Rules
+
+- CPU cull resolves mesh handles serially into an immutable per-frame snapshot.
+- Parallel cull workers write only to range-local candidate buckets.
+- Buckets are merged in ascending input-range order before the existing sort and
+  visible-list classification.
+- `AXIOM_VERIFY_PARALLEL_CULL=ON` reruns the serial cull builder and asserts
+  that candidate lists and frustum-cull counts match before downstream work.
+
+## Config Flags
+
+- `AXIOM_PARALLEL_CULL`: enables the Vulkan parallel CPU cull path.
+- `AXIOM_VERIFY_PARALLEL_CULL`: compares parallel cull output against serial
+  output.
+- `AXIOM_FRAME_TASK_GRAPH`: enables the renderer frame module job chain.
+
+Each flag is also exposed through `ApplicationConfig` / `RendererCreateInfo` so
+tests and hosts can force a specific mode without changing global build flags.
+
+## Next Ports
+
+Future ports should move command-producing modules onto explicit producer tasks
+instead of relying on `RenderCommand` global scene state. Once those producers
+return immutable render packets, `BeginFrame` no longer needs to be synchronized
+before the `Render` phase, and the graph can express true
+`simulate -> cull(parallel) -> record -> submit` overlap.
diff --git a/HAL/CMakeLists.txt b/HAL/CMakeLists.txt
index f4fb0d09..df20707f 100644
--- a/HAL/CMakeLists.txt
+++ b/HAL/CMakeLists.txt
@@ -16,6 +16,8 @@ if(APPLE)
   )
 endif()
 
+axiom_apply_performance_options(AxiomHAL)
+
 target_include_directories(AxiomHAL PUBLIC
   "${CMAKE_SOURCE_DIR}"
   "${CMAKE_SOURCE_DIR}/Axiom"
diff --git a/Headless/CMakeLists.txt b/Headless/CMakeLists.txt
index 43d983c7..4dabd7f7 100644
--- a/Headless/CMakeLists.txt
+++ b/Headless/CMakeLists.txt
@@ -7,6 +7,8 @@ add_library(uSockets STATIC
   "${CMAKE_SOURCE_DIR}/ThirdParty/uWebSockets/uSockets/src/eventing/epoll_kqueue.c"
 )
 
+axiom_apply_performance_options(uSockets)
+
 target_include_directories(uSockets PUBLIC
   "${CMAKE_SOURCE_DIR}/ThirdParty/uWebSockets/uSockets/src"
 )
@@ -31,6 +33,8 @@ add_library(WraithNetworking STATIC
   WraithNetworkingModule.cpp
 )
 
+axiom_apply_performance_options(WraithNetworking)
+
 target_include_directories(WraithNetworking PUBLIC
   "${CMAKE_SOURCE_DIR}/Headless"
   "${CMAKE_SOURCE_DIR}/ThirdParty/glfw/deps"
@@ -66,6 +70,8 @@ add_executable(AxiomHeadless
   WebRtcSession.cpp
 )
 
+axiom_apply_performance_options(AxiomHeadless)
+
 add_executable(AxiomRemoteViewportDevClient
   DevRemoteViewportClient.cpp
   HeadlessCommandProtocol.cpp
@@ -76,6 +82,8 @@ add_executable(AxiomRemoteViewportDevClient
   WebRtcSession.cpp
 )
 
+axiom_apply_performance_options(AxiomRemoteViewportDevClient)
+
 target_include_directories(AxiomHeadless PRIVATE
   "${CMAKE_SOURCE_DIR}/ThirdParty/glfw/deps"
 )
@@ -121,6 +129,8 @@ add_executable(AxiomRemoteViewportServer
   WebRtcSession.cpp
 )
 
+axiom_apply_performance_options(AxiomRemoteViewportServer)
+
 add_executable(AxiomPackagedRuntime
   AxiomPackagedRuntime.cpp
   HeadlessOverlayModule.cpp
@@ -129,6 +139,8 @@ add_executable(AxiomPackagedRuntime
   HeadlessSessionModule.cpp
 )
 
+axiom_apply_performance_options(AxiomPackagedRuntime)
+
 target_include_directories(AxiomRemoteViewportServer PRIVATE
   "${CMAKE_SOURCE_DIR}/ThirdParty/glfw/deps"
 )
diff --git a/README.md b/README.md
index 9ec578f1..2a749f8c 100644
--- a/README.md
+++ b/README.md
@@ -133,13 +133,35 @@ AxiomCore runtime flow
 
 ## Build
 
-### Quick start (minimal — no scripting, no WebRTC)
+By default, single-config CMake generators use `Release` when no build type is
+specified. The checked-in presets still provide explicit `debug`, `release`, and
+`minsizerel` configurations, and preset builds run with parallel jobs enabled.
+
+### Quick start (debug — no scripting, no WebRTC)
 
 ```bash
 cmake --preset debug
 cmake --build build/debug
 ```
 
+### Performance server build (scripting + WebRTC)
+
+This is the normal high-performance local command for the browser-facing server
+on macOS when using a locally built `WebRTC.framework`:
+
+```bash
+cmake --preset release \
+  -DAXIOM_ENABLE_SCRIPTING=ON \
+  -DAXIOM_ENABLE_WEBRTC=ON \
+  -DAXIOM_WEBRTC_FRAMEWORK_PATH=/Users/joshua/webrtc-checkout/src/out/Default/WebRTC.framework
+
+cmake --build build/release --target AxiomRemoteViewportServer
+```
+
+The release preset uses optimized first-party compile defaults, native CPU
+tuning, IPO/LTO when the toolchain supports it, threaded rendering, the renderer
+frame task graph, parallel CPU culling, and parallel build jobs.
+
 ### With physics enabled
 
 Physics uses Jolt and is currently enabled by default, but this is the explicit build if you want to guarantee it is on:
@@ -233,6 +255,9 @@ cmake --preset release -DAXIOM_ENABLE_SCRIPTING=ON
 cmake --build build/release
 ```
 
+The release preset is the recommended baseline for runtime performance. Use the
+debug preset for development diagnostics and tests.
+
 ---
 
 ## CMake Options Reference
@@ -245,6 +270,12 @@ cmake --build build/release
 | `AXIOM_SCRIPTING_TRUST_DEFAULT` | `STRING` | `Restricted` | Default sandbox tier for user scripts. `Restricted` (hosted — blocks `System.Net.*`, `System.Reflection.Emit`, etc.) or `Trusted` (local dev — full BCL access) |
 | `AXIOM_ENABLE_WEBRTC` | `BOOL` | `OFF` | Enable the macOS WebRTC transport |
 | `AXIOM_ENABLE_PHYSICS` | `BOOL` | `ON` | Enable the JoltPhysics runtime simulation seam |
+| `AXIOM_THREADED_RENDER` | `BOOL` | `ON` | Enable the threaded renderer and worker job system |
+| `AXIOM_PARALLEL_CULL` | `BOOL` | `ON` | Enable guarded parallel CPU culling in the Vulkan scene renderer |
+| `AXIOM_FRAME_TASK_GRAPH` | `BOOL` | `ON` | Enable the renderer frame task graph backed by enkiTS jobs |
+| `AXIOM_ENABLE_PERFORMANCE_DEFAULTS` | `BOOL` | `ON` | Apply optimized compile/link defaults to first-party targets |
+| `AXIOM_OPTIMIZE_FOR_NATIVE_ARCH` | `BOOL` | `ON` | Tune optimized builds for the host CPU architecture |
+| `AXIOM_ENABLE_IPO` | `BOOL` | `ON` | Enable interprocedural optimization/LTO for optimized builds when supported |
 | `AXIOM_WEBRTC_FRAMEWORK_PATH` | `PATH` | _(empty)_ | Path to a `WebRTC.framework` bundle (macOS framework variant) |
 | `AXIOM_WEBRTC_LIBRARY_PATH` | `FILEPATH` | _(empty)_ | Path to a `libwebrtc` static/shared binary (non-framework variant) |
 | `AXIOM_WEBRTC_INCLUDE_DIR` | `PATH` | _(empty)_ | Include directory for the non-framework libwebrtc variant |
@@ -262,11 +293,25 @@ cmake --build build/release
 
 ### Remote viewport server
 
+Release build:
+
+```bash
+./build/release/Headless/AxiomRemoteViewportServer \
+  --host 127.0.0.1 --port 8080 --width 1280 --height 720
+```
+
+Debug build:
+
 ```bash
 ./build/debug/Headless/AxiomRemoteViewportServer \
   --host 127.0.0.1 --port 8080 --width 1280 --height 720
 ```
 
+If WebRTC was not enabled at configure time, the server starts but the browser
+viewport cannot receive the H.264 WebRTC media path. Reconfigure with
+`-DAXIOM_ENABLE_WEBRTC=ON` and either `AXIOM_WEBRTC_FRAMEWORK_PATH` or the
+`AXIOM_WEBRTC_LIBRARY_PATH` / `AXIOM_WEBRTC_INCLUDE_DIR` pair.
+
 At startup, `AxiomRemoteViewportServer` registers the toggleable `WraithNetworking` module with `ModuleManager`. That module owns transport initialization, publishes connection metrics/state snapshots, and keeps the existing WebRTC session logic active behind the same public server API.
 
 ### Browser editor
@@ -309,7 +354,7 @@ To test the packaged runtime binary built in `build/` against an existing staged
 package:
 
 ```bash
-./build/debug/Headless/AxiomPackagedRuntime \
+./build/release/Headless/AxiomPackagedRuntime \
   --package-root /absolute/path/to/Projects/<project-slug>/Package
 ```
 
@@ -318,7 +363,7 @@ Open `http://localhost:3000` in your browser.
 ### Local native editor (no browser required)
 
 ```bash
-./build/debug/Editor/AxiomEditor
+./build/release/Editor/AxiomEditor
 ```
 
 ---
diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
index f83ba47c..cca09e34 100644
--- a/Tests/CMakeLists.txt
+++ b/Tests/CMakeLists.txt
@@ -19,6 +19,8 @@ add_executable(AxiomTests
   ../Headless/WebRtcSession.cpp
 )
 
+axiom_apply_performance_options(AxiomTests)
+
 if(APPLE AND EXISTS "${CMAKE_SOURCE_DIR}/Headless/MacOSWebRtcSession.mm")
   target_sources(AxiomTests PRIVATE
     ../Headless/MacOSWebRtcSession.mm
diff --git a/Tests/RenderSubmissionTests.cpp b/Tests/RenderSubmissionTests.cpp
index 8b91c92e..eed409ed 100644
--- a/Tests/RenderSubmissionTests.cpp
+++ b/Tests/RenderSubmissionTests.cpp
@@ -2,6 +2,7 @@
 #include <Core/VulkanLoader.h>
 #include <Renderer/Mesh.h>
 #include <Renderer/Camera.h>
+#include <Jobs/JobSystem.h>
 #include <Renderer/OffscreenRenderSurface.h>
 #include <Renderer/RenderCommand.h>
 #include <Renderer/Renderer.h>
@@ -107,6 +108,12 @@ Axiom::MeshData MakeTriangleMesh() {
       .BoundsMax = {0.25f, 0.25f, 0.0f},
   };
 }
+
+class ScopedJobSystem {
+public:
+  ScopedJobSystem() { Axiom::Jobs::Startup(); }
+  ~ScopedJobSystem() { Axiom::Jobs::Shutdown(); }
+};
 } // namespace
 
 TEST(RenderSubmissionTests, EditorSceneRendererAdapterReusesCachedMeshUntilAssetChanges) {
@@ -166,22 +173,25 @@ TEST(RenderSubmissionTests, EditorSceneRendererAdapterDropsDeletedObjectsFromCac
 }
 
 TEST(RenderSubmissionTests,
-     VulkanRendererRendersAllFiveThousandSubmittedMeshesOffscreen) {
+     VulkanRendererRendersAllTenThousandSubmittedMeshesOffscreen) {
   constexpr uint32_t Width = 1280;
   constexpr uint32_t Height = 720;
-  constexpr size_t MeshCount = 5000;
+  constexpr size_t MeshCount = 10000;
 
   EnsureLoggingInitialized();
   if (!Axiom::CanInitializeHeadlessVulkan()) {
     GTEST_SKIP() << "Headless Vulkan is unavailable on this host";
   }
 
+  ScopedJobSystem Jobs;
   auto Surface = std::make_shared<Axiom::OffscreenRenderSurface>(Width, Height);
   Axiom::Renderer Renderer;
   Renderer.Init({
       .TargetSurface = Surface,
       .Width = Width,
       .Height = Height,
+      .EnableParallelCull = true,
+      .VerifyParallelCull = true,
   });
   Renderer.SetViewMode(Axiom::RendererViewMode::Wireframe);
 
diff --git a/Tests/ThreadingTests.cpp b/Tests/ThreadingTests.cpp
index 793f2ce7..3bea6f16 100644
--- a/Tests/ThreadingTests.cpp
+++ b/Tests/ThreadingTests.cpp
@@ -16,11 +16,16 @@
 #include <atomic>
 #include <chrono>
 #include <condition_variable>
+#include <cstdlib>
 #include <mutex>
+#include <new>
 #include <string>
 #include <thread>
 
 namespace {
+std::atomic<bool> g_CountAllocations{false};
+std::atomic<size_t> g_AllocationCount{0};
+
 void EnsureLoggingInitialized() {
   static bool Initialized = false;
   if (!Initialized) {
@@ -60,6 +65,65 @@ Axiom::CommandContext MakeContext(uint64_t FrameIndex = 1,
 }
 } // namespace
 
+void *operator new(std::size_t Size) {
+  if (g_CountAllocations.load(std::memory_order_relaxed)) {
+    g_AllocationCount.fetch_add(1, std::memory_order_relaxed);
+  }
+  if (void *Pointer = std::malloc(Size)) {
+    return Pointer;
+  }
+  throw std::bad_alloc();
+}
+
+void *operator new[](std::size_t Size) {
+  if (g_CountAllocations.load(std::memory_order_relaxed)) {
+    g_AllocationCount.fetch_add(1, std::memory_order_relaxed);
+  }
+  if (void *Pointer = std::malloc(Size)) {
+    return Pointer;
+  }
+  throw std::bad_alloc();
+}
+
+void *operator new(std::size_t Size, std::align_val_t Alignment) {
+  if (g_CountAllocations.load(std::memory_order_relaxed)) {
+    g_AllocationCount.fetch_add(1, std::memory_order_relaxed);
+  }
+  void *Pointer = nullptr;
+  if (posix_memalign(&Pointer, static_cast<std::size_t>(Alignment), Size) == 0) {
+    return Pointer;
+  }
+  throw std::bad_alloc();
+}
+
+void *operator new[](std::size_t Size, std::align_val_t Alignment) {
+  if (g_CountAllocations.load(std::memory_order_relaxed)) {
+    g_AllocationCount.fetch_add(1, std::memory_order_relaxed);
+  }
+  void *Pointer = nullptr;
+  if (posix_memalign(&Pointer, static_cast<std::size_t>(Alignment), Size) == 0) {
+    return Pointer;
+  }
+  throw std::bad_alloc();
+}
+
+void operator delete(void *Pointer) noexcept { std::free(Pointer); }
+void operator delete[](void *Pointer) noexcept { std::free(Pointer); }
+void operator delete(void *Pointer, std::size_t) noexcept { std::free(Pointer); }
+void operator delete[](void *Pointer, std::size_t) noexcept { std::free(Pointer); }
+void operator delete(void *Pointer, std::align_val_t) noexcept {
+  std::free(Pointer);
+}
+void operator delete[](void *Pointer, std::align_val_t) noexcept {
+  std::free(Pointer);
+}
+void operator delete(void *Pointer, std::size_t, std::align_val_t) noexcept {
+  std::free(Pointer);
+}
+void operator delete[](void *Pointer, std::size_t, std::align_val_t) noexcept {
+  std::free(Pointer);
+}
+
 TEST(ThreadingTests, JobsRunWithDependenciesAndParallelFor) {
   Axiom::Jobs::Startup();
 
@@ -85,6 +149,34 @@ TEST(ThreadingTests, JobsRunWithDependenciesAndParallelFor) {
   Axiom::Jobs::Shutdown();
 }
 
+TEST(ThreadingTests, SchedulingTrivialJobsDoesNotAllocatePerJob) {
+  constexpr size_t JobCount = 100000;
+  static std::array<Axiom::Jobs::JobHandle, JobCount> Handles;
+
+  Axiom::Jobs::Startup();
+
+  Axiom::Jobs::JobHandle Warmup = Axiom::Jobs::ScheduleJob([]() {});
+  Axiom::Jobs::Wait(Warmup);
+
+  g_AllocationCount.store(0, std::memory_order_relaxed);
+  g_CountAllocations.store(true, std::memory_order_release);
+
+  for (Axiom::Jobs::JobHandle &Handle : Handles) {
+    Handle = Axiom::Jobs::ScheduleJob([]() {});
+  }
+  for (Axiom::Jobs::JobHandle Handle : Handles) {
+    Axiom::Jobs::Wait(Handle);
+  }
+
+  g_CountAllocations.store(false, std::memory_order_release);
+  const size_t AllocationCount =
+      g_AllocationCount.load(std::memory_order_relaxed);
+
+  Axiom::Jobs::Shutdown();
+
+  EXPECT_EQ(AllocationCount, 0u);
+}
+
 TEST(ThreadingTests, ThreadedRendererRunsHeadlessForThousandFramesWithoutDeadlock) {
 #if AXIOM_THREADED_RENDER == 0
   GTEST_SKIP() << "Threaded renderer is disabled in this build";
diff --git a/cmake/AxiomCompilerOptions.cmake b/cmake/AxiomCompilerOptions.cmake
new file mode 100644
index 00000000..5f5809dc
--- /dev/null
+++ b/cmake/AxiomCompilerOptions.cmake
@@ -0,0 +1,78 @@
+include(CheckIPOSupported)
+
+if(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release CACHE STRING
+      "Build type for single-configuration generators" FORCE)
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
+      Debug Release RelWithDebInfo MinSizeRel)
+endif()
+
+option(AXIOM_ENABLE_PERFORMANCE_DEFAULTS
+       "Use high-performance compile defaults for first-party targets" ON)
+option(AXIOM_OPTIMIZE_FOR_NATIVE_ARCH
+       "Tune optimized builds for the host CPU architecture" ON)
+option(AXIOM_ENABLE_IPO
+       "Enable interprocedural optimization/LTO for optimized builds" ON)
+
+set(AXIOM_IPO_SUPPORTED OFF)
+if(AXIOM_ENABLE_IPO)
+  check_ipo_supported(RESULT AXIOM_IPO_SUPPORTED OUTPUT AXIOM_IPO_OUTPUT)
+  if(NOT AXIOM_IPO_SUPPORTED)
+    message(WARNING
+      "IPO/LTO was requested but is not supported by this toolchain: "
+      "${AXIOM_IPO_OUTPUT}")
+  endif()
+endif()
+
+function(axiom_apply_performance_options target_name)
+  if(NOT AXIOM_ENABLE_PERFORMANCE_DEFAULTS)
+    return()
+  endif()
+
+  if(NOT TARGET ${target_name})
+    message(FATAL_ERROR
+      "axiom_apply_performance_options called for missing target: ${target_name}")
+  endif()
+
+  get_target_property(AXIOM_TARGET_TYPE ${target_name} TYPE)
+  if(AXIOM_TARGET_TYPE STREQUAL "INTERFACE_LIBRARY" OR
+     AXIOM_TARGET_TYPE STREQUAL "UTILITY")
+    return()
+  endif()
+
+  if(MSVC)
+    target_compile_options(${target_name} PRIVATE
+      $<$<CONFIG:Release>:/O2 /Ob3 /Oi /Ot /Gy /Gw>
+      $<$<CONFIG:RelWithDebInfo>:/O2 /Ob3 /Oi /Ot /Gy /Gw>
+      $<$<CONFIG:MinSizeRel>:/O2 /Ob3 /Oi /Ot /Gy /Gw>
+    )
+    target_link_options(${target_name} PRIVATE
+      $<$<CONFIG:Release>:/OPT:REF /OPT:ICF>
+      $<$<CONFIG:RelWithDebInfo>:/OPT:REF /OPT:ICF>
+      $<$<CONFIG:MinSizeRel>:/OPT:REF /OPT:ICF>
+    )
+  elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+    target_compile_options(${target_name} PRIVATE
+      $<$<CONFIG:Release>:-O3>
+      $<$<CONFIG:RelWithDebInfo>:-O3>
+      $<$<CONFIG:MinSizeRel>:-Os>
+    )
+
+    if(AXIOM_OPTIMIZE_FOR_NATIVE_ARCH)
+      target_compile_options(${target_name} PRIVATE
+        $<$<CONFIG:Release>:-march=native>
+        $<$<CONFIG:RelWithDebInfo>:-march=native>
+        $<$<CONFIG:MinSizeRel>:-march=native>
+      )
+    endif()
+  endif()
+
+  if(AXIOM_IPO_SUPPORTED)
+    set_property(TARGET ${target_name} PROPERTY
+      INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE)
+    set_property(TARGET ${target_name} PROPERTY
+      INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO TRUE)
+    set_property(TARGET ${target_name} PROPERTY
+      INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL TRUE)
+  endif()
+endfunction()