From a016b6ba50bd44eff82fa8c34acef85b0999805a Mon Sep 17 00:00:00 2001
From: haykh <haykh.astro@gmail.com>
Date: Mon, 18 May 2026 08:51:01 -0400
Subject: [PATCH 01/33] version bump

---
 CMakeLists.txt | 75 +++++++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 37 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cd5f0c258..5482348f5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ set(PROJECT_NAME entity)
 
 project(
   ${PROJECT_NAME}
-  VERSION 1.4.0
+  VERSION 1.5.0
   LANGUAGES CXX C)
 add_compile_options("-D ENTITY_VERSION=\"${PROJECT_VERSION}\"")
 set(hash_cmd "git diff --quiet src/ && echo $(git rev-parse HEAD) ")
@@ -28,35 +28,35 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/defaults.cmake)
 
 # defaults
 set(DEBUG
-  ${default_debug}
-  CACHE BOOL "Debug mode")
+    ${default_debug}
+    CACHE BOOL "Debug mode")
 
 set(precision
-  ${default_precision}
-  CACHE STRING "Precision")
+    ${default_precision}
+    CACHE STRING "Precision")
 
 set(deposit
-  ${default_deposit}
-  CACHE STRING "Deposit")
+    ${default_deposit}
+    CACHE STRING "Deposit")
 
 set(shape_order
-  ${default_shape_order}
-  CACHE STRING "Shape function")
+    ${default_shape_order}
+    CACHE STRING "Shape function")
 
 set(pgen
-  ${default_pgen}
-  CACHE STRING "Problem generator")
+    ${default_pgen}
+    CACHE STRING "Problem generator")
 
 set(output
-  ${default_output}
-  CACHE BOOL "Enable output")
+    ${default_output}
+    CACHE BOOL "Enable output")
 set(mpi
-  ${default_mpi}
-  CACHE BOOL "Use MPI")
+    ${default_mpi}
+    CACHE BOOL "Use MPI")
 
 set(gpu_aware_mpi
-  ${default_gpu_aware_mpi}
-  CACHE BOOL "Enable GPU-aware MPI")
+    ${default_gpu_aware_mpi}
+    CACHE BOOL "Enable GPU-aware MPI")
 
 # -------------------------- Compilation settings -------------------------- #
 set(CMAKE_CXX_STANDARD 20)
@@ -65,42 +65,42 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 if(${DEBUG} STREQUAL "OFF")
   set(CMAKE_BUILD_TYPE
-    Release
-    CACHE STRING "CMake build type")
+      Release
+      CACHE STRING "CMake build type")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG -O3")
 else()
   set(CMAKE_BUILD_TYPE
-    Debug
-    CACHE STRING "CMake build type")
+      Debug
+      CACHE STRING "CMake build type")
   set(CMAKE_CXX_FLAGS
-    "${CMAKE_CXX_FLAGS} -DDEBUG -Wall -Wextra -Wno-unknown-pragmas")
+      "${CMAKE_CXX_FLAGS} -DDEBUG -Wall -Wextra -Wno-unknown-pragmas")
 endif()
 
 # options
 set(precisions
-  "single" "double"
-  CACHE STRING "Precisions")
+    "single" "double"
+    CACHE STRING "Precisions")
 
 set(deposits
-  "zigzag" "esirkepov"
-  CACHE STRING "Deposits")
+    "zigzag" "esirkepov"
+    CACHE STRING "Deposits")
 
 if(${deposit} STREQUAL "zigzag")
   set(shape_order
-    ${default_shape_order}
-    CACHE STRING "Shape functions")
+      ${default_shape_order}
+      CACHE STRING "Shape functions")
 endif()
 
 set(shape_orders
-  "1;2;3;4;5;6;7;8;9;10;11"
-  CACHE STRING "Shape orders")
+    "1;2;3;4;5;6;7;8;9;10;11"
+    CACHE STRING "Shape orders")
 
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/config.cmake)
 
 # ------------------------- Third-Party Tests ------------------------------ #
 set(BUILD_TESTING
-  OFF
-  CACHE BOOL "Build tests")
+    OFF
+    CACHE BOOL "Build tests")
 
 # ------------------------ Third-party dependencies ------------------------ #
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/dependencies.cmake)
@@ -129,8 +129,8 @@ else()
 endif()
 
 if(("${Kokkos_DEVICES}" MATCHES "CUDA")
-  OR ("${Kokkos_DEVICES}" MATCHES "HIP")
-  OR ("${Kokkos_DEVICES}" MATCHES "SYCL"))
+   OR ("${Kokkos_DEVICES}" MATCHES "HIP")
+   OR ("${Kokkos_DEVICES}" MATCHES "SYCL"))
   set(DEVICE_ENABLED ON)
 else()
   set(DEVICE_ENABLED OFF)
@@ -148,8 +148,8 @@ if(${mpi})
     endif()
   else()
     set(gpu_aware_mpi
-      OFF
-      CACHE BOOL "Use explicit copy when using MPI + GPU")
+        OFF
+        CACHE BOOL "Use explicit copy when using MPI + GPU")
   endif()
 endif()
 
@@ -195,7 +195,8 @@ else()
     string(REPLACE "/" "_" pg_nodir ${pg_nodir})
     set(pgen_suffix "_${pg_nodir}")
     set_problem_generator(${pg})
-    add_subdirectory(${SRC_DIR}/engines ${CMAKE_CURRENT_BINARY_DIR}/${pg_nodir}/engines)
+    add_subdirectory(${SRC_DIR}/engines
+                     ${CMAKE_CURRENT_BINARY_DIR}/${pg_nodir}/engines)
     add_subdirectory(${SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/${pg_nodir}/src)
     list(APPEND pgens_short ${PGEN})
   endforeach()

From 00fa0bf65a35c111c94dc8924ee4e6eda8285629 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ludwig=20B=C3=B6ss?= <ludwig.boess@outlook.de>
Date: Tue, 19 May 2026 19:25:22 +0200
Subject: [PATCH 02/33] port: team policy and vendor specific sort

---
 CMakeLists.txt                              |   66 +
 cmake/defaults.cmake                        |   15 +
 cmake/report.cmake                          |   25 +
 src/engines/srpic/currents.h                |  135 +-
 src/framework/containers/particles.h        |   58 +
 src/framework/containers/particles_sort.cpp |  341 ++++-
 src/global/arch/kokkos_aliases.h            |   27 +
 src/global/utils/sort_dispatch.h            |  171 +++
 src/global/utils/sorting.h                  |  155 ++-
 src/kernels/currents_deposit.hpp            | 1395 ++++++++++++-------
 tests/framework/CMakeLists.txt              |    6 +
 tests/framework/sort_by_key.cpp             |  110 ++
 tests/global/tiling.cpp                     |   29 +-
 tests/kernels/CMakeLists.txt                |    3 +
 tests/kernels/deposit_tiled.cpp             |  262 ++++
 15 files changed, 2286 insertions(+), 512 deletions(-)
 create mode 100644 src/global/utils/sort_dispatch.h
 create mode 100644 tests/framework/sort_by_key.cpp
 create mode 100644 tests/kernels/deposit_tiled.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cd5f0c258..acac1d7d5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,6 +58,16 @@ set(gpu_aware_mpi
   ${default_gpu_aware_mpi}
   CACHE BOOL "Enable GPU-aware MPI")
 
+set(team_policy
+  ${default_team_policy}
+  CACHE BOOL "Enable team_policy tile-blocked deposit/pusher kernels")
+set(team_policy_tile_size
+  ${default_team_policy_tile_size}
+  CACHE STRING "team_policy tile edge length in cells")
+set(team_policy_tile_sizes
+  "4;6;8;10;12"
+  CACHE STRING "team_policy tile-size choices")
+
 # -------------------------- Compilation settings -------------------------- #
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -136,6 +146,62 @@ else()
   set(DEVICE_ENABLED OFF)
 endif()
 
+# ------------------------------ team_policy wiring ------------------------ #
+if(${team_policy})
+  list(FIND team_policy_tile_sizes "${team_policy_tile_size}" _tps_idx)
+  if(_tps_idx EQUAL -1)
+    message(FATAL_ERROR
+      "${Red}team_policy_tile_size must be one of ${team_policy_tile_sizes}, "
+      "got '${team_policy_tile_size}'${ColorReset}")
+  endif()
+  add_compile_options("-D TEAM_POLICY")
+  add_compile_options("-D TEAM_POLICY_TILE_SIZE=${team_policy_tile_size}")
+
+  # Vendor sort: oneDPL on SYCL, Thrust on CUDA. Used automatically
+  # when found; falls back to Kokkos::BinSort otherwise.
+  if("${Kokkos_DEVICES}" MATCHES "SYCL")
+    find_package(oneDPL QUIET)
+    if(oneDPL_FOUND)
+      message(STATUS "team_policy: oneDPL found, enabling SYCL sort_by_key")
+      add_compile_options("-D ONEDPL_ENABLED")
+      set(DEPENDENCIES ${DEPENDENCIES} oneDPL)
+    else()
+      message(STATUS "team_policy: oneDPL not found; using BinSort fallback "
+                     "for SYCL sort_by_key")
+    endif()
+  endif()
+
+  if("${Kokkos_DEVICES}" MATCHES "CUDA")
+    find_package(Thrust QUIET)
+    if(Thrust_FOUND)
+      message(STATUS "team_policy: Thrust enabled for CUDA sort_by_key")
+      add_compile_options("-D THRUST_ENABLED")
+    else()
+      message(STATUS "team_policy: Thrust not found; using BinSort fallback "
+                     "for CUDA sort_by_key")
+    endif()
+  endif()
+
+  if("${Kokkos_DEVICES}" MATCHES "HIP")
+    # rocThrust ships with ROCm and exposes the same thrust:: API. Using
+    # it lets the HIP backend build a single permutation via
+    # sort_by_key and gather all SoA members through one reused buffer,
+    # instead of the legacy per-member Kokkos::BinSort path which
+    # allocates a fresh `sorted_values` buffer for every member every
+    # step (the dominant source of allocator churn / fragmentation on
+    # ROCm).
+    find_package(rocthrust QUIET)
+    if(rocthrust_FOUND)
+      message(STATUS "team_policy: rocThrust enabled for HIP sort_by_key")
+      add_compile_options("-D ROCTHRUST_ENABLED")
+      set(DEPENDENCIES ${DEPENDENCIES} roc::rocthrust)
+    else()
+      message(STATUS "team_policy: rocThrust not found; using BinSort "
+                     "fallback for HIP sort_by_key")
+    endif()
+  endif()
+endif()
+
 # MPI
 if(${mpi})
   find_or_fetch_dependency(MPI FALSE REQUIRED)
diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake
index fb8790019..a85accf84 100644
--- a/cmake/defaults.cmake
+++ b/cmake/defaults.cmake
@@ -92,3 +92,18 @@ else()
 endif()
 
 set_property(CACHE default_gpu_aware_mpi PROPERTY TYPE BOOL)
+
+if(DEFINED ENV{Entity_ENABLE_TEAM_POLICY})
+  set(default_team_policy
+      $ENV{Entity_ENABLE_TEAM_POLICY}
+      CACHE INTERNAL "Default flag for team_policy tile-blocked kernels")
+else()
+  set(default_team_policy
+      OFF
+      CACHE INTERNAL "Default flag for team_policy tile-blocked kernels")
+endif()
+set_property(CACHE default_team_policy PROPERTY TYPE BOOL)
+
+set(default_team_policy_tile_size
+    8
+    CACHE INTERNAL "Default tile edge length in cells for team_policy")
diff --git a/cmake/report.cmake b/cmake/report.cmake
index 7b145bfc7..65a22a7a6 100644
--- a/cmake/report.cmake
+++ b/cmake/report.cmake
@@ -122,6 +122,26 @@ if(${mpi} AND ${DEVICE_ENABLED})
     GPU_AWARE_MPI_REPORT
     46)
 endif()
+printchoices(
+  "Team Policy"
+  "team_policy"
+  "${ON_OFF_VALUES}"
+  ${team_policy}
+  OFF
+  "${Green}"
+  TEAM_POLICY_REPORT
+  46)
+if(${team_policy})
+  printchoices(
+    "Team Tile Size"
+    "team_policy_tile_size"
+    "${team_policy_tile_sizes}"
+    ${team_policy_tile_size}
+    ${default_team_policy_tile_size}
+    "${Blue}"
+    TEAM_POLICY_TILE_SIZE_REPORT
+    46)
+endif()
 printchoices(
   "Debug mode"
   "DEBUG"
@@ -197,6 +217,11 @@ if(${mpi} AND ${DEVICE_ENABLED})
   string(APPEND REPORT_TEXT "  " ${GPU_AWARE_MPI_REPORT} "\n")
 endif()
 
+string(APPEND REPORT_TEXT "  " ${TEAM_POLICY_REPORT} "\n")
+if(${team_policy})
+  string(APPEND REPORT_TEXT "  " ${TEAM_POLICY_TILE_SIZE_REPORT} "\n")
+endif()
+
 string(
   APPEND
   REPORT_TEXT
diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h
index 3afabea8a..faf0bb3ad 100644
--- a/src/engines/srpic/currents.h
+++ b/src/engines/srpic/currents.h
@@ -2,7 +2,8 @@
  * @file engines/srpic/currents.h
  * @brief Current deposition and filtering routines for the SRPIC engine
  * @implements
- *   - ntt::srpic::CallDepositKernel<> -> void
+ *   - ntt::srpic::CallDepositKernel<> -> void                 (flat path)
+ *   - ntt::srpic::CallDepositKernelTiled<> -> void            (TEAM_POLICY)
  *   - ntt::srpic::CurrentsDeposit<> -> void
  *   - ntt::srpic::CurrentsFilter<> -> void
  * @namespaces:
@@ -61,11 +62,142 @@ namespace ntt {
                              dt));
     }
 
+#if defined(TEAM_POLICY)
+    /**
+     * @brief Tiled deposit launcher (TeamPolicy + per-team scratch).
+     *
+     * Iterates over `tile_layout.ntiles_total` teams; each team accumulates
+     * its tile's particle contributions in SLM scratch and atomically
+     * flushes to the global J. Requires the species to have been sorted
+     * with `team_policy` enabled (`tile_layout` populated by
+     * `SortSpatially`).
+     *
+     * Falls back to the flat kernel if `tile_offsets` is empty — this
+     * happens on the first step before the first sort, or for very small
+     * species that exited early in `SortSpatially`. The fallback uses the
+     * passed-in `scatter_cur` so the caller still composes correctly.
+     */
+    template <SRMetricClass M, unsigned short O>
+    void CallDepositKernelTiled(
+      const Particles<M::Dim, M::CoordType>& species,
+      const M&                               local_metric,
+      const ndfield_t<M::Dim, 3>&            cur,
+      real_t                                 dt) {
+      static_assert(O <= 11u, "Shape order must be <= 11");
+      constexpr unsigned short T = static_cast<unsigned short>(
+        TEAM_POLICY_TILE_SIZE);
+      const auto& layout = species.tile_layout();
+      raise::ErrorIf(layout.ntiles_total == 0u,
+                     "CallDepositKernelTiled: tile_layout has 0 tiles — call "
+                     "SortSpatially before CurrentsDeposit",
+                     HERE);
+      raise::ErrorIf(layout.tile_offsets.extent(0) != layout.ntiles_total + 1u,
+                     "CallDepositKernelTiled: tile_offsets size inconsistent "
+                     "with ntiles_total",
+                     HERE);
+
+      using kernel_t = kernel::DepositCurrents_kernel_tiled<SimEngine::SRPIC,
+                                                            M,
+                                                            O,
+                                                            T>;
+      kernel_t kern { cur,
+                      species.i1,
+                      species.i2,
+                      species.i3,
+                      species.i1_prev,
+                      species.i2_prev,
+                      species.i3_prev,
+                      species.dx1,
+                      species.dx2,
+                      species.dx3,
+                      species.dx1_prev,
+                      species.dx2_prev,
+                      species.dx3_prev,
+                      species.ux1,
+                      species.ux2,
+                      species.ux3,
+                      species.phi,
+                      species.weight,
+                      species.tag,
+                      local_metric,
+                      (real_t)(species.charge()),
+                      dt,
+                      layout };
+
+      Kokkos::TeamPolicy<> policy(static_cast<int>(layout.ntiles_total),
+                                  Kokkos::AUTO);
+      policy.set_scratch_size(0, Kokkos::PerTeam(kernel_t::scratch_bytes()));
+      Kokkos::parallel_for("CurrentsDepositTiled", policy, kern);
+    }
+#endif // TEAM_POLICY
+
     template <SRMetricClass M>
     void CurrentsDeposit(Domain<SimEngine::SRPIC, M>& domain,
                          const prm::Parameters&       engine_params) {
       const auto dt = engine_params.get<real_t>("dt");
       Kokkos::deep_copy(domain.fields.cur, ZERO);
+
+#if defined(TEAM_POLICY)
+
+      // First-step fallback: if any contributing species has not been
+      // sorted yet (tile_layout still empty), fall back to the flat
+      // scatter-view path for that step. Subsequent steps see populated
+      // layouts and use the tiled kernel.
+      bool any_unsorted = false;
+      for (auto& species : domain.species) {
+        if ((species.pusher() == ParticlePusher::NONE) or
+            (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
+          continue;
+        }
+        if (species.tile_layout().ntiles_total == 0u or
+            species.tile_layout().tile_offsets.extent(0) == 0u) {
+          any_unsorted = true;
+          break;
+        }
+      }
+      if (any_unsorted) {
+        auto scatter_cur = Kokkos::Experimental::create_scatter_view(
+          domain.fields.cur);
+        for (auto& species : domain.species) {
+          if ((species.pusher() == ParticlePusher::NONE) or
+              (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
+            continue;
+          }
+          logger::Checkpoint(
+            fmt::format("Launching currents deposit (flat fallback, no sort yet) "
+                        "for %d [%s] : %lu %f",
+                        species.index(),
+                        species.label().c_str(),
+                        species.npart(),
+                        (double)species.charge()),
+            HERE);
+          CallDepositKernel<M, SHAPE_ORDER>(species,
+                                             domain.mesh.metric,
+                                             scatter_cur,
+                                             dt);
+        }
+        Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur);
+      } else {
+        for (auto& species : domain.species) {
+          if ((species.pusher() == ParticlePusher::NONE) or
+              (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
+            continue;
+          }
+          logger::Checkpoint(
+            fmt::format("Launching tiled currents deposit for %d [%s] : %lu %f",
+                        species.index(),
+                        species.label().c_str(),
+                        species.npart(),
+                        (double)species.charge()),
+            HERE);
+
+          CallDepositKernelTiled<M, SHAPE_ORDER>(species,
+                                                  domain.mesh.metric,
+                                                  domain.fields.cur,
+                                                  dt);
+        }
+      }
+#else
       auto scatter_cur = Kokkos::Experimental::create_scatter_view(
         domain.fields.cur);
       for (auto& species : domain.species) {
@@ -84,6 +216,7 @@ namespace ntt {
         CallDepositKernel<M, SHAPE_ORDER>(species, domain.mesh.metric, scatter_cur, dt);
       }
       Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur);
+#endif
     }
 
     template <SRMetricClass M>
diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h
index 895026552..0a15cb5d9 100644
--- a/src/framework/containers/particles.h
+++ b/src/framework/containers/particles.h
@@ -25,6 +25,7 @@
 #include "traits/metric.h"
 #include "utils/error.h"
 #include "utils/formatting.h"
+#include "utils/sorting.h"
 
 #include "framework/containers/species.h"
 #include "framework/domain/grid.h"
@@ -90,6 +91,30 @@ namespace ntt {
     const uint8_t m_ntags { (uint8_t)(2 + math::pow(3, (int)D) - 1) };
 #endif
 
+    // team_policy: tile metadata produced by SortSpatially
+    // and consumed by the tiled deposit / pusher kernels. Lazily
+    // allocated on first sort. The sort backend itself (oneDPL on SYCL,
+    // Thrust on CUDA, std::sort on Host, Kokkos::BinSort otherwise) is
+    // selected at compile time based on the Kokkos device and the
+    // vendor libraries detected by CMake.
+    TileLayout<D> m_tile_layout {};
+
+#if defined(TEAM_POLICY) &&                                                    \
+  ((defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) ||                       \
+   (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) ||                       \
+   (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)))
+    // Persistent byte scratch reused by every SoA-member gather in
+    // `apply_permutation_to_soa`, across all members and all timesteps.
+    // Without this each member would allocate (and free) its own
+    // transient buffer every sort; recycling one persistent buffer
+    // removes that allocation churn entirely — the structural fix for
+    // the ROCm sort slowdown / fragmentation. Grown monotonically to
+    // the largest required size, never shrunk. Kokkos device
+    // allocations are over-aligned (>= 8 B), so reinterpreting the
+    // bytes as any SoA element type (<= 8 B PODs) is well-defined.
+    array_t<char*> m_perm_scratch {};
+#endif
+
   public:
     // for empty allocation
     Particles() {}
@@ -276,9 +301,42 @@ namespace ntt {
     /**
      * @brief Sort particles spatially by their cell indices
      * @param grid The grid object to get the cell information for sorting
+     * @note In team_policy mode (compile-time `team_policy=ON`), also
+     *       populates `m_tile_layout` with tile-offset and per-tile
+     *       permutation metadata that the tiled deposit/pusher kernels
+     *       consume.
      */
     void SortSpatially(const Grid<D>&);
 
+#if defined(TEAM_POLICY) &&                                                    \
+  ((defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) ||                       \
+   (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) ||                       \
+   (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)))
+  private:
+    /**
+     * @brief Apply a particle-index permutation (built by oneDPL/Thrust
+     *        sort_by_key) to every SoA member array. Sequential — one
+     *        transient buffer at a time, fenced before scope exit.
+     *        Only compiled when a vendor sort backend is enabled; the
+     *        BinSort path applies the permutation in place via
+     *        `sorter.sort(view)` instead.
+     */
+    void apply_permutation_to_soa(const prtl_perm_t& perm);
+
+  public:
+#endif
+
+    /**
+     * @brief Read-only access to the tile layout produced by the most
+     *        recent SortSpatially call. Returns a default-constructed
+     *        layout (`ntiles_total == 0`) when the species has not yet
+     *        been sorted.
+     */
+    [[nodiscard]]
+    auto tile_layout() const -> const TileLayout<D>& {
+      return m_tile_layout;
+    }
+
     /**
      * @brief Copy particle data from device to host.
      */
diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp
index 904fb3fc7..c04813e12 100644
--- a/src/framework/containers/particles_sort.cpp
+++ b/src/framework/containers/particles_sort.cpp
@@ -8,10 +8,22 @@
 #include "framework/containers/particles.h"
 #include "framework/domain/grid.h"
 
+#if defined(TEAM_POLICY)
+  #if (defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) ||                    \
+      (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) ||                    \
+      (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED))
+    #define TEAM_POLICY_USE_VENDOR_SORT
+    #include "utils/sort_dispatch.h"
+  #endif
+#endif
+
 #include <Kokkos_Core.hpp>
 #include <Kokkos_ScatterView.hpp>
+#include <Kokkos_Sort.hpp>
 #include <Kokkos_StdAlgorithms.hpp>
 
+#include <algorithm>
+#include <cstddef>
 #include <string>
 #include <utility>
 #include <vector>
@@ -195,6 +207,192 @@ namespace ntt {
 
   template <Dimension D, Coord::type C>
   void Particles<D, C>::SortSpatially(const Grid<D>& grid) {
+#if defined(TEAM_POLICY)
+    // ---------------------- team_policy: tile-based sort ------------------ //
+    const auto npart_local = npart();
+    if (npart_local == 0u) {
+      m_tile_layout = TileLayout<D> {};
+      m_is_sorted   = true;
+      return;
+    }
+
+    constexpr unsigned short T = static_cast<unsigned short>(
+      TEAM_POLICY_TILE_SIZE);
+    static_assert(T > 0u, "TEAM_POLICY_TILE_SIZE must be > 0");
+
+    // 1. Compute per-axis tile counts and total_tiles.
+    const auto ncells_active = grid.n_active();
+    ncells_t   ntx[3] { 1u, 1u, 1u };
+    ncells_t   total_tiles { 1u };
+    if constexpr ((D == Dim::_1D) or (D == Dim::_2D) or (D == Dim::_3D)) {
+      ntx[0]       = static_cast<ncells_t>(math::ceil(
+        static_cast<double>(ncells_active[0]) / static_cast<double>(T)));
+      total_tiles *= ntx[0];
+    }
+    if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) {
+      ntx[1]       = static_cast<ncells_t>(math::ceil(
+        static_cast<double>(ncells_active[1]) / static_cast<double>(T)));
+      total_tiles *= ntx[1];
+    }
+    if constexpr (D == Dim::_3D) {
+      ntx[2]       = static_cast<ncells_t>(math::ceil(
+        static_cast<double>(ncells_active[2]) / static_cast<double>(T)));
+      total_tiles *= ntx[2];
+    }
+
+    // 2. Compute per-particle tile key (with min(i, i_prev)).
+    array_t<ncells_t*> tile_indices { "tile_indices", npart_local };
+    Kokkos::parallel_for(
+      "FillTileIndices",
+      rangeActiveParticles(),
+      sort::PositionToTileIndex<D, false, true> { i1,
+                                                   i2,
+                                                   i3,
+                                                   tag,
+                                                   tile_indices,
+                                                   ncells_active,
+                                                   static_cast<ncells_t>(T),
+                                                   array_t<npart_t*> {},
+                                                   i1_prev,
+                                                   i2_prev,
+                                                   i3_prev });
+
+    // 3. Sort. Vendor library (oneDPL/Thrust) when compiled in;
+    //    Kokkos::BinSort otherwise. n_bins = total_tiles + 2 covers
+    //    the dead-particle sentinel bin (total_tiles + 1u).
+    const ncells_t n_bins = total_tiles + 2u;
+    const auto     slice  = prtl_slice_t(0, npart_local);
+  #if defined(TEAM_POLICY_USE_VENDOR_SORT)
+    // Vendor path: produce an explicit permutation via sort_by_key,
+    // then apply it to each SoA member with a sequential one-buffer
+    // gather (peak transient = one `npart × sizeof(member)` buffer.
+    prtl_perm_t perm { "tile_perm", npart_local };
+    #if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)
+    sort_helpers::sort_by_key_dispatch(tile_indices,
+                                       perm,
+                                       n_bins,
+                                       sort::backend::OneDPL {});
+    #elif defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)
+    sort_helpers::sort_by_key_dispatch(tile_indices,
+                                       perm,
+                                       n_bins,
+                                       sort::backend::Rocthrust {});
+    #else
+    sort_helpers::sort_by_key_dispatch(tile_indices,
+                                       perm,
+                                       n_bins,
+                                       sort::backend::Thrust {});
+    #endif
+    Kokkos::fence("SortSpatially: pre-gather drain");
+    apply_permutation_to_soa(perm);
+  #else
+    // BinSort path: same mechanism as legacy SortSpatially (BinSort
+    // allocates one temp View per `sorter.sort(view)` call and frees
+    // it before the next), so peak transient memory is bounded.
+    using sorter_op_t = Kokkos::BinOp1D<array_t<ncells_t*>>;
+    using sorter_t    = Kokkos::BinSort<array_t<ncells_t*>, sorter_op_t>;
+    auto bin_op       = sorter_op_t { static_cast<int>(n_bins), 0u, n_bins };
+    auto sorter       = sorter_t { tile_indices, bin_op, false };
+    sorter.create_permute_vector();
+    if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) {
+      sorter.sort(Kokkos::subview(i1, slice));
+      sorter.sort(Kokkos::subview(i1_prev, slice));
+      sorter.sort(Kokkos::subview(dx1, slice));
+      sorter.sort(Kokkos::subview(dx1_prev, slice));
+    }
+    if constexpr (D == Dim::_2D or D == Dim::_3D) {
+      sorter.sort(Kokkos::subview(i2, slice));
+      sorter.sort(Kokkos::subview(i2_prev, slice));
+      sorter.sort(Kokkos::subview(dx2, slice));
+      sorter.sort(Kokkos::subview(dx2_prev, slice));
+    }
+    if constexpr (D == Dim::_3D) {
+      sorter.sort(Kokkos::subview(i3, slice));
+      sorter.sort(Kokkos::subview(i3_prev, slice));
+      sorter.sort(Kokkos::subview(dx3, slice));
+      sorter.sort(Kokkos::subview(dx3_prev, slice));
+    }
+    sorter.sort(Kokkos::subview(ux1, slice));
+    sorter.sort(Kokkos::subview(ux2, slice));
+    sorter.sort(Kokkos::subview(ux3, slice));
+    sorter.sort(Kokkos::subview(weight, slice));
+    sorter.sort(Kokkos::subview(tag, slice));
+    if constexpr (D == Dim::_2D and C != Coord::Cartesian) {
+      sorter.sort(Kokkos::subview(phi, slice));
+    }
+    for (auto pldr { 0u }; pldr < npld_r(); ++pldr) {
+      sorter.sort(Kokkos::subview(pld_r, slice, pldr));
+    }
+    for (auto pldi { 0u }; pldi < npld_i(); ++pldi) {
+      sorter.sort(Kokkos::subview(pld_i, slice, pldi));
+    }
+    // Apply the same permutation to `tile_indices` itself so it ends
+    // monotonically non-decreasing for the offsets pass below.
+    sorter.sort(tile_indices);
+  #endif // TEAM_POLICY_USE_VENDOR_SORT
+
+    // 5. Compute per-tile prefix-sum `tile_offsets` for the tiled
+    //    pusher. `tile_indices` is now sorted (monotonically
+    //    non-decreasing for alive particles, dead sentinel
+    //    `total_tiles + 1` clustered at the end) — vendor sort_by_key
+    //    sorts keys in place; the BinSort path explicitly applies the
+    //    same permutation to `tile_indices` above. Transition-detect
+    //    directly on it: the start of each non-empty tile is the only
+    //    place a write happens — atomic-free in the dense branch.
+    //    Empty tiles (no particles) are filled by a reverse pass on a
+    //    small host mirror (`total_tiles ≈ 176K` at production scale →
+    //    ~700 KB).
+    {
+      array_t<npart_t*> tile_offsets { "tile_offsets", total_tiles + 1u };
+      Kokkos::deep_copy(tile_offsets, static_cast<npart_t>(npart_local));
+
+      const auto total_tiles_v = total_tiles;
+      auto       ti_v          = tile_indices;
+      Kokkos::parallel_for(
+        "DetectTileBoundaries",
+        rangeActiveParticles(),
+        Lambda(prtlidx_t p) {
+          const auto t_curr   = ti_v(p);
+          const bool boundary = (p == 0u) || (ti_v(p - 1u) != t_curr);
+          if (!boundary) {
+            return;
+          }
+          if (t_curr < total_tiles_v) {
+            tile_offsets(t_curr) = p;
+          } else {
+            // First dead particle — also marks the alive_count boundary
+            // stored at index total_tiles.
+            Kokkos::atomic_min(&tile_offsets(total_tiles_v), p);
+          }
+        });
+
+      auto h_offsets = Kokkos::create_mirror_view(tile_offsets);
+      Kokkos::deep_copy(h_offsets, tile_offsets);
+      for (auto t = static_cast<std::size_t>(total_tiles); t-- > 0u;) {
+        if (h_offsets(t) > h_offsets(t + 1u)) {
+          h_offsets(t) = h_offsets(t + 1u);
+        }
+      }
+      Kokkos::deep_copy(tile_offsets, h_offsets);
+
+      m_tile_layout.tile_offsets = tile_offsets;
+    }
+
+    // 6. Populate `m_tile_layout` size/shape. `tile_perm` is not used
+    //    in the current design — the SoA arrays are physically permuted
+    //    into tile order, so consumers iterate
+    //    `[tile_offsets(t), tile_offsets(t+1))` directly without a
+    //    separate permutation indirection.
+    m_tile_layout.ntiles_per_axis[0] = ntx[0];
+    m_tile_layout.ntiles_per_axis[1] = ntx[1];
+    m_tile_layout.ntiles_per_axis[2] = ntx[2];
+    m_tile_layout.ntiles_total       = total_tiles;
+    m_tile_layout.tile_size          = T;
+    m_tile_layout.tile_perm          = prtl_perm_t {};
+    m_is_sorted                      = true;
+
+    Kokkos::fence("SortSpatially: end of team_policy path");
+#else  // !TEAM_POLICY — legacy in-place BinSort by global cell index
     const auto nx2         = grid.n_active(in::x2);
     const auto nx3         = grid.n_active(in::x3);
     const auto total_cells = grid.num_active();
@@ -250,13 +448,153 @@ namespace ntt {
     for (auto pldi { 0u }; pldi < npld_i(); ++pldi) {
       sorter.sort(Kokkos::subview(pld_i, slice, pldi));
     }
+#endif // TEAM_POLICY
+  }
+
+#if defined(TEAM_POLICY_USE_VENDOR_SORT)
+  namespace permute_helpers {
+
+    // Permute a 1D SoA member array `arr` in place by `perm`, gathering
+    // through `scratch` — a persistent byte buffer reused by every
+    // member and every timestep (no per-call allocation). An unmanaged
+    // typed view aliases the scratch bytes; the caller guarantees
+    // `scratch` is large enough and that Kokkos' device over-alignment
+    // covers the element type.
+    template <typename V>
+    inline void permute_1d_inplace(V&                    arr,
+                                   const prtl_perm_t&    perm,
+                                   npart_t               n,
+                                   const array_t<char*>& scratch) {
+      if (n == 0u) {
+        return;
+      }
+      using value_t = typename V::non_const_value_type;
+      using buf_t   = Kokkos::View<value_t*,
+                                 typename V::array_layout,
+                                 typename V::memory_space,
+                                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+      buf_t buf(reinterpret_cast<value_t*>(scratch.data()), n);
+      auto  perm_v = perm;
+      auto  arr_v  = arr;
+      Kokkos::parallel_for(
+        "Permute1D",
+        n,
+        KOKKOS_LAMBDA(const npart_t p) { buf(p) = arr_v(perm_v(p)); });
+      Kokkos::deep_copy(Kokkos::subview(arr, prtl_slice_t(0u, n)), buf);
+      Kokkos::fence("permute_1d_inplace: end");
+    }
+
+    // 2D analogue for `pld_r` / `pld_i`.
+    template <typename V>
+    inline void permute_2d_inplace(V&                    arr,
+                                   const prtl_perm_t&    perm,
+                                   npart_t               n,
+                                   npart_t               ncols,
+                                   const array_t<char*>& scratch) {
+      if (n == 0u or ncols == 0u) {
+        return;
+      }
+      using value_t = typename V::non_const_value_type;
+      using buf_t   = Kokkos::View<value_t**,
+                                 typename V::array_layout,
+                                 typename V::memory_space,
+                                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+      buf_t buf(reinterpret_cast<value_t*>(scratch.data()), n, ncols);
+      auto  perm_v = perm;
+      auto  arr_v  = arr;
+      Kokkos::parallel_for(
+        "Permute2D",
+        CreateParticleRangePolicy<Dim::_2D>({ 0u, 0u }, { n, ncols }),
+        KOKKOS_LAMBDA(const npart_t p, const npart_t l) {
+          buf(p, l) = arr_v(perm_v(p), l);
+        });
+      Kokkos::deep_copy(Kokkos::subview(arr, prtl_slice_t(0u, n), Kokkos::ALL),
+                        buf);
+      Kokkos::fence("permute_2d_inplace: end");
+    }
+
+  } // namespace permute_helpers
+
+  template <Dimension D, Coord::type C>
+  void Particles<D, C>::apply_permutation_to_soa(const prtl_perm_t& perm) {
+    const auto n = npart();
+    if (n == 0u) {
+      return;
+    }
+
+    // Size the persistent scratch once to the largest gather any member
+    // needs this call: 1D members need n * sizeof(real_t) bytes (the
+    // widest element); the 2D payloads need n * ncols * elem bytes.
+    // Grown monotonically, never shrunk — so after warmup this incurs
+    // no allocation at all.
+    std::size_t need = static_cast<std::size_t>(n) * sizeof(real_t);
+    if (npld_r() > 0) {
+      need = std::max(need,
+                      static_cast<std::size_t>(n) *
+                        static_cast<std::size_t>(npld_r()) * sizeof(real_t));
+    }
+    if (npld_i() > 0) {
+      need = std::max(need,
+                      static_cast<std::size_t>(n) *
+                        static_cast<std::size_t>(npld_i()) * sizeof(npart_t));
+    }
+    if (m_perm_scratch.extent(0) < need) {
+      m_perm_scratch = array_t<char*> { "perm_scratch", need };
+    }
+    const auto& scratch = m_perm_scratch;
+
+    using permute_helpers::permute_1d_inplace;
+    using permute_helpers::permute_2d_inplace;
+
+    if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) {
+      permute_1d_inplace(i1, perm, n, scratch);
+      permute_1d_inplace(dx1, perm, n, scratch);
+      permute_1d_inplace(i1_prev, perm, n, scratch);
+      permute_1d_inplace(dx1_prev, perm, n, scratch);
+    }
+    if constexpr (D == Dim::_2D or D == Dim::_3D) {
+      permute_1d_inplace(i2, perm, n, scratch);
+      permute_1d_inplace(dx2, perm, n, scratch);
+      permute_1d_inplace(i2_prev, perm, n, scratch);
+      permute_1d_inplace(dx2_prev, perm, n, scratch);
+    }
+    if constexpr (D == Dim::_3D) {
+      permute_1d_inplace(i3, perm, n, scratch);
+      permute_1d_inplace(dx3, perm, n, scratch);
+      permute_1d_inplace(i3_prev, perm, n, scratch);
+      permute_1d_inplace(dx3_prev, perm, n, scratch);
+    }
+    permute_1d_inplace(ux1, perm, n, scratch);
+    permute_1d_inplace(ux2, perm, n, scratch);
+    permute_1d_inplace(ux3, perm, n, scratch);
+    permute_1d_inplace(weight, perm, n, scratch);
+    permute_1d_inplace(tag, perm, n, scratch);
+    if constexpr (D == Dim::_2D and C != Coord::Cartesian) {
+      permute_1d_inplace(phi, perm, n, scratch);
+    }
+    if (npld_r() > 0) {
+      permute_2d_inplace(pld_r, perm, n, static_cast<npart_t>(npld_r()), scratch);
+    }
+    if (npld_i() > 0) {
+      permute_2d_inplace(pld_i, perm, n, static_cast<npart_t>(npld_i()), scratch);
+    }
   }
+#endif // TEAM_POLICY_USE_VENDOR_SORT
+
+#if defined(TEAM_POLICY_USE_VENDOR_SORT)
+  #define APPLY_PERM_INSTANTIATE(D, C)                                         \
+    template void Particles<D, C>::apply_permutation_to_soa(                   \
+      const prtl_perm_t&);
+#else
+  #define APPLY_PERM_INSTANTIATE(D, C)
+#endif
 
 #define PARTICLES_SORT(D, C)                                                   \
   template auto Particles<D, C>::NpartsPerTagAndOffsets() const                \
     -> std::pair<std::vector<npart_t>, array_t<npart_t*>>;                     \
   template void Particles<D, C>::RemoveDead();                                 \
-  template void Particles<D, C>::SortSpatially(const Grid<D>&);
+  template void Particles<D, C>::SortSpatially(const Grid<D>&);                \
+  APPLY_PERM_INSTANTIATE(D, C)
 
   PARTICLES_SORT(Dim::_1D, Coord::Cartesian)
   PARTICLES_SORT(Dim::_2D, Coord::Cartesian)
@@ -266,5 +604,6 @@ namespace ntt {
   PARTICLES_SORT(Dim::_3D, Coord::Spherical)
   PARTICLES_SORT(Dim::_3D, Coord::Qspherical)
 #undef PARTICLES_SORT
+#undef APPLY_PERM_INSTANTIATE
 
 } // namespace ntt
diff --git a/src/global/arch/kokkos_aliases.h b/src/global/arch/kokkos_aliases.h
index 314e78c0e..43f21ce47 100644
--- a/src/global/arch/kokkos_aliases.h
+++ b/src/global/arch/kokkos_aliases.h
@@ -10,6 +10,7 @@
  *   - CreateRangePolicy, CreateRangePolicyOnHost
  *   - random_number_pool_t, random_generator_t
  *   - Random function
+ *   - prtl_perm_t, TileLayout<>
  * @cpp:
  *   - arch/kokkos_aliases.cpp
  * @namespaces:
@@ -258,6 +259,32 @@ template <Dimension D>
 auto CreateRangePolicyOnHost(const tuple_t<ncells_t, D>&,
                              const tuple_t<ncells_t, D>&) -> range_h_t<D>;
 
+// --------------------------- team_policy types ---------------------------- //
+// Particle permutation index: maps a sorted-position p in [0, npart) to a
+// pre-sort particle index. Produced by SortSpatially, consumed by tiled
+// pusher and deposit kernels to walk particles tile-by-tile without
+// physically re-permuting the SoA arrays in lock step every step.
+using prtl_perm_t = array_t<npart_t*>;
+
+// Tile layout metadata: the contract between Stream 1 (sort) and Streams
+// 2/3 (tiled deposit / pusher). All members are device-resident.
+//   ntiles_per_axis : number of tiles along each axis (1 for unused axes).
+//   ntiles_total    : product of ntiles_per_axis = league size for TeamPolicy.
+//   tile_size       : tile edge length in cells (compile-time CMake knob,
+//                     replicated here for runtime checks).
+//   tile_offsets    : prefix-sum of per-tile particle counts; size
+//                     ntiles_total + 1; tile t owns particles
+//                     [tile_offsets(t), tile_offsets(t+1)).
+//   tile_perm       : size npart, particle index sorted by tile.
+template <Dimension D>
+struct TileLayout {
+  ncells_t          ntiles_per_axis[3] { 1u, 1u, 1u };
+  ncells_t          ntiles_total { 0u };
+  unsigned short    tile_size { 0u };
+  array_t<npart_t*> tile_offsets;
+  prtl_perm_t       tile_perm;
+};
+
 // Random number pool/generator type alias
 // (using math:: instead of Kokkos:: to suppress compiler warning on unused namespace alias)
 using random_number_pool_t = math::Random_XorShift64_Pool<Kokkos::DefaultExecutionSpace>;
diff --git a/src/global/utils/sort_dispatch.h b/src/global/utils/sort_dispatch.h
new file mode 100644
index 000000000..506a90742
--- /dev/null
+++ b/src/global/utils/sort_dispatch.h
@@ -0,0 +1,171 @@
+/**
+ * @file utils/sort_dispatch.h
+ * @brief Backend-dispatched sort_by_key for team_policy SortSpatially.
+ * @implements
+ *   - sort_helpers::sort_by_key_dispatch -> void  (BinSort, OneDPL, Thrust, StdSort)
+ * @namespaces:
+ *   - ntt::sort_helpers::
+ * @macros:
+ *   - TEAM_POLICY
+ *   - SYCL_ENABLED, ONEDPL_ENABLED  (oneDPL overload)
+ *   - CUDA_ENABLED, THRUST_ENABLED  (Thrust overload)
+ *
+ * @note Each overload produces a permutation `perm` of size N such that
+ *       keys[perm[0]] <= keys[perm[1]] <= ... in stable order.
+ *       Always-available overloads: BinSort (uses Kokkos::BinSort) and
+ *       StdSort (host-side std::stable_sort fallback). The vendor-library
+ *       overloads (OneDPL on SYCL, Thrust on CUDA) are conditional on the
+ *       respective build flags.
+ */
+
+#ifndef GLOBAL_UTILS_SORT_DISPATCH_H
+#define GLOBAL_UTILS_SORT_DISPATCH_H
+
+#if !defined(TEAM_POLICY)
+  #error "sort_dispatch.h is only meaningful when TEAM_POLICY is defined"
+#endif
+
+#include "global.h"
+
+#include "arch/kokkos_aliases.h"
+#include "utils/sorting.h"
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Sort.hpp>
+
+#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)
+  #include <oneapi/dpl/algorithm>
+  #include <oneapi/dpl/execution>
+#endif
+#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED)
+  #include <thrust/device_ptr.h>
+  #include <thrust/sequence.h>
+  #include <thrust/sort.h>
+#endif
+#if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)
+  #include <thrust/device_ptr.h>
+  #include <thrust/execution_policy.h>
+  #include <thrust/sequence.h>
+  #include <thrust/sort.h>
+#endif
+
+#include <algorithm>
+#include <numeric>
+
+namespace ntt::sort_helpers {
+
+  // Always-available legacy fallback: Kokkos::BinSort. n_bins must be an
+  // upper bound on distinct key values.
+  inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
+                                   prtl_perm_t&              perm,
+                                   ncells_t                  n_bins,
+                                   ::sort::backend::BinSort) {
+    const auto n = static_cast<npart_t>(keys.extent(0));
+    if (n == 0u) {
+      return;
+    }
+    using sorter_op_t = Kokkos::BinOp1D<array_t<ncells_t*>>;
+    using sorter_t    = Kokkos::BinSort<array_t<ncells_t*>, sorter_op_t>;
+    auto bin_op       = sorter_op_t { static_cast<int>(n_bins), 0u, n_bins };
+    auto sorter       = sorter_t { keys, bin_op, false };
+    sorter.create_permute_vector();
+    auto perm_v = perm;
+    Kokkos::parallel_for(
+      "PermInitIota",
+      n,
+      KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; });
+    Kokkos::fence("sort_by_key_dispatch BinSort: pre-sort");
+    sorter.sort(perm);
+    Kokkos::fence("sort_by_key_dispatch BinSort: post-sort");
+  }
+
+#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)
+  inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
+                                   prtl_perm_t&              perm,
+                                   ncells_t /*n_bins*/,
+                                   ::sort::backend::OneDPL) {
+    const auto n = static_cast<npart_t>(keys.extent(0));
+    if (n == 0u) {
+      return;
+    }
+    auto*  keys_ptr = keys.data();
+    auto*  perm_ptr = perm.data();
+    auto   exec     = Kokkos::DefaultExecutionSpace();
+    auto   perm_v   = perm;
+    Kokkos::parallel_for(
+      "PermInitIota",
+      n,
+      KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; });
+    // Drain Kokkos's queue so oneDPL's policy sees the iota'd perm even
+    // if oneDPL submits to a different SYCL queue internally.
+    exec.fence("sort_by_key_dispatch OneDPL: pre-sort");
+    auto queue  = exec.sycl_queue();
+    auto policy = oneapi::dpl::execution::make_device_policy(queue);
+    oneapi::dpl::sort_by_key(policy, keys_ptr, keys_ptr + n, perm_ptr);
+    exec.fence("sort_by_key_dispatch OneDPL: post-sort");
+  }
+#endif
+
+#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED)
+  inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
+                                   prtl_perm_t&              perm,
+                                   ncells_t /*n_bins*/,
+                                   ::sort::backend::Thrust) {
+    const auto n = static_cast<npart_t>(keys.extent(0));
+    if (n == 0u) {
+      return;
+    }
+    Kokkos::fence("sort_by_key_dispatch Thrust: pre-sort");
+    thrust::device_ptr<ncells_t> kp(keys.data());
+    thrust::device_ptr<npart_t>  pp(perm.data());
+    thrust::sequence(pp, pp + n);
+    thrust::sort_by_key(kp, kp + n, pp);
+    Kokkos::fence("sort_by_key_dispatch Thrust: post-sort");
+  }
+#endif
+
+#if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)
+  // rocThrust exposes the same thrust:: API as CUDA Thrust; with hipcc
+  // device_ptr-based algorithms dispatch to the HIP backend. Mirrors
+  // the CUDA Thrust overload.
+  inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
+                                   prtl_perm_t&              perm,
+                                   ncells_t /*n_bins*/,
+                                   ::sort::backend::Rocthrust) {
+    const auto n = static_cast<npart_t>(keys.extent(0));
+    if (n == 0u) {
+      return;
+    }
+    Kokkos::fence("sort_by_key_dispatch Rocthrust: pre-sort");
+    thrust::device_ptr<ncells_t> kp(keys.data());
+    thrust::device_ptr<npart_t>  pp(perm.data());
+    thrust::sequence(pp, pp + n);
+    thrust::sort_by_key(kp, kp + n, pp);
+    Kokkos::fence("sort_by_key_dispatch Rocthrust: post-sort");
+  }
+#endif
+
+  // Host fallback: indirect-sort via std::stable_sort.
+  inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
+                                   prtl_perm_t&              perm,
+                                   ncells_t /*n_bins*/,
+                                   ::sort::backend::StdSort) {
+    const auto n = static_cast<npart_t>(keys.extent(0));
+    if (n == 0u) {
+      return;
+    }
+    auto keys_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                       keys);
+    auto perm_h = Kokkos::create_mirror_view(perm);
+    std::iota(perm_h.data(), perm_h.data() + n, npart_t { 0u });
+    std::stable_sort(perm_h.data(),
+                     perm_h.data() + n,
+                     [&](npart_t a, npart_t b) {
+                       return keys_h(a) < keys_h(b);
+                     });
+    Kokkos::deep_copy(perm, perm_h);
+  }
+
+} // namespace ntt::sort_helpers
+
+#endif // GLOBAL_UTILS_SORT_DISPATCH_H
diff --git a/src/global/utils/sorting.h b/src/global/utils/sorting.h
index dbe774402..8442f5ddb 100644
--- a/src/global/utils/sorting.h
+++ b/src/global/utils/sorting.h
@@ -5,6 +5,7 @@
  *   - sort::BinBool<>
  *   - sort::BinTag<>
  *   - sort::PositionToTileIndex<>
+ *   - sort::backend tag types (compile-time tag dispatch for sort_by_key)
  * @namespaces:
  *   - sort::
  * @note BinBool sorts by boolean values "true" then "false"
@@ -62,9 +63,27 @@ namespace sort {
     const int m_max_bins;
   };
 
-  template <Dimension D, bool Count>
+  /**
+   * @brief Bin a particle into a tile of edge length `tile_size` cells.
+   * @tparam D     Dimension.
+   * @tparam Count If true, atomic-increment `num_ppt[tile]` for each live
+   *               particle (used to populate per-tile counts in one pass).
+   * @tparam UsePrev If true, the bin key uses `min(i_curr, i_prev)` instead
+   *                 of `i_curr` alone. This guarantees that a particle whose
+   *                 Esirkepov stencil straddles a tile boundary (because it
+   *                 crossed the boundary during the pusher) lands in the
+   *                 lower-indexed of the two tiles. Combined with a halo of
+   *                 `O+1` cells in the deposit's per-tile scratch, this
+   *                 keeps every particle's stencil inside its assigned
+   *                 tile's interior+halo region. See plan §S2.4.
+   *
+   * Dead particles get the sentinel `total_tiles + 1u` so they sort to the
+   * end (or get skipped, depending on the consumer).
+   */
+  template <Dimension D, bool Count, bool UsePrev = false>
   struct PositionToTileIndex {
     const array_t<int*>   i1, i2, i3;
+    const array_t<int*>   i1_prev, i2_prev, i3_prev;
     const array_t<short*> tag;
     array_t<ncells_t*>    tile_indices;
     ncells_t              tile_size;
@@ -72,20 +91,37 @@ namespace sort {
 
     ncells_t ntx2 { 0u }, ntx3 { 0u };
     ncells_t total_tiles { 0u };
-
-    PositionToTileIndex(const array_t<int*>&         i1,
-                        const array_t<int*>&         i2,
-                        const array_t<int*>&         i3,
-                        const array_t<short*>&       tag,
-                        array_t<ncells_t*>&          tile_indices,
+    // Active-cell extents per axis. Used to clamp the bin key when
+    // UsePrev=true, since `i_prev` can be transiently negative after the
+    // pusher's periodic wrap (`i_prev -= ni`) or out-of-range after an
+    // MPI receive that hasn't translated frames. Without clamping, the
+    // signed-to-unsigned promotion in `int(-1) / uint32_t(T)` produces
+    // ~1.07e9, the linearised `tile_indices(p)` overflows past `n_bins`,
+    // and BinSort's internal `atomic_add(&bin_count[wild_idx], 1)`
+    // faults on an unmapped page.
+    int ncells1 { 1 }, ncells2 { 1 }, ncells3 { 1 };
+
+    PositionToTileIndex(const array_t<int*>&         i1_,
+                        const array_t<int*>&         i2_,
+                        const array_t<int*>&         i3_,
+                        const array_t<short*>&       tag_,
+                        array_t<ncells_t*>&          tile_indices_,
                         const std::vector<ncells_t>& ncells,
-                        ncells_t                     tile_size = 1u)
-      : i1 { i1 }
-      , i2 { i2 }
-      , i3 { i3 }
-      , tag { tag }
-      , tile_indices { tile_indices }
-      , tile_size { tile_size }
+                        ncells_t                     tile_size_ = 1u,
+                        const array_t<npart_t*>& num_ppt_ = { "num_ppt", 0u },
+                        const array_t<int*>& i1_prev_ = {},
+                        const array_t<int*>& i2_prev_ = {},
+                        const array_t<int*>& i3_prev_ = {})
+      : i1 { i1_ }
+      , i2 { i2_ }
+      , i3 { i3_ }
+      , i1_prev { i1_prev_ }
+      , i2_prev { i2_prev_ }
+      , i3_prev { i3_prev_ }
+      , tag { tag_ }
+      , tile_indices { tile_indices_ }
+      , tile_size { tile_size_ }
+      , num_ppt { num_ppt_ }
       , ntx2 { 1u }
       , ntx3 { 1u }
       , total_tiles { 1u } {
@@ -93,49 +129,122 @@ namespace sort {
                      "ncells size must match D",
                      HERE);
       if constexpr ((D == Dim::_1D) or (D == Dim::_2D) or (D == Dim::_3D)) {
+        ncells1       = static_cast<int>(ncells[0]);
         npart_t ntx1  = static_cast<ncells_t>(math::ceil(
           static_cast<double>(ncells[0]) / static_cast<double>(tile_size)));
         total_tiles  *= ntx1;
       }
       if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) {
+        ncells2      = static_cast<int>(ncells[1]);
         ntx2         = static_cast<ncells_t>(math::ceil(
           static_cast<double>(ncells[1]) / static_cast<double>(tile_size)));
         total_tiles *= ntx2;
       }
       if constexpr (D == Dim::_3D) {
+        ncells3      = static_cast<int>(ncells[2]);
         ntx3         = static_cast<ncells_t>(math::ceil(
           static_cast<double>(ncells[2]) / static_cast<double>(tile_size)));
         total_tiles *= ntx3;
       }
       if constexpr (Count) {
-        num_ppt = array_t<npart_t*> { "num_ppt", total_tiles };
+        raise::ErrorIf(num_ppt.extent(0) != total_tiles,
+                       "num_ppt must have extent equal to total tiles",
+                       HERE);
+      }
+      if constexpr (UsePrev) {
+        raise::ErrorIf(
+          i1_prev.extent(0) == 0u,
+          "PositionToTileIndex<UsePrev=true> requires i1_prev to be set",
+          HERE);
+        if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) {
+          raise::ErrorIf(
+            i2_prev.extent(0) == 0u,
+            "PositionToTileIndex<UsePrev=true> requires i2_prev to be set",
+            HERE);
+        }
+        if constexpr (D == Dim::_3D) {
+          raise::ErrorIf(
+            i3_prev.extent(0) == 0u,
+            "PositionToTileIndex<UsePrev=true> requires i3_prev to be set",
+            HERE);
+        }
       }
     }
 
-    Inline auto operator()(prtldx_t p) const {
+    Inline auto operator()(prtlidx_t p) const {
       if (tag(p) != ntt::ParticleTag::alive) {
         tile_indices(p) = total_tiles + 1u;
       } else {
+        // bin key per-axis: use min(i, i_prev) when UsePrev so that a
+        // particle straddling a boundary lands in the lower tile.
+        // Then clamp to [0, ncells_axis - 1] — `i_prev` can be negative
+        // (after the pusher's periodic-wrap path: `i_prev -= ni`) or
+        // out-of-range (after MPI receive without frame translation).
+        // Without the clamp, signed-to-unsigned promotion in
+        // `int(-1) / uint32_t(T)` makes `tile_indices(p)` overflow far
+        // past `n_bins`, and BinSort's `atomic_add(&bin_count[bin],1)`
+        // faults on an unmapped page.
+        const auto clamp_axis = [](int v, int ncells) -> int {
+          return (v < 0) ? 0 : ((v >= ncells) ? (ncells - 1) : v);
+        };
+        const auto key1 = [&]() -> int {
+          if constexpr (UsePrev) {
+            const int raw = (i1(p) < i1_prev(p)) ? i1(p) : i1_prev(p);
+            return clamp_axis(raw, ncells1);
+          } else {
+            return i1(p);
+          }
+        }();
+        const auto key2 = [&]() -> int {
+          if constexpr (UsePrev) {
+            const int raw = (i2(p) < i2_prev(p)) ? i2(p) : i2_prev(p);
+            return clamp_axis(raw, ncells2);
+          } else {
+            return i2(p);
+          }
+        }();
+        const auto key3 = [&]() -> int {
+          if constexpr (UsePrev) {
+            const int raw = (i3(p) < i3_prev(p)) ? i3(p) : i3_prev(p);
+            return clamp_axis(raw, ncells3);
+          } else {
+            return i3(p);
+          }
+        }();
         if constexpr (D == Dim::_1D) {
-          tile_indices(p) = static_cast<ncells_t>(i1(p) / tile_size);
+          tile_indices(p) = static_cast<ncells_t>(key1 / tile_size);
         } else if constexpr (D == Dim::_2D) {
-          tile_indices(p) = static_cast<ncells_t>(i1(p) / tile_size) * ntx2 +
-                            static_cast<ncells_t>(i2(p) / tile_size);
+          tile_indices(p) = static_cast<ncells_t>(key1 / tile_size) * ntx2 +
+                            static_cast<ncells_t>(key2 / tile_size);
         } else if constexpr (D == Dim::_3D) {
-          tile_indices(p) = (static_cast<ncells_t>(i1(p) / tile_size) * ntx2 +
-                             static_cast<ncells_t>(i2(p) / tile_size)) *
+          tile_indices(p) = (static_cast<ncells_t>(key1 / tile_size) * ntx2 +
+                             static_cast<ncells_t>(key2 / tile_size)) *
                               ntx3 +
-                            static_cast<ncells_t>(i3(p) / tile_size);
+                            static_cast<ncells_t>(key3 / tile_size);
         } else {
           raise::KernelError(HERE, "Wrong D in SortSpatially");
         }
         if constexpr (Count) {
-          Kokkos::atomic_add(&num_ppt(tile_indices(p)), 1u);
+          Kokkos::atomic_add(&num_ppt(tile_indices(p)), 1);
         }
       }
     }
   };
 
+  // -------------------- Backend dispatch for sort_by_key ------------------- //
+  // Compile-time tags for tag-dispatch into backend-specific
+  // sort_by_key implementations. Selection is fully compile-time: the
+  // backend that resolves depends on the active Kokkos device and the
+  // availability of the corresponding vendor library.
+  namespace backend {
+    struct OneDPL {};
+    struct Thrust {};
+    struct Rocthrust {};
+    struct StdSort {};
+    // Always-available legacy fallback using Kokkos::BinSort.
+    struct BinSort {};
+  } // namespace backend
+
 } // namespace sort
 
 #endif // GLOBAL_UTILS_SORTING_H
diff --git a/src/kernels/currents_deposit.hpp b/src/kernels/currents_deposit.hpp
index 268187af2..5eb7ff2b6 100644
--- a/src/kernels/currents_deposit.hpp
+++ b/src/kernels/currents_deposit.hpp
@@ -1,10 +1,24 @@
 /**
  * @file kernels/currents_deposit.hpp
- * @brief Covariant algorithms for the current deposition
+ * @brief Covariant algorithms for the current deposition.
+ *
+ * Two kernels share the same per-particle body
+ * (`kernel::deposit::deposit_one_particle`):
+ *   - `kernel::DepositCurrents_kernel<S, M, O>` flat (RangePolicy over particles,
+ *     writes into a `Kokkos::Experimental::ScatterView`). Always available.
+ *   - `kernel::DepositCurrents_kernel_tiled<S, M, O, T_TILE>` team-policy
+ *     (one team per spatial tile, accumulates into team SLM scratch with
+ *     atomic adds, then flushes to global J). Available when `team_policy=ON`
+ *     (`#if defined(TEAM_POLICY)`). Stream 2 of the Pattern A plan.
+ *
  * @implements
+ *   - kernel::deposit::PrtlPack<>
+ *   - kernel::deposit::deposit_one_particle<>
  *   - kernel::DepositCurrents_kernel<>
+ *   - kernel::DepositCurrents_kernel_tiled<>   (TEAM_POLICY only)
  * @namespaces:
  *   - kernel::
+ *   - kernel::deposit::
  */
 
 #ifndef KERNELS_CURRENTS_DEPOSIT_HPP
@@ -18,100 +32,99 @@
 #include "utils/error.h"
 #include "utils/numeric.h"
 
-#include "particle_shapes.hpp"
+#include "kernels/particle_shapes.hpp"
 
 #include <Kokkos_Core.hpp>
+#include <Kokkos_ScatterView.hpp>
 
 #define i_di_to_Xi(I, DI) (static_cast<real_t>((I)) + static_cast<real_t>((DI)))
 
 namespace kernel {
   using namespace ntt;
 
-  /**
-   * @brief Algorithm for the current deposition
-   */
-  template <SimEngine::type S, MetricClass M, unsigned short O = 1u>
-  class DepositCurrents_kernel {
-    static_assert(O <= 11u, "Shape function order O must be <= 11");
-    static constexpr auto D = M::Dim;
-
-    scatter_ndfield_t<D, 3>  J;
-    const array_t<int*>      i1, i2, i3;
-    const array_t<int*>      i1_prev, i2_prev, i3_prev;
-    const array_t<prtldx_t*> dx1, dx2, dx3;
-    const array_t<prtldx_t*> dx1_prev, dx2_prev, dx3_prev;
-    const array_t<real_t*>   ux1, ux2, ux3;
-    const array_t<real_t*>   phi;
-    const array_t<real_t*>   weight;
-    const array_t<short*>    tag;
-    const M                  metric;
-    const real_t             charge, inv_dt;
+  namespace deposit {
 
-  public:
     /**
-     * @brief explicit constructor.
+     * @brief Per-particle reference pack consumed by both the flat and tiled
+     *        deposit kernels. The same set of SoA references is captured by
+     *        each kernel; bundling them here keeps the helper's argument
+     *        list manageable and ensures every consumer reads the same
+     *        view aliases.
      */
-    DepositCurrents_kernel(const scatter_ndfield_t<D, 3>& scatter_cur,
-                           const array_t<int*>&           i1,
-                           const array_t<int*>&           i2,
-                           const array_t<int*>&           i3,
-                           const array_t<int*>&           i1_prev,
-                           const array_t<int*>&           i2_prev,
-                           const array_t<int*>&           i3_prev,
-                           const array_t<prtldx_t*>&      dx1,
-                           const array_t<prtldx_t*>&      dx2,
-                           const array_t<prtldx_t*>&      dx3,
-                           const array_t<prtldx_t*>&      dx1_prev,
-                           const array_t<prtldx_t*>&      dx2_prev,
-                           const array_t<prtldx_t*>&      dx3_prev,
-                           const array_t<real_t*>&        ux1,
-                           const array_t<real_t*>&        ux2,
-                           const array_t<real_t*>&        ux3,
-                           const array_t<real_t*>&        phi,
-                           const array_t<real_t*>&        weight,
-                           const array_t<short*>&         tag,
-                           const M&                       metric,
-                           real_t                         charge,
-                           const real_t                   dt)
-      : J { scatter_cur }
-      , i1 { i1 }
-      , i2 { i2 }
-      , i3 { i3 }
-      , i1_prev { i1_prev }
-      , i2_prev { i2_prev }
-      , i3_prev { i3_prev }
-      , dx1 { dx1 }
-      , dx2 { dx2 }
-      , dx3 { dx3 }
-      , dx1_prev { dx1_prev }
-      , dx2_prev { dx2_prev }
-      , dx3_prev { dx3_prev }
-      , ux1 { ux1 }
-      , ux2 { ux2 }
-      , ux3 { ux3 }
-      , phi { phi }
-      , weight { weight }
-      , tag { tag }
-      , metric { metric }
-      , charge { charge }
-      , inv_dt { ONE / dt } {
-      raise::ErrorIf(
-        (O == 2u and N_GHOSTS < 2),
-        "Order of interpolation is 2, but number of ghost cells is < 2",
-        HERE);
-    }
+    template <Dimension D>
+    struct PrtlPack {
+      array_t<int*>      i1, i2, i3;
+      array_t<int*>      i1_prev, i2_prev, i3_prev;
+      array_t<prtldx_t*> dx1, dx2, dx3;
+      array_t<prtldx_t*> dx1_prev, dx2_prev, dx3_prev;
+      array_t<real_t*>   ux1, ux2, ux3;
+      array_t<real_t*>   phi;
+      array_t<real_t*>   weight;
+      array_t<short*>    tag;
+    };
 
     /**
-     * @brief Iteration of the loop over particles.
-     * @param p index.
+     * @brief Per-particle deposit body, shared between the flat and tiled
+     *        kernels.
+     *
+     * The caller supplies a `deposit_at(idx..., comp, val)` callback that
+     * applies the contribution `val` to the J component `comp` at the
+     * **global** J cell index `idx...` (already includes the `N_GHOSTS`
+     * offset). The flat kernel's callback simply does
+     * `J_acc(idx..., comp) += val` on its scatter-view accessor; the tiled
+     * kernel's callback translates `idx...` into per-tile scratch
+     * coordinates and uses `Kokkos::atomic_add` on SLM. Either way, this
+     * function is identical numerically and contains the only deposit math
+     * in the codebase.
+     *
+     * Dead particles return early. The callback is invoked once per cell
+     * write, with the dimension-appropriate signature:
+     *   - 1D: `deposit_at(int g_i1, int comp, real_t val)`
+     *   - 2D: `deposit_at(int g_i1, int g_i2, int comp, real_t val)`
+     *   - 3D: `deposit_at(int g_i1, int g_i2, int g_i3, int comp, real_t val)`
      */
-    Inline auto operator()(prtlidx_t p) const -> void {
+    template <SimEngine::type S, MetricClass M, unsigned short O,
+              typename DepositFn>
+    Inline void deposit_one_particle(prtlidx_t       p,
+                                     const PrtlPack<M::Dim>& prtls,
+                                     const M&        metric,
+                                     real_t          charge,
+                                     real_t          inv_dt,
+                                     DepositFn       deposit_at) {
+      static_assert(O <= 11u, "Shape function order O must be <= 11");
+      constexpr auto D = M::Dim;
+
+      const auto& i1       = prtls.i1;
+      const auto& i2       = prtls.i2;
+      const auto& i3       = prtls.i3;
+      const auto& i1_prev  = prtls.i1_prev;
+      const auto& i2_prev  = prtls.i2_prev;
+      const auto& i3_prev  = prtls.i3_prev;
+      const auto& dx1      = prtls.dx1;
+      const auto& dx2      = prtls.dx2;
+      const auto& dx3      = prtls.dx3;
+      const auto& dx1_prev = prtls.dx1_prev;
+      const auto& dx2_prev = prtls.dx2_prev;
+      const auto& dx3_prev = prtls.dx3_prev;
+      const auto& ux1      = prtls.ux1;
+      const auto& ux2      = prtls.ux2;
+      const auto& ux3      = prtls.ux3;
+      const auto& phi      = prtls.phi;
+      const auto& weight   = prtls.weight;
+      const auto& tag      = prtls.tag;
+
       if (tag(p) == ParticleTag::dead) {
         return;
       }
+
       // recover particle velocity to deposit in unsimulated direction
-      vec_t<Dim::_3D> vp { ZERO };
-      {
+      [[maybe_unused]] vec_t<Dim::_3D> vp { ZERO };
+      // `vp` only feeds the unsimulated-direction current in the 1D
+      // (jx2, jx3) and 2D (jx3) branches. In 3D every J component comes
+      // from the Esirkepov/zigzag charge motion and `vp` is never read,
+      // so the metric transform + 1/sqrt + NaN/Inf guard below is pure
+      // dead work there — skip it (also frees xp/inv_energy registers).
+      if constexpr (D != Dim::_3D) {
         coord_t<M::PrtlDim> xp { ZERO };
         if constexpr (D == Dim::_1D) {
           xp[0] = i_di_to_Xi(i1(p), dx1(p));
@@ -167,7 +180,6 @@ namespace kernel {
 
       const real_t coeff { weight(p) * charge };
 
-      // ToDo: interpolation_order as parameter
       if constexpr (O == 0u) {
         /*
           Zig-zag deposit
@@ -191,8 +203,6 @@ namespace kernel {
                               dx1(p) - dxp_r_1) *
                              coeff * inv_dt };
 
-        auto J_acc = J.access();
-
         if constexpr (D == Dim::_1D) {
           const real_t Fx2_1 { HALF * vp[1] * coeff };
           const real_t Fx2_2 { HALF * vp[1] * coeff };
@@ -200,18 +210,18 @@ namespace kernel {
           const real_t Fx3_1 { HALF * vp[2] * coeff };
           const real_t Fx3_2 { HALF * vp[2] * coeff };
 
-          J_acc(i1_prev(p) + N_GHOSTS, cur::jx1) += Fx1_1;
-          J_acc(i1(p) + N_GHOSTS, cur::jx1)      += Fx1_2;
+          deposit_at(i1_prev(p) + N_GHOSTS, cur::jx1, Fx1_1);
+          deposit_at(i1(p) + N_GHOSTS, cur::jx1, Fx1_2);
 
-          J_acc(i1_prev(p) + N_GHOSTS, cur::jx2)     += Fx2_1 * (ONE - Wx1_1);
-          J_acc(i1_prev(p) + N_GHOSTS + 1, cur::jx2) += Fx2_1 * Wx1_1;
-          J_acc(i1(p) + N_GHOSTS, cur::jx2)          += Fx2_2 * (ONE - Wx1_2);
-          J_acc(i1(p) + N_GHOSTS + 1, cur::jx2)      += Fx2_2 * Wx1_2;
+          deposit_at(i1_prev(p) + N_GHOSTS, cur::jx2, Fx2_1 * (ONE - Wx1_1));
+          deposit_at(i1_prev(p) + N_GHOSTS + 1, cur::jx2, Fx2_1 * Wx1_1);
+          deposit_at(i1(p) + N_GHOSTS, cur::jx2, Fx2_2 * (ONE - Wx1_2));
+          deposit_at(i1(p) + N_GHOSTS + 1, cur::jx2, Fx2_2 * Wx1_2);
 
-          J_acc(i1_prev(p) + N_GHOSTS, cur::jx3)     += Fx3_1 * (ONE - Wx1_1);
-          J_acc(i1_prev(p) + N_GHOSTS + 1, cur::jx3) += Fx3_1 * Wx1_1;
-          J_acc(i1(p) + N_GHOSTS, cur::jx3)          += Fx3_2 * (ONE - Wx1_2);
-          J_acc(i1(p) + N_GHOSTS + 1, cur::jx3)      += Fx3_2 * Wx1_2;
+          deposit_at(i1_prev(p) + N_GHOSTS, cur::jx3, Fx3_1 * (ONE - Wx1_1));
+          deposit_at(i1_prev(p) + N_GHOSTS + 1, cur::jx3, Fx3_1 * Wx1_1);
+          deposit_at(i1(p) + N_GHOSTS, cur::jx3, Fx3_2 * (ONE - Wx1_2));
+          deposit_at(i1(p) + N_GHOSTS + 1, cur::jx3, Fx3_2 * Wx1_2);
         } else if constexpr (D == Dim::_2D || D == Dim::_3D) {
           const auto dxp_r_2 { static_cast<prtldx_t>(i2(p) == i2_prev(p)) *
                                (dx2(p) + dx2_prev(p)) *
@@ -236,51 +246,73 @@ namespace kernel {
             const real_t Fx3_1 { HALF * vp[2] * coeff };
             const real_t Fx3_2 { HALF * vp[2] * coeff };
 
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  cur::jx1) += Fx1_1 * (ONE - Wx2_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  cur::jx1) += Fx1_1 * Wx2_1;
-            J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS, cur::jx1) += Fx1_2 *
-                                                                   (ONE - Wx2_2);
-            J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS + 1, cur::jx1) += Fx1_2 * Wx2_2;
-
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_1 * (ONE - Wx1_1);
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_1 * Wx1_1;
-            J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS, cur::jx2) += Fx2_2 *
-                                                                   (ONE - Wx1_2);
-            J_acc(i1(p) + N_GHOSTS + 1, i2(p) + N_GHOSTS, cur::jx2) += Fx2_2 * Wx1_2;
-
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1);
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * Wx1_1 * (ONE - Wx2_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  cur::jx3) += Fx3_1 * (ONE - Wx1_1) * Wx2_1;
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  cur::jx3) += Fx3_1 * Wx1_1 * Wx2_1;
-
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2);
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * Wx1_2 * (ONE - Wx2_2);
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS + 1,
-                  cur::jx3) += Fx3_2 * (ONE - Wx1_2) * Wx2_2;
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS + 1,
-                  cur::jx3) += Fx3_2 * Wx1_2 * Wx2_2;
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS,
+                       cur::jx1,
+                       Fx1_1 * (ONE - Wx2_1));
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS + 1,
+                       cur::jx1,
+                       Fx1_1 * Wx2_1);
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS,
+                       cur::jx1,
+                       Fx1_2 * (ONE - Wx2_2));
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS + 1,
+                       cur::jx1,
+                       Fx1_2 * Wx2_2);
+
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS,
+                       cur::jx2,
+                       Fx2_1 * (ONE - Wx1_1));
+            deposit_at(i1_prev(p) + N_GHOSTS + 1,
+                       i2_prev(p) + N_GHOSTS,
+                       cur::jx2,
+                       Fx2_1 * Wx1_1);
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS,
+                       cur::jx2,
+                       Fx2_2 * (ONE - Wx1_2));
+            deposit_at(i1(p) + N_GHOSTS + 1,
+                       i2(p) + N_GHOSTS,
+                       cur::jx2,
+                       Fx2_2 * Wx1_2);
+
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1));
+            deposit_at(i1_prev(p) + N_GHOSTS + 1,
+                       i2_prev(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_1 * Wx1_1 * (ONE - Wx2_1));
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS + 1,
+                       cur::jx3,
+                       Fx3_1 * (ONE - Wx1_1) * Wx2_1);
+            deposit_at(i1_prev(p) + N_GHOSTS + 1,
+                       i2_prev(p) + N_GHOSTS + 1,
+                       cur::jx3,
+                       Fx3_1 * Wx1_1 * Wx2_1);
+
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2));
+            deposit_at(i1(p) + N_GHOSTS + 1,
+                       i2(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_2 * Wx1_2 * (ONE - Wx2_2));
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS + 1,
+                       cur::jx3,
+                       Fx3_2 * (ONE - Wx1_2) * Wx2_2);
+            deposit_at(i1(p) + N_GHOSTS + 1,
+                       i2(p) + N_GHOSTS + 1,
+                       cur::jx3,
+                       Fx3_2 * Wx1_2 * Wx2_2);
           } else {
             const auto   dxp_r_3 { static_cast<prtldx_t>(i3(p) == i3_prev(p)) *
                                  (dx3(p) + dx3_prev(p)) *
@@ -300,107 +332,131 @@ namespace kernel {
                                   dx3(p) - dxp_r_3) *
                                  coeff * inv_dt };
 
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx1) += Fx1_1 * (ONE - Wx2_1) * (ONE - Wx3_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx1) += Fx1_1 * Wx2_1 * (ONE - Wx3_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS + 1,
-                  cur::jx1) += Fx1_1 * (ONE - Wx2_1) * Wx3_1;
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  i3_prev(p) + N_GHOSTS + 1,
-                  cur::jx1) += Fx1_1 * Wx2_1 * Wx3_1;
-
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS,
-                  cur::jx1) += Fx1_2 * (ONE - Wx2_2) * (ONE - Wx3_2);
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS + 1,
-                  i3(p) + N_GHOSTS,
-                  cur::jx1) += Fx1_2 * Wx2_2 * (ONE - Wx3_2);
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS + 1,
-                  cur::jx1) += Fx1_2 * (ONE - Wx2_2) * Wx3_2;
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS + 1,
-                  i3(p) + N_GHOSTS + 1,
-                  cur::jx1) += Fx1_2 * Wx2_2 * Wx3_2;
-
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_1 * (ONE - Wx1_1) * (ONE - Wx3_1);
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_1 * Wx1_1 * (ONE - Wx3_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS + 1,
-                  cur::jx2) += Fx2_1 * (ONE - Wx1_1) * Wx3_1;
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS + 1,
-                  cur::jx2) += Fx2_1 * Wx1_1 * Wx3_1;
-
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_2 * (ONE - Wx1_2) * (ONE - Wx3_2);
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_2 * Wx1_2 * (ONE - Wx3_2);
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS + 1,
-                  cur::jx2) += Fx2_2 * (ONE - Wx1_2) * Wx3_2;
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS + 1,
-                  cur::jx2) += Fx2_2 * Wx1_2 * Wx3_2;
-
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1);
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * Wx1_1 * (ONE - Wx2_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * (ONE - Wx1_1) * Wx2_1;
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * Wx1_1 * Wx2_1;
-
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2);
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * Wx1_2 * (ONE - Wx2_2);
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS + 1,
-                  i3(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * (ONE - Wx1_2) * Wx2_2;
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS + 1,
-                  i3(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * Wx1_2 * Wx2_2;
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS,
+                       i3_prev(p) + N_GHOSTS,
+                       cur::jx1,
+                       Fx1_1 * (ONE - Wx2_1) * (ONE - Wx3_1));
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS + 1,
+                       i3_prev(p) + N_GHOSTS,
+                       cur::jx1,
+                       Fx1_1 * Wx2_1 * (ONE - Wx3_1));
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS,
+                       i3_prev(p) + N_GHOSTS + 1,
+                       cur::jx1,
+                       Fx1_1 * (ONE - Wx2_1) * Wx3_1);
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS + 1,
+                       i3_prev(p) + N_GHOSTS + 1,
+                       cur::jx1,
+                       Fx1_1 * Wx2_1 * Wx3_1);
+
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS,
+                       i3(p) + N_GHOSTS,
+                       cur::jx1,
+                       Fx1_2 * (ONE - Wx2_2) * (ONE - Wx3_2));
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS + 1,
+                       i3(p) + N_GHOSTS,
+                       cur::jx1,
+                       Fx1_2 * Wx2_2 * (ONE - Wx3_2));
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS,
+                       i3(p) + N_GHOSTS + 1,
+                       cur::jx1,
+                       Fx1_2 * (ONE - Wx2_2) * Wx3_2);
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS + 1,
+                       i3(p) + N_GHOSTS + 1,
+                       cur::jx1,
+                       Fx1_2 * Wx2_2 * Wx3_2);
+
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS,
+                       i3_prev(p) + N_GHOSTS,
+                       cur::jx2,
+                       Fx2_1 * (ONE - Wx1_1) * (ONE - Wx3_1));
+            deposit_at(i1_prev(p) + N_GHOSTS + 1,
+                       i2_prev(p) + N_GHOSTS,
+                       i3_prev(p) + N_GHOSTS,
+                       cur::jx2,
+                       Fx2_1 * Wx1_1 * (ONE - Wx3_1));
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS,
+                       i3_prev(p) + N_GHOSTS + 1,
+                       cur::jx2,
+                       Fx2_1 * (ONE - Wx1_1) * Wx3_1);
+            deposit_at(i1_prev(p) + N_GHOSTS + 1,
+                       i2_prev(p) + N_GHOSTS,
+                       i3_prev(p) + N_GHOSTS + 1,
+                       cur::jx2,
+                       Fx2_1 * Wx1_1 * Wx3_1);
+
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS,
+                       i3(p) + N_GHOSTS,
+                       cur::jx2,
+                       Fx2_2 * (ONE - Wx1_2) * (ONE - Wx3_2));
+            deposit_at(i1(p) + N_GHOSTS + 1,
+                       i2(p) + N_GHOSTS,
+                       i3(p) + N_GHOSTS,
+                       cur::jx2,
+                       Fx2_2 * Wx1_2 * (ONE - Wx3_2));
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS,
+                       i3(p) + N_GHOSTS + 1,
+                       cur::jx2,
+                       Fx2_2 * (ONE - Wx1_2) * Wx3_2);
+            deposit_at(i1(p) + N_GHOSTS + 1,
+                       i2(p) + N_GHOSTS,
+                       i3(p) + N_GHOSTS + 1,
+                       cur::jx2,
+                       Fx2_2 * Wx1_2 * Wx3_2);
+
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS,
+                       i3_prev(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1));
+            deposit_at(i1_prev(p) + N_GHOSTS + 1,
+                       i2_prev(p) + N_GHOSTS,
+                       i3_prev(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_1 * Wx1_1 * (ONE - Wx2_1));
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS + 1,
+                       i3_prev(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_1 * (ONE - Wx1_1) * Wx2_1);
+            deposit_at(i1_prev(p) + N_GHOSTS + 1,
+                       i2_prev(p) + N_GHOSTS + 1,
+                       i3_prev(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_1 * Wx1_1 * Wx2_1);
+
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS,
+                       i3(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2));
+            deposit_at(i1(p) + N_GHOSTS + 1,
+                       i2(p) + N_GHOSTS,
+                       i3(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_2 * Wx1_2 * (ONE - Wx2_2));
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS + 1,
+                       i3(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_2 * (ONE - Wx1_2) * Wx2_2);
+            deposit_at(i1(p) + N_GHOSTS + 1,
+                       i2(p) + N_GHOSTS + 1,
+                       i3(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_2 * Wx1_2 * Wx2_2);
           }
         }
       } else if constexpr ((O >= 1u) and (O <= 11u)) {
@@ -421,33 +477,14 @@ namespace kernel {
                                    fS_x1);
 
         if constexpr (D == Dim::_1D) {
-          // define weight vectors
-          real_t Wx1[O + 2];
-          real_t Wx23[O + 2];
-
-          // Calculate weight function
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-            // Esirkepov 2001, Eq. 38 for 1D case
-            Wx1[i]  = fS_x1[i] - iS_x1[i];
-            Wx23[i] = HALF * (fS_x1[i] + iS_x1[i]);
-          }
-
-          // contribution within the shape function stencil
-          real_t jx1[O + 2];
-
-          // prefactors for j update
+          // (1D): fused Esirkepov, no [O+2] temporaries.
+          //   jx1[i] = -Qdx1dt * sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i'])
+          //          = -Qdx1dt * P1[i]                  (Eq. 38, 1D)
+          //   Wx23[i] = HALF * (fS_x1[i] + iS_x1[i])     (computed inline)
           const real_t Qdx1dt = coeff * inv_dt;
           const real_t QVx2   = coeff * vp[1];
           const real_t QVx3   = coeff * vp[2];
 
-          // Calculate current contribution
-          jx1[0] = -Qdx1dt * Wx1[0];
-#pragma unroll
-          for (int i = 1; i < O + 2; ++i) {
-            jx1[i] = jx1[i - 1] - Qdx1dt * Wx1[i];
-          }
-
           // account for ghost cells
           i1_min += N_GHOSTS;
           i1_max += N_GHOSTS;
@@ -455,21 +492,18 @@ namespace kernel {
           // get number of update indices for asymmetric movement
           const int di_x1 = i1_max - i1_min;
 
-          /*
-              Current update
-          */
-          auto J_acc = J.access();
-
-          for (int i = 0; i < di_x1; ++i) {
-            J_acc(i1_min + i, cur::jx1) += jx1[i];
-          }
-
+          // Current update — fused over the union line so the J cell
+          // stays L1-resident across the 3 component atomic_adds.
+          real_t P1 = ZERO;
           for (int i = 0; i <= di_x1; ++i) {
-            J_acc(i1_min + i, cur::jx2) += QVx2 * Wx23[i];
-          }
-
-          for (int i = 0; i <= di_x1; ++i) {
-            J_acc(i1_min + i, cur::jx3) += QVx3 * Wx23[i];
+            P1 += fS_x1[i] - iS_x1[i];
+            const int    gi   = i1_min + i;
+            const real_t Wx23 = HALF * (fS_x1[i] + iS_x1[i]);
+            if (i < di_x1) {
+              deposit_at(gi, cur::jx1, -Qdx1dt * P1);
+            }
+            deposit_at(gi, cur::jx2, QVx2 * Wx23);
+            deposit_at(gi, cur::jx3, QVx3 * Wx23);
           }
 
         } else if constexpr (D == Dim::_2D) {
@@ -489,63 +523,23 @@ namespace kernel {
                                      iS_x2,
                                      fS_x2);
 
-          // define weight tensors
-          real_t Wx1[O + 2][O + 2];
-          real_t Wx2[O + 2][O + 2];
-          real_t Wx3[O + 2][O + 2];
-
-// Calculate weight function
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-              // Esirkepov 2001, Eq. 38 (simplified)
-              Wx1[i][j] = HALF * (fS_x1[i] - iS_x1[i]) * (fS_x2[j] + iS_x2[j]);
-
-              Wx2[i][j] = HALF * (fS_x1[i] + iS_x1[i]) * (fS_x2[j] - iS_x2[j]);
-
-              Wx3[i][j] = THIRD * (fS_x2[j] * (HALF * iS_x1[i] + fS_x1[i]) +
-                                   iS_x2[j] * (HALF * fS_x1[i] + iS_x1[i]));
-            }
-          }
-
-          // contribution within the shape function stencil
-          real_t jx1[O + 2][O + 2], jx2[O + 2][O + 2];
-
-          // prefactors for j update
-          const real_t Qdx1dt = coeff * inv_dt;
-          const real_t Qdx2dt = coeff * inv_dt;
-          const real_t QVx3   = coeff * vp[2];
-
-          // Calculate current contribution
-
-          // jx1
-#pragma unroll
-          for (int j = 0; j < O + 2; ++j) {
-            jx1[0][j] = -Qdx1dt * Wx1[0][j];
-          }
-
-#pragma unroll
-          for (int i = 1; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-              jx1[i][j] = jx1[i - 1][j] - Qdx1dt * Wx1[i][j];
-            }
-          }
-
-          // jx2
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-            jx2[i][0] = -Qdx2dt * Wx2[i][0];
-          }
-
-#pragma unroll
-          for (int j = 1; j < O + 2; ++j) {
-#pragma unroll
-            for (int i = 0; i < O + 2; ++i) {
-              jx2[i][j] = jx2[i][j - 1] - Qdx2dt * Wx2[i][j];
-            }
-          }
+          // (2D): fused Esirkepov, no [O+2]^2 temporaries.
+          //
+          // Esirkepov 2001 Eq. 38 (simplified) is separable: with
+          // P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) and
+          // P2[j] = sum_{j'=0}^{j} (fS_x2[j'] - iS_x2[j']),
+          //   jx1[i][j] = -Q*HALF * P1[i] * (fS_x2[j] + iS_x2[j])
+          //   jx2[i][j] = -Q*HALF * P2[j] * (fS_x1[i] + iS_x1[i])
+          //   Wx3[i][j] = THIRD*( fS_x2[j]*(HALF*iS_x1[i]+fS_x1[i])
+          //                     + iS_x2[j]*(HALF*fS_x1[i]+iS_x1[i]) )
+          // with Q = coeff*inv_dt (Qdx1dt == Qdx2dt). Same value as the
+          // old explicit Wx/jx tensors up to FP reassociation;
+          // charge-conserving by construction. Prefix sums carried as
+          // running scalars, so the only per-thread state is the 
+          // existing 1D shape arrays.
+          const real_t QVx3 = coeff * vp[2];
+          // -Q*HALF prefactor (Qdx1dt == Qdx2dt == coeff*inv_dt)
+          const real_t cf = -(coeff * inv_dt) * HALF;
 
           // account for ghost cells
           i1_min += N_GHOSTS;
@@ -557,26 +551,30 @@ namespace kernel {
           const int di_x1 = i1_max - i1_min;
           const int di_x2 = i2_max - i2_min;
 
-          /*
-              Current update
-          */
-          auto J_acc = J.access();
-
-          for (int i = 0; i < di_x1; ++i) {
-            for (int j = 0; j <= di_x2; ++j) {
-              J_acc(i1_min + i, i2_min + j, cur::jx1) += jx1[i][j];
-            }
-          }
-
-          for (int i = 0; i <= di_x1; ++i) {
-            for (int j = 0; j < di_x2; ++j) {
-              J_acc(i1_min + i, i2_min + j, cur::jx2) += jx2[i][j];
-            }
-          }
-
+          // Current update — fused over the union plane so the J cell
+          // line stays L1-resident across the 3 component atomic_adds.
+          real_t P1 = ZERO;
           for (int i = 0; i <= di_x1; ++i) {
+            P1 += fS_x1[i] - iS_x1[i];
+            const int    gi   = i1_min + i;
+            const real_t iSx1 = iS_x1[i];
+            const real_t fSx1 = fS_x1[i];
+            const real_t A1   = fSx1 + iSx1;     // jx2 cross-factor
+            real_t       P2   = ZERO;
             for (int j = 0; j <= di_x2; ++j) {
-              J_acc(i1_min + i, i2_min + j, cur::jx3) += QVx3 * Wx3[i][j];
+              P2 += fS_x2[j] - iS_x2[j];
+              const int    gj   = i2_min + j;
+              const real_t iSx2 = iS_x2[j];
+              const real_t fSx2 = fS_x2[j];
+              if (i < di_x1) {
+                deposit_at(gi, gj, cur::jx1, cf * P1 * (fSx2 + iSx2));
+              }
+              if (j < di_x2) {
+                deposit_at(gi, gj, cur::jx2, cf * P2 * A1);
+              }
+              const real_t Wx3 = THIRD * (fSx2 * (HALF * iSx1 + fSx1) +
+                                          iSx2 * (HALF * fSx1 + iSx1));
+              deposit_at(gi, gj, cur::jx3, QVx3 * Wx3);
             }
           }
 
@@ -610,104 +608,33 @@ namespace kernel {
                                      iS_x3,
                                      fS_x3);
 
-          // define weight tensors
-          real_t Wx1[O + 2][O + 2][O + 2];
-          real_t Wx2[O + 2][O + 2][O + 2];
-          real_t Wx3[O + 2][O + 2][O + 2];
-
-// Calculate weight function
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-#pragma unroll
-              for (int k = 0; k < O + 2; ++k) {
-                // Esirkepov 2001, Eq. 31
-                Wx1[i][j][k] = THIRD * (fS_x1[i] - iS_x1[i]) *
-                               ((iS_x2[j] * iS_x3[k] + fS_x2[j] * fS_x3[k]) +
-                                HALF * (iS_x3[k] * fS_x2[j] + iS_x2[j] * fS_x3[k]));
-
-                Wx2[i][j][k] = THIRD * (fS_x2[j] - iS_x2[j]) *
-                               (iS_x1[i] * iS_x3[k] + fS_x1[i] * fS_x3[k] +
-                                HALF * (iS_x3[k] * fS_x1[i] + iS_x1[i] * fS_x3[k]));
-
-                Wx3[i][j][k] = THIRD * (fS_x3[k] - iS_x3[k]) *
-                               (iS_x1[i] * iS_x2[j] + fS_x1[i] * fS_x2[j] +
-                                HALF * (iS_x1[i] * fS_x2[j] + iS_x2[j] * fS_x1[i]));
-              }
-            }
-          }
-
-          // contribution within the shape function stencil
-          real_t jx1[O + 2][O + 2][O + 2], jx2[O + 2][O + 2][O + 2],
-            jx3[O + 2][O + 2][O + 2];
-
-          // prefactors to j update
-          const real_t Qdxdt = coeff * inv_dt;
-          const real_t Qdydt = coeff * inv_dt;
-          const real_t Qdzdt = coeff * inv_dt;
-
-          // Calculate current contribution
-
-          // jx1
-#pragma unroll
-          for (int j = 0; j < O + 2; ++j) {
-#pragma unroll
-            for (int k = 0; k < O + 2; ++k) {
-              jx1[0][j][k] = -Qdxdt * Wx1[0][j][k];
-            }
-          }
-
-#pragma unroll
-          for (int i = 1; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-#pragma unroll
-              for (int k = 0; k < O + 2; ++k) {
-                jx1[i][j][k] = jx1[i - 1][j][k] - Qdxdt * Wx1[i][j][k];
-              }
-            }
-          }
-
-          // jx2
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int k = 0; k < O + 2; ++k) {
-              jx2[i][0][k] = -Qdydt * Wx2[i][0][k];
-            }
-          }
-
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 1; j < O + 2; ++j) {
-#pragma unroll
-              for (int k = 0; k < O + 2; ++k) {
-                jx2[i][j][k] = jx2[i][j - 1][k] - Qdydt * Wx2[i][j][k];
-              }
-            }
-          }
-
-          // jx3
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-              jx3[i][j][0] = -Qdydt * Wx3[i][j][0];
-            }
-          }
-
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-#pragma unroll
-              for (int k = 1; k < O + 2; ++k) {
-                jx3[i][j][k] = jx3[i][j][k - 1] - Qdzdt * Wx3[i][j][k];
-              }
-            }
-          }
+          // fused Esirkepov, no (O+2)^3 temporaries.
+          //
+          // The Esirkepov 3D current (2001, Eq. 31) is separable: with
+          // P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) (and likewise
+          // P2[j], P3[k]) the cumulative-sum currents collapse to
+          //
+          //   jx1[i][j][k] = -Q*THIRD * P1[i] * G23(j,k)
+          //   jx2[i][j][k] = -Q*THIRD * P2[j] * H13(i,k)
+          //   jx3[i][j][k] = -Q*THIRD * P3[k] * F12(i,j)
+          //
+          // with the 1D-shape cross-factors
+          //
+          //   G23(j,k) = iS_x2[j]*iS_x3[k] + fS_x2[j]*fS_x3[k]
+          //            + HALF*(iS_x3[k]*fS_x2[j] + iS_x2[j]*fS_x3[k])
+          //   H13(i,k) = iS_x1[i]*iS_x3[k] + fS_x1[i]*fS_x3[k]
+          //            + HALF*(iS_x3[k]*fS_x1[i] + iS_x1[i]*fS_x3[k])
+          //   F12(i,j) = iS_x1[i]*iS_x2[j] + fS_x1[i]*fS_x2[j]
+          //            + HALF*(iS_x1[i]*fS_x2[j] + iS_x2[j]*fS_x1[i])
+          //
+          // and Q = coeff*inv_dt (Qdxdt == Qdydt == Qdzdt). This is the
+          // same value as the old explicit Wx/jx tensors up to
+          // floating-point reassociation: charge-conserving by
+          // construction (the Esirkepov decomposition is exact). The
+          // prefix sums are carried as running scalars in the deposit
+          // loop, so the only per-thread state is the existing 1D shape
+          // arrays (no (O+2)^3 / (O+2)^2 locals, hence far fewer VGPRs
+          // and no private-memory tensor traffic).
 
           // account for ghost cells
           i1_min += N_GHOSTS;
@@ -722,31 +649,50 @@ namespace kernel {
           const int di_x2 = i2_max - i2_min;
           const int di_x3 = i3_max - i3_min;
 
+          // -Q*THIRD prefactor (Qdxdt == Qdydt == Qdzdt == coeff*inv_dt)
+          const real_t cf = -(coeff * inv_dt) * THIRD;
+
           /*
-            Current update
+            Current update — fused over the union cube so the J cell
+            line stays L1-resident across the 3 component atomic_adds.
+            Per-cell branches on (i<di_x1), (j<di_x2), (k<di_x3) skip
+            the trailing slab where each component's stencil ends one
+            cell short of the union; particles within a tile share
+            di_x* so the branch predicates cleanly.
           */
-          auto J_acc = J.access();
-
-          for (int i = 0; i < di_x1; ++i) {
-            for (int j = 0; j <= di_x2; ++j) {
-              for (int k = 0; k <= di_x3; ++k) {
-                J_acc(i1_min + i, i2_min + j, i3_min + k, cur::jx1) += jx1[i][j][k];
-              }
-            }
-          }
-
-          for (int i = 0; i <= di_x1; ++i) {
-            for (int j = 0; j < di_x2; ++j) {
-              for (int k = 0; k <= di_x3; ++k) {
-                J_acc(i1_min + i, i2_min + j, i3_min + k, cur::jx2) += jx2[i][j][k];
-              }
-            }
-          }
-
+          real_t P1 = ZERO;
           for (int i = 0; i <= di_x1; ++i) {
+            P1 += fS_x1[i] - iS_x1[i];
+            const int    gi    = i1_min + i;
+            const real_t iSx1i = iS_x1[i];
+            const real_t fSx1i = fS_x1[i];
+            real_t       P2    = ZERO;
             for (int j = 0; j <= di_x2; ++j) {
-              for (int k = 0; k < di_x3; ++k) {
-                J_acc(i1_min + i, i2_min + j, i3_min + k, cur::jx3) += jx3[i][j][k];
+              P2 += fS_x2[j] - iS_x2[j];
+              const int    gj    = i2_min + j;
+              const real_t iSx2j = iS_x2[j];
+              const real_t fSx2j = fS_x2[j];
+              const real_t F12   = iSx1i * iSx2j + fSx1i * fSx2j +
+                                 HALF * (iSx1i * fSx2j + iSx2j * fSx1i);
+              real_t P3 = ZERO;
+              for (int k = 0; k <= di_x3; ++k) {
+                P3 += fS_x3[k] - iS_x3[k];
+                const int    gk    = i3_min + k;
+                const real_t iSx3k = iS_x3[k];
+                const real_t fSx3k = fS_x3[k];
+                if (i < di_x1) {
+                  const real_t G23 = iSx2j * iSx3k + fSx2j * fSx3k +
+                                     HALF * (iSx3k * fSx2j + iSx2j * fSx3k);
+                  deposit_at(gi, gj, gk, cur::jx1, cf * P1 * G23);
+                }
+                if (j < di_x2) {
+                  const real_t H13 = iSx1i * iSx3k + fSx1i * fSx3k +
+                                     HALF * (iSx3k * fSx1i + iSx1i * fSx3k);
+                  deposit_at(gi, gj, gk, cur::jx2, cf * P2 * H13);
+                }
+                if (k < di_x3) {
+                  deposit_at(gi, gj, gk, cur::jx3, cf * P3 * F12);
+                }
               }
             }
           }
@@ -759,7 +705,522 @@ namespace kernel {
           "What are you even doing here? Entity already goes to 11!");
       }
     }
+
+  } // namespace deposit
+
+  /**
+   * @brief Flat current-deposition kernel.
+   *
+   * One thread per particle (RangePolicy). Writes are coalesced through a
+   * `Kokkos::Experimental::ScatterView` to avoid per-thread atomics on
+   * global J. Constructor signature is unchanged from prior versions —
+   * `engines/srpic/currents.h` continues to call it identically.
+   */
+  template <SimEngine::type S, MetricClass M, unsigned short O = 1u>
+  class DepositCurrents_kernel {
+    static_assert(O <= 11u, "Shape function order O must be <= 11");
+    static constexpr auto D = M::Dim;
+
+    scatter_ndfield_t<D, 3>     J;
+    deposit::PrtlPack<D>        prtls;
+    const M                     metric;
+    const real_t                charge, inv_dt;
+
+  public:
+    DepositCurrents_kernel(const scatter_ndfield_t<D, 3>& scatter_cur,
+                           const array_t<int*>&           i1,
+                           const array_t<int*>&           i2,
+                           const array_t<int*>&           i3,
+                           const array_t<int*>&           i1_prev,
+                           const array_t<int*>&           i2_prev,
+                           const array_t<int*>&           i3_prev,
+                           const array_t<prtldx_t*>&      dx1,
+                           const array_t<prtldx_t*>&      dx2,
+                           const array_t<prtldx_t*>&      dx3,
+                           const array_t<prtldx_t*>&      dx1_prev,
+                           const array_t<prtldx_t*>&      dx2_prev,
+                           const array_t<prtldx_t*>&      dx3_prev,
+                           const array_t<real_t*>&        ux1,
+                           const array_t<real_t*>&        ux2,
+                           const array_t<real_t*>&        ux3,
+                           const array_t<real_t*>&        phi,
+                           const array_t<real_t*>&        weight,
+                           const array_t<short*>&         tag,
+                           const M&                       metric,
+                           real_t                         charge,
+                           const real_t                   dt)
+      : J { scatter_cur }
+      , prtls { i1,       i2,       i3,       i1_prev,  i2_prev,  i3_prev,
+                dx1,      dx2,      dx3,      dx1_prev, dx2_prev, dx3_prev,
+                ux1,      ux2,      ux3,      phi,      weight,   tag }
+      , metric { metric }
+      , charge { charge }
+      , inv_dt { ONE / dt } {
+      raise::ErrorIf(
+        (O == 2u and N_GHOSTS < 2),
+        "Order of interpolation is 2, but number of ghost cells is < 2",
+        HERE);
+    }
+
+    Inline auto operator()(prtlidx_t p) const -> void {
+      auto J_acc = J.access();
+      if constexpr (D == Dim::_1D) {
+        deposit::deposit_one_particle<S, M, O>(
+          p,
+          prtls,
+          metric,
+          charge,
+          inv_dt,
+          [&](int g_i1, int comp, real_t v) {
+            J_acc(g_i1, comp) += v;
+          });
+      } else if constexpr (D == Dim::_2D) {
+        deposit::deposit_one_particle<S, M, O>(
+          p,
+          prtls,
+          metric,
+          charge,
+          inv_dt,
+          [&](int g_i1, int g_i2, int comp, real_t v) {
+            J_acc(g_i1, g_i2, comp) += v;
+          });
+      } else if constexpr (D == Dim::_3D) {
+        deposit::deposit_one_particle<S, M, O>(
+          p,
+          prtls,
+          metric,
+          charge,
+          inv_dt,
+          [&](int g_i1, int g_i2, int g_i3, int comp, real_t v) {
+            J_acc(g_i1, g_i2, g_i3, comp) += v;
+          });
+      }
+    }
   };
+
+#if defined(TEAM_POLICY)
+
+  /**
+   * @brief Tiled current-deposition kernel.
+   *
+   * One team per spatial tile (`league_size = ntiles_total`). Each team
+   * accumulates particle contributions into a per-team scratch buffer of
+   * shape `(T_TILE + 2*HALO)^D × 3` real_t, where `HALO = O + 1` cells per
+   * side. Scratch atomics live in SLM (PVC: ~5–10 cycles per
+   * `atomic_add`); the global J is touched only once per scratch cell at
+   * flush time. Compared with the flat scatter-view kernel:
+   *   - global atomic pressure ~ (T_TILE + 2*HALO)^D × 3 per tile
+   *     instead of (stencil writes per particle × particles)
+   *   - per-particle stencil writes are tile-local (SLM) instead of
+   *     scattering through global HBM
+   *
+   * Supports `O ∈ {0, ..., 11}`. `O == 0` (zigzag) is wired for
+   * A/B benchmarking against the flat scatter-view kernel — its narrow
+   * stencil typically makes scratch alloc/zero/flush overhead a
+   * regression there, but it's good to be able to measure the
+   * crossover. To revert and use flat for zigzag-only builds, change
+   * the dispatch in `engines/srpic/currents.h` from
+   * `#if defined(TEAM_POLICY)` to
+   * `#if defined(TEAM_POLICY) && (SHAPE_ORDER > 0)`.
+   *
+   * Particle iteration order is governed by `tile_offsets`: tile `t`
+   * owns particles `[tile_offsets(t), tile_offsets(t+1))`, post-sort.
+   * `SortSpatially` (`particles_sort.cpp`) is responsible for keeping
+   * the SoA arrays consistent with that.
+   *
+   * **Halo sizing and escape valve.** Sort runs at the end of the
+   * previous step (see `srpic.hpp`), so at deposit time the particle
+   * has already been pushed once — its `min(i, i_prev)` may differ
+   * from the bin key by one cell of drift per step elapsed since the
+   * last sort. The scratch HALO is `STENCIL_REACH(O) + DRIFT`, where
+   * `STENCIL_REACH = 2` for zigzag (writes `{i_prev, i_prev+1, i,
+   * i+1}` ⇒ +2 above `min(i, i_prev)` with `|Δi|=1`) and `O` for
+   * Esirkepov, and `DRIFT` is a fixed constant (1) covering the one
+   * guaranteed post-sort pusher step.
+   *
+   * HALO is sized for the *common* (every-step-sorted) case, not for
+   * a worst-case sort cadence: correctness does **not** depend on it.
+   * Any particle whose stencil escapes the scratch tile — because it
+   * drifted further than `DRIFT` (e.g. a large runtime
+   * `spatial_sorting_interval`), or because the halo is otherwise
+   * undersized — silently falls back to a direct, bounds-clipped
+   * `Kokkos::atomic_add` on the global J view. That path is
+   * charge-conserving (each particle's stencil is deposited exactly
+   * once, partly to private SLM scratch and partly to global J, and
+   * scratch is flushed once via `atomic_add`); it is merely slower
+   * per write. Sorting less often than every step therefore costs
+   * escape-valve traffic, never accuracy.
+   */
+  template <SimEngine::type S, MetricClass M, unsigned short O,
+            unsigned short T_TILE>
+  class DepositCurrents_kernel_tiled {
+    static_assert(O <= 11u, "Shape order O must be <= 11");
+    static_assert(T_TILE > 0u, "T_TILE must be positive");
+    static constexpr auto D = M::Dim;
+
+    // Per-side scratch halo, derived from first principles.
+    //
+    //   total halo = stencil_reach(O) + drift_between_sort_and_deposit
+    //
+    // stencil_reach(O) — maximum cells the deposit writes ABOVE
+    // min(i, i_prev) under CFL |v·dt/dx| ≤ 1/2:
+    //   - O == 0 (zigzag):  writes {i_prev, i_prev+1, i, i+1}  ⇒ +2
+    //   - O >= 1 Esirkepov: `for_deposit` returns an (O+2)-wide
+    //     array but only O+1 entries are non-zero, and the union
+    //     window satisfies `i_max - i_min <= O+1` (see
+    //     particle_shapes.hpp::for_deposit). The genuine one-sided
+    //     reach above min(i, i_prev) is therefore O, not O+1 — the
+    //     old `O+1` carried one extra cell of conservative padding
+    //     on top of the already-conservative drift term below.
+    //
+    // drift — sort runs at end-of-step (see srpic.hpp), so a particle
+    // sees exactly one pusher step before the *next* step's deposit
+    // when sorted every step (the common case). DRIFT is therefore a
+    // fixed constant of 1, NOT a compile-time function of the runtime
+    // sort cadence. Sizing the halo for the common case (rather than a
+    // worst-case sort interval) is what keeps the scratch small enough
+    // for good occupancy; a species sorted less often than
+    // every step just drifts past the halo and takes the global-J
+    // escape valve more often — correct, only slower (see the class
+    // doc-comment for why this is charge-conserving).
+    //
+    static constexpr int STENCIL_REACH = (O == 0u)
+                                           ? 2
+                                           : static_cast<int>(O);
+    static constexpr int DRIFT = 1;
+    static constexpr int HALO  = STENCIL_REACH + DRIFT;
+    static constexpr int TE    = static_cast<int>(T_TILE) + 2 * HALO;
+
+    using exec_space   = Kokkos::DefaultExecutionSpace;
+    using team_policy  = Kokkos::TeamPolicy<exec_space>;
+    using member_t     = typename team_policy::member_type;
+    using scratch_mem  = typename exec_space::scratch_memory_space;
+
+    // Scratch view types: trailing extent of 3 (jx1, jx2, jx3 components)
+    // is fixed by a runtime extent so we don't need a separate dimension
+    // template per component count.
+    using scratch_1d_t = Kokkos::View<real_t**,
+                                      scratch_mem,
+                                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+    using scratch_2d_t = Kokkos::View<real_t***,
+                                      scratch_mem,
+                                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+    using scratch_3d_t = Kokkos::View<real_t****,
+                                      scratch_mem,
+                                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+
+    ndfield_t<D, 3>           J;
+    deposit::PrtlPack<D>      prtls;
+    const M                   metric;
+    const real_t              charge, inv_dt;
+
+    // Tile metadata produced by SortSpatially.
+    array_t<npart_t*>         tile_offsets;
+    ncells_t                  ntx1 { 1u }, ntx2 { 1u }, ntx3 { 1u };
+    ncells_t                  total_tiles { 0u };
+
+    // J's full storage extent including all ghost cells. Used to clip
+    // the cooperative flush so that a partial tile at the high end of
+    // the domain does not over-write past the J view.
+    int                       j_ext1 { 0 }, j_ext2 { 0 }, j_ext3 { 0 };
+
+  public:
+    DepositCurrents_kernel_tiled(const ndfield_t<D, 3>&    cur,
+                                 const array_t<int*>&      i1,
+                                 const array_t<int*>&      i2,
+                                 const array_t<int*>&      i3,
+                                 const array_t<int*>&      i1_prev,
+                                 const array_t<int*>&      i2_prev,
+                                 const array_t<int*>&      i3_prev,
+                                 const array_t<prtldx_t*>& dx1,
+                                 const array_t<prtldx_t*>& dx2,
+                                 const array_t<prtldx_t*>& dx3,
+                                 const array_t<prtldx_t*>& dx1_prev,
+                                 const array_t<prtldx_t*>& dx2_prev,
+                                 const array_t<prtldx_t*>& dx3_prev,
+                                 const array_t<real_t*>&   ux1,
+                                 const array_t<real_t*>&   ux2,
+                                 const array_t<real_t*>&   ux3,
+                                 const array_t<real_t*>&   phi,
+                                 const array_t<real_t*>&   weight,
+                                 const array_t<short*>&    tag,
+                                 const M&                  metric,
+                                 real_t                    charge,
+                                 const real_t              dt,
+                                 const TileLayout<D>&      layout)
+      : J { cur }
+      , prtls { i1,       i2,       i3,       i1_prev,  i2_prev,  i3_prev,
+                dx1,      dx2,      dx3,      dx1_prev, dx2_prev, dx3_prev,
+                ux1,      ux2,      ux3,      phi,      weight,   tag }
+      , metric { metric }
+      , charge { charge }
+      , inv_dt { ONE / dt }
+      , tile_offsets { layout.tile_offsets }
+      , ntx1 { layout.ntiles_per_axis[0] }
+      , ntx2 { layout.ntiles_per_axis[1] }
+      , ntx3 { layout.ntiles_per_axis[2] }
+      , total_tiles { layout.ntiles_total } {
+      raise::ErrorIf(
+        layout.tile_size != T_TILE,
+        "Tiled deposit launched with mismatched T_TILE and runtime tile_size",
+        HERE);
+      // Note: HALO is allowed to exceed N_GHOSTS. The cooperative
+      // scratch→J flush and the per-particle escape valve both bounds-clip
+      // their writes against `j_ext*` so writes that would land past J's
+      // ghost stripe are silently dropped (they only ever come from a
+      // particle whose stencil reaches into the domain ghost region, where
+      // CommunicateFields will re-supply the contribution).
+      if constexpr (D == Dim::_1D || D == Dim::_2D || D == Dim::_3D) {
+        j_ext1 = static_cast<int>(cur.extent(0));
+      }
+      if constexpr (D == Dim::_2D || D == Dim::_3D) {
+        j_ext2 = static_cast<int>(cur.extent(1));
+      }
+      if constexpr (D == Dim::_3D) {
+        j_ext3 = static_cast<int>(cur.extent(2));
+      }
+    }
+
+    /**
+     * @brief Per-team scratch size in bytes. Used by the launcher to set
+     *        `team_policy.set_scratch_size(0, Kokkos::PerTeam(bytes))`.
+     */
+    static constexpr std::size_t scratch_bytes() {
+      if constexpr (D == Dim::_1D) {
+        return scratch_1d_t::shmem_size(TE, 3);
+      } else if constexpr (D == Dim::_2D) {
+        return scratch_2d_t::shmem_size(TE, TE, 3);
+      } else {
+        return scratch_3d_t::shmem_size(TE, TE, TE, 3);
+      }
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_t& team) const {
+      const auto tile_id = static_cast<ncells_t>(team.league_rank());
+      // Tile coordinates (tile-grid indices) → tile origin in **active**
+      // cell coords (no ghost offset). Using ncells_t to match the linearised
+      // tile index produced by SortSpatially.
+      ncells_t tx1 = 0, tx2 = 0, tx3 = 0;
+      if constexpr (D == Dim::_1D) {
+        tx1 = tile_id;
+      } else if constexpr (D == Dim::_2D) {
+        tx1 = tile_id / ntx2;
+        tx2 = tile_id - tx1 * ntx2;
+      } else {
+        const auto plane = ntx2 * ntx3;
+        tx1              = tile_id / plane;
+        const auto rem   = tile_id - tx1 * plane;
+        tx2              = rem / ntx3;
+        tx3              = rem - tx2 * ntx3;
+      }
+      // origin_active = lowest active-cell index in the tile (no ghost).
+      // origin_J      = same value translated into J's storage coordinate
+      //                 (i.e. plus N_GHOSTS).
+      // origin_J_low  = J coordinate of scratch index 0 (i.e. origin_J - HALO).
+      // local index `li` in scratch ↔ global J index `gi = li + origin_J_low`.
+      const int origin_J1_low = static_cast<int>(tx1 * T_TILE)
+                                + static_cast<int>(N_GHOSTS) - HALO;
+      const int origin_J2_low = static_cast<int>(tx2 * T_TILE)
+                                + static_cast<int>(N_GHOSTS) - HALO;
+      const int origin_J3_low = static_cast<int>(tx3 * T_TILE)
+                                + static_cast<int>(N_GHOSTS) - HALO;
+
+      // Allocate scratch and cooperatively zero-fill it.
+      if constexpr (D == Dim::_1D) {
+        scratch_1d_t scr(team.team_scratch(0), TE, 3);
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, TE * 3),
+          [&](const int idx) {
+            const int li = idx / 3;
+            const int c  = idx - li * 3;
+            scr(li, c)   = ZERO;
+          });
+        team.team_barrier();
+
+        const auto p_begin = tile_offsets(tile_id);
+        const auto p_end   = tile_offsets(tile_id + 1u);
+        const int  e1_d    = j_ext1;
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, p_begin, p_end),
+          [&](const npart_t p) {
+            deposit::deposit_one_particle<S, M, O>(
+              p,
+              prtls,
+              metric,
+              charge,
+              inv_dt,
+              // Escape valve: a particle whose stencil reaches past the
+              // tile's scratch (e.g. exceeded the compile-time
+              // STENCIL_REACH + DRIFT budget) falls back to a direct
+              // atomic_add on the global J view. Bounds-clipped against
+              // J's storage extent so writes past the domain ghost stripe
+              // are dropped (matches the cooperative flush below; those
+              // contributions are re-supplied by SynchronizeFields(J)).
+              [&](int g_i1, int comp, real_t v) {
+                const int li = g_i1 - origin_J1_low;
+                if (li >= 0 && li < TE) {
+                  Kokkos::atomic_add(&scr(li, comp), v);
+                } else if (g_i1 >= 0 && g_i1 < e1_d) {
+                  Kokkos::atomic_add(&J(g_i1, comp), v);
+                }
+              });
+          });
+        team.team_barrier();
+
+        // Cooperative flush of scratch to global J. Bounds-clip against
+        // the J view extent in case a partial high-end tile (or non-zero
+        // halo at domain edges) would otherwise write past J.
+        const int e1 = j_ext1;
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, TE * 3),
+          [&](const int idx) {
+            const int li = idx / 3;
+            const int c  = idx - li * 3;
+            const int gi = li + origin_J1_low;
+            if (gi < 0 || gi >= e1) {
+              return;
+            }
+            const real_t v = scr(li, c);
+            if (v != ZERO) {
+              Kokkos::atomic_add(&J(gi, c), v);
+            }
+          });
+      } else if constexpr (D == Dim::_2D) {
+        scratch_2d_t scr(team.team_scratch(0), TE, TE, 3);
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, TE * TE * 3),
+          [&](const int idx) {
+            const int lij  = idx / 3;
+            const int c    = idx - lij * 3;
+            const int li   = lij / TE;
+            const int lj   = lij - li * TE;
+            scr(li, lj, c) = ZERO;
+          });
+        team.team_barrier();
+
+        const auto p_begin = tile_offsets(tile_id);
+        const auto p_end   = tile_offsets(tile_id + 1u);
+        const int  e1_d    = j_ext1;
+        const int  e2_d    = j_ext2;
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, p_begin, p_end),
+          [&](const npart_t p) {
+            deposit::deposit_one_particle<S, M, O>(
+              p,
+              prtls,
+              metric,
+              charge,
+              inv_dt,
+              // See 1D branch for rationale.
+              [&](int g_i1, int g_i2, int comp, real_t v) {
+                const int li = g_i1 - origin_J1_low;
+                const int lj = g_i2 - origin_J2_low;
+                if (li >= 0 && li < TE && lj >= 0 && lj < TE) {
+                  Kokkos::atomic_add(&scr(li, lj, comp), v);
+                } else if (g_i1 >= 0 && g_i1 < e1_d && g_i2 >= 0 &&
+                           g_i2 < e2_d) {
+                  Kokkos::atomic_add(&J(g_i1, g_i2, comp), v);
+                }
+              });
+          });
+        team.team_barrier();
+
+        const int e1 = j_ext1;
+        const int e2 = j_ext2;
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, TE * TE * 3),
+          [&](const int idx) {
+            const int lij = idx / 3;
+            const int c   = idx - lij * 3;
+            const int li  = lij / TE;
+            const int lj  = lij - li * TE;
+            const int gi  = li + origin_J1_low;
+            const int gj  = lj + origin_J2_low;
+            if (gi < 0 || gi >= e1 || gj < 0 || gj >= e2) {
+              return;
+            }
+            const real_t v = scr(li, lj, c);
+            if (v != ZERO) {
+              Kokkos::atomic_add(&J(gi, gj, c), v);
+            }
+          });
+      } else if constexpr (D == Dim::_3D) {
+        scratch_3d_t scr(team.team_scratch(0), TE, TE, TE, 3);
+        const int    cells = TE * TE * TE;
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, cells * 3),
+          [&](const int idx) {
+            const int lijk     = idx / 3;
+            const int c        = idx - lijk * 3;
+            const int li       = lijk / (TE * TE);
+            const int rem      = lijk - li * TE * TE;
+            const int lj       = rem / TE;
+            const int lk       = rem - lj * TE;
+            scr(li, lj, lk, c) = ZERO;
+          });
+        team.team_barrier();
+
+        const auto p_begin = tile_offsets(tile_id);
+        const auto p_end   = tile_offsets(tile_id + 1u);
+        const int  e1_d    = j_ext1;
+        const int  e2_d    = j_ext2;
+        const int  e3_d    = j_ext3;
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, p_begin, p_end),
+          [&](const npart_t p) {
+            deposit::deposit_one_particle<S, M, O>(
+              p,
+              prtls,
+              metric,
+              charge,
+              inv_dt,
+              // See 1D branch for rationale.
+              [&](int g_i1, int g_i2, int g_i3, int comp, real_t v) {
+                const int li = g_i1 - origin_J1_low;
+                const int lj = g_i2 - origin_J2_low;
+                const int lk = g_i3 - origin_J3_low;
+                if (li >= 0 && li < TE && lj >= 0 && lj < TE && lk >= 0 &&
+                    lk < TE) {
+                  Kokkos::atomic_add(&scr(li, lj, lk, comp), v);
+                } else if (g_i1 >= 0 && g_i1 < e1_d && g_i2 >= 0 &&
+                           g_i2 < e2_d && g_i3 >= 0 && g_i3 < e3_d) {
+                  Kokkos::atomic_add(&J(g_i1, g_i2, g_i3, comp), v);
+                }
+              });
+          });
+        team.team_barrier();
+
+        const int e1 = j_ext1;
+        const int e2 = j_ext2;
+        const int e3 = j_ext3;
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, cells * 3),
+          [&](const int idx) {
+            const int lijk = idx / 3;
+            const int c    = idx - lijk * 3;
+            const int li   = lijk / (TE * TE);
+            const int rem  = lijk - li * TE * TE;
+            const int lj   = rem / TE;
+            const int lk   = rem - lj * TE;
+            const int gi   = li + origin_J1_low;
+            const int gj   = lj + origin_J2_low;
+            const int gk   = lk + origin_J3_low;
+            if (gi < 0 || gi >= e1 || gj < 0 || gj >= e2 || gk < 0 ||
+                gk >= e3) {
+              return;
+            }
+            const real_t v = scr(li, lj, lk, c);
+            if (v != ZERO) {
+              Kokkos::atomic_add(&J(gi, gj, gk, c), v);
+            }
+          });
+      }
+    }
+  };
+
+#endif // TEAM_POLICY
+
 } // namespace kernel
 
 #undef i_di_to_Xi
diff --git a/tests/framework/CMakeLists.txt b/tests/framework/CMakeLists.txt
index d6dc295af..9a2e2865e 100644
--- a/tests/framework/CMakeLists.txt
+++ b/tests/framework/CMakeLists.txt
@@ -44,6 +44,12 @@ else()
   gen_test(particles_sort false)
 endif()
 
+# team_policy X-3: per-backend sort_by_key permutation test (only built
+# when the compile-time team_policy toggle is on).
+if(${team_policy})
+  gen_test(sort_by_key false)
+endif()
+
 gen_test(fields false)
 gen_test(grid_mesh false)
 if(${DEBUG})
diff --git a/tests/framework/sort_by_key.cpp b/tests/framework/sort_by_key.cpp
new file mode 100644
index 000000000..94dc51d0e
--- /dev/null
+++ b/tests/framework/sort_by_key.cpp
@@ -0,0 +1,110 @@
+/**
+ * @brief X-3 (team_policy) — sort_by_key permutation test.
+ *
+ * Exercises every backend overload of `ntt::sort_helpers::sort_by_key_dispatch`
+ * that is compiled in for the current Kokkos device. For each backend:
+ *   1. Allocate keys = { 5, 2, 5, 1, 3, 5, 2 }, perm = (uninitialised).
+ *   2. Call sort_by_key_dispatch.
+ *   3. Verify that keys[perm[i]] is sorted in non-decreasing order.
+ *
+ * Stability is verified for the BinSort and StdSort backends (the others
+ * promise stability per their documentation but we don't bake that into
+ * the test).
+ *
+ * Built only when `team_policy=ON` at CMake time.
+ */
+#include "enums.h"
+#include "global.h"
+
+#include "arch/kokkos_aliases.h"
+#include "utils/error.h"
+#include "utils/sort_dispatch.h"
+#include "utils/sorting.h"
+
+#include <Kokkos_Core.hpp>
+
+#include <iostream>
+#include <vector>
+
+namespace {
+
+  using namespace ntt;
+
+  template <typename Backend>
+  void test_one_backend(const char* label, Backend tag) {
+    const std::vector<ncells_t> keys_host_init { 5u, 2u, 5u, 1u, 3u, 5u, 2u };
+    const npart_t               n     = keys_host_init.size();
+    const ncells_t              n_max = 6u; // bin range [0, n_max)
+
+    array_t<ncells_t*> keys { "keys", n };
+    auto               keys_h = Kokkos::create_mirror_view(keys);
+    for (npart_t i = 0u; i < n; ++i) {
+      keys_h(i) = keys_host_init[i];
+    }
+    Kokkos::deep_copy(keys, keys_h);
+
+    prtl_perm_t perm { "perm", n };
+
+    sort_helpers::sort_by_key_dispatch(keys, perm, n_max, tag);
+
+    auto perm_h = Kokkos::create_mirror_view(perm);
+    Kokkos::deep_copy(perm_h, perm);
+
+    // Validate: keys[perm[0]] <= keys[perm[1]] <= ...
+    for (npart_t i = 1u; i < n; ++i) {
+      const auto a = keys_host_init[perm_h(i - 1u)];
+      const auto b = keys_host_init[perm_h(i)];
+      raise::ErrorIf(
+        a > b,
+        std::string("sort_by_key_dispatch produced non-sorted permutation "
+                    "for backend ") +
+          label,
+        HERE);
+    }
+
+    // Validate: perm is a permutation of [0, n).
+    std::vector<int> seen(n, 0);
+    for (npart_t i = 0u; i < n; ++i) {
+      const auto idx = perm_h(i);
+      raise::ErrorIf(idx >= n,
+                     std::string("permutation index out of range for "
+                                 "backend ") +
+                       label,
+                     HERE);
+      seen[idx] += 1;
+    }
+    for (npart_t i = 0u; i < n; ++i) {
+      raise::ErrorIf(seen[i] != 1,
+                     std::string("permutation not a bijection for backend ") +
+                       label,
+                     HERE);
+    }
+
+    std::cout << "[OK] sort_by_key_dispatch<" << label << ">: "
+              << "keys[perm] sorted, perm is a bijection." << std::endl;
+  }
+
+} // namespace
+
+auto main(int argc, char* argv[]) -> int {
+  ntt::GlobalInitialize(argc, argv);
+  try {
+    // Always-available backends.
+    test_one_backend("BinSort", ::sort::backend::BinSort {});
+#if !defined(DEVICE_ENABLED)
+    test_one_backend("StdSort", ::sort::backend::StdSort {});
+#endif
+#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)
+    test_one_backend("OneDPL", ::sort::backend::OneDPL {});
+#endif
+#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED)
+    test_one_backend("Thrust", ::sort::backend::Thrust {});
+#endif
+  } catch (std::exception& e) {
+    std::cerr << e.what() << std::endl;
+    ntt::GlobalFinalize();
+    return 1;
+  }
+  ntt::GlobalFinalize();
+  return 0;
+}
diff --git a/tests/global/tiling.cpp b/tests/global/tiling.cpp
index d0f0a412c..2f926ce28 100644
--- a/tests/global/tiling.cpp
+++ b/tests/global/tiling.cpp
@@ -1,6 +1,5 @@
 #include "arch/kokkos_aliases.h"
 #include "utils/error.h"
-#include "utils/formatting.h"
 #include "utils/numeric.h"
 #include "utils/sorting.h"
 
@@ -122,37 +121,27 @@ void test_tiling(const array_t<int*>&         i1,
 
     const auto ntiles = nt1 * nt2 * nt3;
 
-    auto position_to_tile_kernel = sort::PositionToTileIndex<D, true> {
-      i1, i2, i3, tag, tile_indices, ncells, ts
-    };
-    Kokkos::parallel_for("Tiling", npart, position_to_tile_kernel);
-    const auto num_ppt = position_to_tile_kernel.num_ppt;
+    array_t<npart_t*> num_ppt { "num_ppt", ntiles };
+    Kokkos::parallel_for(
+      "Tiling",
+      npart,
+      sort::PositionToTileIndex<D, true> { i1, i2, i3, tag, tile_indices, ncells, ts, num_ppt });
     Kokkos::parallel_for(
       "Checking",
       npart,
       Lambda(prtlidx_t p) {
         CheckValue<D>(p, i1, i2, i3, tag, tile_indices, nt1, nt2, nt3, ntiles, ts);
       });
-    raise::ErrorIf(
-      num_ppt.extent(0) != ntiles,
-      fmt::format("num_ppt size does not match number of tiles %u != %u",
-                  num_ppt.extent(0),
-                  ntiles),
-      HERE);
 
     npart_t tot_alive = 0u;
     Kokkos::parallel_reduce(
       "CountAliveInTiles",
       ntiles,
-      Lambda(cellidx_t t, npart_t & count) { count += num_ppt(t); },
+      Lambda(prtlidx_t t, npart_t & count) { count += num_ppt(t); },
       tot_alive);
-    raise::ErrorIf(
-      tot_alive != npart - ndead,
-      fmt::format("Error in counting particles per tile: %u != %u - %u",
-                  tot_alive,
-                  npart,
-                  ndead),
-      HERE);
+    raise::ErrorIf(tot_alive != npart - ndead,
+                   "Error in counting particles per tile",
+                   HERE);
   }
 }
 
diff --git a/tests/kernels/CMakeLists.txt b/tests/kernels/CMakeLists.txt
index f1438e108..0fe23f578 100644
--- a/tests/kernels/CMakeLists.txt
+++ b/tests/kernels/CMakeLists.txt
@@ -26,6 +26,9 @@ endfunction()
 gen_test(faraday_mink)
 gen_test(ampere_mink)
 gen_test(deposit)
+if(${team_policy})
+  gen_test(deposit_tiled)
+endif()
 gen_test(digital_filter)
 gen_test(particle_moments)
 gen_test(fields_to_phys)
diff --git a/tests/kernels/deposit_tiled.cpp b/tests/kernels/deposit_tiled.cpp
new file mode 100644
index 000000000..87f4f3f4d
--- /dev/null
+++ b/tests/kernels/deposit_tiled.cpp
@@ -0,0 +1,262 @@
+/**
+ * @file tests/kernels/deposit_tiled.cpp
+ * @brief X-1 numerical-equivalence test for the tiled deposit kernel.
+ *
+ * Runs the flat (`DepositCurrents_kernel`) and tiled
+ * (`DepositCurrents_kernel_tiled`) kernels on identical particle SoA inputs
+ * for shape orders O = 1..11 and asserts that the resulting J array is
+ * identical cell-by-cell within a small floating-point tolerance.
+ *
+ * Built only when `team_policy=ON` (`-D TEAM_POLICY` defined). The test
+ * matches the per-particle setup used in `deposit.cpp` so that any
+ * regression in the shared `kernel::deposit::deposit_one_particle` body
+ * is caught by both tests.
+ */
+
+#include "enums.h"
+#include "global.h"
+
+#include "arch/kokkos_aliases.h"
+#include "utils/comparators.h"
+
+#include "metrics/minkowski.h"
+
+#include "kernels/currents_deposit.hpp"
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_ScatterView.hpp>
+
+#include <cmath>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+namespace {
+
+  using namespace ntt;
+
+  void errorIf(bool condition, const std::string& msg) {
+    if (condition) {
+      throw std::runtime_error(msg);
+    }
+  }
+
+  template <typename T>
+  void put_value(const array_t<T*>& arr, T value, int i) {
+    auto h = Kokkos::create_mirror_view(arr);
+    h(i)   = value;
+    Kokkos::deep_copy(arr, h);
+  }
+
+  // Builds tile_offsets for a single-particle test. Particle 0 is alive
+  // and lives in tile (tx1, tx2); slots 1..n_slots-1 carry the dead
+  // sentinel and are never referenced by tile_offsets — so the tiled
+  // kernel never iterates over them.
+  array_t<npart_t*> build_tile_offsets_single_particle(ncells_t ntx1,
+                                                       ncells_t ntx2,
+                                                       ncells_t tx1,
+                                                       ncells_t tx2) {
+    const ncells_t total_tiles = ntx1 * ntx2;
+    const ncells_t hot_tile    = tx1 * ntx2 + tx2;
+    array_t<npart_t*> offsets("tile_offsets", total_tiles + 1u);
+    auto h = Kokkos::create_mirror_view(offsets);
+    for (ncells_t t = 0; t <= total_tiles; ++t) {
+      h(t) = (t <= hot_tile) ? static_cast<npart_t>(0)
+                             : static_cast<npart_t>(1);
+    }
+    Kokkos::deep_copy(offsets, h);
+    return offsets;
+  }
+
+  template <unsigned short O, unsigned short T_TILE>
+  void run_one_case() {
+    using metric_t = metric::Minkowski<Dim::_2D>;
+    constexpr unsigned short nx1 = 50u, nx2 = 50u;
+    metric_t metric { { nx1, nx2 },
+                      { { 0.0, 55.0 }, { 0.0, 55.0 } },
+                      {} };
+
+    // Particle setup (mirrors deposit.cpp).
+    const int      i0 = 25, j0 = 21, i0f = 24, j0f = 20;
+    const real_t   uz = 2.5;
+    const prtldx_t dxi = static_cast<prtldx_t>(0.65);
+    const prtldx_t dxf = static_cast<prtldx_t>(0.99);
+    const prtldx_t dyi = static_cast<prtldx_t>(0.65);
+    const prtldx_t dyf = static_cast<prtldx_t>(0.80);
+
+    array_t<int*>      i1 { "i1", 10 };
+    array_t<int*>      i2 { "i2", 10 };
+    array_t<int*>      i3 { "i3", 10 };
+    array_t<int*>      i1_prev { "i1_prev", 10 };
+    array_t<int*>      i2_prev { "i2_prev", 10 };
+    array_t<int*>      i3_prev { "i3_prev", 10 };
+    array_t<prtldx_t*> dx1 { "dx1", 10 };
+    array_t<prtldx_t*> dx2 { "dx2", 10 };
+    array_t<prtldx_t*> dx3 { "dx3", 10 };
+    array_t<prtldx_t*> dx1_prev { "dx1_prev", 10 };
+    array_t<prtldx_t*> dx2_prev { "dx2_prev", 10 };
+    array_t<prtldx_t*> dx3_prev { "dx3_prev", 10 };
+    array_t<real_t*>   ux1 { "ux1", 10 };
+    array_t<real_t*>   ux2 { "ux2", 10 };
+    array_t<real_t*>   ux3 { "ux3", 10 };
+    array_t<real_t*>   phi { "phi", 10 };
+    array_t<real_t*>   weight { "weight", 10 };
+    array_t<short*>    tag { "tag", 10 };
+    const real_t       charge = 1.0, dt = 1.0;
+
+    put_value<int>(i1, i0f, 0);
+    put_value<int>(i2, j0f, 0);
+    put_value<int>(i1_prev, i0, 0);
+    put_value<int>(i2_prev, j0, 0);
+    put_value<prtldx_t>(dx1, dxf, 0);
+    put_value<prtldx_t>(dx2, dyf, 0);
+    put_value<prtldx_t>(dx1_prev, dxi, 0);
+    put_value<prtldx_t>(dx2_prev, dyi, 0);
+    put_value<real_t>(ux1, ZERO, 0);
+    put_value<real_t>(ux2, ZERO, 0);
+    put_value<real_t>(ux3, uz, 0);
+    put_value<real_t>(weight, 1.0, 0);
+    put_value<short>(tag, ParticleTag::alive, 0);
+
+    // Run the flat kernel.
+    ndfield_t<Dim::_2D, 3> J_flat { "J_flat",
+                                    nx1 + 2u * N_GHOSTS,
+                                    nx2 + 2u * N_GHOSTS };
+    {
+      auto J_scat = Kokkos::Experimental::create_scatter_view(J_flat);
+      Kokkos::parallel_for(
+        "FlatDeposit",
+        10,
+        kernel::DepositCurrents_kernel<SimEngine::SRPIC, metric_t, O>(
+          J_scat,
+          i1, i2, i3,
+          i1_prev, i2_prev, i3_prev,
+          dx1, dx2, dx3,
+          dx1_prev, dx2_prev, dx3_prev,
+          ux1, ux2, ux3,
+          phi, weight, tag,
+          metric, charge, dt));
+      Kokkos::Experimental::contribute(J_flat, J_scat);
+      Kokkos::fence("flat deposit done");
+    }
+
+    // Run the tiled kernel. Build a TileLayout with one alive particle
+    // landing in its expected tile (sort key = min(i, i_prev) / T_TILE).
+    ndfield_t<Dim::_2D, 3> J_tiled { "J_tiled",
+                                     nx1 + 2u * N_GHOSTS,
+                                     nx2 + 2u * N_GHOSTS };
+    {
+      const auto sort_i1 = static_cast<int>(
+        (i0 < i0f) ? i0 : i0f); // min(i, i_prev) before clamp
+      const auto sort_i2 = static_cast<int>((j0 < j0f) ? j0 : j0f);
+      const auto ntx1    = static_cast<ncells_t>(
+        std::ceil(static_cast<double>(nx1) / static_cast<double>(T_TILE)));
+      const auto ntx2 = static_cast<ncells_t>(
+        std::ceil(static_cast<double>(nx2) / static_cast<double>(T_TILE)));
+      const auto tx1 = static_cast<ncells_t>(sort_i1) / T_TILE;
+      const auto tx2 = static_cast<ncells_t>(sort_i2) / T_TILE;
+
+      TileLayout<Dim::_2D> layout;
+      layout.ntiles_per_axis[0] = ntx1;
+      layout.ntiles_per_axis[1] = ntx2;
+      layout.ntiles_per_axis[2] = 1u;
+      layout.ntiles_total       = ntx1 * ntx2;
+      layout.tile_size          = T_TILE;
+      layout.tile_offsets       = build_tile_offsets_single_particle(ntx1,
+                                                                      ntx2,
+                                                                      tx1,
+                                                                      tx2);
+
+      using kernel_t =
+        kernel::DepositCurrents_kernel_tiled<SimEngine::SRPIC, metric_t, O, T_TILE>;
+      kernel_t kern { J_tiled,
+                      i1, i2, i3,
+                      i1_prev, i2_prev, i3_prev,
+                      dx1, dx2, dx3,
+                      dx1_prev, dx2_prev, dx3_prev,
+                      ux1, ux2, ux3,
+                      phi, weight, tag,
+                      metric, charge, dt, layout };
+
+      Kokkos::TeamPolicy<> policy(static_cast<int>(layout.ntiles_total),
+                                  Kokkos::AUTO);
+      policy.set_scratch_size(0,
+                              Kokkos::PerTeam(kernel_t::scratch_bytes()));
+      Kokkos::parallel_for("TiledDeposit", policy, kern);
+      Kokkos::fence("tiled deposit done");
+    }
+
+    // Compare J_flat vs J_tiled cell-by-cell.
+    auto h_flat  = Kokkos::create_mirror_view(J_flat);
+    auto h_tiled = Kokkos::create_mirror_view(J_tiled);
+    Kokkos::deep_copy(h_flat, J_flat);
+    Kokkos::deep_copy(h_tiled, J_tiled);
+
+    const real_t eps      = static_cast<real_t>(1.0e-5);
+    real_t       max_diff = ZERO;
+    int          fail_count = 0;
+    for (ncells_t i = 0; i < h_flat.extent(0); ++i) {
+      for (ncells_t j = 0; j < h_flat.extent(1); ++j) {
+        for (int c = 0; c < 3; ++c) {
+          const real_t a    = h_flat(i, j, c);
+          const real_t b    = h_tiled(i, j, c);
+          const real_t diff = math::fabs(a - b);
+          const real_t mag  = math::max(math::fabs(a), math::fabs(b));
+          if (diff > max_diff) {
+            max_diff = diff;
+          }
+          if (diff > eps * math::max(mag, static_cast<real_t>(1.0))) {
+            if (fail_count < 5) {
+              std::cerr << "  J(" << i << "," << j << ",c=" << c
+                        << ") flat=" << a << " tiled=" << b
+                        << " diff=" << diff << '\n';
+            }
+            ++fail_count;
+          }
+        }
+      }
+    }
+    if (fail_count > 0) {
+      std::cerr << "X-1 deposit_tiled equivalence FAILED for O=" << O
+                << " T_TILE=" << T_TILE
+                << " : " << fail_count << " mismatches; max_diff=" << max_diff
+                << '\n';
+      throw std::logic_error("DepositCurrents_kernel_tiled mismatch");
+    }
+    std::cerr << "X-1 deposit_tiled OK  O=" << O << " T_TILE=" << T_TILE
+              << "  max_diff=" << max_diff << '\n';
+  }
+
+  template <unsigned short T_TILE>
+  void run_all_orders() {
+    run_one_case<0u, T_TILE>();
+    run_one_case<1u, T_TILE>();
+    run_one_case<2u, T_TILE>();
+    run_one_case<3u, T_TILE>();
+    run_one_case<4u, T_TILE>();
+    run_one_case<5u, T_TILE>();
+    run_one_case<6u, T_TILE>();
+    run_one_case<7u, T_TILE>();
+    run_one_case<8u, T_TILE>();
+    run_one_case<9u, T_TILE>();
+    run_one_case<10u, T_TILE>();
+    run_one_case<11u, T_TILE>();
+  }
+
+} // namespace
+
+auto main(int argc, char* argv[]) -> int {
+  Kokkos::initialize(argc, argv);
+  try {
+    // Run with each tile-size choice from the validated CMake list.
+    run_all_orders<4u>();
+    run_all_orders<8u>();
+    run_all_orders<12u>();
+  } catch (std::exception& e) {
+    std::cerr << e.what() << '\n';
+    Kokkos::finalize();
+    return 1;
+  }
+  Kokkos::finalize();
+  return 0;
+}

From 04d403c606147e0d651cac28f930d5592f808c84 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ludwig=20B=C3=B6ss?= <ludwig.boess@outlook.de>
Date: Tue, 19 May 2026 19:25:38 +0200
Subject: [PATCH 03/33] removed redundant inner loop over species

---
 src/framework/domain/metadomain_sort.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/framework/domain/metadomain_sort.cpp b/src/framework/domain/metadomain_sort.cpp
index 791bf31a8..a2b08f8be 100644
--- a/src/framework/domain/metadomain_sort.cpp
+++ b/src/framework/domain/metadomain_sort.cpp
@@ -21,9 +21,7 @@ namespace ntt {
       const auto clearing_interval = species.clearing_interval();
       if ((clearing_interval > 0u) and (step % clearing_interval == 0u) and
           (step > 0u)) {
-        for (auto& species : domain.species) {
-          species.RemoveDead();
-        }
+        species.RemoveDead();
       }
       const auto spatial_sorting_interval = species.spatial_sorting_interval();
       if ((spatial_sorting_interval > 0u) and

From 5830c0740e43851f5d26afb8e35438160be8a007 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ludwig=20B=C3=B6ss?= <ludwig.boess@outlook.de>
Date: Tue, 19 May 2026 19:26:11 +0200
Subject: [PATCH 04/33] frontier-specific memory pool allocation

---
 src/global/global.cpp | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/src/global/global.cpp b/src/global/global.cpp
index ec22fd2f3..c3aa1bcdc 100644
--- a/src/global/global.cpp
+++ b/src/global/global.cpp
@@ -6,8 +6,51 @@
   #include <mpi.h>
 #endif // MPI_ENABLED
 
+#if defined(HIP_ENABLED)
+  #include <hip/hip_runtime.h>
+
+  #include <cstdint>
+#endif // HIP_ENABLED
+
+namespace {
+#if defined(HIP_ENABLED)
+  // Turn the ROCm stream-ordered allocator into a caching arena.
+  //
+  // This Kokkos build uses hipMallocAsync/hipFreeAsync (Kokkos option
+  // IMPL_HIP_MALLOC_ASYNC). The default memory pool has a release
+  // threshold of 0, so every freed block is handed back to the driver
+  // at the next stream sync. With ~50 GB of particle SoA permanently
+  // pinned and only ~14 GB free, the per-step churn of dozens of
+  // large, differently-sized sort/comm scratch buffers fragments that
+  // free space: allocation cost grows monotonically (ParticleSort
+  // slowdown) until no contiguous mid-size block remains and BinSort's
+  // `sorted_values` allocation fails (OOM). Raising the release
+  // threshold to "unlimited" makes the pool retain and recycle freed
+  // blocks instead, which stabilizes the working set and removes both
+  // the slowdown and the OOM.
+  void ConfigureHipMemPool() {
+    int device = 0;
+    if (hipGetDevice(&device) != hipSuccess) {
+      return;
+    }
+    hipMemPool_t pool = nullptr;
+    if (hipDeviceGetDefaultMemPool(&pool, device) != hipSuccess or
+        pool == nullptr) {
+      return;
+    }
+    uint64_t threshold = UINT64_MAX;
+    (void)hipMemPoolSetAttribute(pool,
+                                 hipMemPoolAttrReleaseThreshold,
+                                 &threshold);
+  }
+#endif // HIP_ENABLED
+} // namespace
+
 void ntt::GlobalInitialize(int argc, char* argv[]) {
   Kokkos::initialize(argc, argv);
+#if defined(HIP_ENABLED)
+  ConfigureHipMemPool();
+#endif // HIP_ENABLED
 #if defined(MPI_ENABLED)
   MPI_Init(&argc, &argv);
 #endif // MPI_ENABLED

From cf14e9f11891afc732bfb9d289b2c16c53af770c Mon Sep 17 00:00:00 2001
From: LudwigBoess <ludwig.boess@outlook.de>
Date: Tue, 26 May 2026 17:06:27 +0000
Subject: [PATCH 05/33] support more tile sizes

---
 CMakeLists.txt                  | 2 +-
 tests/kernels/deposit_tiled.cpp | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 248a34e7b..06494a7db 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,7 +65,7 @@ set(team_policy_tile_size
   ${default_team_policy_tile_size}
   CACHE STRING "team_policy tile edge length in cells")
 set(team_policy_tile_sizes
-  "4;6;8;10;12"
+  "4;6;8;10;12;14;16"
   CACHE STRING "team_policy tile-size choices")
 
 # -------------------------- Compilation settings -------------------------- #
diff --git a/tests/kernels/deposit_tiled.cpp b/tests/kernels/deposit_tiled.cpp
index 87f4f3f4d..504cc3818 100644
--- a/tests/kernels/deposit_tiled.cpp
+++ b/tests/kernels/deposit_tiled.cpp
@@ -250,8 +250,12 @@ auto main(int argc, char* argv[]) -> int {
   try {
     // Run with each tile-size choice from the validated CMake list.
     run_all_orders<4u>();
+    run_all_orders<6u>();
     run_all_orders<8u>();
+    run_all_orders<10u>();
     run_all_orders<12u>();
+    run_all_orders<14u>();
+    run_all_orders<16u>();
   } catch (std::exception& e) {
     std::cerr << e.what() << '\n';
     Kokkos::finalize();

From 787aa045750fa2c582b54d02c5e8d3f4265de1e0 Mon Sep 17 00:00:00 2001
From: LudwigBoess <ludwig.boess@outlook.de>
Date: Tue, 26 May 2026 17:12:06 +0000
Subject: [PATCH 06/33] removed persistent sort scratch to reduce memory
 overhead

---
 src/framework/containers/particles.h        |  16 ---
 src/framework/containers/particles_sort.cpp | 111 +++++++-------------
 2 files changed, 38 insertions(+), 89 deletions(-)

diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h
index 0a15cb5d9..4f0770729 100644
--- a/src/framework/containers/particles.h
+++ b/src/framework/containers/particles.h
@@ -99,22 +99,6 @@ namespace ntt {
     // vendor libraries detected by CMake.
     TileLayout<D> m_tile_layout {};
 
-#if defined(TEAM_POLICY) &&                                                    \
-  ((defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) ||                       \
-   (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) ||                       \
-   (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)))
-    // Persistent byte scratch reused by every SoA-member gather in
-    // `apply_permutation_to_soa`, across all members and all timesteps.
-    // Without this each member would allocate (and free) its own
-    // transient buffer every sort; recycling one persistent buffer
-    // removes that allocation churn entirely — the structural fix for
-    // the ROCm sort slowdown / fragmentation. Grown monotonically to
-    // the largest required size, never shrunk. Kokkos device
-    // allocations are over-aligned (>= 8 B), so reinterpreting the
-    // bytes as any SoA element type (<= 8 B PODs) is well-defined.
-    array_t<char*> m_perm_scratch {};
-#endif
-
   public:
     // for empty allocation
     Particles() {}
diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp
index c04813e12..0317e0b5f 100644
--- a/src/framework/containers/particles_sort.cpp
+++ b/src/framework/containers/particles_sort.cpp
@@ -454,28 +454,20 @@ namespace ntt {
 #if defined(TEAM_POLICY_USE_VENDOR_SORT)
   namespace permute_helpers {
 
-    // Permute a 1D SoA member array `arr` in place by `perm`, gathering
-    // through `scratch` — a persistent byte buffer reused by every
-    // member and every timestep (no per-call allocation). An unmanaged
-    // typed view aliases the scratch bytes; the caller guarantees
-    // `scratch` is large enough and that Kokkos' device over-alignment
-    // covers the element type.
+    // Permute a 1D SoA member array `arr` in place by `perm`, using a
+    // single transient buffer of size `n`. Buffer is freed at scope
+    // exit; the explicit fence right before that drains queued GPU
+    // work referencing it.
     template <typename V>
-    inline void permute_1d_inplace(V&                    arr,
-                                   const prtl_perm_t&    perm,
-                                   npart_t               n,
-                                   const array_t<char*>& scratch) {
+    inline void permute_1d_inplace(V&                 arr,
+                                   const prtl_perm_t& perm,
+                                   npart_t            n) {
       if (n == 0u) {
         return;
       }
-      using value_t = typename V::non_const_value_type;
-      using buf_t   = Kokkos::View<value_t*,
-                                 typename V::array_layout,
-                                 typename V::memory_space,
-                                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-      buf_t buf(reinterpret_cast<value_t*>(scratch.data()), n);
-      auto  perm_v = perm;
-      auto  arr_v  = arr;
+      V    buf(std::string(arr.label()) + "_perm_buf", n);
+      auto perm_v = perm;
+      auto arr_v  = arr;
       Kokkos::parallel_for(
         "Permute1D",
         n,
@@ -486,22 +478,16 @@ namespace ntt {
 
     // 2D analogue for `pld_r` / `pld_i`.
     template <typename V>
-    inline void permute_2d_inplace(V&                    arr,
-                                   const prtl_perm_t&    perm,
-                                   npart_t               n,
-                                   npart_t               ncols,
-                                   const array_t<char*>& scratch) {
+    inline void permute_2d_inplace(V&                 arr,
+                                   const prtl_perm_t& perm,
+                                   npart_t            n,
+                                   npart_t            ncols) {
       if (n == 0u or ncols == 0u) {
         return;
       }
-      using value_t = typename V::non_const_value_type;
-      using buf_t   = Kokkos::View<value_t**,
-                                 typename V::array_layout,
-                                 typename V::memory_space,
-                                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-      buf_t buf(reinterpret_cast<value_t*>(scratch.data()), n, ncols);
-      auto  perm_v = perm;
-      auto  arr_v  = arr;
+      V    buf(std::string(arr.label()) + "_perm_buf", n, ncols);
+      auto perm_v = perm;
+      auto arr_v  = arr;
       Kokkos::parallel_for(
         "Permute2D",
         CreateParticleRangePolicy<Dim::_2D>({ 0u, 0u }, { n, ncols }),
@@ -522,61 +508,40 @@ namespace ntt {
       return;
     }
 
-    // Size the persistent scratch once to the largest gather any member
-    // needs this call: 1D members need n * sizeof(real_t) bytes (the
-    // widest element); the 2D payloads need n * ncols * elem bytes.
-    // Grown monotonically, never shrunk — so after warmup this incurs
-    // no allocation at all.
-    std::size_t need = static_cast<std::size_t>(n) * sizeof(real_t);
-    if (npld_r() > 0) {
-      need = std::max(need,
-                      static_cast<std::size_t>(n) *
-                        static_cast<std::size_t>(npld_r()) * sizeof(real_t));
-    }
-    if (npld_i() > 0) {
-      need = std::max(need,
-                      static_cast<std::size_t>(n) *
-                        static_cast<std::size_t>(npld_i()) * sizeof(npart_t));
-    }
-    if (m_perm_scratch.extent(0) < need) {
-      m_perm_scratch = array_t<char*> { "perm_scratch", need };
-    }
-    const auto& scratch = m_perm_scratch;
-
     using permute_helpers::permute_1d_inplace;
     using permute_helpers::permute_2d_inplace;
 
     if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) {
-      permute_1d_inplace(i1, perm, n, scratch);
-      permute_1d_inplace(dx1, perm, n, scratch);
-      permute_1d_inplace(i1_prev, perm, n, scratch);
-      permute_1d_inplace(dx1_prev, perm, n, scratch);
+      permute_1d_inplace(i1, perm, n);
+      permute_1d_inplace(dx1, perm, n);
+      permute_1d_inplace(i1_prev, perm, n);
+      permute_1d_inplace(dx1_prev, perm, n);
     }
     if constexpr (D == Dim::_2D or D == Dim::_3D) {
-      permute_1d_inplace(i2, perm, n, scratch);
-      permute_1d_inplace(dx2, perm, n, scratch);
-      permute_1d_inplace(i2_prev, perm, n, scratch);
-      permute_1d_inplace(dx2_prev, perm, n, scratch);
+      permute_1d_inplace(i2, perm, n);
+      permute_1d_inplace(dx2, perm, n);
+      permute_1d_inplace(i2_prev, perm, n);
+      permute_1d_inplace(dx2_prev, perm, n);
     }
     if constexpr (D == Dim::_3D) {
-      permute_1d_inplace(i3, perm, n, scratch);
-      permute_1d_inplace(dx3, perm, n, scratch);
-      permute_1d_inplace(i3_prev, perm, n, scratch);
-      permute_1d_inplace(dx3_prev, perm, n, scratch);
-    }
-    permute_1d_inplace(ux1, perm, n, scratch);
-    permute_1d_inplace(ux2, perm, n, scratch);
-    permute_1d_inplace(ux3, perm, n, scratch);
-    permute_1d_inplace(weight, perm, n, scratch);
-    permute_1d_inplace(tag, perm, n, scratch);
+      permute_1d_inplace(i3, perm, n);
+      permute_1d_inplace(dx3, perm, n);
+      permute_1d_inplace(i3_prev, perm, n);
+      permute_1d_inplace(dx3_prev, perm, n);
+    }
+    permute_1d_inplace(ux1, perm, n);
+    permute_1d_inplace(ux2, perm, n);
+    permute_1d_inplace(ux3, perm, n);
+    permute_1d_inplace(weight, perm, n);
+    permute_1d_inplace(tag, perm, n);
     if constexpr (D == Dim::_2D and C != Coord::Cartesian) {
-      permute_1d_inplace(phi, perm, n, scratch);
+      permute_1d_inplace(phi, perm, n);
     }
     if (npld_r() > 0) {
-      permute_2d_inplace(pld_r, perm, n, static_cast<npart_t>(npld_r()), scratch);
+      permute_2d_inplace(pld_r, perm, n, static_cast<npart_t>(npld_r()));
     }
     if (npld_i() > 0) {
-      permute_2d_inplace(pld_i, perm, n, static_cast<npart_t>(npld_i()), scratch);
+      permute_2d_inplace(pld_i, perm, n, static_cast<npart_t>(npld_i()));
     }
   }
 #endif // TEAM_POLICY_USE_VENDOR_SORT

From 827cf264c2903b3179252c88f19c0a7bb090357d Mon Sep 17 00:00:00 2001
From: LudwigBoess <ludwig.boess@outlook.de>
Date: Tue, 2 Jun 2026 18:20:33 +0000
Subject: [PATCH 07/33] added team policy reporting

---
 src/engines/reporter.cpp      | 3 +++
 src/global/utils/reporter.cpp | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/src/engines/reporter.cpp b/src/engines/reporter.cpp
index f56874d23..3caeda5de 100644
--- a/src/engines/reporter.cpp
+++ b/src/engines/reporter.cpp
@@ -32,6 +32,9 @@ namespace ntt {
                        "%s",
                        params.template get<std::string>("simulation.name").c_str());
     reporter::AddParam(report, 4, "Engine", "%s", SimEngine(S).to_string());
+#if defined(TEAM_POLICY)
+    reporter::AddParam(report, 4, "Tile size", "%d", TEAM_POLICY_TILE_SIZE);
+#endif
     reporter::AddParam(report, 4, "Metric", "%s", M.to_string());
 #if SHAPE_ORDER == 0
     reporter::AddParam(report, 4, "Deposit", "%s", "zigzag");
diff --git a/src/global/utils/reporter.cpp b/src/global/utils/reporter.cpp
index 77117c4b9..a4b10eee6 100644
--- a/src/global/utils/reporter.cpp
+++ b/src/global/utils/reporter.cpp
@@ -250,6 +250,12 @@ namespace reporter {
 #else
     AddParam(report, 4, "GPU_AWARE_MPI", "%s", "OFF");
 #endif
+
+#if defined(TEAM_POLICY)
+    AddParam(report, 4, "TEAM_POLICY", "%s", "ON");
+#else
+    AddParam(report, 4, "TEAM_POLICY", "%s", "OFF");
+#endif
     report += "\n";
     return report;
   }

From 4b81914aae5cbc15cea19217b284113f617750f3 Mon Sep 17 00:00:00 2001
From: LudwigBoess <ludwig.boess@outlook.de>
Date: Tue, 2 Jun 2026 19:08:47 +0000
Subject: [PATCH 08/33] explicitly bind GPU Transport Layer for GPU aware MPI
 on Frontier

---
 CMakeLists.txt | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 06494a7db..0137671a0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -211,6 +211,62 @@ if(${mpi})
   if(${DEVICE_ENABLED})
     if(${gpu_aware_mpi})
       add_compile_options("-D GPU_AWARE_MPI")
+
+      # On Cray systems (e.g. Frontier) GPU-aware Cray MPICH can only
+      # handle device pointers if the GPU Transport Layer (GTL) library
+      # is linked. The Cray compiler wrappers (cc/CC) inject this
+      # automatically, but we build with hipcc/nvcc directly, so
+      # find_package(MPI) only finds base libmpi and the GTL is left
+      # out -> MPI_Sendrecv on a device pointer fails with
+      # "OFI ... Bad address". Add it explicitly here.
+      #
+      # Cray PE exports PE_MPICH_GTL_DIR_<accel> / PE_MPICH_GTL_LIBS_<accel>
+      # (e.g. amd_gfx90a -> -lmpi_gtl_hsa). Their absence means this is
+      # not a Cray MPICH build, in which case nothing extra is needed.
+      if("${Kokkos_DEVICES}" MATCHES "HIP")
+        set(_gtl_accels amd_gfx942 amd_gfx940 amd_gfx90a amd_gfx908 amd_gfx906)
+      elseif("${Kokkos_DEVICES}" MATCHES "CUDA")
+        set(_gtl_accels nvidia90 nvidia80 nvidia70)
+      elseif("${Kokkos_DEVICES}" MATCHES "SYCL")
+        set(_gtl_accels ponteVecchio)
+      else()
+        set(_gtl_accels "")
+      endif()
+
+      set(_gtl_dir "")
+      set(_gtl_libflag "")
+      foreach(_accel ${_gtl_accels})
+        if((NOT _gtl_dir) AND (DEFINED ENV{PE_MPICH_GTL_DIR_${_accel}}))
+          # strip the leading "-L" from the Cray-provided value
+          string(REGEX REPLACE "^-L" ""
+                 _gtl_dir "$ENV{PE_MPICH_GTL_DIR_${_accel}}")
+          string(REGEX REPLACE "^-l" ""
+                 _gtl_libflag "$ENV{PE_MPICH_GTL_LIBS_${_accel}}")
+        endif()
+      endforeach()
+
+      if(_gtl_dir AND _gtl_libflag)
+        find_library(MPI_GTL_LIBRARY
+          NAMES ${_gtl_libflag}
+          HINTS "${_gtl_dir}"
+          NO_DEFAULT_PATH)
+        if(MPI_GTL_LIBRARY)
+          message(STATUS
+            "GPU-aware MPI: linking Cray GTL library ${MPI_GTL_LIBRARY}")
+          set(DEPENDENCIES ${DEPENDENCIES} ${MPI_GTL_LIBRARY})
+        else()
+          message(FATAL_ERROR
+            "${Red}gpu_aware_mpi=ON: Cray MPICH detected but the GTL "
+            "library 'lib${_gtl_libflag}' was not found in '${_gtl_dir}'. "
+            "GPU-aware MPI will crash at runtime without it. Make sure the "
+            "craype-accel module is loaded, or build with gpu_aware_mpi=OFF."
+            "${ColorReset}")
+        endif()
+      else()
+        message(STATUS
+          "GPU-aware MPI: no Cray GTL environment found; assuming the MPI "
+          "implementation is GPU-aware without an extra transport library.")
+      endif()
     endif()
   else()
     set(gpu_aware_mpi

From a603ecb0b11662282cfd613bf8bff085c5aead70 Mon Sep 17 00:00:00 2001
From: haykh <haykh.astro@gmail.com>
Date: Wed, 3 Jun 2026 10:38:29 -0400
Subject: [PATCH 09/33] minor refactor

---
 src/engines/srpic/currents.h     |   92 +-
 src/global/arch/kokkos_aliases.h |   73 ++
 src/kernels/currents_deposit.hpp | 1760 ++++++++++++++----------------
 3 files changed, 934 insertions(+), 991 deletions(-)

diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h
index faf0bb3ad..63746d101 100644
--- a/src/engines/srpic/currents.h
+++ b/src/engines/srpic/currents.h
@@ -39,30 +39,12 @@ namespace ntt {
                            species.rangeActiveParticles(),
                            kernel::DepositCurrents_kernel<SimEngine::SRPIC, M, O>(
                              scatter_cur,
-                             species.i1,
-                             species.i2,
-                             species.i3,
-                             species.i1_prev,
-                             species.i2_prev,
-                             species.i3_prev,
-                             species.dx1,
-                             species.dx2,
-                             species.dx3,
-                             species.dx1_prev,
-                             species.dx2_prev,
-                             species.dx3_prev,
-                             species.ux1,
-                             species.ux2,
-                             species.ux3,
-                             species.phi,
-                             species.weight,
-                             species.tag,
+                             species,
                              local_metric,
                              (real_t)(species.charge()),
                              dt));
     }
 
-#if defined(TEAM_POLICY)
     /**
      * @brief Tiled deposit launcher (TeamPolicy + per-team scratch).
      *
@@ -78,11 +60,10 @@ namespace ntt {
      * passed-in `scatter_cur` so the caller still composes correctly.
      */
     template <SRMetricClass M, unsigned short O>
-    void CallDepositKernelTiled(
-      const Particles<M::Dim, M::CoordType>& species,
-      const M&                               local_metric,
-      const ndfield_t<M::Dim, 3>&            cur,
-      real_t                                 dt) {
+    void CallDepositKernelTiled(const Particles<M::Dim, M::CoordType>& species,
+                                const M&                    local_metric,
+                                const ndfield_t<M::Dim, 3>& cur,
+                                real_t                      dt) {
       static_assert(O <= 11u, "Shape order must be <= 11");
       constexpr unsigned short T = static_cast<unsigned short>(
         TEAM_POLICY_TILE_SIZE);
@@ -96,40 +77,18 @@ namespace ntt {
                      "with ntiles_total",
                      HERE);
 
-      using kernel_t = kernel::DepositCurrents_kernel_tiled<SimEngine::SRPIC,
-                                                            M,
-                                                            O,
-                                                            T>;
-      kernel_t kern { cur,
-                      species.i1,
-                      species.i2,
-                      species.i3,
-                      species.i1_prev,
-                      species.i2_prev,
-                      species.i3_prev,
-                      species.dx1,
-                      species.dx2,
-                      species.dx3,
-                      species.dx1_prev,
-                      species.dx2_prev,
-                      species.dx3_prev,
-                      species.ux1,
-                      species.ux2,
-                      species.ux3,
-                      species.phi,
-                      species.weight,
-                      species.tag,
-                      local_metric,
-                      (real_t)(species.charge()),
-                      dt,
-                      layout };
+      auto deposit_kernel =
+        kernel::DepositCurrentsTiled_kernel<SimEngine::SRPIC, M, O, T> {
+          cur, species, local_metric, (real_t)(species.charge()), dt, layout
+        };
 
       Kokkos::TeamPolicy<> policy(static_cast<int>(layout.ntiles_total),
                                   Kokkos::AUTO);
-      policy.set_scratch_size(0, Kokkos::PerTeam(kernel_t::scratch_bytes()));
-      Kokkos::parallel_for("CurrentsDepositTiled", policy, kern);
+      policy.set_scratch_size(
+        0,
+        Kokkos::PerTeam(decltype(deposit_kernel)::scratch_bytes()));
+      Kokkos::parallel_for("CurrentsDepositTiled", policy, deposit_kernel);
     }
-#endif // TEAM_POLICY
 
     template <SRMetricClass M>
     void CurrentsDeposit(Domain<SimEngine::SRPIC, M>& domain,
@@ -164,17 +123,18 @@ namespace ntt {
             continue;
           }
           logger::Checkpoint(
-            fmt::format("Launching currents deposit (flat fallback, no sort yet) "
-                        "for %d [%s] : %lu %f",
-                        species.index(),
-                        species.label().c_str(),
-                        species.npart(),
-                        (double)species.charge()),
+            fmt::format(
+              "Launching currents deposit (flat fallback, no sort yet) "
+              "for %d [%s] : %lu %f",
+              species.index(),
+              species.label().c_str(),
+              species.npart(),
+              (double)species.charge()),
             HERE);
           CallDepositKernel<M, SHAPE_ORDER>(species,
-                                             domain.mesh.metric,
-                                             scatter_cur,
-                                             dt);
+                                            domain.mesh.metric,
+                                            scatter_cur,
+                                            dt);
         }
         Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur);
       } else {
@@ -192,9 +152,9 @@ namespace ntt {
             HERE);
 
           CallDepositKernelTiled<M, SHAPE_ORDER>(species,
-                                                  domain.mesh.metric,
-                                                  domain.fields.cur,
-                                                  dt);
+                                                 domain.mesh.metric,
+                                                 domain.fields.cur,
+                                                 dt);
         }
       }
 #else
diff --git a/src/global/arch/kokkos_aliases.h b/src/global/arch/kokkos_aliases.h
index 43f21ce47..4fe88cdf7 100644
--- a/src/global/arch/kokkos_aliases.h
+++ b/src/global/arch/kokkos_aliases.h
@@ -226,6 +226,79 @@ namespace kokkos_aliases_hidden {
 template <Dimension D>
 using range_h_t = typename kokkos_aliases_hidden::range_h_impl<D>::type;
 
+// Array aliases of arbitrary type and dimensions (up to 4)
+namespace kokkos_aliases_hidden {
+  // c++ magic
+  template <unsigned short D, typename T>
+  struct scratch_nddata_impl {
+    using type = void;
+  };
+
+  template <typename T>
+  struct scratch_nddata_impl<1, T> {
+    using type = Kokkos::View<T*,
+                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  };
+
+  template <typename T>
+  struct scratch_nddata_impl<2, T> {
+    using type = Kokkos::View<T**,
+                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  };
+
+  template <typename T>
+  struct scratch_nddata_impl<3, T> {
+    using type = Kokkos::View<T***,
+                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  };
+
+  template <typename T>
+  struct scratch_nddata_impl<4, T> {
+    using type = Kokkos::View<T****,
+                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  };
+} // namespace kokkos_aliases_hidden
+
+template <unsigned short D, typename T>
+using scratch_nddata_t = typename kokkos_aliases_hidden::scratch_nddata_impl<D, T>::type;
+
+// Defining aliases for Scratch memory ndfield
+namespace kokkos_aliases_hidden {
+  template <Dimension D, unsigned short N, typename T>
+  struct scratch_ndfield_impl {
+    using type = void;
+  };
+
+  template <unsigned short N, typename T>
+  struct scratch_ndfield_impl<Dim::_1D, N, T> {
+    using type = Kokkos::View<T* [N],
+                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  };
+
+  template <unsigned short N, typename T>
+  struct scratch_ndfield_impl<Dim::_2D, N, T> {
+    using type = Kokkos::View<T** [N],
+                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  };
+
+  template <unsigned short N, typename T>
+  struct scratch_ndfield_impl<Dim::_3D, N, T> {
+    using type = Kokkos::View<T*** [N],
+                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  };
+} // namespace kokkos_aliases_hidden
+
+template <Dimension D, unsigned short N, typename T>
+using scratch_ndfield_t =
+  typename kokkos_aliases_hidden::scratch_ndfield_impl<D, N, T>::type;
+
 /**
  * @brief Function template for generating 1D Kokkos range policy for particles.
  * @tparam D Dimension
diff --git a/src/kernels/currents_deposit.hpp b/src/kernels/currents_deposit.hpp
index 5eb7ff2b6..79252621a 100644
--- a/src/kernels/currents_deposit.hpp
+++ b/src/kernels/currents_deposit.hpp
@@ -3,7 +3,7 @@
  * @brief Covariant algorithms for the current deposition.
  *
  * Two kernels share the same per-particle body
- * (`kernel::deposit::deposit_one_particle`):
+ * (`kernel::DepositOneParticle`):
  *   - `kernel::DepositCurrents_kernel<S, M, O>` flat (RangePolicy over particles,
  *     writes into a `Kokkos::Experimental::ScatterView`). Always available.
  *   - `kernel::DepositCurrents_kernel_tiled<S, M, O, T_TILE>` team-policy
@@ -13,7 +13,7 @@
  *
  * @implements
  *   - kernel::deposit::PrtlPack<>
- *   - kernel::deposit::deposit_one_particle<>
+ *   - kernel::DepositOneParticle<>
  *   - kernel::DepositCurrents_kernel<>
  *   - kernel::DepositCurrents_kernel_tiled<>   (TEAM_POLICY only)
  * @namespaces:
@@ -32,6 +32,7 @@
 #include "utils/error.h"
 #include "utils/numeric.h"
 
+#include "framework/containers/particles.h"
 #include "kernels/particle_shapes.hpp"
 
 #include <Kokkos_Core.hpp>
@@ -42,671 +43,645 @@
 namespace kernel {
   using namespace ntt;
 
-  namespace deposit {
-
-    /**
-     * @brief Per-particle reference pack consumed by both the flat and tiled
-     *        deposit kernels. The same set of SoA references is captured by
-     *        each kernel; bundling them here keeps the helper's argument
-     *        list manageable and ensures every consumer reads the same
-     *        view aliases.
-     */
-    template <Dimension D>
-    struct PrtlPack {
-      array_t<int*>      i1, i2, i3;
-      array_t<int*>      i1_prev, i2_prev, i3_prev;
-      array_t<prtldx_t*> dx1, dx2, dx3;
-      array_t<prtldx_t*> dx1_prev, dx2_prev, dx3_prev;
-      array_t<real_t*>   ux1, ux2, ux3;
-      array_t<real_t*>   phi;
-      array_t<real_t*>   weight;
-      array_t<short*>    tag;
-    };
+  /**
+   * @brief Per-particle deposit body, shared between the flat and tiled
+   *        kernels.
+   *
+   * The caller supplies a `deposit_at(idx..., comp, val)` callback that
+   * applies the contribution `val` to the J component `comp` at the
+   * **global** J cell index `idx...` (already includes the `N_GHOSTS`
+   * offset). The flat kernel's callback simply does
+   * `J_acc(idx..., comp) += val` on its scatter-view accessor; the tiled
+   * kernel's callback translates `idx...` into per-tile scratch
+   * coordinates and uses `Kokkos::atomic_add` on SLM. Either way, this
+   * function is identical numerically and contains the only deposit math
+   * in the codebase.
+   *
+   * Dead particles return early. The callback is invoked once per cell
+   * write, with the dimension-appropriate signature:
+   *   - 1D: `deposit_at(int g_i1, int comp, real_t val)`
+   *   - 2D: `deposit_at(int g_i1, int g_i2, int comp, real_t val)`
+   *   - 3D: `deposit_at(int g_i1, int g_i2, int g_i3, int comp, real_t val)`
+   */
+  template <SimEngine::type S, MetricClass M, unsigned short O, typename DepositFn>
+  Inline void DepositOneParticle(prtlidx_t             p,
+                                 const ParticleArrays& prtls,
+                                 const M&              metric,
+                                 real_t                charge,
+                                 real_t                inv_dt,
+                                 DepositFn             deposit_at) {
+    static_assert(O <= 11u, "Shape function order O must be <= 11");
+    constexpr auto D = M::Dim;
 
-    /**
-     * @brief Per-particle deposit body, shared between the flat and tiled
-     *        kernels.
-     *
-     * The caller supplies a `deposit_at(idx..., comp, val)` callback that
-     * applies the contribution `val` to the J component `comp` at the
-     * **global** J cell index `idx...` (already includes the `N_GHOSTS`
-     * offset). The flat kernel's callback simply does
-     * `J_acc(idx..., comp) += val` on its scatter-view accessor; the tiled
-     * kernel's callback translates `idx...` into per-tile scratch
-     * coordinates and uses `Kokkos::atomic_add` on SLM. Either way, this
-     * function is identical numerically and contains the only deposit math
-     * in the codebase.
-     *
-     * Dead particles return early. The callback is invoked once per cell
-     * write, with the dimension-appropriate signature:
-     *   - 1D: `deposit_at(int g_i1, int comp, real_t val)`
-     *   - 2D: `deposit_at(int g_i1, int g_i2, int comp, real_t val)`
-     *   - 3D: `deposit_at(int g_i1, int g_i2, int g_i3, int comp, real_t val)`
-     */
-    template <SimEngine::type S, MetricClass M, unsigned short O,
-              typename DepositFn>
-    Inline void deposit_one_particle(prtlidx_t       p,
-                                     const PrtlPack<M::Dim>& prtls,
-                                     const M&        metric,
-                                     real_t          charge,
-                                     real_t          inv_dt,
-                                     DepositFn       deposit_at) {
-      static_assert(O <= 11u, "Shape function order O must be <= 11");
-      constexpr auto D = M::Dim;
-
-      const auto& i1       = prtls.i1;
-      const auto& i2       = prtls.i2;
-      const auto& i3       = prtls.i3;
-      const auto& i1_prev  = prtls.i1_prev;
-      const auto& i2_prev  = prtls.i2_prev;
-      const auto& i3_prev  = prtls.i3_prev;
-      const auto& dx1      = prtls.dx1;
-      const auto& dx2      = prtls.dx2;
-      const auto& dx3      = prtls.dx3;
-      const auto& dx1_prev = prtls.dx1_prev;
-      const auto& dx2_prev = prtls.dx2_prev;
-      const auto& dx3_prev = prtls.dx3_prev;
-      const auto& ux1      = prtls.ux1;
-      const auto& ux2      = prtls.ux2;
-      const auto& ux3      = prtls.ux3;
-      const auto& phi      = prtls.phi;
-      const auto& weight   = prtls.weight;
-      const auto& tag      = prtls.tag;
-
-      if (tag(p) == ParticleTag::dead) {
-        return;
-      }
+    if (prtls.tag(p) == ParticleTag::dead) {
+      return;
+    }
 
-      // recover particle velocity to deposit in unsimulated direction
-      [[maybe_unused]] vec_t<Dim::_3D> vp { ZERO };
-      // `vp` only feeds the unsimulated-direction current in the 1D
-      // (jx2, jx3) and 2D (jx3) branches. In 3D every J component comes
-      // from the Esirkepov/zigzag charge motion and `vp` is never read,
-      // so the metric transform + 1/sqrt + NaN/Inf guard below is pure
-      // dead work there — skip it (also frees xp/inv_energy registers).
-      if constexpr (D != Dim::_3D) {
-        coord_t<M::PrtlDim> xp { ZERO };
-        if constexpr (D == Dim::_1D) {
-          xp[0] = i_di_to_Xi(i1(p), dx1(p));
-        } else if constexpr (D == Dim::_2D) {
-          if constexpr (M::PrtlDim == Dim::_3D) {
-            xp[0] = i_di_to_Xi(i1(p), dx1(p));
-            xp[1] = i_di_to_Xi(i2(p), dx2(p));
-            xp[2] = phi(p);
-          } else {
-            xp[0] = i_di_to_Xi(i1(p), dx1(p));
-            xp[1] = i_di_to_Xi(i2(p), dx2(p));
-          }
-        } else {
-          xp[0] = i_di_to_Xi(i1(p), dx1(p));
-          xp[1] = i_di_to_Xi(i2(p), dx2(p));
-          xp[2] = i_di_to_Xi(i3(p), dx3(p));
-        }
-        auto inv_energy { ZERO };
-        if constexpr (S == SimEngine::SRPIC) {
-          metric.template transform_xyz<Idx::XYZ, Idx::U>(xp,
-                                                          { ux1(p), ux2(p), ux3(p) },
-                                                          vp);
-          inv_energy = ONE / math::sqrt(ONE + NORM_SQR(ux1(p), ux2(p), ux3(p)));
+    // recover particle velocity to deposit in unsimulated direction
+    [[maybe_unused]]
+    vec_t<Dim::_3D> vp { ZERO };
+    // `vp` only feeds the unsimulated-direction current in the 1D
+    // (jx2, jx3) and 2D (jx3) branches. In 3D every J component comes
+    // from the Esirkepov/zigzag charge motion and `vp` is never read,
+    // so the metric transform + 1/sqrt + NaN/Inf guard below is pure
+    // dead work there — skip it (also frees xp/inv_energy registers).
+    if constexpr (D != Dim::_3D) {
+      coord_t<M::PrtlDim> xp { ZERO };
+      if constexpr (D == Dim::_1D) {
+        xp[0] = i_di_to_Xi(prtls.i1(p), prtls.dx1(p));
+      } else if constexpr (D == Dim::_2D) {
+        if constexpr (M::PrtlDim == Dim::_3D) {
+          xp[0] = i_di_to_Xi(prtls.i1(p), prtls.dx1(p));
+          xp[1] = i_di_to_Xi(prtls.i2(p), prtls.dx2(p));
+          xp[2] = prtls.phi(p);
         } else {
-          coord_t<Dim::_2D> xp_ { ZERO };
-          xp_[0] = xp[0];
-          real_t     theta_Cd { xp[1] };
-          const auto theta_Ph { metric.template convert<2, Crd::Cd, Crd::Ph>(
-            theta_Cd) };
-          const auto small_angle { static_cast<real_t>(constant::SMALL_ANGLE_GR) };
-          const auto large_angle { static_cast<real_t>(
-            constant::PI - constant::SMALL_ANGLE_GR) };
-          if (theta_Ph < small_angle) {
-            theta_Cd = metric.template convert<2, Crd::Ph, Crd::Cd>(small_angle);
-          } else if (theta_Ph >= large_angle) {
-            theta_Cd = metric.template convert<2, Crd::Ph, Crd::Cd>(large_angle);
-          }
-          xp_[1] = theta_Cd;
-          metric.template transform<Idx::D, Idx::U>(xp_,
-                                                    { ux1(p), ux2(p), ux3(p) },
-                                                    vp);
-          inv_energy = metric.alpha(xp_) /
-                       math::sqrt(ONE + ux1(p) * vp[0] + ux2(p) * vp[1] +
-                                  ux3(p) * vp[2]);
+          xp[0] = i_di_to_Xi(prtls.i1(p), prtls.dx1(p));
+          xp[1] = i_di_to_Xi(prtls.i2(p), prtls.dx2(p));
         }
-        if (Kokkos::isnan(vp[2]) || Kokkos::isinf(vp[2])) {
-          vp[2] = ZERO;
+      } else {
+        xp[0] = i_di_to_Xi(prtls.i1(p), prtls.dx1(p));
+        xp[1] = i_di_to_Xi(prtls.i2(p), prtls.dx2(p));
+        xp[2] = i_di_to_Xi(prtls.i3(p), prtls.dx3(p));
+      }
+      auto inv_energy { ZERO };
+      if constexpr (S == SimEngine::SRPIC) {
+        metric.template transform_xyz<Idx::XYZ, Idx::U>(
+          xp,
+          { prtls.ux1(p), prtls.ux2(p), prtls.ux3(p) },
+          vp);
+        inv_energy = ONE / U2GAMMA(prtls.ux1(p), prtls.ux2(p), prtls.ux3(p));
+      } else {
+        coord_t<Dim::_2D> xp_ { ZERO };
+        xp_[0] = xp[0];
+        real_t     theta_Cd { xp[1] };
+        const auto theta_Ph { metric.template convert<2, Crd::Cd, Crd::Ph>(
+          theta_Cd) };
+        const auto small_angle { static_cast<real_t>(constant::SMALL_ANGLE_GR) };
+        const auto large_angle { static_cast<real_t>(
+          constant::PI - constant::SMALL_ANGLE_GR) };
+        if (theta_Ph < small_angle) {
+          theta_Cd = metric.template convert<2, Crd::Ph, Crd::Cd>(small_angle);
+        } else if (theta_Ph >= large_angle) {
+          theta_Cd = metric.template convert<2, Crd::Ph, Crd::Cd>(large_angle);
         }
-        vp[0] *= inv_energy;
-        vp[1] *= inv_energy;
-        vp[2] *= inv_energy;
+        xp_[1] = theta_Cd;
+        metric.template transform<Idx::D, Idx::U>(
+          xp_,
+          { prtls.ux1(p), prtls.ux2(p), prtls.ux3(p) },
+          vp);
+        inv_energy = metric.alpha(xp_) /
+                     math::sqrt(ONE + prtls.ux1(p) * vp[0] +
+                                prtls.ux2(p) * vp[1] + prtls.ux3(p) * vp[2]);
+      }
+      if (Kokkos::isnan(vp[2]) || Kokkos::isinf(vp[2])) {
+        vp[2] = ZERO;
       }
+      vp[0] *= inv_energy;
+      vp[1] *= inv_energy;
+      vp[2] *= inv_energy;
+    }
 
-      const real_t coeff { weight(p) * charge };
+    const real_t coeff { prtls.weight(p) * charge };
+
+    if constexpr (O == 0u) {
+      /*
+        Zig-zag deposit
+      */
+      const auto dxp_r_1 { static_cast<prtldx_t>(prtls.i1(p) == prtls.i1_prev(p)) *
+                           (prtls.dx1(p) + prtls.dx1_prev(p)) *
+                           static_cast<prtldx_t>(INV_2) };
+
+      const real_t Wx1_1 { INV_2 *
+                           (dxp_r_1 + prtls.dx1_prev(p) +
+                            static_cast<real_t>(prtls.i1(p) > prtls.i1_prev(p))) };
+      const real_t Wx1_2 { INV_2 *
+                           (prtls.dx1(p) + dxp_r_1 +
+                            static_cast<real_t>(
+                              static_cast<int>(prtls.i1(p) > prtls.i1_prev(p)) +
+                              prtls.i1_prev(p) - prtls.i1(p))) };
+      const real_t Fx1_1 { (static_cast<real_t>(prtls.i1(p) > prtls.i1_prev(p)) +
+                            dxp_r_1 - prtls.dx1_prev(p)) *
+                           coeff * inv_dt };
+      const real_t Fx1_2 { (static_cast<real_t>(
+                              prtls.i1(p) - prtls.i1_prev(p) -
+                              static_cast<int>(prtls.i1(p) > prtls.i1_prev(p))) +
+                            prtls.dx1(p) - dxp_r_1) *
+                           coeff * inv_dt };
 
-      if constexpr (O == 0u) {
-        /*
-          Zig-zag deposit
-        */
-        const auto dxp_r_1 { static_cast<prtldx_t>(i1(p) == i1_prev(p)) *
-                             (dx1(p) + dx1_prev(p)) *
+      if constexpr (D == Dim::_1D) {
+        const real_t Fx2_1 { HALF * vp[1] * coeff };
+        const real_t Fx2_2 { HALF * vp[1] * coeff };
+
+        const real_t Fx3_1 { HALF * vp[2] * coeff };
+        const real_t Fx3_2 { HALF * vp[2] * coeff };
+
+        deposit_at(prtls.i1_prev(p) + N_GHOSTS, cur::jx1, Fx1_1);
+        deposit_at(prtls.i1(p) + N_GHOSTS, cur::jx1, Fx1_2);
+
+        deposit_at(prtls.i1_prev(p) + N_GHOSTS, cur::jx2, Fx2_1 * (ONE - Wx1_1));
+        deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, cur::jx2, Fx2_1 * Wx1_1);
+        deposit_at(prtls.i1(p) + N_GHOSTS, cur::jx2, Fx2_2 * (ONE - Wx1_2));
+        deposit_at(prtls.i1(p) + N_GHOSTS + 1, cur::jx2, Fx2_2 * Wx1_2);
+
+        deposit_at(prtls.i1_prev(p) + N_GHOSTS, cur::jx3, Fx3_1 * (ONE - Wx1_1));
+        deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, cur::jx3, Fx3_1 * Wx1_1);
+        deposit_at(prtls.i1(p) + N_GHOSTS, cur::jx3, Fx3_2 * (ONE - Wx1_2));
+        deposit_at(prtls.i1(p) + N_GHOSTS + 1, cur::jx3, Fx3_2 * Wx1_2);
+      } else if constexpr (D == Dim::_2D || D == Dim::_3D) {
+        const auto dxp_r_2 { static_cast<prtldx_t>(prtls.i2(p) == prtls.i2_prev(p)) *
+                             (prtls.dx2(p) + prtls.dx2_prev(p)) *
                              static_cast<prtldx_t>(INV_2) };
 
-        const real_t Wx1_1 { INV_2 * (dxp_r_1 + dx1_prev(p) +
-                                      static_cast<real_t>(i1(p) > i1_prev(p))) };
-        const real_t Wx1_2 { INV_2 * (dx1(p) + dxp_r_1 +
-                                      static_cast<real_t>(
-                                        static_cast<int>(i1(p) > i1_prev(p)) +
-                                        i1_prev(p) - i1(p))) };
-        const real_t Fx1_1 { (static_cast<real_t>(i1(p) > i1_prev(p)) +
-                              dxp_r_1 - dx1_prev(p)) *
-                             coeff * inv_dt };
-        const real_t Fx1_2 { (static_cast<real_t>(
-                                i1(p) - i1_prev(p) -
-                                static_cast<int>(i1(p) > i1_prev(p))) +
-                              dx1(p) - dxp_r_1) *
+        const real_t Wx2_1 { INV_2 * (dxp_r_2 + prtls.dx2_prev(p) +
+                                      static_cast<real_t>(prtls.i2(p) >
+                                                          prtls.i2_prev(p))) };
+        const real_t Wx2_2 { INV_2 *
+                             (prtls.dx2(p) + dxp_r_2 +
+                              static_cast<real_t>(
+                                static_cast<int>(prtls.i2(p) > prtls.i2_prev(p)) +
+                                prtls.i2_prev(p) - prtls.i2(p))) };
+        const real_t Fx2_1 { (static_cast<real_t>(prtls.i2(p) > prtls.i2_prev(p)) +
+                              dxp_r_2 - prtls.dx2_prev(p)) *
                              coeff * inv_dt };
-
-        if constexpr (D == Dim::_1D) {
-          const real_t Fx2_1 { HALF * vp[1] * coeff };
-          const real_t Fx2_2 { HALF * vp[1] * coeff };
-
+        const real_t Fx2_2 {
+          (static_cast<real_t>(prtls.i2(p) - prtls.i2_prev(p) -
+                               static_cast<int>(prtls.i2(p) > prtls.i2_prev(p))) +
+           prtls.dx2(p) - dxp_r_2) *
+          coeff * inv_dt
+        };
+
+        if constexpr (D == Dim::_2D) {
           const real_t Fx3_1 { HALF * vp[2] * coeff };
           const real_t Fx3_2 { HALF * vp[2] * coeff };
 
-          deposit_at(i1_prev(p) + N_GHOSTS, cur::jx1, Fx1_1);
-          deposit_at(i1(p) + N_GHOSTS, cur::jx1, Fx1_2);
-
-          deposit_at(i1_prev(p) + N_GHOSTS, cur::jx2, Fx2_1 * (ONE - Wx1_1));
-          deposit_at(i1_prev(p) + N_GHOSTS + 1, cur::jx2, Fx2_1 * Wx1_1);
-          deposit_at(i1(p) + N_GHOSTS, cur::jx2, Fx2_2 * (ONE - Wx1_2));
-          deposit_at(i1(p) + N_GHOSTS + 1, cur::jx2, Fx2_2 * Wx1_2);
-
-          deposit_at(i1_prev(p) + N_GHOSTS, cur::jx3, Fx3_1 * (ONE - Wx1_1));
-          deposit_at(i1_prev(p) + N_GHOSTS + 1, cur::jx3, Fx3_1 * Wx1_1);
-          deposit_at(i1(p) + N_GHOSTS, cur::jx3, Fx3_2 * (ONE - Wx1_2));
-          deposit_at(i1(p) + N_GHOSTS + 1, cur::jx3, Fx3_2 * Wx1_2);
-        } else if constexpr (D == Dim::_2D || D == Dim::_3D) {
-          const auto dxp_r_2 { static_cast<prtldx_t>(i2(p) == i2_prev(p)) *
-                               (dx2(p) + dx2_prev(p)) *
-                               static_cast<prtldx_t>(INV_2) };
-
-          const real_t Wx2_1 { INV_2 * (dxp_r_2 + dx2_prev(p) +
-                                        static_cast<real_t>(i2(p) > i2_prev(p))) };
-          const real_t Wx2_2 { INV_2 * (dx2(p) + dxp_r_2 +
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     cur::jx1,
+                     Fx1_1 * (ONE - Wx2_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS + 1,
+                     cur::jx1,
+                     Fx1_1 * Wx2_1);
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS,
+                     cur::jx1,
+                     Fx1_2 * (ONE - Wx2_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS + 1,
+                     cur::jx1,
+                     Fx1_2 * Wx2_2);
+
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     cur::jx2,
+                     Fx2_1 * (ONE - Wx1_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     cur::jx2,
+                     Fx2_1 * Wx1_1);
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS,
+                     cur::jx2,
+                     Fx2_2 * (ONE - Wx1_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS + 1,
+                     prtls.i2(p) + N_GHOSTS,
+                     cur::jx2,
+                     Fx2_2 * Wx1_2);
+
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_1 * Wx1_1 * (ONE - Wx2_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS + 1,
+                     cur::jx3,
+                     Fx3_1 * (ONE - Wx1_1) * Wx2_1);
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1,
+                     prtls.i2_prev(p) + N_GHOSTS + 1,
+                     cur::jx3,
+                     Fx3_1 * Wx1_1 * Wx2_1);
+
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS + 1,
+                     prtls.i2(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_2 * Wx1_2 * (ONE - Wx2_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS + 1,
+                     cur::jx3,
+                     Fx3_2 * (ONE - Wx1_2) * Wx2_2);
+          deposit_at(prtls.i1(p) + N_GHOSTS + 1,
+                     prtls.i2(p) + N_GHOSTS + 1,
+                     cur::jx3,
+                     Fx3_2 * Wx1_2 * Wx2_2);
+        } else {
+          const auto dxp_r_3 {
+            static_cast<prtldx_t>(prtls.i3(p) == prtls.i3_prev(p)) *
+            (prtls.dx3(p) + prtls.dx3_prev(p)) * static_cast<prtldx_t>(INV_2)
+          };
+          const real_t Wx3_1 { INV_2 * (dxp_r_3 + prtls.dx3_prev(p) +
                                         static_cast<real_t>(
-                                          static_cast<int>(i2(p) > i2_prev(p)) +
-                                          i2_prev(p) - i2(p))) };
-          const real_t Fx2_1 { (static_cast<real_t>(i2(p) > i2_prev(p)) +
-                                dxp_r_2 - dx2_prev(p)) *
-                               coeff * inv_dt };
-          const real_t Fx2_2 { (static_cast<real_t>(
-                                  i2(p) - i2_prev(p) -
-                                  static_cast<int>(i2(p) > i2_prev(p))) +
-                                dx2(p) - dxp_r_2) *
+                                          prtls.i3(p) > prtls.i3_prev(p))) };
+          const real_t Wx3_2 {
+            INV_2 *
+            (prtls.dx3(p) + dxp_r_3 +
+             static_cast<real_t>(static_cast<int>(prtls.i3(p) > prtls.i3_prev(p)) +
+                                 prtls.i3_prev(p) - prtls.i3(p)))
+          };
+          const real_t Fx3_1 { (static_cast<real_t>(prtls.i3(p) > prtls.i3_prev(p)) +
+                                dxp_r_3 - prtls.dx3_prev(p)) *
                                coeff * inv_dt };
+          const real_t Fx3_2 {
+            (static_cast<real_t>(prtls.i3(p) - prtls.i3_prev(p) -
+                                 static_cast<int>(prtls.i3(p) > prtls.i3_prev(p))) +
+             prtls.dx3(p) - dxp_r_3) *
+            coeff * inv_dt
+          };
+
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     prtls.i3_prev(p) + N_GHOSTS,
+                     cur::jx1,
+                     Fx1_1 * (ONE - Wx2_1) * (ONE - Wx3_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS + 1,
+                     prtls.i3_prev(p) + N_GHOSTS,
+                     cur::jx1,
+                     Fx1_1 * Wx2_1 * (ONE - Wx3_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     prtls.i3_prev(p) + N_GHOSTS + 1,
+                     cur::jx1,
+                     Fx1_1 * (ONE - Wx2_1) * Wx3_1);
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS + 1,
+                     prtls.i3_prev(p) + N_GHOSTS + 1,
+                     cur::jx1,
+                     Fx1_1 * Wx2_1 * Wx3_1);
+
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS,
+                     prtls.i3(p) + N_GHOSTS,
+                     cur::jx1,
+                     Fx1_2 * (ONE - Wx2_2) * (ONE - Wx3_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS + 1,
+                     prtls.i3(p) + N_GHOSTS,
+                     cur::jx1,
+                     Fx1_2 * Wx2_2 * (ONE - Wx3_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS,
+                     prtls.i3(p) + N_GHOSTS + 1,
+                     cur::jx1,
+                     Fx1_2 * (ONE - Wx2_2) * Wx3_2);
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS + 1,
+                     prtls.i3(p) + N_GHOSTS + 1,
+                     cur::jx1,
+                     Fx1_2 * Wx2_2 * Wx3_2);
+
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     prtls.i3_prev(p) + N_GHOSTS,
+                     cur::jx2,
+                     Fx2_1 * (ONE - Wx1_1) * (ONE - Wx3_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     prtls.i3_prev(p) + N_GHOSTS,
+                     cur::jx2,
+                     Fx2_1 * Wx1_1 * (ONE - Wx3_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     prtls.i3_prev(p) + N_GHOSTS + 1,
+                     cur::jx2,
+                     Fx2_1 * (ONE - Wx1_1) * Wx3_1);
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     prtls.i3_prev(p) + N_GHOSTS + 1,
+                     cur::jx2,
+                     Fx2_1 * Wx1_1 * Wx3_1);
+
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS,
+                     prtls.i3(p) + N_GHOSTS,
+                     cur::jx2,
+                     Fx2_2 * (ONE - Wx1_2) * (ONE - Wx3_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS + 1,
+                     prtls.i2(p) + N_GHOSTS,
+                     prtls.i3(p) + N_GHOSTS,
+                     cur::jx2,
+                     Fx2_2 * Wx1_2 * (ONE - Wx3_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS,
+                     prtls.i3(p) + N_GHOSTS + 1,
+                     cur::jx2,
+                     Fx2_2 * (ONE - Wx1_2) * Wx3_2);
+          deposit_at(prtls.i1(p) + N_GHOSTS + 1,
+                     prtls.i2(p) + N_GHOSTS,
+                     prtls.i3(p) + N_GHOSTS + 1,
+                     cur::jx2,
+                     Fx2_2 * Wx1_2 * Wx3_2);
+
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     prtls.i3_prev(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     prtls.i3_prev(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_1 * Wx1_1 * (ONE - Wx2_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS + 1,
+                     prtls.i3_prev(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_1 * (ONE - Wx1_1) * Wx2_1);
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1,
+                     prtls.i2_prev(p) + N_GHOSTS + 1,
+                     prtls.i3_prev(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_1 * Wx1_1 * Wx2_1);
+
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS,
+                     prtls.i3(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS + 1,
+                     prtls.i2(p) + N_GHOSTS,
+                     prtls.i3(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_2 * Wx1_2 * (ONE - Wx2_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS + 1,
+                     prtls.i3(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_2 * (ONE - Wx1_2) * Wx2_2);
+          deposit_at(prtls.i1(p) + N_GHOSTS + 1,
+                     prtls.i2(p) + N_GHOSTS + 1,
+                     prtls.i3(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_2 * Wx1_2 * Wx2_2);
+        }
+      }
+    } else if constexpr ((O >= 1u) and (O <= 11u)) {
+
+      // shape function in dim1 -> always required
+      real_t iS_x1[O + 2], fS_x1[O + 2];
+      // indices of the shape function
+      int    i1_min, i1_max;
+
+      // call shape function
+      prtl_shape::for_deposit<O>(prtls.i1_prev(p),
+                                 static_cast<real_t>(prtls.dx1_prev(p)),
+                                 prtls.i1(p),
+                                 static_cast<real_t>(prtls.dx1(p)),
+                                 i1_min,
+                                 i1_max,
+                                 iS_x1,
+                                 fS_x1);
 
-          if constexpr (D == Dim::_2D) {
-            const real_t Fx3_1 { HALF * vp[2] * coeff };
-            const real_t Fx3_2 { HALF * vp[2] * coeff };
-
-            deposit_at(i1_prev(p) + N_GHOSTS,
-                       i2_prev(p) + N_GHOSTS,
-                       cur::jx1,
-                       Fx1_1 * (ONE - Wx2_1));
-            deposit_at(i1_prev(p) + N_GHOSTS,
-                       i2_prev(p) + N_GHOSTS + 1,
-                       cur::jx1,
-                       Fx1_1 * Wx2_1);
-            deposit_at(i1(p) + N_GHOSTS,
-                       i2(p) + N_GHOSTS,
-                       cur::jx1,
-                       Fx1_2 * (ONE - Wx2_2));
-            deposit_at(i1(p) + N_GHOSTS,
-                       i2(p) + N_GHOSTS + 1,
-                       cur::jx1,
-                       Fx1_2 * Wx2_2);
-
-            deposit_at(i1_prev(p) + N_GHOSTS,
-                       i2_prev(p) + N_GHOSTS,
-                       cur::jx2,
-                       Fx2_1 * (ONE - Wx1_1));
-            deposit_at(i1_prev(p) + N_GHOSTS + 1,
-                       i2_prev(p) + N_GHOSTS,
-                       cur::jx2,
-                       Fx2_1 * Wx1_1);
-            deposit_at(i1(p) + N_GHOSTS,
-                       i2(p) + N_GHOSTS,
-                       cur::jx2,
-                       Fx2_2 * (ONE - Wx1_2));
-            deposit_at(i1(p) + N_GHOSTS + 1,
-                       i2(p) + N_GHOSTS,
-                       cur::jx2,
-                       Fx2_2 * Wx1_2);
-
-            deposit_at(i1_prev(p) + N_GHOSTS,
-                       i2_prev(p) + N_GHOSTS,
-                       cur::jx3,
-                       Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1));
-            deposit_at(i1_prev(p) + N_GHOSTS + 1,
-                       i2_prev(p) + N_GHOSTS,
-                       cur::jx3,
-                       Fx3_1 * Wx1_1 * (ONE - Wx2_1));
-            deposit_at(i1_prev(p) + N_GHOSTS,
-                       i2_prev(p) + N_GHOSTS + 1,
-                       cur::jx3,
-                       Fx3_1 * (ONE - Wx1_1) * Wx2_1);
-            deposit_at(i1_prev(p) + N_GHOSTS + 1,
-                       i2_prev(p) + N_GHOSTS + 1,
-                       cur::jx3,
-                       Fx3_1 * Wx1_1 * Wx2_1);
-
-            deposit_at(i1(p) + N_GHOSTS,
-                       i2(p) + N_GHOSTS,
-                       cur::jx3,
-                       Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2));
-            deposit_at(i1(p) + N_GHOSTS + 1,
-                       i2(p) + N_GHOSTS,
-                       cur::jx3,
-                       Fx3_2 * Wx1_2 * (ONE - Wx2_2));
-            deposit_at(i1(p) + N_GHOSTS,
-                       i2(p) + N_GHOSTS + 1,
-                       cur::jx3,
-                       Fx3_2 * (ONE - Wx1_2) * Wx2_2);
-            deposit_at(i1(p) + N_GHOSTS + 1,
-                       i2(p) + N_GHOSTS + 1,
-                       cur::jx3,
-                       Fx3_2 * Wx1_2 * Wx2_2);
-          } else {
-            const auto   dxp_r_3 { static_cast<prtldx_t>(i3(p) == i3_prev(p)) *
-                                 (dx3(p) + dx3_prev(p)) *
-                                 static_cast<prtldx_t>(INV_2) };
-            const real_t Wx3_1 { INV_2 * (dxp_r_3 + dx3_prev(p) +
-                                          static_cast<real_t>(i3(p) > i3_prev(p))) };
-            const real_t Wx3_2 { INV_2 * (dx3(p) + dxp_r_3 +
-                                          static_cast<real_t>(
-                                            static_cast<int>(i3(p) > i3_prev(p)) +
-                                            i3_prev(p) - i3(p))) };
-            const real_t Fx3_1 { (static_cast<real_t>(i3(p) > i3_prev(p)) +
-                                  dxp_r_3 - dx3_prev(p)) *
-                                 coeff * inv_dt };
-            const real_t Fx3_2 { (static_cast<real_t>(
-                                    i3(p) - i3_prev(p) -
-                                    static_cast<int>(i3(p) > i3_prev(p))) +
-                                  dx3(p) - dxp_r_3) *
-                                 coeff * inv_dt };
-
-            deposit_at(i1_prev(p) + N_GHOSTS,
-                       i2_prev(p) + N_GHOSTS,
-                       i3_prev(p) + N_GHOSTS,
-                       cur::jx1,
-                       Fx1_1 * (ONE - Wx2_1) * (ONE - Wx3_1));
-            deposit_at(i1_prev(p) + N_GHOSTS,
-                       i2_prev(p) + N_GHOSTS + 1,
-                       i3_prev(p) + N_GHOSTS,
-                       cur::jx1,
-                       Fx1_1 * Wx2_1 * (ONE - Wx3_1));
-            deposit_at(i1_prev(p) + N_GHOSTS,
-                       i2_prev(p) + N_GHOSTS,
-                       i3_prev(p) + N_GHOSTS + 1,
-                       cur::jx1,
-                       Fx1_1 * (ONE - Wx2_1) * Wx3_1);
-            deposit_at(i1_prev(p) + N_GHOSTS,
-                       i2_prev(p) + N_GHOSTS + 1,
-                       i3_prev(p) + N_GHOSTS + 1,
-                       cur::jx1,
-                       Fx1_1 * Wx2_1 * Wx3_1);
-
-            deposit_at(i1(p) + N_GHOSTS,
-                       i2(p) + N_GHOSTS,
-                       i3(p) + N_GHOSTS,
-                       cur::jx1,
-                       Fx1_2 * (ONE - Wx2_2) * (ONE - Wx3_2));
-            deposit_at(i1(p) + N_GHOSTS,
-                       i2(p) + N_GHOSTS + 1,
-                       i3(p) + N_GHOSTS,
-                       cur::jx1,
-                       Fx1_2 * Wx2_2 * (ONE - Wx3_2));
-            deposit_at(i1(p) + N_GHOSTS,
-                       i2(p) + N_GHOSTS,
-                       i3(p) + N_GHOSTS + 1,
-                       cur::jx1,
-                       Fx1_2 * (ONE - Wx2_2) * Wx3_2);
-            deposit_at(i1(p) + N_GHOSTS,
-                       i2(p) + N_GHOSTS + 1,
-                       i3(p) + N_GHOSTS + 1,
-                       cur::jx1,
-                       Fx1_2 * Wx2_2 * Wx3_2);
-
-            deposit_at(i1_prev(p) + N_GHOSTS,
-                       i2_prev(p) + N_GHOSTS,
-                       i3_prev(p) + N_GHOSTS,
-                       cur::jx2,
-                       Fx2_1 * (ONE - Wx1_1) * (ONE - Wx3_1));
-            deposit_at(i1_prev(p) + N_GHOSTS + 1,
-                       i2_prev(p) + N_GHOSTS,
-                       i3_prev(p) + N_GHOSTS,
-                       cur::jx2,
-                       Fx2_1 * Wx1_1 * (ONE - Wx3_1));
-            deposit_at(i1_prev(p) + N_GHOSTS,
-                       i2_prev(p) + N_GHOSTS,
-                       i3_prev(p) + N_GHOSTS + 1,
-                       cur::jx2,
-                       Fx2_1 * (ONE - Wx1_1) * Wx3_1);
-            deposit_at(i1_prev(p) + N_GHOSTS + 1,
-                       i2_prev(p) + N_GHOSTS,
-                       i3_prev(p) + N_GHOSTS + 1,
-                       cur::jx2,
-                       Fx2_1 * Wx1_1 * Wx3_1);
-
-            deposit_at(i1(p) + N_GHOSTS,
-                       i2(p) + N_GHOSTS,
-                       i3(p) + N_GHOSTS,
-                       cur::jx2,
-                       Fx2_2 * (ONE - Wx1_2) * (ONE - Wx3_2));
-            deposit_at(i1(p) + N_GHOSTS + 1,
-                       i2(p) + N_GHOSTS,
-                       i3(p) + N_GHOSTS,
-                       cur::jx2,
-                       Fx2_2 * Wx1_2 * (ONE - Wx3_2));
-            deposit_at(i1(p) + N_GHOSTS,
-                       i2(p) + N_GHOSTS,
-                       i3(p) + N_GHOSTS + 1,
-                       cur::jx2,
-                       Fx2_2 * (ONE - Wx1_2) * Wx3_2);
-            deposit_at(i1(p) + N_GHOSTS + 1,
-                       i2(p) + N_GHOSTS,
-                       i3(p) + N_GHOSTS + 1,
-                       cur::jx2,
-                       Fx2_2 * Wx1_2 * Wx3_2);
-
-            deposit_at(i1_prev(p) + N_GHOSTS,
-                       i2_prev(p) + N_GHOSTS,
-                       i3_prev(p) + N_GHOSTS,
-                       cur::jx3,
-                       Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1));
-            deposit_at(i1_prev(p) + N_GHOSTS + 1,
-                       i2_prev(p) + N_GHOSTS,
-                       i3_prev(p) + N_GHOSTS,
-                       cur::jx3,
-                       Fx3_1 * Wx1_1 * (ONE - Wx2_1));
-            deposit_at(i1_prev(p) + N_GHOSTS,
-                       i2_prev(p) + N_GHOSTS + 1,
-                       i3_prev(p) + N_GHOSTS,
-                       cur::jx3,
-                       Fx3_1 * (ONE - Wx1_1) * Wx2_1);
-            deposit_at(i1_prev(p) + N_GHOSTS + 1,
-                       i2_prev(p) + N_GHOSTS + 1,
-                       i3_prev(p) + N_GHOSTS,
-                       cur::jx3,
-                       Fx3_1 * Wx1_1 * Wx2_1);
-
-            deposit_at(i1(p) + N_GHOSTS,
-                       i2(p) + N_GHOSTS,
-                       i3(p) + N_GHOSTS,
-                       cur::jx3,
-                       Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2));
-            deposit_at(i1(p) + N_GHOSTS + 1,
-                       i2(p) + N_GHOSTS,
-                       i3(p) + N_GHOSTS,
-                       cur::jx3,
-                       Fx3_2 * Wx1_2 * (ONE - Wx2_2));
-            deposit_at(i1(p) + N_GHOSTS,
-                       i2(p) + N_GHOSTS + 1,
-                       i3(p) + N_GHOSTS,
-                       cur::jx3,
-                       Fx3_2 * (ONE - Wx1_2) * Wx2_2);
-            deposit_at(i1(p) + N_GHOSTS + 1,
-                       i2(p) + N_GHOSTS + 1,
-                       i3(p) + N_GHOSTS,
-                       cur::jx3,
-                       Fx3_2 * Wx1_2 * Wx2_2);
+      if constexpr (D == Dim::_1D) {
+        // (1D): fused Esirkepov, no [O+2] temporaries.
+        //   jx1[i] = -Qdx1dt * sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i'])
+        //          = -Qdx1dt * P1[i]                  (Eq. 38, 1D)
+        //   Wx23[i] = HALF * (fS_x1[i] + iS_x1[i])     (computed inline)
+        const real_t Qdx1dt = coeff * inv_dt;
+        const real_t QVx2   = coeff * vp[1];
+        const real_t QVx3   = coeff * vp[2];
+
+        // account for ghost cells
+        i1_min += N_GHOSTS;
+        i1_max += N_GHOSTS;
+
+        // get number of update indices for asymmetric movement
+        const int di_x1 = i1_max - i1_min;
+
+        // Current update — fused over the union line so the J cell
+        // stays L1-resident across the 3 component atomic_adds.
+        real_t P1 = ZERO;
+        for (int i = 0; i <= di_x1; ++i) {
+          P1                += fS_x1[i] - iS_x1[i];
+          const int    gi    = i1_min + i;
+          const real_t Wx23  = HALF * (fS_x1[i] + iS_x1[i]);
+          if (i < di_x1) {
+            deposit_at(gi, cur::jx1, -Qdx1dt * P1);
           }
+          deposit_at(gi, cur::jx2, QVx2 * Wx23);
+          deposit_at(gi, cur::jx3, QVx3 * Wx23);
         }
-      } else if constexpr ((O >= 1u) and (O <= 11u)) {
+
+      } else if constexpr (D == Dim::_2D) {
 
         // shape function in dim1 -> always required
-        real_t iS_x1[O + 2], fS_x1[O + 2];
+        real_t iS_x2[O + 2], fS_x2[O + 2];
         // indices of the shape function
-        int    i1_min, i1_max;
+        int    i2_min, i2_max;
 
         // call shape function
-        prtl_shape::for_deposit<O>(i1_prev(p),
-                                   static_cast<real_t>(dx1_prev(p)),
-                                   i1(p),
-                                   static_cast<real_t>(dx1(p)),
-                                   i1_min,
-                                   i1_max,
-                                   iS_x1,
-                                   fS_x1);
-
-        if constexpr (D == Dim::_1D) {
-          // (1D): fused Esirkepov, no [O+2] temporaries.
-          //   jx1[i] = -Qdx1dt * sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i'])
-          //          = -Qdx1dt * P1[i]                  (Eq. 38, 1D)
-          //   Wx23[i] = HALF * (fS_x1[i] + iS_x1[i])     (computed inline)
-          const real_t Qdx1dt = coeff * inv_dt;
-          const real_t QVx2   = coeff * vp[1];
-          const real_t QVx3   = coeff * vp[2];
-
-          // account for ghost cells
-          i1_min += N_GHOSTS;
-          i1_max += N_GHOSTS;
-
-          // get number of update indices for asymmetric movement
-          const int di_x1 = i1_max - i1_min;
-
-          // Current update — fused over the union line so the J cell
-          // stays L1-resident across the 3 component atomic_adds.
-          real_t P1 = ZERO;
-          for (int i = 0; i <= di_x1; ++i) {
-            P1 += fS_x1[i] - iS_x1[i];
-            const int    gi   = i1_min + i;
-            const real_t Wx23 = HALF * (fS_x1[i] + iS_x1[i]);
+        prtl_shape::for_deposit<O>(prtls.i2_prev(p),
+                                   static_cast<real_t>(prtls.dx2_prev(p)),
+                                   prtls.i2(p),
+                                   static_cast<real_t>(prtls.dx2(p)),
+                                   i2_min,
+                                   i2_max,
+                                   iS_x2,
+                                   fS_x2);
+
+        /**
+         * (2D): fused Esirkepov, no [O+2]^2 temporaries.
+         *
+         * Esirkepov 2001 Eq. 38 (simplified) is separable: with
+         * P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) and
+         * P2[j] = sum_{j'=0}^{j} (fS_x2[j'] - iS_x2[j']),
+         *   jx1[i][j] = -Q*HALF * P1[i] * (fS_x2[j] + iS_x2[j])
+         *   jx2[i][j] = -Q*HALF * P2[j] * (fS_x1[i] + iS_x1[i])
+         *   Wx3[i][j] = THIRD*( fS_x2[j]*(HALF*iS_x1[i]+fS_x1[i])
+         *                     + iS_x2[j]*(HALF*fS_x1[i]+iS_x1[i]) )
+         * with Q = coeff*inv_dt (Qdx1dt == Qdx2dt). Same value as the
+         * old explicit Wx/jx tensors up to FP reassociation;
+         * charge-conserving by construction. Prefix sums carried as
+         * running scalars, so the only per-thread state is the
+         * existing 1D shape arrays.
+         */
+        const real_t QVx3 = coeff * vp[2];
+        // -Q*HALF prefactor (Qdx1dt == Qdx2dt == coeff*inv_dt)
+        const real_t cf   = -(coeff * inv_dt) * HALF;
+
+        // account for ghost cells
+        i1_min += N_GHOSTS;
+        i2_min += N_GHOSTS;
+        i1_max += N_GHOSTS;
+        i2_max += N_GHOSTS;
+
+        // get number of update indices for asymmetric movement
+        const int di_x1 = i1_max - i1_min;
+        const int di_x2 = i2_max - i2_min;
+
+        // Current update — fused over the union plane so the J cell
+        // line stays L1-resident across the 3 component atomic_adds.
+        real_t P1 = ZERO;
+        for (int i = 0; i <= di_x1; ++i) {
+          P1                += fS_x1[i] - iS_x1[i];
+          const int    gi    = i1_min + i;
+          const real_t iSx1  = iS_x1[i];
+          const real_t fSx1  = fS_x1[i];
+          const real_t A1    = fSx1 + iSx1; // jx2 cross-factor
+          real_t       P2    = ZERO;
+          for (int j = 0; j <= di_x2; ++j) {
+            P2                += fS_x2[j] - iS_x2[j];
+            const int    gj    = i2_min + j;
+            const real_t iSx2  = iS_x2[j];
+            const real_t fSx2  = fS_x2[j];
             if (i < di_x1) {
-              deposit_at(gi, cur::jx1, -Qdx1dt * P1);
+              deposit_at(gi, gj, cur::jx1, cf * P1 * (fSx2 + iSx2));
             }
-            deposit_at(gi, cur::jx2, QVx2 * Wx23);
-            deposit_at(gi, cur::jx3, QVx3 * Wx23);
+            if (j < di_x2) {
+              deposit_at(gi, gj, cur::jx2, cf * P2 * A1);
+            }
+            const real_t Wx3 = THIRD * (fSx2 * (HALF * iSx1 + fSx1) +
+                                        iSx2 * (HALF * fSx1 + iSx1));
+            deposit_at(gi, gj, cur::jx3, QVx3 * Wx3);
           }
+        }
+
+      } else if constexpr (D == Dim::_3D) {
+        // shape function in dim2
+        real_t iS_x2[O + 2], fS_x2[O + 2];
+        // indices of the shape function
+        int    i2_min, i2_max;
+        // call shape function
+        prtl_shape::for_deposit<O>(prtls.i2_prev(p),
+                                   static_cast<real_t>(prtls.dx2_prev(p)),
+                                   prtls.i2(p),
+                                   static_cast<real_t>(prtls.dx2(p)),
+                                   i2_min,
+                                   i2_max,
+                                   iS_x2,
+                                   fS_x2);
+
+        // shape function in dim3
+        real_t iS_x3[O + 2], fS_x3[O + 2];
+        // indices of the shape function
+        int    i3_min, i3_max;
 
-        } else if constexpr (D == Dim::_2D) {
-
-          // shape function in dim1 -> always required
-          real_t iS_x2[O + 2], fS_x2[O + 2];
-          // indices of the shape function
-          int    i2_min, i2_max;
-
-          // call shape function
-          prtl_shape::for_deposit<O>(i2_prev(p),
-                                     static_cast<real_t>(dx2_prev(p)),
-                                     i2(p),
-                                     static_cast<real_t>(dx2(p)),
-                                     i2_min,
-                                     i2_max,
-                                     iS_x2,
-                                     fS_x2);
-
-          // (2D): fused Esirkepov, no [O+2]^2 temporaries.
-          //
-          // Esirkepov 2001 Eq. 38 (simplified) is separable: with
-          // P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) and
-          // P2[j] = sum_{j'=0}^{j} (fS_x2[j'] - iS_x2[j']),
-          //   jx1[i][j] = -Q*HALF * P1[i] * (fS_x2[j] + iS_x2[j])
-          //   jx2[i][j] = -Q*HALF * P2[j] * (fS_x1[i] + iS_x1[i])
-          //   Wx3[i][j] = THIRD*( fS_x2[j]*(HALF*iS_x1[i]+fS_x1[i])
-          //                     + iS_x2[j]*(HALF*fS_x1[i]+iS_x1[i]) )
-          // with Q = coeff*inv_dt (Qdx1dt == Qdx2dt). Same value as the
-          // old explicit Wx/jx tensors up to FP reassociation;
-          // charge-conserving by construction. Prefix sums carried as
-          // running scalars, so the only per-thread state is the 
-          // existing 1D shape arrays.
-          const real_t QVx3 = coeff * vp[2];
-          // -Q*HALF prefactor (Qdx1dt == Qdx2dt == coeff*inv_dt)
-          const real_t cf = -(coeff * inv_dt) * HALF;
-
-          // account for ghost cells
-          i1_min += N_GHOSTS;
-          i2_min += N_GHOSTS;
-          i1_max += N_GHOSTS;
-          i2_max += N_GHOSTS;
-
-          // get number of update indices for asymmetric movement
-          const int di_x1 = i1_max - i1_min;
-          const int di_x2 = i2_max - i2_min;
-
-          // Current update — fused over the union plane so the J cell
-          // line stays L1-resident across the 3 component atomic_adds.
-          real_t P1 = ZERO;
-          for (int i = 0; i <= di_x1; ++i) {
-            P1 += fS_x1[i] - iS_x1[i];
-            const int    gi   = i1_min + i;
-            const real_t iSx1 = iS_x1[i];
-            const real_t fSx1 = fS_x1[i];
-            const real_t A1   = fSx1 + iSx1;     // jx2 cross-factor
-            real_t       P2   = ZERO;
-            for (int j = 0; j <= di_x2; ++j) {
-              P2 += fS_x2[j] - iS_x2[j];
-              const int    gj   = i2_min + j;
-              const real_t iSx2 = iS_x2[j];
-              const real_t fSx2 = fS_x2[j];
+        // call shape function
+        prtl_shape::for_deposit<O>(prtls.i3_prev(p),
+                                   static_cast<real_t>(prtls.dx3_prev(p)),
+                                   prtls.i3(p),
+                                   static_cast<real_t>(prtls.dx3(p)),
+                                   i3_min,
+                                   i3_max,
+                                   iS_x3,
+                                   fS_x3);
+
+        /**
+         * fused Esirkepov, no (O+2)^3 temporaries.
+         *
+         * The Esirkepov 3D current (2001, Eq. 31) is separable: with
+         * P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) (and likewise
+         * P2[j], P3[k]) the cumulative-sum currents collapse to
+         *
+         *   jx1[i][j][k] = -Q*THIRD * P1[i] * G23(j,k)
+         *   jx2[i][j][k] = -Q*THIRD * P2[j] * H13(i,k)
+         *   jx3[i][j][k] = -Q*THIRD * P3[k] * F12(i,j)
+         *
+         * with the 1D-shape cross-factors
+         *
+         *   G23(j,k) = iS_x2[j]*iS_x3[k] + fS_x2[j]*fS_x3[k]
+         *            + HALF*(iS_x3[k]*fS_x2[j] + iS_x2[j]*fS_x3[k])
+         *   H13(i,k) = iS_x1[i]*iS_x3[k] + fS_x1[i]*fS_x3[k]
+         *            + HALF*(iS_x3[k]*fS_x1[i] + iS_x1[i]*fS_x3[k])
+         *   F12(i,j) = iS_x1[i]*iS_x2[j] + fS_x1[i]*fS_x2[j]
+         *            + HALF*(iS_x1[i]*fS_x2[j] + iS_x2[j]*fS_x1[i])
+         *
+         * and Q = coeff*inv_dt (Qdxdt == Qdydt == Qdzdt). This is the
+         * same value as the old explicit Wx/jx tensors up to
+         * floating-point reassociation: charge-conserving by
+         * construction (the Esirkepov decomposition is exact). The
+         * prefix sums are carried as running scalars in the deposit
+         * loop, so the only per-thread state is the existing 1D shape
+         * arrays (no (O+2)^3 / (O+2)^2 locals, hence far fewer VGPRs
+         * and no private-memory tensor traffic).
+         */
+
+        // account for ghost cells
+        i1_min += N_GHOSTS;
+        i2_min += N_GHOSTS;
+        i3_min += N_GHOSTS;
+        i1_max += N_GHOSTS;
+        i2_max += N_GHOSTS;
+        i3_max += N_GHOSTS;
+
+        // get number of update indices for asymmetric movement
+        const int di_x1 = i1_max - i1_min;
+        const int di_x2 = i2_max - i2_min;
+        const int di_x3 = i3_max - i3_min;
+
+        // -Q*THIRD prefactor (Qdxdt == Qdydt == Qdzdt == coeff*inv_dt)
+        const real_t cf = -(coeff * inv_dt) * THIRD;
+
+        /**
+         * Current update — fused over the union cube so the J cell
+         * line stays L1-resident across the 3 component atomic_adds.
+         * Per-cell branches on (i<di_x1), (j<di_x2), (k<di_x3) skip
+         * the trailing slab where each component's stencil ends one
+         * cell short of the union; particles within a tile share
+         * di_x* so the branch predicates cleanly.
+         */
+        real_t P1 = ZERO;
+        for (int i = 0; i <= di_x1; ++i) {
+          P1                 += fS_x1[i] - iS_x1[i];
+          const int    gi     = i1_min + i;
+          const real_t iSx1i  = iS_x1[i];
+          const real_t fSx1i  = fS_x1[i];
+          real_t       P2     = ZERO;
+          for (int j = 0; j <= di_x2; ++j) {
+            P2                 += fS_x2[j] - iS_x2[j];
+            const int    gj     = i2_min + j;
+            const real_t iSx2j  = iS_x2[j];
+            const real_t fSx2j  = fS_x2[j];
+            const real_t F12    = iSx1i * iSx2j + fSx1i * fSx2j +
+                               HALF * (iSx1i * fSx2j + iSx2j * fSx1i);
+            real_t P3 = ZERO;
+            for (int k = 0; k <= di_x3; ++k) {
+              P3                 += fS_x3[k] - iS_x3[k];
+              const int    gk     = i3_min + k;
+              const real_t iSx3k  = iS_x3[k];
+              const real_t fSx3k  = fS_x3[k];
               if (i < di_x1) {
-                deposit_at(gi, gj, cur::jx1, cf * P1 * (fSx2 + iSx2));
+                const real_t G23 = iSx2j * iSx3k + fSx2j * fSx3k +
+                                   HALF * (iSx3k * fSx2j + iSx2j * fSx3k);
+                deposit_at(gi, gj, gk, cur::jx1, cf * P1 * G23);
               }
               if (j < di_x2) {
-                deposit_at(gi, gj, cur::jx2, cf * P2 * A1);
+                const real_t H13 = iSx1i * iSx3k + fSx1i * fSx3k +
+                                   HALF * (iSx3k * fSx1i + iSx1i * fSx3k);
+                deposit_at(gi, gj, gk, cur::jx2, cf * P2 * H13);
               }
-              const real_t Wx3 = THIRD * (fSx2 * (HALF * iSx1 + fSx1) +
-                                          iSx2 * (HALF * fSx1 + iSx1));
-              deposit_at(gi, gj, cur::jx3, QVx3 * Wx3);
-            }
-          }
-
-        } else if constexpr (D == Dim::_3D) {
-          // shape function in dim2
-          real_t iS_x2[O + 2], fS_x2[O + 2];
-          // indices of the shape function
-          int    i2_min, i2_max;
-          // call shape function
-          prtl_shape::for_deposit<O>(i2_prev(p),
-                                     static_cast<real_t>(dx2_prev(p)),
-                                     i2(p),
-                                     static_cast<real_t>(dx2(p)),
-                                     i2_min,
-                                     i2_max,
-                                     iS_x2,
-                                     fS_x2);
-
-          // shape function in dim3
-          real_t iS_x3[O + 2], fS_x3[O + 2];
-          // indices of the shape function
-          int    i3_min, i3_max;
-
-          // call shape function
-          prtl_shape::for_deposit<O>(i3_prev(p),
-                                     static_cast<real_t>(dx3_prev(p)),
-                                     i3(p),
-                                     static_cast<real_t>(dx3(p)),
-                                     i3_min,
-                                     i3_max,
-                                     iS_x3,
-                                     fS_x3);
-
-          // fused Esirkepov, no (O+2)^3 temporaries.
-          //
-          // The Esirkepov 3D current (2001, Eq. 31) is separable: with
-          // P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) (and likewise
-          // P2[j], P3[k]) the cumulative-sum currents collapse to
-          //
-          //   jx1[i][j][k] = -Q*THIRD * P1[i] * G23(j,k)
-          //   jx2[i][j][k] = -Q*THIRD * P2[j] * H13(i,k)
-          //   jx3[i][j][k] = -Q*THIRD * P3[k] * F12(i,j)
-          //
-          // with the 1D-shape cross-factors
-          //
-          //   G23(j,k) = iS_x2[j]*iS_x3[k] + fS_x2[j]*fS_x3[k]
-          //            + HALF*(iS_x3[k]*fS_x2[j] + iS_x2[j]*fS_x3[k])
-          //   H13(i,k) = iS_x1[i]*iS_x3[k] + fS_x1[i]*fS_x3[k]
-          //            + HALF*(iS_x3[k]*fS_x1[i] + iS_x1[i]*fS_x3[k])
-          //   F12(i,j) = iS_x1[i]*iS_x2[j] + fS_x1[i]*fS_x2[j]
-          //            + HALF*(iS_x1[i]*fS_x2[j] + iS_x2[j]*fS_x1[i])
-          //
-          // and Q = coeff*inv_dt (Qdxdt == Qdydt == Qdzdt). This is the
-          // same value as the old explicit Wx/jx tensors up to
-          // floating-point reassociation: charge-conserving by
-          // construction (the Esirkepov decomposition is exact). The
-          // prefix sums are carried as running scalars in the deposit
-          // loop, so the only per-thread state is the existing 1D shape
-          // arrays (no (O+2)^3 / (O+2)^2 locals, hence far fewer VGPRs
-          // and no private-memory tensor traffic).
-
-          // account for ghost cells
-          i1_min += N_GHOSTS;
-          i2_min += N_GHOSTS;
-          i3_min += N_GHOSTS;
-          i1_max += N_GHOSTS;
-          i2_max += N_GHOSTS;
-          i3_max += N_GHOSTS;
-
-          // get number of update indices for asymmetric movement
-          const int di_x1 = i1_max - i1_min;
-          const int di_x2 = i2_max - i2_min;
-          const int di_x3 = i3_max - i3_min;
-
-          // -Q*THIRD prefactor (Qdxdt == Qdydt == Qdzdt == coeff*inv_dt)
-          const real_t cf = -(coeff * inv_dt) * THIRD;
-
-          /*
-            Current update — fused over the union cube so the J cell
-            line stays L1-resident across the 3 component atomic_adds.
-            Per-cell branches on (i<di_x1), (j<di_x2), (k<di_x3) skip
-            the trailing slab where each component's stencil ends one
-            cell short of the union; particles within a tile share
-            di_x* so the branch predicates cleanly.
-          */
-          real_t P1 = ZERO;
-          for (int i = 0; i <= di_x1; ++i) {
-            P1 += fS_x1[i] - iS_x1[i];
-            const int    gi    = i1_min + i;
-            const real_t iSx1i = iS_x1[i];
-            const real_t fSx1i = fS_x1[i];
-            real_t       P2    = ZERO;
-            for (int j = 0; j <= di_x2; ++j) {
-              P2 += fS_x2[j] - iS_x2[j];
-              const int    gj    = i2_min + j;
-              const real_t iSx2j = iS_x2[j];
-              const real_t fSx2j = fS_x2[j];
-              const real_t F12   = iSx1i * iSx2j + fSx1i * fSx2j +
-                                 HALF * (iSx1i * fSx2j + iSx2j * fSx1i);
-              real_t P3 = ZERO;
-              for (int k = 0; k <= di_x3; ++k) {
-                P3 += fS_x3[k] - iS_x3[k];
-                const int    gk    = i3_min + k;
-                const real_t iSx3k = iS_x3[k];
-                const real_t fSx3k = fS_x3[k];
-                if (i < di_x1) {
-                  const real_t G23 = iSx2j * iSx3k + fSx2j * fSx3k +
-                                     HALF * (iSx3k * fSx2j + iSx2j * fSx3k);
-                  deposit_at(gi, gj, gk, cur::jx1, cf * P1 * G23);
-                }
-                if (j < di_x2) {
-                  const real_t H13 = iSx1i * iSx3k + fSx1i * fSx3k +
-                                     HALF * (iSx3k * fSx1i + iSx1i * fSx3k);
-                  deposit_at(gi, gj, gk, cur::jx2, cf * P2 * H13);
-                }
-                if (k < di_x3) {
-                  deposit_at(gi, gj, gk, cur::jx3, cf * P3 * F12);
-                }
+              if (k < di_x3) {
+                deposit_at(gi, gj, gk, cur::jx3, cf * P3 * F12);
               }
             }
           }
+        }
 
-        } // dim
-      } else { // order
-        raise::KernelError(
-          HERE,
-          "Unsupported interpolation order. O > 11 not supported. Seriously. "
-          "What are you even doing here? Entity already goes to 11!");
-      }
+      } // dim
+    } else { // order
+      raise::KernelError(
+        HERE,
+        "Unsupported interpolation order. O > 11 not supported. Seriously. "
+        "What are you even doing here? Entity already goes to 11!");
     }
-
-  } // namespace deposit
+  }
 
   /**
    * @brief Flat current-deposition kernel.
@@ -721,38 +696,19 @@ namespace kernel {
     static_assert(O <= 11u, "Shape function order O must be <= 11");
     static constexpr auto D = M::Dim;
 
-    scatter_ndfield_t<D, 3>     J;
-    deposit::PrtlPack<D>        prtls;
-    const M                     metric;
-    const real_t                charge, inv_dt;
+    scatter_ndfield_t<D, 3> J;
+    const ParticleArrays    prtls;
+    const M                 metric;
+    const real_t            charge, inv_dt;
 
   public:
     DepositCurrents_kernel(const scatter_ndfield_t<D, 3>& scatter_cur,
-                           const array_t<int*>&           i1,
-                           const array_t<int*>&           i2,
-                           const array_t<int*>&           i3,
-                           const array_t<int*>&           i1_prev,
-                           const array_t<int*>&           i2_prev,
-                           const array_t<int*>&           i3_prev,
-                           const array_t<prtldx_t*>&      dx1,
-                           const array_t<prtldx_t*>&      dx2,
-                           const array_t<prtldx_t*>&      dx3,
-                           const array_t<prtldx_t*>&      dx1_prev,
-                           const array_t<prtldx_t*>&      dx2_prev,
-                           const array_t<prtldx_t*>&      dx3_prev,
-                           const array_t<real_t*>&        ux1,
-                           const array_t<real_t*>&        ux2,
-                           const array_t<real_t*>&        ux3,
-                           const array_t<real_t*>&        phi,
-                           const array_t<real_t*>&        weight,
-                           const array_t<short*>&         tag,
+                           const ParticleArrays&          prtls,
                            const M&                       metric,
                            real_t                         charge,
                            const real_t                   dt)
       : J { scatter_cur }
-      , prtls { i1,       i2,       i3,       i1_prev,  i2_prev,  i3_prev,
-                dx1,      dx2,      dx3,      dx1_prev, dx2_prev, dx3_prev,
-                ux1,      ux2,      ux3,      phi,      weight,   tag }
+      , prtls { prtls }
       , metric { metric }
       , charge { charge }
       , inv_dt { ONE / dt } {
@@ -765,27 +721,25 @@ namespace kernel {
     Inline auto operator()(prtlidx_t p) const -> void {
       auto J_acc = J.access();
       if constexpr (D == Dim::_1D) {
-        deposit::deposit_one_particle<S, M, O>(
-          p,
-          prtls,
-          metric,
-          charge,
-          inv_dt,
-          [&](int g_i1, int comp, real_t v) {
-            J_acc(g_i1, comp) += v;
-          });
+        DepositOneParticle<S, M, O>(p,
+                                    prtls,
+                                    metric,
+                                    charge,
+                                    inv_dt,
+                                    [&](int g_i1, int comp, real_t v) {
+                                      J_acc(g_i1, comp) += v;
+                                    });
       } else if constexpr (D == Dim::_2D) {
-        deposit::deposit_one_particle<S, M, O>(
-          p,
-          prtls,
-          metric,
-          charge,
-          inv_dt,
-          [&](int g_i1, int g_i2, int comp, real_t v) {
-            J_acc(g_i1, g_i2, comp) += v;
-          });
+        DepositOneParticle<S, M, O>(p,
+                                    prtls,
+                                    metric,
+                                    charge,
+                                    inv_dt,
+                                    [&](int g_i1, int g_i2, int comp, real_t v) {
+                                      J_acc(g_i1, g_i2, comp) += v;
+                                    });
       } else if constexpr (D == Dim::_3D) {
-        deposit::deposit_one_particle<S, M, O>(
+        DepositOneParticle<S, M, O>(
           p,
           prtls,
           metric,
@@ -798,8 +752,6 @@ namespace kernel {
     }
   };
 
-#if defined(TEAM_POLICY)
-
   /**
    * @brief Tiled current-deposition kernel.
    *
@@ -851,107 +803,74 @@ namespace kernel {
    * per write. Sorting less often than every step therefore costs
    * escape-valve traffic, never accuracy.
    */
-  template <SimEngine::type S, MetricClass M, unsigned short O,
-            unsigned short T_TILE>
-  class DepositCurrents_kernel_tiled {
+  template <SimEngine::type S, MetricClass M, unsigned short O, unsigned short T_TILE>
+  class DepositCurrentsTiled_kernel {
     static_assert(O <= 11u, "Shape order O must be <= 11");
     static_assert(T_TILE > 0u, "T_TILE must be positive");
     static constexpr auto D = M::Dim;
 
-    // Per-side scratch halo, derived from first principles.
-    //
-    //   total halo = stencil_reach(O) + drift_between_sort_and_deposit
-    //
-    // stencil_reach(O) — maximum cells the deposit writes ABOVE
-    // min(i, i_prev) under CFL |v·dt/dx| ≤ 1/2:
-    //   - O == 0 (zigzag):  writes {i_prev, i_prev+1, i, i+1}  ⇒ +2
-    //   - O >= 1 Esirkepov: `for_deposit` returns an (O+2)-wide
-    //     array but only O+1 entries are non-zero, and the union
-    //     window satisfies `i_max - i_min <= O+1` (see
-    //     particle_shapes.hpp::for_deposit). The genuine one-sided
-    //     reach above min(i, i_prev) is therefore O, not O+1 — the
-    //     old `O+1` carried one extra cell of conservative padding
-    //     on top of the already-conservative drift term below.
-    //
-    // drift — sort runs at end-of-step (see srpic.hpp), so a particle
-    // sees exactly one pusher step before the *next* step's deposit
-    // when sorted every step (the common case). DRIFT is therefore a
-    // fixed constant of 1, NOT a compile-time function of the runtime
-    // sort cadence. Sizing the halo for the common case (rather than a
-    // worst-case sort interval) is what keeps the scratch small enough
-    // for good occupancy; a species sorted less often than
-    // every step just drifts past the halo and takes the global-J
-    // escape valve more often — correct, only slower (see the class
-    // doc-comment for why this is charge-conserving).
-    //
-    static constexpr int STENCIL_REACH = (O == 0u)
-                                           ? 2
-                                           : static_cast<int>(O);
-    static constexpr int DRIFT = 1;
-    static constexpr int HALO  = STENCIL_REACH + DRIFT;
-    static constexpr int TE    = static_cast<int>(T_TILE) + 2 * HALO;
-
-    using exec_space   = Kokkos::DefaultExecutionSpace;
-    using team_policy  = Kokkos::TeamPolicy<exec_space>;
-    using member_t     = typename team_policy::member_type;
-    using scratch_mem  = typename exec_space::scratch_memory_space;
-
-    // Scratch view types: trailing extent of 3 (jx1, jx2, jx3 components)
-    // is fixed by a runtime extent so we don't need a separate dimension
-    // template per component count.
-    using scratch_1d_t = Kokkos::View<real_t**,
-                                      scratch_mem,
-                                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-    using scratch_2d_t = Kokkos::View<real_t***,
-                                      scratch_mem,
-                                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-    using scratch_3d_t = Kokkos::View<real_t****,
-                                      scratch_mem,
-                                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-
-    ndfield_t<D, 3>           J;
-    deposit::PrtlPack<D>      prtls;
-    const M                   metric;
-    const real_t              charge, inv_dt;
+    /**
+     * Per-side scratch halo, derived from first principles.
+     *
+     *   total halo = stencil_reach(O) + drift_between_sort_and_deposit
+     *
+     * stencil_reach(O) — maximum cells the deposit writes ABOVE
+     * min(i, i_prev) under CFL |v * dt/dx| <= 1/2:
+     *   - O == 0 (zigzag):  writes { i_prev, i_prev+1, i, i+1 } => +2
+     *   - O >= 1 Esirkepov: `for_deposit` returns an (O+2)-wide
+     *     array but only O+1 entries are non-zero, and the union
+     *     window satisfies `i_max - i_min <= O+1` (see
+     *     particle_shapes.hpp::for_deposit). The genuine one-sided
+     *     reach above min(i, i_prev) is therefore O, not O+1 — the
+     *     old `O+1` carried one extra cell of conservative padding
+     *     on top of the already-conservative drift term below.
+     *
+     * drift — sort runs at end-of-step (see srpic.hpp), so a particle
+     * sees exactly one pusher step before the *next* step's deposit
+     * when sorted every step (the common case). DRIFT is therefore a
+     * fixed constant of 1, NOT a compile-time function of the runtime
+     * sort cadence. Sizing the halo for the common case (rather than a
+     * worst-case sort interval) is what keeps the scratch small enough
+     * for good occupancy; a species sorted less often than
+     * every step just drifts past the halo and takes the global-J
+     * escape valve more often — correct, only slower (see the class
+     * doc-comment for why this is charge-conserving).
+     */
+    static constexpr int STENCIL_REACH = (O == 0u) ? 2 : static_cast<int>(O);
+    static constexpr int DRIFT         = 1;
+    static constexpr int HALO          = STENCIL_REACH + DRIFT;
+    static constexpr int TE            = static_cast<int>(T_TILE) + 2 * HALO;
+
+    using exec_space  = Kokkos::DefaultExecutionSpace;
+    using team_policy = Kokkos::TeamPolicy<exec_space>;
+    using member_t    = typename team_policy::member_type;
+
+    ndfield_t<D, 3> J;
+    ParticleArrays  prtls;
+    const M         metric;
+    const real_t    charge, inv_dt;
 
     // Tile metadata produced by SortSpatially.
-    array_t<npart_t*>         tile_offsets;
-    ncells_t                  ntx1 { 1u }, ntx2 { 1u }, ntx3 { 1u };
-    ncells_t                  total_tiles { 0u };
+    array_t<npart_t*> tile_offsets;
+    ncells_t          ntx1 { 1u }, ntx2 { 1u }, ntx3 { 1u };
+    ncells_t          total_tiles { 0u };
 
-    // J's full storage extent including all ghost cells. Used to clip
-    // the cooperative flush so that a partial tile at the high end of
-    // the domain does not over-write past the J view.
-    int                       j_ext1 { 0 }, j_ext2 { 0 }, j_ext3 { 0 };
+    /**
+     * J's full storage extent including all ghost cells. Used to clip
+     * the cooperative flush so that a partial tile at the high end of
+     * the domain does not over-write past the J view.
+     */
+    int j_ext1 { 0 }, j_ext2 { 0 }, j_ext3 { 0 };
 
   public:
-    DepositCurrents_kernel_tiled(const ndfield_t<D, 3>&    cur,
-                                 const array_t<int*>&      i1,
-                                 const array_t<int*>&      i2,
-                                 const array_t<int*>&      i3,
-                                 const array_t<int*>&      i1_prev,
-                                 const array_t<int*>&      i2_prev,
-                                 const array_t<int*>&      i3_prev,
-                                 const array_t<prtldx_t*>& dx1,
-                                 const array_t<prtldx_t*>& dx2,
-                                 const array_t<prtldx_t*>& dx3,
-                                 const array_t<prtldx_t*>& dx1_prev,
-                                 const array_t<prtldx_t*>& dx2_prev,
-                                 const array_t<prtldx_t*>& dx3_prev,
-                                 const array_t<real_t*>&   ux1,
-                                 const array_t<real_t*>&   ux2,
-                                 const array_t<real_t*>&   ux3,
-                                 const array_t<real_t*>&   phi,
-                                 const array_t<real_t*>&   weight,
-                                 const array_t<short*>&    tag,
-                                 const M&                  metric,
-                                 real_t                    charge,
-                                 const real_t              dt,
-                                 const TileLayout<D>&      layout)
+    DepositCurrentsTiled_kernel(const ndfield_t<D, 3>& cur,
+                                const ParticleArrays&  prtls,
+                                const M&               metric,
+                                real_t                 charge,
+                                real_t                 dt,
+                                const TileLayout<D>&   layout)
       : J { cur }
-      , prtls { i1,       i2,       i3,       i1_prev,  i2_prev,  i3_prev,
-                dx1,      dx2,      dx3,      dx1_prev, dx2_prev, dx3_prev,
-                ux1,      ux2,      ux3,      phi,      weight,   tag }
+      , prtls { prtls }
       , metric { metric }
       , charge { charge }
       , inv_dt { ONE / dt }
@@ -964,16 +883,18 @@ namespace kernel {
         layout.tile_size != T_TILE,
         "Tiled deposit launched with mismatched T_TILE and runtime tile_size",
         HERE);
-      // Note: HALO is allowed to exceed N_GHOSTS. The cooperative
-      // scratch→J flush and the per-particle escape valve both bounds-clip
-      // their writes against `j_ext*` so writes that would land past J's
-      // ghost stripe are silently dropped (they only ever come from a
-      // particle whose stencil reaches into the domain ghost region, where
-      // CommunicateFields will re-supply the contribution).
-      if constexpr (D == Dim::_1D || D == Dim::_2D || D == Dim::_3D) {
+      /**
+       * @note: HALO is allowed to exceed N_GHOSTS. The cooperative
+       * scratch→J flush and the per-particle escape valve both bounds-clip
+       * their writes against `j_ext*` so writes that would land past J's
+       * ghost stripe are silently dropped (they only ever come from a
+       * particle whose stencil reaches into the domain ghost region, where
+       * CommunicateFields will re-supply the contribution).
+       */
+      if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) {
         j_ext1 = static_cast<int>(cur.extent(0));
       }
-      if constexpr (D == Dim::_2D || D == Dim::_3D) {
+      if constexpr (D == Dim::_2D or D == Dim::_3D) {
         j_ext2 = static_cast<int>(cur.extent(1));
       }
       if constexpr (D == Dim::_3D) {
@@ -985,23 +906,24 @@ namespace kernel {
      * @brief Per-team scratch size in bytes. Used by the launcher to set
      *        `team_policy.set_scratch_size(0, Kokkos::PerTeam(bytes))`.
      */
-    static constexpr std::size_t scratch_bytes() {
+    static constexpr size_t scratch_bytes() {
       if constexpr (D == Dim::_1D) {
-        return scratch_1d_t::shmem_size(TE, 3);
+        return scratch_ndfield_t<D, 3, real_t>::shmem_size(TE, 3);
       } else if constexpr (D == Dim::_2D) {
-        return scratch_2d_t::shmem_size(TE, TE, 3);
+        return scratch_ndfield_t<D, 3, real_t>::shmem_size(TE, TE, 3);
       } else {
-        return scratch_3d_t::shmem_size(TE, TE, TE, 3);
+        return scratch_ndfield_t<D, 3, real_t>::shmem_size(TE, TE, TE, 3);
       }
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const member_t& team) const {
+    Inline void operator()(const member_t& team) const {
       const auto tile_id = static_cast<ncells_t>(team.league_rank());
-      // Tile coordinates (tile-grid indices) → tile origin in **active**
-      // cell coords (no ghost offset). Using ncells_t to match the linearised
-      // tile index produced by SortSpatially.
-      ncells_t tx1 = 0, tx2 = 0, tx3 = 0;
+      /**
+       * Tile coordinates (tile-grid indices) → tile origin in **active**
+       * cell coords (no ghost offset). Using ncells_t to match the linearised
+       * tile index produced by SortSpatially.
+       */
+      ncells_t   tx1 = 0, tx2 = 0, tx3 = 0;
       if constexpr (D == Dim::_1D) {
         tx1 = tile_id;
       } else if constexpr (D == Dim::_2D) {
@@ -1014,213 +936,201 @@ namespace kernel {
         tx2              = rem / ntx3;
         tx3              = rem - tx2 * ntx3;
       }
-      // origin_active = lowest active-cell index in the tile (no ghost).
-      // origin_J      = same value translated into J's storage coordinate
-      //                 (i.e. plus N_GHOSTS).
-      // origin_J_low  = J coordinate of scratch index 0 (i.e. origin_J - HALO).
-      // local index `li` in scratch ↔ global J index `gi = li + origin_J_low`.
-      const int origin_J1_low = static_cast<int>(tx1 * T_TILE)
-                                + static_cast<int>(N_GHOSTS) - HALO;
-      const int origin_J2_low = static_cast<int>(tx2 * T_TILE)
-                                + static_cast<int>(N_GHOSTS) - HALO;
-      const int origin_J3_low = static_cast<int>(tx3 * T_TILE)
-                                + static_cast<int>(N_GHOSTS) - HALO;
+      /**
+       * origin_active = lowest active-cell index in the tile (no ghost).
+       * origin_J      = same value translated into J's storage coordinate
+       *                 (i.e. plus N_GHOSTS).
+       * origin_J_low  = J coordinate of scratch index 0 (i.e. origin_J - HALO).
+       * local index `li` in scratch ↔ global J index `gi = li + origin_J_low`.
+       */
+      const int origin_J1_low = static_cast<int>(tx1 * T_TILE) +
+                                static_cast<int>(N_GHOSTS) - HALO;
+      const int origin_J2_low = static_cast<int>(tx2 * T_TILE) +
+                                static_cast<int>(N_GHOSTS) - HALO;
+      const int origin_J3_low = static_cast<int>(tx3 * T_TILE) +
+                                static_cast<int>(N_GHOSTS) - HALO;
 
       // Allocate scratch and cooperatively zero-fill it.
       if constexpr (D == Dim::_1D) {
-        scratch_1d_t scr(team.team_scratch(0), TE, 3);
-        Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(team, TE * 3),
-          [&](const int idx) {
-            const int li = idx / 3;
-            const int c  = idx - li * 3;
-            scr(li, c)   = ZERO;
-          });
+        scratch_ndfield_t<D, 3, real_t> scr { team.team_scratch(0), TE };
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(team, TE * 3),
+                             [&](ncells_t idx) {
+                               const auto li = idx / 3;
+                               const auto c  = idx - li * 3;
+                               scr(li, c)    = ZERO;
+                             });
         team.team_barrier();
 
         const auto p_begin = tile_offsets(tile_id);
         const auto p_end   = tile_offsets(tile_id + 1u);
-        const int  e1_d    = j_ext1;
         Kokkos::parallel_for(
           Kokkos::TeamThreadRange(team, p_begin, p_end),
-          [&](const npart_t p) {
-            deposit::deposit_one_particle<S, M, O>(
+          [&](prtlidx_t p) {
+            DepositOneParticle<S, M, O>(
               p,
               prtls,
               metric,
               charge,
               inv_dt,
-              // Escape valve: a particle whose stencil reaches past the
-              // tile's scratch (e.g. exceeded the compile-time
-              // STENCIL_REACH + DRIFT budget) falls back to a direct
-              // atomic_add on the global J view. Bounds-clipped against
-              // J's storage extent so writes past the domain ghost stripe
-              // are dropped (matches the cooperative flush below; those
-              // contributions are re-supplied by SynchronizeFields(J)).
+              /**
+               * Escape valve: a particle whose stencil reaches past the
+               * tile's scratch (e.g. exceeded the compile-time
+               * STENCIL_REACH + DRIFT budget) falls back to a direct
+               * atomic_add on the global J view. Bounds-clipped against
+               * J's storage extent so writes past the domain ghost stripe
+               * are dropped (matches the cooperative flush below; those
+               * contributions are re-supplied by SynchronizeFields(J)).
+               */
               [&](int g_i1, int comp, real_t v) {
                 const int li = g_i1 - origin_J1_low;
-                if (li >= 0 && li < TE) {
+                if (li >= 0 and li < TE) {
                   Kokkos::atomic_add(&scr(li, comp), v);
-                } else if (g_i1 >= 0 && g_i1 < e1_d) {
+                } else if (g_i1 >= 0 and g_i1 < j_ext1) {
                   Kokkos::atomic_add(&J(g_i1, comp), v);
                 }
               });
           });
         team.team_barrier();
 
-        // Cooperative flush of scratch to global J. Bounds-clip against
-        // the J view extent in case a partial high-end tile (or non-zero
-        // halo at domain edges) would otherwise write past J.
-        const int e1 = j_ext1;
-        Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(team, TE * 3),
-          [&](const int idx) {
-            const int li = idx / 3;
-            const int c  = idx - li * 3;
-            const int gi = li + origin_J1_low;
-            if (gi < 0 || gi >= e1) {
-              return;
-            }
-            const real_t v = scr(li, c);
-            if (v != ZERO) {
-              Kokkos::atomic_add(&J(gi, c), v);
-            }
-          });
+        /**
+         * Cooperative flush of scratch to global J. Bounds-clip against
+         * the J view extent in case a partial high-end tile (or non-zero
+         * halo at domain edges) would otherwise write past J.
+         */
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(team, TE * 3),
+                             [&](const int idx) {
+                               const auto li = idx / 3;
+                               const auto c  = idx - li * 3;
+                               const auto gi = li + origin_J1_low;
+                               if (gi < 0 or gi >= j_ext1) {
+                                 return;
+                               }
+                               const real_t v = scr(li, c);
+                               if (v != ZERO) {
+                                 Kokkos::atomic_add(&J(gi, c), v);
+                               }
+                             });
       } else if constexpr (D == Dim::_2D) {
-        scratch_2d_t scr(team.team_scratch(0), TE, TE, 3);
-        Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(team, TE * TE * 3),
-          [&](const int idx) {
-            const int lij  = idx / 3;
-            const int c    = idx - lij * 3;
-            const int li   = lij / TE;
-            const int lj   = lij - li * TE;
-            scr(li, lj, c) = ZERO;
-          });
+        scratch_ndfield_t<D, 3, real_t> scr { team.team_scratch(0), TE, TE };
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(team, SQR(TE) * 3),
+                             [&](const int idx) {
+                               const auto lij = idx / 3;
+                               const auto c   = idx - lij * 3;
+                               const auto li  = lij / TE;
+                               const auto lj  = lij - li * TE;
+                               scr(li, lj, c) = ZERO;
+                             });
         team.team_barrier();
 
         const auto p_begin = tile_offsets(tile_id);
         const auto p_end   = tile_offsets(tile_id + 1u);
-        const int  e1_d    = j_ext1;
-        const int  e2_d    = j_ext2;
         Kokkos::parallel_for(
           Kokkos::TeamThreadRange(team, p_begin, p_end),
-          [&](const npart_t p) {
-            deposit::deposit_one_particle<S, M, O>(
+          [&](prtlidx_t p) {
+            DepositOneParticle<S, M, O>(
               p,
               prtls,
               metric,
               charge,
               inv_dt,
               // See 1D branch for rationale.
-              [&](int g_i1, int g_i2, int comp, real_t v) {
-                const int li = g_i1 - origin_J1_low;
-                const int lj = g_i2 - origin_J2_low;
-                if (li >= 0 && li < TE && lj >= 0 && lj < TE) {
+              [&](const int g_i1, const int g_i2, int comp, real_t v) {
+                const auto li = g_i1 - origin_J1_low;
+                const auto lj = g_i2 - origin_J2_low;
+                if ((li >= 0 and li < TE) and (lj >= 0 and lj < TE)) {
                   Kokkos::atomic_add(&scr(li, lj, comp), v);
-                } else if (g_i1 >= 0 && g_i1 < e1_d && g_i2 >= 0 &&
-                           g_i2 < e2_d) {
+                } else if ((g_i1 >= 0 and g_i1 < j_ext1) and
+                           (g_i2 >= 0 and g_i2 < j_ext2)) {
                   Kokkos::atomic_add(&J(g_i1, g_i2, comp), v);
                 }
               });
           });
         team.team_barrier();
 
-        const int e1 = j_ext1;
-        const int e2 = j_ext2;
-        Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(team, TE * TE * 3),
-          [&](const int idx) {
-            const int lij = idx / 3;
-            const int c   = idx - lij * 3;
-            const int li  = lij / TE;
-            const int lj  = lij - li * TE;
-            const int gi  = li + origin_J1_low;
-            const int gj  = lj + origin_J2_low;
-            if (gi < 0 || gi >= e1 || gj < 0 || gj >= e2) {
-              return;
-            }
-            const real_t v = scr(li, lj, c);
-            if (v != ZERO) {
-              Kokkos::atomic_add(&J(gi, gj, c), v);
-            }
-          });
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(team, SQR(TE) * 3),
+                             [&](const int idx) {
+                               const auto lij = idx / 3;
+                               const auto c   = idx - lij * 3;
+                               const auto li  = lij / TE;
+                               const auto lj  = lij - li * TE;
+                               const auto gi  = li + origin_J1_low;
+                               const auto gj  = lj + origin_J2_low;
+                               if ((gi < 0 or gi >= j_ext1) or
+                                   (gj < 0 or gj >= j_ext2)) {
+                                 return;
+                               }
+                               const real_t v = scr(li, lj, c);
+                               if (v != ZERO) {
+                                 Kokkos::atomic_add(&J(gi, gj, c), v);
+                               }
+                             });
       } else if constexpr (D == Dim::_3D) {
-        scratch_3d_t scr(team.team_scratch(0), TE, TE, TE, 3);
-        const int    cells = TE * TE * TE;
-        Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(team, cells * 3),
-          [&](const int idx) {
-            const int lijk     = idx / 3;
-            const int c        = idx - lijk * 3;
-            const int li       = lijk / (TE * TE);
-            const int rem      = lijk - li * TE * TE;
-            const int lj       = rem / TE;
-            const int lk       = rem - lj * TE;
-            scr(li, lj, lk, c) = ZERO;
-          });
+        scratch_ndfield_t<D, 3, real_t> scr { team.team_scratch(0), TE, TE, TE };
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(team, CUBE(TE) * 3),
+                             [&](const int idx) {
+                               const auto lijk    = idx / 3;
+                               const auto c       = idx - lijk * 3;
+                               const auto li      = lijk / (TE * TE);
+                               const auto rem     = lijk - li * TE * TE;
+                               const auto lj      = rem / TE;
+                               const auto lk      = rem - lj * TE;
+                               scr(li, lj, lk, c) = ZERO;
+                             });
         team.team_barrier();
 
         const auto p_begin = tile_offsets(tile_id);
         const auto p_end   = tile_offsets(tile_id + 1u);
-        const int  e1_d    = j_ext1;
-        const int  e2_d    = j_ext2;
-        const int  e3_d    = j_ext3;
         Kokkos::parallel_for(
           Kokkos::TeamThreadRange(team, p_begin, p_end),
-          [&](const npart_t p) {
-            deposit::deposit_one_particle<S, M, O>(
+          [&](prtlidx_t p) {
+            DepositOneParticle<S, M, O>(
               p,
               prtls,
               metric,
               charge,
               inv_dt,
               // See 1D branch for rationale.
-              [&](int g_i1, int g_i2, int g_i3, int comp, real_t v) {
-                const int li = g_i1 - origin_J1_low;
-                const int lj = g_i2 - origin_J2_low;
-                const int lk = g_i3 - origin_J3_low;
-                if (li >= 0 && li < TE && lj >= 0 && lj < TE && lk >= 0 &&
-                    lk < TE) {
+              [&](const int g_i1, const int g_i2, const int g_i3, int comp, real_t v) {
+                const auto li = g_i1 - origin_J1_low;
+                const auto lj = g_i2 - origin_J2_low;
+                const auto lk = g_i3 - origin_J3_low;
+                if ((li >= 0 and li < TE) and (lj >= 0 and lj < TE) and
+                    (lk >= 0 and lk < TE)) {
                   Kokkos::atomic_add(&scr(li, lj, lk, comp), v);
-                } else if (g_i1 >= 0 && g_i1 < e1_d && g_i2 >= 0 &&
-                           g_i2 < e2_d && g_i3 >= 0 && g_i3 < e3_d) {
+                } else if ((g_i1 >= 0 and g_i1 < j_ext1) and
+                           (g_i2 >= 0 and g_i2 < j_ext2) and
+                           (g_i3 >= 0 and g_i3 < j_ext3)) {
                   Kokkos::atomic_add(&J(g_i1, g_i2, g_i3, comp), v);
                 }
               });
           });
         team.team_barrier();
 
-        const int e1 = j_ext1;
-        const int e2 = j_ext2;
-        const int e3 = j_ext3;
-        Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(team, cells * 3),
-          [&](const int idx) {
-            const int lijk = idx / 3;
-            const int c    = idx - lijk * 3;
-            const int li   = lijk / (TE * TE);
-            const int rem  = lijk - li * TE * TE;
-            const int lj   = rem / TE;
-            const int lk   = rem - lj * TE;
-            const int gi   = li + origin_J1_low;
-            const int gj   = lj + origin_J2_low;
-            const int gk   = lk + origin_J3_low;
-            if (gi < 0 || gi >= e1 || gj < 0 || gj >= e2 || gk < 0 ||
-                gk >= e3) {
-              return;
-            }
-            const real_t v = scr(li, lj, lk, c);
-            if (v != ZERO) {
-              Kokkos::atomic_add(&J(gi, gj, gk, c), v);
-            }
-          });
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(team, CUBE(TE) * 3),
+                             [&](const int idx) {
+                               const int lijk = idx / 3;
+                               const int c    = idx - lijk * 3;
+                               const int li   = lijk / (TE * TE);
+                               const int rem  = lijk - li * TE * TE;
+                               const int lj   = rem / TE;
+                               const int lk   = rem - lj * TE;
+                               const int gi   = li + origin_J1_low;
+                               const int gj   = lj + origin_J2_low;
+                               const int gk   = lk + origin_J3_low;
+                               if ((gi < 0 or gi >= j_ext1) or
+                                   (gj < 0 or gj >= j_ext2) or
+                                   (gk < 0 or gk >= j_ext3)) {
+                                 return;
+                               }
+                               const real_t v = scr(li, lj, lk, c);
+                               if (v != ZERO) {
+                                 Kokkos::atomic_add(&J(gi, gj, gk, c), v);
+                               }
+                             });
       }
     }
   };
 
-#endif // TEAM_POLICY
-
 } // namespace kernel
 
 #undef i_di_to_Xi

From b5e05ad7130ca88c44ecdecdb6c46690fb2e2a9e Mon Sep 17 00:00:00 2001
From: LudwigBoess <ludwig.boess@outlook.de>
Date: Fri, 19 Jun 2026 20:13:33 +0000
Subject: [PATCH 10/33] compile-time sorting interval with team policies and
 sorting speedup

---
 CMakeLists.txt                              |  13 ++
 cmake/defaults.cmake                        |   5 +
 cmake/report.cmake                          |  14 ++
 src/engines/srpic/currents.h                |  90 ++++---
 src/framework/containers/particles.h        |  13 +-
 src/framework/containers/particles_sort.cpp | 110 +++++----
 src/framework/parameters/parameters.cpp     |   7 +
 src/framework/parameters/particles.cpp      |   9 +
 src/global/arch/kokkos_aliases.h            |  25 +-
 src/kernels/currents_deposit.hpp            | 245 ++++++++++++++------
 tests/framework/particles_sort.cpp          |  12 +-
 11 files changed, 372 insertions(+), 171 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0137671a0..d7bea74f0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,6 +67,10 @@ set(team_policy_tile_size
 set(team_policy_tile_sizes
   "4;6;8;10;12;14;16"
   CACHE STRING "team_policy tile-size choices")
+set(team_policy_sort
+  ${default_team_policy_sort}
+  CACHE STRING
+  "team_policy hardwired spatial sorting interval; >0 overrides the runtime spatial_sorting_interval and sizes the tiled deposit scratch halo (0 = use runtime)")
 
 # -------------------------- Compilation settings -------------------------- #
 set(CMAKE_CXX_STANDARD 20)
@@ -157,6 +161,15 @@ if(${team_policy})
   add_compile_options("-D TEAM_POLICY")
   add_compile_options("-D TEAM_POLICY_TILE_SIZE=${team_policy_tile_size}")
 
+  # Optional compile-time hardwired sort interval. When > 0, it (a) overrides
+  # the runtime spatial_sorting_interval (see framework/parameters) and (b)
+  # sizes the tiled deposit scratch halo to DRIFT = interval, so a particle
+  # drifting over a full sort interval still deposits inside its tile scratch.
+  if(team_policy_sort GREATER 0)
+    add_compile_options(
+      "-D TEAM_POLICY_SORT_INTERVAL=${team_policy_sort}")
+  endif()
+
   # Vendor sort: oneDPL on SYCL, Thrust on CUDA. Used automatically
   # when found; falls back to Kokkos::BinSort otherwise.
   if("${Kokkos_DEVICES}" MATCHES "SYCL")
diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake
index a85accf84..619f48dc1 100644
--- a/cmake/defaults.cmake
+++ b/cmake/defaults.cmake
@@ -107,3 +107,8 @@ set_property(CACHE default_team_policy PROPERTY TYPE BOOL)
 set(default_team_policy_tile_size
     8
     CACHE INTERNAL "Default tile edge length in cells for team_policy")
+
+set(default_team_policy_sort
+    0
+    CACHE INTERNAL
+      "Default hardwired spatial sorting interval for team_policy (0 = runtime)")
diff --git a/cmake/report.cmake b/cmake/report.cmake
index 65a22a7a6..e6036d366 100644
--- a/cmake/report.cmake
+++ b/cmake/report.cmake
@@ -141,6 +141,17 @@ if(${team_policy})
     "${Blue}"
     TEAM_POLICY_TILE_SIZE_REPORT
     46)
+  if(team_policy_sort GREATER 0)
+    printchoices(
+      "Team Sort Interval"
+      "team_policy_sort"
+      "${team_policy_sort}"
+      ${team_policy_sort}
+      0
+      "${Blue}"
+      TEAM_POLICY_SORT_INTERVAL_REPORT
+      46)
+  endif()
 endif()
 printchoices(
   "Debug mode"
@@ -220,6 +231,9 @@ endif()
 string(APPEND REPORT_TEXT "  " ${TEAM_POLICY_REPORT} "\n")
 if(${team_policy})
   string(APPEND REPORT_TEXT "  " ${TEAM_POLICY_TILE_SIZE_REPORT} "\n")
+  if(team_policy_sort GREATER 0)
+    string(APPEND REPORT_TEXT "  " ${TEAM_POLICY_SORT_INTERVAL_REPORT} "\n")
+  endif()
 endif()
 
 string(
diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h
index 63746d101..eadda70b9 100644
--- a/src/engines/srpic/currents.h
+++ b/src/engines/srpic/currents.h
@@ -79,7 +79,8 @@ namespace ntt {
 
       auto deposit_kernel =
         kernel::DepositCurrentsTiled_kernel<SimEngine::SRPIC, M, O, T> {
-          cur, species, local_metric, (real_t)(species.charge()), dt, layout
+          cur,    species, local_metric, (real_t)(species.charge()),
+          dt,     layout,  species.npart()
         };
 
       Kokkos::TeamPolicy<> policy(static_cast<int>(layout.ntiles_total),
@@ -88,6 +89,30 @@ namespace ntt {
         0,
         Kokkos::PerTeam(decltype(deposit_kernel)::scratch_bytes()));
       Kokkos::parallel_for("CurrentsDepositTiled", policy, deposit_kernel);
+
+      // Particles appended since the last sort (injection / MPI receive on a
+      // no-sort step) live past the partition and are not visited by any team
+      // above. Deposit that tail [npart_partitioned, npart) with the flat
+      // scatter-view kernel so every active particle is deposited exactly
+      // once. The range is empty when the species was just sorted (the
+      // every-step-sorted common case), so this is a no-op there.
+      if (species.npart() > layout.npart_partitioned) {
+        // `cur` is a const ref; take a non-const View handle (shallow copy,
+        // shares storage) so the scatter view can contribute back into it.
+        auto cur_nc      = cur;
+        auto scatter_cur = Kokkos::Experimental::create_scatter_view(cur_nc);
+        Kokkos::parallel_for(
+          "CurrentsDepositTiledTail",
+          CreateParticleRangePolicy<Dim::_1D>({ layout.npart_partitioned },
+                                              { species.npart() }),
+          kernel::DepositCurrents_kernel<SimEngine::SRPIC, M, O>(
+            scatter_cur,
+            species,
+            local_metric,
+            (real_t)(species.charge()),
+            dt));
+        Kokkos::Experimental::contribute(cur_nc, scatter_cur);
+      }
     }
 
     template <SRMetricClass M>
@@ -98,51 +123,45 @@ namespace ntt {
 
 #if defined(TEAM_POLICY)
 
-      // First-step fallback: if any contributing species has not been
-      // sorted yet (tile_layout still empty), fall back to the flat
-      // scatter-view path for that step. Subsequent steps see populated
-      // layouts and use the tiled kernel.
-      bool any_unsorted = false;
+      // Tiled deposit. Correctness no longer depends on the SoA being in a
+      // "sorted" state at deposit time — the tiled kernel handles a stale
+      // partition per-particle:
+      //   - a particle whose full stencil has drifted out of its tile is
+      //     deposited straight to the global J view (the per-particle escape
+      //     valve); `team_policy_sort_interval` sizes the scratch halo so the
+      //     common in-tile case stays in fast SLM (see currents_deposit.hpp);
+      //   - particles dead-tagged in place since the sort are clamped out by
+      //     the kernel and skipped by the dead-tag test;
+      //   - particles appended past the partition since the sort (injection /
+      //     MPI receive on a no-sort step) are deposited by the launcher's
+      //     flat tail pass over [npart_partitioned, npart).
+      // Together these cover every active particle exactly once for any sort
+      // interval. The only case the tiled kernel cannot serve is the very
+      // first step, before any SortSpatially has populated a layout; that
+      // species takes the flat scatter-view path for that step alone.
       for (auto& species : domain.species) {
         if ((species.pusher() == ParticlePusher::NONE) or
             (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
           continue;
         }
-        if (species.tile_layout().ntiles_total == 0u or
-            species.tile_layout().tile_offsets.extent(0) == 0u) {
-          any_unsorted = true;
-          break;
-        }
-      }
-      if (any_unsorted) {
-        auto scatter_cur = Kokkos::Experimental::create_scatter_view(
-          domain.fields.cur);
-        for (auto& species : domain.species) {
-          if ((species.pusher() == ParticlePusher::NONE) or
-              (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
-            continue;
-          }
+        const auto& layout = species.tile_layout();
+        if (layout.ntiles_total == 0u or layout.tile_offsets.extent(0) == 0u) {
           logger::Checkpoint(
-            fmt::format(
-              "Launching currents deposit (flat fallback, no sort yet) "
-              "for %d [%s] : %lu %f",
-              species.index(),
-              species.label().c_str(),
-              species.npart(),
-              (double)species.charge()),
+            fmt::format("Launching currents deposit (flat, no sort yet) for "
+                        "%d [%s] : %lu %f",
+                        species.index(),
+                        species.label().c_str(),
+                        species.npart(),
+                        (double)species.charge()),
             HERE);
+          auto scatter_cur = Kokkos::Experimental::create_scatter_view(
+            domain.fields.cur);
           CallDepositKernel<M, SHAPE_ORDER>(species,
                                             domain.mesh.metric,
                                             scatter_cur,
                                             dt);
-        }
-        Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur);
-      } else {
-        for (auto& species : domain.species) {
-          if ((species.pusher() == ParticlePusher::NONE) or
-              (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
-            continue;
-          }
+          Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur);
+        } else {
           logger::Checkpoint(
             fmt::format("Launching tiled currents deposit for %d [%s] : %lu %f",
                         species.index(),
@@ -150,7 +169,6 @@ namespace ntt {
                         species.npart(),
                         (double)species.charge()),
             HERE);
-
           CallDepositKernelTiled<M, SHAPE_ORDER>(species,
                                                  domain.mesh.metric,
                                                  domain.fields.cur,
diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h
index 4f0770729..0877efd15 100644
--- a/src/framework/containers/particles.h
+++ b/src/framework/containers/particles.h
@@ -299,11 +299,14 @@ namespace ntt {
   private:
     /**
      * @brief Apply a particle-index permutation (built by oneDPL/Thrust
-     *        sort_by_key) to every SoA member array. Sequential — one
-     *        transient buffer at a time, fenced before scope exit.
-     *        Only compiled when a vendor sort backend is enabled; the
-     *        BinSort path applies the permutation in place via
-     *        `sorter.sort(view)` instead.
+     *        sort_by_key) to the SoA member arrays. Each member is
+     *        gathered into a fresh full-capacity buffer whose handle is
+     *        then swapped in (no copy-back), one buffer at a time, fenced
+     *        before the old storage is released. The *_prev arrays are
+     *        intentionally not permuted (overwritten by the next push
+     *        before any read). Only compiled when a vendor sort backend
+     *        is enabled; the BinSort path applies the permutation in
+     *        place via `sorter.sort(view)` instead.
      */
     void apply_permutation_to_soa(const prtl_perm_t& perm);
 
diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp
index 0317e0b5f..a52b013f7 100644
--- a/src/framework/containers/particles_sort.cpp
+++ b/src/framework/containers/particles_sort.cpp
@@ -264,8 +264,11 @@ namespace ntt {
     const auto     slice  = prtl_slice_t(0, npart_local);
   #if defined(TEAM_POLICY_USE_VENDOR_SORT)
     // Vendor path: produce an explicit permutation via sort_by_key,
-    // then apply it to each SoA member with a sequential one-buffer
-    // gather (peak transient = one `npart × sizeof(member)` buffer.
+    // then apply it to each SoA member by gathering into a fresh
+    // full-capacity buffer and swapping the View handle in (no
+    // copy-back). The *_prev arrays are skipped — see
+    // apply_permutation_to_soa. Peak transient = one
+    // `maxnpart × sizeof(member)` buffer at a time.
     prtl_perm_t perm { "tile_perm", npart_local };
     #if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)
     sort_helpers::sort_by_key_dispatch(tile_indices,
@@ -376,6 +379,11 @@ namespace ntt {
       Kokkos::deep_copy(tile_offsets, h_offsets);
 
       m_tile_layout.tile_offsets = tile_offsets;
+      // tile_offsets(total_tiles) is the alive-particle count at sort time:
+      // the tiles partition exactly [0, npart_partitioned). The deposit
+      // launcher compares this against the live npart() to detect (and
+      // separately deposit) particles appended since this sort.
+      m_tile_layout.npart_partitioned = h_offsets(total_tiles);
     }
 
     // 6. Populate `m_tile_layout` size/shape. `tile_perm` is not used
@@ -454,38 +462,43 @@ namespace ntt {
 #if defined(TEAM_POLICY_USE_VENDOR_SORT)
   namespace permute_helpers {
 
-    // Permute a 1D SoA member array `arr` in place by `perm`, using a
-    // single transient buffer of size `n`. Buffer is freed at scope
-    // exit; the explicit fence right before that drains queued GPU
-    // work referencing it.
+    // Permute a 1D SoA member array `arr` by `perm`. Gathers into a
+    // fresh buffer allocated at the member's full capacity (maxnpart),
+    // then swaps the View handle in. This avoids the redundant copy-back
+    // pass of the old gather-then-deep_copy approach (~2x less HBM
+    // traffic). Allocating at full capacity preserves the member's spare
+    // room for injection; the untouched tail [n, capacity) is
+    // zero-initialized by Kokkos (cleaner than the stale values the old
+    // deep_copy left there). The fence drains the gather (which reads the
+    // old storage) before the swap drops the last reference to it.
     template <typename V>
-    inline void permute_1d_inplace(V&                 arr,
-                                   const prtl_perm_t& perm,
-                                   npart_t            n) {
+    inline void permute_1d_swap(V&                 arr,
+                                const prtl_perm_t& perm,
+                                npart_t            n) {
       if (n == 0u) {
         return;
       }
-      V    buf(std::string(arr.label()) + "_perm_buf", n);
+      V    buf(arr.label(), arr.extent(0));
       auto perm_v = perm;
       auto arr_v  = arr;
       Kokkos::parallel_for(
         "Permute1D",
         n,
         KOKKOS_LAMBDA(const npart_t p) { buf(p) = arr_v(perm_v(p)); });
-      Kokkos::deep_copy(Kokkos::subview(arr, prtl_slice_t(0u, n)), buf);
-      Kokkos::fence("permute_1d_inplace: end");
+      Kokkos::fence("permute_1d_swap: end");
+      arr = buf;
     }
 
     // 2D analogue for `pld_r` / `pld_i`.
     template <typename V>
-    inline void permute_2d_inplace(V&                 arr,
-                                   const prtl_perm_t& perm,
-                                   npart_t            n,
-                                   npart_t            ncols) {
+    inline void permute_2d_swap(V&                 arr,
+                                const prtl_perm_t& perm,
+                                npart_t            n,
+                                npart_t            ncols) {
       if (n == 0u or ncols == 0u) {
         return;
       }
-      V    buf(std::string(arr.label()) + "_perm_buf", n, ncols);
+      V    buf(arr.label(), arr.extent(0), arr.extent(1));
       auto perm_v = perm;
       auto arr_v  = arr;
       Kokkos::parallel_for(
@@ -494,9 +507,8 @@ namespace ntt {
         KOKKOS_LAMBDA(const npart_t p, const npart_t l) {
           buf(p, l) = arr_v(perm_v(p), l);
         });
-      Kokkos::deep_copy(Kokkos::subview(arr, prtl_slice_t(0u, n), Kokkos::ALL),
-                        buf);
-      Kokkos::fence("permute_2d_inplace: end");
+      Kokkos::fence("permute_2d_swap: end");
+      arr = buf;
     }
 
   } // namespace permute_helpers
@@ -508,40 +520,50 @@ namespace ntt {
       return;
     }
 
-    using permute_helpers::permute_1d_inplace;
-    using permute_helpers::permute_2d_inplace;
-
+    using permute_helpers::permute_1d_swap;
+    using permute_helpers::permute_2d_swap;
+
+    // The *_prev arrays (i{1,2,3}_prev, dx{1,2,3}_prev) are intentionally
+    // NOT permuted. SortSpatially runs at the very end of the step loop
+    // (engine step_forward), and the first thing the next step's pusher
+    // does is overwrite prev := current (positionPush, sr.hpp / gr.hpp)
+    // for every active particle, before any consumer reads prev:
+    //   - current deposit: runs after the push, which has already
+    //     overwritten prev; species with pusher==NONE (whose prev would
+    //     stay un-permuted) are skipped by CurrentsDeposit entirely.
+    //   - pusher getParticlePrevPosition / piston: read prev only after
+    //     positionPush has rewritten it within the same call.
+    //   - checkpoint (prev is checkpoint-only, never in diagnostic
+    //     output): on restart the first push overwrites prev before it
+    //     is read, so restart results are unaffected; only the redundant
+    //     prev field saved to the checkpoint differs from the old code.
+    // Permuting prev would therefore reorder data that is overwritten
+    // before it is ever observed.
     if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) {
-      permute_1d_inplace(i1, perm, n);
-      permute_1d_inplace(dx1, perm, n);
-      permute_1d_inplace(i1_prev, perm, n);
-      permute_1d_inplace(dx1_prev, perm, n);
+      permute_1d_swap(i1, perm, n);
+      permute_1d_swap(dx1, perm, n);
     }
     if constexpr (D == Dim::_2D or D == Dim::_3D) {
-      permute_1d_inplace(i2, perm, n);
-      permute_1d_inplace(dx2, perm, n);
-      permute_1d_inplace(i2_prev, perm, n);
-      permute_1d_inplace(dx2_prev, perm, n);
+      permute_1d_swap(i2, perm, n);
+      permute_1d_swap(dx2, perm, n);
     }
     if constexpr (D == Dim::_3D) {
-      permute_1d_inplace(i3, perm, n);
-      permute_1d_inplace(dx3, perm, n);
-      permute_1d_inplace(i3_prev, perm, n);
-      permute_1d_inplace(dx3_prev, perm, n);
-    }
-    permute_1d_inplace(ux1, perm, n);
-    permute_1d_inplace(ux2, perm, n);
-    permute_1d_inplace(ux3, perm, n);
-    permute_1d_inplace(weight, perm, n);
-    permute_1d_inplace(tag, perm, n);
+      permute_1d_swap(i3, perm, n);
+      permute_1d_swap(dx3, perm, n);
+    }
+    permute_1d_swap(ux1, perm, n);
+    permute_1d_swap(ux2, perm, n);
+    permute_1d_swap(ux3, perm, n);
+    permute_1d_swap(weight, perm, n);
+    permute_1d_swap(tag, perm, n);
     if constexpr (D == Dim::_2D and C != Coord::Cartesian) {
-      permute_1d_inplace(phi, perm, n);
+      permute_1d_swap(phi, perm, n);
     }
     if (npld_r() > 0) {
-      permute_2d_inplace(pld_r, perm, n, static_cast<npart_t>(npld_r()));
+      permute_2d_swap(pld_r, perm, n, static_cast<npart_t>(npld_r()));
     }
     if (npld_i() > 0) {
-      permute_2d_inplace(pld_i, perm, n, static_cast<npart_t>(npld_i()));
+      permute_2d_swap(pld_i, perm, n, static_cast<npart_t>(npld_i()));
     }
   }
 #endif // TEAM_POLICY_USE_VENDOR_SORT
diff --git a/src/framework/parameters/parameters.cpp b/src/framework/parameters/parameters.cpp
index 7372da510..4116783a1 100644
--- a/src/framework/parameters/parameters.cpp
+++ b/src/framework/parameters/parameters.cpp
@@ -67,11 +67,18 @@ namespace ntt {
       "clear_interval",
       defaults::clear_interval);
     set("particles.clear_interval", global_clearing_interval);
+#if defined(TEAM_POLICY_SORT_INTERVAL)
+    // See particles.cpp: the compile-time team_policy_sort_interval overrides
+    // the runtime value (kept consistent here for the stored global param).
+    const auto global_spatial_sorting_interval = static_cast<timestep_t>(
+      TEAM_POLICY_SORT_INTERVAL);
+#else
     const auto global_spatial_sorting_interval = toml::find_or<timestep_t>(
       toml_data,
       "particles",
       "spatial_sorting_interval",
       0u);
+#endif
     set("particles.spatial_sorting_interval", global_spatial_sorting_interval);
 
     set("scales.n0", ppc0 / get<real_t>("scales.V0"));
diff --git a/src/framework/parameters/particles.cpp b/src/framework/parameters/particles.cpp
index 6f1a23c03..46192035b 100644
--- a/src/framework/parameters/particles.cpp
+++ b/src/framework/parameters/particles.cpp
@@ -121,10 +121,19 @@ namespace ntt {
         sp,
         "clear_interval",
         global_clearing_interval);
+#if defined(TEAM_POLICY_SORT_INTERVAL)
+      // Compile-time hardwired sort interval (the `team_policy_sort_interval`
+      // CMake knob). It overrides whatever the input file requested so the
+      // tiled deposit's scratch halo — sized for exactly this cadence — is
+      // guaranteed to contain every particle's drift between sorts.
+      const auto spatial_sorting_interval = static_cast<timestep_t>(
+        TEAM_POLICY_SORT_INTERVAL);
+#else
       const auto spatial_sorting_interval = toml::find_or<timestep_t>(
         sp,
         "spatial_sorting_interval",
         global_spatial_sorting_interval);
+#endif
       auto pusher_str = toml::find_or(sp, "pusher", std::string(def_pusher));
       const auto npayloads_real = toml::find_or(sp,
                                                 "n_payloads_real",
diff --git a/src/global/arch/kokkos_aliases.h b/src/global/arch/kokkos_aliases.h
index 4fe88cdf7..7212b554e 100644
--- a/src/global/arch/kokkos_aliases.h
+++ b/src/global/arch/kokkos_aliases.h
@@ -340,20 +340,27 @@ auto CreateRangePolicyOnHost(const tuple_t<ncells_t, D>&,
 using prtl_perm_t = array_t<npart_t*>;
 
 // Tile layout metadata: the contract between Stream 1 (sort) and Streams
-// 2/3 (tiled deposit / pusher). All members are device-resident.
-//   ntiles_per_axis : number of tiles along each axis (1 for unused axes).
-//   ntiles_total    : product of ntiles_per_axis = league size for TeamPolicy.
-//   tile_size       : tile edge length in cells (compile-time CMake knob,
-//                     replicated here for runtime checks).
-//   tile_offsets    : prefix-sum of per-tile particle counts; size
-//                     ntiles_total + 1; tile t owns particles
-//                     [tile_offsets(t), tile_offsets(t+1)).
-//   tile_perm       : size npart, particle index sorted by tile.
+// 2/3 (tiled deposit / pusher). Scalars are host-resident; the views are
+// device-resident.
+//   ntiles_per_axis  : number of tiles along each axis (1 for unused axes).
+//   ntiles_total     : product of ntiles_per_axis = league size for TeamPolicy.
+//   tile_size        : tile edge length in cells (compile-time CMake knob,
+//                      replicated here for runtime checks).
+//   npart_partitioned: number of (alive) particles partitioned at the last
+//                      sort, i.e. tile_offsets(ntiles_total). The tiles cover
+//                      exactly [0, npart_partitioned); particles appended past
+//                      it (injection / MPI receive on a no-sort step) are not
+//                      partitioned and must be deposited separately.
+//   tile_offsets     : prefix-sum of per-tile particle counts; size
+//                      ntiles_total + 1; tile t owns particles
+//                      [tile_offsets(t), tile_offsets(t+1)).
+//   tile_perm        : size npart, particle index sorted by tile.
 template <Dimension D>
 struct TileLayout {
   ncells_t          ntiles_per_axis[3] { 1u, 1u, 1u };
   ncells_t          ntiles_total { 0u };
   unsigned short    tile_size { 0u };
+  npart_t           npart_partitioned { 0u };
   array_t<npart_t*> tile_offsets;
   prtl_perm_t       tile_perm;
 };
diff --git a/src/kernels/currents_deposit.hpp b/src/kernels/currents_deposit.hpp
index 79252621a..8840a1046 100644
--- a/src/kernels/currents_deposit.hpp
+++ b/src/kernels/currents_deposit.hpp
@@ -780,28 +780,37 @@ namespace kernel {
    * `SortSpatially` (`particles_sort.cpp`) is responsible for keeping
    * the SoA arrays consistent with that.
    *
-   * **Halo sizing and escape valve.** Sort runs at the end of the
-   * previous step (see `srpic.hpp`), so at deposit time the particle
-   * has already been pushed once — its `min(i, i_prev)` may differ
-   * from the bin key by one cell of drift per step elapsed since the
-   * last sort. The scratch HALO is `STENCIL_REACH(O) + DRIFT`, where
-   * `STENCIL_REACH = 2` for zigzag (writes `{i_prev, i_prev+1, i,
-   * i+1}` ⇒ +2 above `min(i, i_prev)` with `|Δi|=1`) and `O` for
-   * Esirkepov, and `DRIFT` is a fixed constant (1) covering the one
-   * guaranteed post-sort pusher step.
+   * **Halo sizing and escape valve.** Sort runs at the end of a step
+   * (see `srpic.hpp`); a particle is pushed once per step thereafter, so
+   * its `min(i, i_prev)` may differ from the bin key by one cell of drift
+   * per step elapsed since the last sort. The scratch HALO is
+   * `STENCIL_REACH(O) + DRIFT`, where `STENCIL_REACH = 2` for zigzag
+   * (writes `{i_prev, i_prev+1, i, i+1}` ⇒ +2 above `min(i, i_prev)` with
+   * `|Δi|=1`) and `O` for Esirkepov. `DRIFT` is the
+   * `team_policy_sort_interval` CMake knob (macro TEAM_POLICY_SORT_INTERVAL)
+   * when set — the hardwired sort interval, hence the maximum drift any
+   * particle accrues between sorts — and `1` otherwise (the
+   * every-step-sorted common case).
    *
-   * HALO is sized for the *common* (every-step-sorted) case, not for
-   * a worst-case sort cadence: correctness does **not** depend on it.
-   * Any particle whose stencil escapes the scratch tile — because it
-   * drifted further than `DRIFT` (e.g. a large runtime
-   * `spatial_sorting_interval`), or because the halo is otherwise
-   * undersized — silently falls back to a direct, bounds-clipped
-   * `Kokkos::atomic_add` on the global J view. That path is
-   * charge-conserving (each particle's stencil is deposited exactly
-   * once, partly to private SLM scratch and partly to global J, and
-   * scratch is flushed once via `atomic_add`); it is merely slower
-   * per write. Sorting less often than every step therefore costs
+   * Correctness does **not** depend on the halo size. Any particle whose
+   * full stencil escapes the scratch tile — because it drifted further
+   * than `DRIFT`, was reordered far from its tile by a no-sort-step
+   * `CommunicateParticles`, or because the halo is otherwise undersized —
+   * is deposited *as a whole* via a direct, bounds-clipped
+   * `Kokkos::atomic_add` on the global J view (the per-particle escape
+   * valve). Each particle's stencil is therefore deposited exactly once
+   * (entirely to SLM scratch when it fits, entirely to global J when it
+   * does not), so the path is charge-conserving; it is merely slower per
+   * write. Sizing `DRIFT` to the sort interval keeps the common,
+   * within-interval drift in fast SLM; sorting less often only costs
    * escape-valve traffic, never accuracy.
+   *
+   * **Partition coverage.** The team iteration covers only the particles
+   * partitioned at the last sort, `[0, layout.npart_partitioned)`, clamped
+   * to the live `npart`. Particles appended past the partition since the
+   * sort are not seen here; the launcher (`engines/srpic/currents.h`)
+   * deposits that tail with the flat kernel so every active particle is
+   * covered exactly once regardless of sort cadence.
    */
   template <SimEngine::type S, MetricClass M, unsigned short O, unsigned short T_TILE>
   class DepositCurrentsTiled_kernel {
@@ -825,21 +834,33 @@ namespace kernel {
      *     old `O+1` carried one extra cell of conservative padding
      *     on top of the already-conservative drift term below.
      *
-     * drift — sort runs at end-of-step (see srpic.hpp), so a particle
-     * sees exactly one pusher step before the *next* step's deposit
-     * when sorted every step (the common case). DRIFT is therefore a
-     * fixed constant of 1, NOT a compile-time function of the runtime
-     * sort cadence. Sizing the halo for the common case (rather than a
-     * worst-case sort interval) is what keeps the scratch small enough
-     * for good occupancy; a species sorted less often than
-     * every step just drifts past the halo and takes the global-J
-     * escape valve more often — correct, only slower (see the class
-     * doc-comment for why this is charge-conserving).
+     * drift — sort runs at end-of-step (see srpic.hpp), so a particle is
+     * pushed once per step between its last sort and a given deposit. With
+     * a sort interval of `K`, a particle therefore drifts at most `K` cells
+     * (CFL |v dt/dx| <= 1/2 ⇒ |Δi| <= 1 per step) before the next sort. The
+     * `team_policy_sort_interval` CMake knob (macro TEAM_POLICY_SORT_INTERVAL)
+     * pins that interval at compile time and feeds it here as DRIFT, sizing
+     * the halo so a fully-interval-drifted particle still deposits inside
+     * its tile scratch. When the knob is unset, DRIFT defaults to 1 (the
+     * sorted-every-step common case); any particle that drifts past the halo
+     * (e.g. a larger runtime interval, or a CFL excursion) takes the
+     * per-particle global-J escape valve below — correct, only slower (see
+     * the class doc-comment for why this is charge-conserving).
      */
-    static constexpr int STENCIL_REACH = (O == 0u) ? 2 : static_cast<int>(O);
-    static constexpr int DRIFT         = 1;
-    static constexpr int HALO          = STENCIL_REACH + DRIFT;
-    static constexpr int TE            = static_cast<int>(T_TILE) + 2 * HALO;
+    static constexpr int STENCIL_REACH   = (O == 0u) ? 2 : static_cast<int>(O);
+    // One-sided footprint reach for the per-particle escape valve: the
+    // deposit writes at most this many cells above max(i,i_prev) (and fewer
+    // below min), so [min - FOOTPRINT_REACH, max + FOOTPRINT_REACH] in cell
+    // coords conservatively bounds every deposited cell for any order
+    // (Esirkepov reaches max+O; O=0 zigzag reaches max+1).
+    static constexpr int FOOTPRINT_REACH = (O == 0u) ? 1 : static_cast<int>(O);
+#if defined(TEAM_POLICY_SORT_INTERVAL)
+    static constexpr int DRIFT = static_cast<int>(TEAM_POLICY_SORT_INTERVAL);
+#else
+    static constexpr int DRIFT = 1;
+#endif
+    static constexpr int HALO = STENCIL_REACH + DRIFT;
+    static constexpr int TE   = static_cast<int>(T_TILE) + 2 * HALO;
 
     using exec_space  = Kokkos::DefaultExecutionSpace;
     using team_policy = Kokkos::TeamPolicy<exec_space>;
@@ -855,6 +876,17 @@ namespace kernel {
     ncells_t          ntx1 { 1u }, ntx2 { 1u }, ntx3 { 1u };
     ncells_t          total_tiles { 0u };
 
+    /**
+     * Current active-particle count. `tile_offsets` partitions only the
+     * particles that existed at the last sort ([0, layout.npart_partitioned));
+     * `npart` may differ if the pusher dead-tagged particles in place since.
+     * Each team clamps its `[tile_offsets(t), tile_offsets(t+1))` slice to
+     * `npart` so stale slots past the live array are never read. Particles
+     * appended *beyond* the partition (npart > npart_partitioned) are not seen
+     * by any team here — the launcher deposits that tail separately.
+     */
+    npart_t npart { 0u };
+
     /**
      * J's full storage extent including all ghost cells. Used to clip
      * the cooperative flush so that a partial tile at the high end of
@@ -868,7 +900,8 @@ namespace kernel {
                                 const M&               metric,
                                 real_t                 charge,
                                 real_t                 dt,
-                                const TileLayout<D>&   layout)
+                                const TileLayout<D>&   layout,
+                                npart_t                npart)
       : J { cur }
       , prtls { prtls }
       , metric { metric }
@@ -878,7 +911,8 @@ namespace kernel {
       , ntx1 { layout.ntiles_per_axis[0] }
       , ntx2 { layout.ntiles_per_axis[1] }
       , ntx3 { layout.ntiles_per_axis[2] }
-      , total_tiles { layout.ntiles_total } {
+      , total_tiles { layout.ntiles_total }
+      , npart { npart } {
       raise::ErrorIf(
         layout.tile_size != T_TILE,
         "Tiled deposit launched with mismatched T_TILE and runtime tile_size",
@@ -907,12 +941,17 @@ namespace kernel {
      *        `team_policy.set_scratch_size(0, Kokkos::PerTeam(bytes))`.
      */
     static constexpr size_t scratch_bytes() {
+      // The component count (3) is a *static* extent of scratch_ndfield_t
+      // (View<real_t*[3]> / **[3] / ***[3]), so shmem_size() takes only the
+      // dynamic spatial extents — passing 3 as well trips Kokkos'
+      // `rank_dynamic != number of arguments` abort. This matches the
+      // scratch View construction below, which also omits the 3.
       if constexpr (D == Dim::_1D) {
-        return scratch_ndfield_t<D, 3, real_t>::shmem_size(TE, 3);
+        return scratch_ndfield_t<D, 3, real_t>::shmem_size(TE);
       } else if constexpr (D == Dim::_2D) {
-        return scratch_ndfield_t<D, 3, real_t>::shmem_size(TE, TE, 3);
+        return scratch_ndfield_t<D, 3, real_t>::shmem_size(TE, TE);
       } else {
-        return scratch_ndfield_t<D, 3, real_t>::shmem_size(TE, TE, TE, 3);
+        return scratch_ndfield_t<D, 3, real_t>::shmem_size(TE, TE, TE);
       }
     }
 
@@ -961,31 +1000,49 @@ namespace kernel {
                              });
         team.team_barrier();
 
-        const auto p_begin = tile_offsets(tile_id);
-        const auto p_end   = tile_offsets(tile_id + 1u);
+        // Clamp the tile's particle slice to the live array: slots past
+        // `npart` may hold stale (possibly alive-tagged) data from a prior
+        // step's compaction and must not be re-deposited.
+        const auto t_lo    = tile_offsets(tile_id);
+        const auto t_hi    = tile_offsets(tile_id + 1u);
+        const auto p_begin = (t_lo < npart) ? t_lo : npart;
+        const auto p_end   = (t_hi < npart) ? t_hi : npart;
         Kokkos::parallel_for(
           Kokkos::TeamThreadRange(team, p_begin, p_end),
           [&](prtlidx_t p) {
+            /**
+             * Per-particle escape valve: route the WHOLE particle to the
+             * global J view when its Esirkepov footprint does not fit
+             * inside this tile's scratch window [0,TE); only particles
+             * fully inside the tile touch SLM scratch. A particle drifts
+             * out of its tile when sorted less often than every step.
+             *
+             * The conservative footprint bound in cell coords,
+             * [min(i,i_prev) - O, max(i,i_prev) + O], covers
+             * prtl_shape::for_deposit<O> for any order (i_min >=
+             * min-floor(O/2), i_max <= max+O), so when `to_scratch` is true
+             * every deposited cell is provably in [0,TE) and the scratch write
+             * needs no per-cell bounds test. The global path bounds-clips
+             * against J's storage extent (writes past the ghost stripe are
+             * re-supplied by SynchronizeFields(J)).
+             */
+            const int i1c = prtls.i1(p), i1p = prtls.i1_prev(p);
+            const int lo1 = (i1c < i1p ? i1c : i1p) + static_cast<int>(N_GHOSTS) -
+                            FOOTPRINT_REACH - origin_J1_low;
+            const int hi1 = (i1c > i1p ? i1c : i1p) + static_cast<int>(N_GHOSTS) +
+                            FOOTPRINT_REACH - origin_J1_low;
+            const bool to_scratch = (lo1 >= 0 and hi1 < TE);
             DepositOneParticle<S, M, O>(
               p,
               prtls,
               metric,
               charge,
               inv_dt,
-              /**
-               * Escape valve: a particle whose stencil reaches past the
-               * tile's scratch (e.g. exceeded the compile-time
-               * STENCIL_REACH + DRIFT budget) falls back to a direct
-               * atomic_add on the global J view. Bounds-clipped against
-               * J's storage extent so writes past the domain ghost stripe
-               * are dropped (matches the cooperative flush below; those
-               * contributions are re-supplied by SynchronizeFields(J)).
-               */
               [&](int g_i1, int comp, real_t v) {
-                const int li = g_i1 - origin_J1_low;
-                if (li >= 0 and li < TE) {
-                  Kokkos::atomic_add(&scr(li, comp), v);
-                } else if (g_i1 >= 0 and g_i1 < j_ext1) {
+                if (to_scratch) {
+                  Kokkos::atomic_add(&scr(g_i1 - origin_J1_low, comp), v);
+                  //} else if (g_i1 >= 0 and g_i1 < j_ext1) {
+                } else {
                   Kokkos::atomic_add(&J(g_i1, comp), v);
                 }
               });
@@ -1022,25 +1079,42 @@ namespace kernel {
                              });
         team.team_barrier();
 
-        const auto p_begin = tile_offsets(tile_id);
-        const auto p_end   = tile_offsets(tile_id + 1u);
+        // Clamp the tile's particle slice to the live array: slots past
+        // `npart` may hold stale (possibly alive-tagged) data from a prior
+        // step's compaction and must not be re-deposited.
+        const auto t_lo    = tile_offsets(tile_id);
+        const auto t_hi    = tile_offsets(tile_id + 1u);
+        const auto p_begin = (t_lo < npart) ? t_lo : npart;
+        const auto p_end   = (t_hi < npart) ? t_hi : npart;
         Kokkos::parallel_for(
           Kokkos::TeamThreadRange(team, p_begin, p_end),
           [&](prtlidx_t p) {
+            // See 1D branch for rationale: route the whole particle to the
+            // global escape valve unless its full footprint fits in scratch.
+            const int i1c = prtls.i1(p), i1p = prtls.i1_prev(p);
+            const int i2c = prtls.i2(p), i2p = prtls.i2_prev(p);
+            const int lo1 = (i1c < i1p ? i1c : i1p) + static_cast<int>(N_GHOSTS) -
+                            FOOTPRINT_REACH - origin_J1_low;
+            const int hi1 = (i1c > i1p ? i1c : i1p) + static_cast<int>(N_GHOSTS) +
+                            FOOTPRINT_REACH - origin_J1_low;
+            const int lo2 = (i2c < i2p ? i2c : i2p) + static_cast<int>(N_GHOSTS) -
+                            FOOTPRINT_REACH - origin_J2_low;
+            const int hi2 = (i2c > i2p ? i2c : i2p) + static_cast<int>(N_GHOSTS) +
+                            FOOTPRINT_REACH - origin_J2_low;
+            const bool to_scratch = (lo1 >= 0 and hi1 < TE and lo2 >= 0 and
+                                     hi2 < TE);
             DepositOneParticle<S, M, O>(
               p,
               prtls,
               metric,
               charge,
               inv_dt,
-              // See 1D branch for rationale.
               [&](const int g_i1, const int g_i2, int comp, real_t v) {
-                const auto li = g_i1 - origin_J1_low;
-                const auto lj = g_i2 - origin_J2_low;
-                if ((li >= 0 and li < TE) and (lj >= 0 and lj < TE)) {
-                  Kokkos::atomic_add(&scr(li, lj, comp), v);
-                } else if ((g_i1 >= 0 and g_i1 < j_ext1) and
-                           (g_i2 >= 0 and g_i2 < j_ext2)) {
+                if (to_scratch) {
+                  Kokkos::atomic_add(
+                    &scr(g_i1 - origin_J1_low, g_i2 - origin_J2_low, comp),
+                    v);
+                } else {
                   Kokkos::atomic_add(&J(g_i1, g_i2, comp), v);
                 }
               });
@@ -1078,28 +1152,49 @@ namespace kernel {
                              });
         team.team_barrier();
 
-        const auto p_begin = tile_offsets(tile_id);
-        const auto p_end   = tile_offsets(tile_id + 1u);
+        // Clamp the tile's particle slice to the live array: slots past
+        // `npart` may hold stale (possibly alive-tagged) data from a prior
+        // step's compaction and must not be re-deposited.
+        const auto t_lo    = tile_offsets(tile_id);
+        const auto t_hi    = tile_offsets(tile_id + 1u);
+        const auto p_begin = (t_lo < npart) ? t_lo : npart;
+        const auto p_end   = (t_hi < npart) ? t_hi : npart;
         Kokkos::parallel_for(
           Kokkos::TeamThreadRange(team, p_begin, p_end),
           [&](prtlidx_t p) {
+            // See 1D branch for rationale: route the whole particle to the
+            // global escape valve unless its full footprint fits in scratch.
+            const int i1c = prtls.i1(p), i1p = prtls.i1_prev(p);
+            const int i2c = prtls.i2(p), i2p = prtls.i2_prev(p);
+            const int i3c = prtls.i3(p), i3p = prtls.i3_prev(p);
+            const int lo1 = (i1c < i1p ? i1c : i1p) + static_cast<int>(N_GHOSTS) -
+                            FOOTPRINT_REACH - origin_J1_low;
+            const int hi1 = (i1c > i1p ? i1c : i1p) + static_cast<int>(N_GHOSTS) +
+                            FOOTPRINT_REACH - origin_J1_low;
+            const int lo2 = (i2c < i2p ? i2c : i2p) + static_cast<int>(N_GHOSTS) -
+                            FOOTPRINT_REACH - origin_J2_low;
+            const int hi2 = (i2c > i2p ? i2c : i2p) + static_cast<int>(N_GHOSTS) +
+                            FOOTPRINT_REACH - origin_J2_low;
+            const int lo3 = (i3c < i3p ? i3c : i3p) + static_cast<int>(N_GHOSTS) -
+                            FOOTPRINT_REACH - origin_J3_low;
+            const int hi3 = (i3c > i3p ? i3c : i3p) + static_cast<int>(N_GHOSTS) +
+                            FOOTPRINT_REACH - origin_J3_low;
+            const bool to_scratch = (lo1 >= 0 and hi1 < TE and lo2 >= 0 and
+                                     hi2 < TE and lo3 >= 0 and hi3 < TE);
             DepositOneParticle<S, M, O>(
               p,
               prtls,
               metric,
               charge,
               inv_dt,
-              // See 1D branch for rationale.
               [&](const int g_i1, const int g_i2, const int g_i3, int comp, real_t v) {
-                const auto li = g_i1 - origin_J1_low;
-                const auto lj = g_i2 - origin_J2_low;
-                const auto lk = g_i3 - origin_J3_low;
-                if ((li >= 0 and li < TE) and (lj >= 0 and lj < TE) and
-                    (lk >= 0 and lk < TE)) {
-                  Kokkos::atomic_add(&scr(li, lj, lk, comp), v);
-                } else if ((g_i1 >= 0 and g_i1 < j_ext1) and
-                           (g_i2 >= 0 and g_i2 < j_ext2) and
-                           (g_i3 >= 0 and g_i3 < j_ext3)) {
+                if (to_scratch) {
+                  Kokkos::atomic_add(&scr(g_i1 - origin_J1_low,
+                                          g_i2 - origin_J2_low,
+                                          g_i3 - origin_J3_low,
+                                          comp),
+                                     v);
+                } else {
                   Kokkos::atomic_add(&J(g_i1, g_i2, g_i3, comp), v);
                 }
               });
diff --git a/tests/framework/particles_sort.cpp b/tests/framework/particles_sort.cpp
index 1ba10d828..6945c962f 100644
--- a/tests/framework/particles_sort.cpp
+++ b/tests/framework/particles_sort.cpp
@@ -82,7 +82,11 @@ auto main(int argc, char* argv[]) -> int {
       Kokkos::deep_copy(pld_r_h, prtls.pld_r);
       Kokkos::deep_copy(pld_i_h, prtls.pld_i);
 
-      for (auto p { 0u }; p < 75u; ++p) {
+      // Only [0, npart) is defined after a sort. The swap-based gather in
+      // apply_permutation_to_soa replaces each SoA View, zero-filling the
+      // spare capacity [npart, maxnpart) (a don't-care region overwritten
+      // by injection), so the old "tail preserved" check no longer holds.
+      for (auto p { 0u }; p < 66u; ++p) {
         if (p < 16u) {
           raise::ErrorIf(weight_h(p) != 3.0, "error in sorting particles", HERE);
         } else if (p < 33u) {
@@ -180,7 +184,11 @@ auto main(int argc, char* argv[]) -> int {
 
       auto weight_h = Kokkos::create_mirror_view(prtls.weight);
       Kokkos::deep_copy(weight_h, prtls.weight);
-      for (auto p { 0u }; p < 75u; ++p) {
+      // Only [0, npart) is defined after a sort. The swap-based gather in
+      // apply_permutation_to_soa replaces each SoA View, zero-filling the
+      // spare capacity [npart, maxnpart) (a don't-care region overwritten
+      // by injection), so the old "tail preserved" check no longer holds.
+      for (auto p { 0u }; p < 66u; ++p) {
         if (p < 13u) {
           raise::ErrorIf(weight_h(p) != 4.0, "error in sorting particles", HERE);
         } else if (p < 26u) {

From 4f4409d507b942e9a7a15cc3c6a3abb69af07dd7 Mon Sep 17 00:00:00 2001
From: LudwigBoess <ludwig.boess@outlook.de>
Date: Fri, 19 Jun 2026 20:13:50 +0000
Subject: [PATCH 11/33] fix test for tiled deposit

---
 tests/kernels/deposit_tiled.cpp | 289 ++++++++++++++++++++++++++++++--
 1 file changed, 273 insertions(+), 16 deletions(-)

diff --git a/tests/kernels/deposit_tiled.cpp b/tests/kernels/deposit_tiled.cpp
index 504cc3818..3cc2f62b0 100644
--- a/tests/kernels/deposit_tiled.cpp
+++ b/tests/kernels/deposit_tiled.cpp
@@ -3,7 +3,7 @@
  * @brief X-1 numerical-equivalence test for the tiled deposit kernel.
  *
  * Runs the flat (`DepositCurrents_kernel`) and tiled
- * (`DepositCurrents_kernel_tiled`) kernels on identical particle SoA inputs
+ * (`DepositCurrentsTiled_kernel`) kernels on identical particle SoA inputs
  * for shape orders O = 1..11 and asserts that the resulting J array is
  * identical cell-by-cell within a small floating-point tolerance.
  *
@@ -48,6 +48,37 @@ namespace {
     Kokkos::deep_copy(arr, h);
   }
 
+  // Pack the per-test SoA arrays into a ParticleArrays — the struct both
+  // deposit kernels take. Payload (pld_*) members stay default; these are
+  // 2D Cartesian Minkowski cases, so phi/i3/dx3 are present but unread.
+  ParticleArrays pack_arrays(const array_t<int*>&      i1,
+                             const array_t<int*>&      i2,
+                             const array_t<int*>&      i3,
+                             const array_t<int*>&      i1_prev,
+                             const array_t<int*>&      i2_prev,
+                             const array_t<int*>&      i3_prev,
+                             const array_t<prtldx_t*>& dx1,
+                             const array_t<prtldx_t*>& dx2,
+                             const array_t<prtldx_t*>& dx3,
+                             const array_t<prtldx_t*>& dx1_prev,
+                             const array_t<prtldx_t*>& dx2_prev,
+                             const array_t<prtldx_t*>& dx3_prev,
+                             const array_t<real_t*>&   ux1,
+                             const array_t<real_t*>&   ux2,
+                             const array_t<real_t*>&   ux3,
+                             const array_t<real_t*>&   phi,
+                             const array_t<real_t*>&   weight,
+                             const array_t<short*>&    tag) {
+    ParticleArrays pa;
+    pa.i1 = i1, pa.i2 = i2, pa.i3 = i3;
+    pa.i1_prev = i1_prev, pa.i2_prev = i2_prev, pa.i3_prev = i3_prev;
+    pa.dx1 = dx1, pa.dx2 = dx2, pa.dx3 = dx3;
+    pa.dx1_prev = dx1_prev, pa.dx2_prev = dx2_prev, pa.dx3_prev = dx3_prev;
+    pa.ux1 = ux1, pa.ux2 = ux2, pa.ux3 = ux3;
+    pa.phi = phi, pa.weight = weight, pa.tag = tag;
+    return pa;
+  }
+
   // Builds tile_offsets for a single-particle test. Particle 0 is alive
   // and lives in tile (tx1, tx2); slots 1..n_slots-1 carry the dead
   // sentinel and are never referenced by tile_offsets — so the tiled
@@ -68,6 +99,69 @@ namespace {
     return offsets;
   }
 
+  // Buckets ALL `n_alive` particles (slots 0..n_alive-1) into tile 0,
+  // modelling a maximally-stale tile layout: every particle was "sorted"
+  // into tile 0 but now sits anywhere in the domain (as happens when the
+  // SoA drifts / is reordered between sorts). Every particle except the
+  // few that genuinely live near the origin must therefore take the
+  // per-particle escape valve to global J.
+  array_t<npart_t*> build_tile_offsets_all_in_tile0(ncells_t total_tiles,
+                                                    npart_t  n_alive) {
+    array_t<npart_t*> offsets("tile_offsets", total_tiles + 1u);
+    auto              h = Kokkos::create_mirror_view(offsets);
+    h(0)                = static_cast<npart_t>(0);
+    for (ncells_t t = 1; t <= total_tiles; ++t) {
+      h(t) = n_alive;
+    }
+    Kokkos::deep_copy(offsets, h);
+    return offsets;
+  }
+
+  // Cell-by-cell comparison of two J fields; throws on mismatch.
+  void compare_J_fields(const ndfield_t<Dim::_2D, 3>& J_flat,
+                        const ndfield_t<Dim::_2D, 3>& J_tiled,
+                        unsigned short                O,
+                        unsigned short                T_TILE,
+                        const char*                   label) {
+    auto h_flat  = Kokkos::create_mirror_view(J_flat);
+    auto h_tiled = Kokkos::create_mirror_view(J_tiled);
+    Kokkos::deep_copy(h_flat, J_flat);
+    Kokkos::deep_copy(h_tiled, J_tiled);
+
+    const real_t eps        = static_cast<real_t>(1.0e-5);
+    real_t       max_diff   = ZERO;
+    int          fail_count = 0;
+    for (ncells_t i = 0; i < h_flat.extent(0); ++i) {
+      for (ncells_t j = 0; j < h_flat.extent(1); ++j) {
+        for (int c = 0; c < 3; ++c) {
+          const real_t a    = h_flat(i, j, c);
+          const real_t b    = h_tiled(i, j, c);
+          const real_t diff = math::fabs(a - b);
+          const real_t mag  = math::max(math::fabs(a), math::fabs(b));
+          if (diff > max_diff) {
+            max_diff = diff;
+          }
+          if (diff > eps * math::max(mag, static_cast<real_t>(1.0))) {
+            if (fail_count < 5) {
+              std::cerr << "  [" << label << "] J(" << i << "," << j
+                        << ",c=" << c << ") flat=" << a << " tiled=" << b
+                        << " diff=" << diff << '\n';
+            }
+            ++fail_count;
+          }
+        }
+      }
+    }
+    if (fail_count > 0) {
+      std::cerr << "deposit_tiled[" << label << "] FAILED for O=" << O
+                << " T_TILE=" << T_TILE << " : " << fail_count
+                << " mismatches; max_diff=" << max_diff << '\n';
+      throw std::logic_error("DepositCurrentsTiled_kernel mismatch");
+    }
+    std::cerr << "deposit_tiled[" << label << "] OK  O=" << O
+              << " T_TILE=" << T_TILE << "  max_diff=" << max_diff << '\n';
+  }
+
   template <unsigned short O, unsigned short T_TILE>
   void run_one_case() {
     using metric_t = metric::Minkowski<Dim::_2D>;
@@ -129,12 +223,12 @@ namespace {
         10,
         kernel::DepositCurrents_kernel<SimEngine::SRPIC, metric_t, O>(
           J_scat,
-          i1, i2, i3,
-          i1_prev, i2_prev, i3_prev,
-          dx1, dx2, dx3,
-          dx1_prev, dx2_prev, dx3_prev,
-          ux1, ux2, ux3,
-          phi, weight, tag,
+          pack_arrays(i1, i2, i3,
+                      i1_prev, i2_prev, i3_prev,
+                      dx1, dx2, dx3,
+                      dx1_prev, dx2_prev, dx3_prev,
+                      ux1, ux2, ux3,
+                      phi, weight, tag),
           metric, charge, dt));
       Kokkos::Experimental::contribute(J_flat, J_scat);
       Kokkos::fence("flat deposit done");
@@ -168,15 +262,18 @@ namespace {
                                                                       tx2);
 
       using kernel_t =
-        kernel::DepositCurrents_kernel_tiled<SimEngine::SRPIC, metric_t, O, T_TILE>;
+        kernel::DepositCurrentsTiled_kernel<SimEngine::SRPIC, metric_t, O, T_TILE>;
+      // npart = full slot count (10): the lone alive particle sits in slot 0
+      // and the per-tile slice clamp keeps the (dead) tail out.
       kernel_t kern { J_tiled,
-                      i1, i2, i3,
-                      i1_prev, i2_prev, i3_prev,
-                      dx1, dx2, dx3,
-                      dx1_prev, dx2_prev, dx3_prev,
-                      ux1, ux2, ux3,
-                      phi, weight, tag,
-                      metric, charge, dt, layout };
+                      pack_arrays(i1, i2, i3,
+                                  i1_prev, i2_prev, i3_prev,
+                                  dx1, dx2, dx3,
+                                  dx1_prev, dx2_prev, dx3_prev,
+                                  ux1, ux2, ux3,
+                                  phi, weight, tag),
+                      metric, charge, dt, layout,
+                      static_cast<npart_t>(10) };
 
       Kokkos::TeamPolicy<> policy(static_cast<int>(layout.ntiles_total),
                                   Kokkos::AUTO);
@@ -221,12 +318,158 @@ namespace {
                 << " T_TILE=" << T_TILE
                 << " : " << fail_count << " mismatches; max_diff=" << max_diff
                 << '\n';
-      throw std::logic_error("DepositCurrents_kernel_tiled mismatch");
+      throw std::logic_error("DepositCurrentsTiled_kernel mismatch");
     }
     std::cerr << "X-1 deposit_tiled OK  O=" << O << " T_TILE=" << T_TILE
               << "  max_diff=" << max_diff << '\n';
   }
 
+  // Drift / stale-layout regression test for the per-particle escape valve.
+  //
+  // A population of alive particles is spread across the whole domain
+  // (including the near-boundary cells that deposit into the ghost stripe)
+  // but the tile layout buckets them ALL into tile 0 — i.e. the layout is
+  // maximally stale w.r.t. their real positions, exactly the situation that
+  // arises when the SoA is reordered/appended between sorts. The tiled
+  // kernel must route every out-of-tile particle to the global J view and
+  // reproduce the flat deposit cell-for-cell: no charge dropped or
+  // double-counted at any drift distance. This is the property that broke
+  // when `spatial_sorting_interval > 1` left drifted particles depositing
+  // partial stencils, producing a density/E line at the decomposition
+  // boundary.
+  template <unsigned short O, unsigned short T_TILE>
+  void run_drift_case() {
+    using metric_t = metric::Minkowski<Dim::_2D>;
+    constexpr unsigned short nx1 = 50u, nx2 = 50u;
+    metric_t metric { { nx1, nx2 }, { { 0.0, 55.0 }, { 0.0, 55.0 } }, {} };
+
+    constexpr int n_slots = 64;
+    constexpr int n_base  = 5;
+    const int     bases[n_base] = { 1, 13, 25, 37, 48 };
+    const int     n_alive       = n_base * n_base; // 25
+
+    array_t<int*>      i1 { "i1", n_slots }, i2 { "i2", n_slots },
+      i3 { "i3", n_slots };
+    array_t<int*>      i1_prev { "i1_prev", n_slots },
+      i2_prev { "i2_prev", n_slots }, i3_prev { "i3_prev", n_slots };
+    array_t<prtldx_t*> dx1 { "dx1", n_slots }, dx2 { "dx2", n_slots },
+      dx3 { "dx3", n_slots };
+    array_t<prtldx_t*> dx1_prev { "dx1_prev", n_slots },
+      dx2_prev { "dx2_prev", n_slots }, dx3_prev { "dx3_prev", n_slots };
+    array_t<real_t*>   ux1 { "ux1", n_slots }, ux2 { "ux2", n_slots },
+      ux3 { "ux3", n_slots };
+    array_t<real_t*>   phi { "phi", n_slots }, weight { "weight", n_slots };
+    array_t<short*>    tag { "tag", n_slots };
+    const real_t       charge = 1.0, dt = 1.0;
+
+    // Fill alive particles on host (slots >= n_alive stay zero == dead).
+    auto h_i1  = Kokkos::create_mirror_view(i1);
+    auto h_i2  = Kokkos::create_mirror_view(i2);
+    auto h_i1p = Kokkos::create_mirror_view(i1_prev);
+    auto h_i2p = Kokkos::create_mirror_view(i2_prev);
+    auto h_dx1  = Kokkos::create_mirror_view(dx1);
+    auto h_dx2  = Kokkos::create_mirror_view(dx2);
+    auto h_dx1p = Kokkos::create_mirror_view(dx1_prev);
+    auto h_dx2p = Kokkos::create_mirror_view(dx2_prev);
+    auto h_ux3  = Kokkos::create_mirror_view(ux3);
+    auto h_w    = Kokkos::create_mirror_view(weight);
+    auto h_tag  = Kokkos::create_mirror_view(tag);
+    int  p = 0;
+    for (int a = 0; a < n_base; ++a) {
+      for (int b = 0; b < n_base; ++b, ++p) {
+        h_i1p(p) = bases[a];
+        h_i1(p)  = bases[a] - 1;
+        h_i2p(p) = bases[b];
+        h_i2(p)  = bases[b] - 1;
+        h_dx1p(p) = static_cast<prtldx_t>(0.65);
+        h_dx1(p)  = static_cast<prtldx_t>(0.99);
+        h_dx2p(p) = static_cast<prtldx_t>(0.65);
+        h_dx2(p)  = static_cast<prtldx_t>(0.80);
+        h_ux3(p)  = static_cast<real_t>(2.5);
+        h_w(p)    = static_cast<real_t>(1.0);
+        h_tag(p)  = ParticleTag::alive;
+      }
+    }
+    Kokkos::deep_copy(i1, h_i1);
+    Kokkos::deep_copy(i2, h_i2);
+    Kokkos::deep_copy(i1_prev, h_i1p);
+    Kokkos::deep_copy(i2_prev, h_i2p);
+    Kokkos::deep_copy(dx1, h_dx1);
+    Kokkos::deep_copy(dx2, h_dx2);
+    Kokkos::deep_copy(dx1_prev, h_dx1p);
+    Kokkos::deep_copy(dx2_prev, h_dx2p);
+    Kokkos::deep_copy(ux3, h_ux3);
+    Kokkos::deep_copy(weight, h_w);
+    Kokkos::deep_copy(tag, h_tag);
+
+    // Flat reference over all slots (dead slots are skipped internally).
+    ndfield_t<Dim::_2D, 3> J_flat { "J_flat",
+                                    nx1 + 2u * N_GHOSTS,
+                                    nx2 + 2u * N_GHOSTS };
+    {
+      auto J_scat = Kokkos::Experimental::create_scatter_view(J_flat);
+      Kokkos::parallel_for(
+        "FlatDepositDrift",
+        n_slots,
+        kernel::DepositCurrents_kernel<SimEngine::SRPIC, metric_t, O>(
+          J_scat,
+          pack_arrays(i1, i2, i3,
+                      i1_prev, i2_prev, i3_prev,
+                      dx1, dx2, dx3,
+                      dx1_prev, dx2_prev, dx3_prev,
+                      ux1, ux2, ux3,
+                      phi, weight, tag),
+          metric, charge, dt));
+      Kokkos::Experimental::contribute(J_flat, J_scat);
+      Kokkos::fence("flat drift deposit done");
+    }
+
+    // Tiled with a maximally-stale layout: all alive particles in tile 0.
+    ndfield_t<Dim::_2D, 3> J_tiled { "J_tiled",
+                                     nx1 + 2u * N_GHOSTS,
+                                     nx2 + 2u * N_GHOSTS };
+    {
+      const auto ntx1 = static_cast<ncells_t>(
+        std::ceil(static_cast<double>(nx1) / static_cast<double>(T_TILE)));
+      const auto ntx2 = static_cast<ncells_t>(
+        std::ceil(static_cast<double>(nx2) / static_cast<double>(T_TILE)));
+
+      TileLayout<Dim::_2D> layout;
+      layout.ntiles_per_axis[0] = ntx1;
+      layout.ntiles_per_axis[1] = ntx2;
+      layout.ntiles_per_axis[2] = 1u;
+      layout.ntiles_total       = ntx1 * ntx2;
+      layout.tile_size          = T_TILE;
+      layout.tile_offsets       = build_tile_offsets_all_in_tile0(
+        ntx1 * ntx2,
+        static_cast<npart_t>(n_alive));
+
+      using kernel_t =
+        kernel::DepositCurrentsTiled_kernel<SimEngine::SRPIC, metric_t, O, T_TILE>;
+      // npart = n_alive: the stale layout buckets all alive particles into
+      // tile 0, so the team must walk [0, n_alive) and route the drifted
+      // ones to the global-J escape valve.
+      kernel_t kern { J_tiled,
+                      pack_arrays(i1, i2, i3,
+                                  i1_prev, i2_prev, i3_prev,
+                                  dx1, dx2, dx3,
+                                  dx1_prev, dx2_prev, dx3_prev,
+                                  ux1, ux2, ux3,
+                                  phi, weight, tag),
+                      metric, charge, dt, layout,
+                      static_cast<npart_t>(n_alive) };
+
+      Kokkos::TeamPolicy<> policy(static_cast<int>(layout.ntiles_total),
+                                  Kokkos::AUTO);
+      policy.set_scratch_size(0,
+                              Kokkos::PerTeam(kernel_t::scratch_bytes()));
+      Kokkos::parallel_for("TiledDepositDrift", policy, kern);
+      Kokkos::fence("tiled drift deposit done");
+    }
+
+    compare_J_fields(J_flat, J_tiled, O, T_TILE, "drift");
+  }
+
   template <unsigned short T_TILE>
   void run_all_orders() {
     run_one_case<0u, T_TILE>();
@@ -241,6 +484,20 @@ namespace {
     run_one_case<9u, T_TILE>();
     run_one_case<10u, T_TILE>();
     run_one_case<11u, T_TILE>();
+
+    // Stale-layout / drift regression (per-particle escape valve).
+    run_drift_case<0u, T_TILE>();
+    run_drift_case<1u, T_TILE>();
+    run_drift_case<2u, T_TILE>();
+    run_drift_case<3u, T_TILE>();
+    run_drift_case<4u, T_TILE>();
+    run_drift_case<5u, T_TILE>();
+    run_drift_case<6u, T_TILE>();
+    run_drift_case<7u, T_TILE>();
+    run_drift_case<8u, T_TILE>();
+    run_drift_case<9u, T_TILE>();
+    run_drift_case<10u, T_TILE>();
+    run_drift_case<11u, T_TILE>();
   }
 
 } // namespace

From 4dfaba918443fb6dc374f2d6c661c5836e5742dc Mon Sep 17 00:00:00 2001
From: Ludwig Boess <ludwig.boess@outlook.de>
Date: Fri, 19 Jun 2026 22:36:26 -0400
Subject: [PATCH 12/33] replaced compile-time sort intervale with compile-time
 drift halo size

---
 CMakeLists.txt                          | 40 ++++++++++++++-----------
 cmake/defaults.cmake                    |  6 ++--
 cmake/report.cmake                      | 24 +++++++--------
 src/engines/srpic/currents.h            |  2 +-
 src/framework/parameters/parameters.cpp |  7 -----
 src/framework/parameters/particles.cpp  |  9 ------
 src/kernels/currents_deposit.hpp        | 39 ++++++++++++------------
 7 files changed, 56 insertions(+), 71 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d7bea74f0..f8409209b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,10 +67,10 @@ set(team_policy_tile_size
 set(team_policy_tile_sizes
   "4;6;8;10;12;14;16"
   CACHE STRING "team_policy tile-size choices")
-set(team_policy_sort
-  ${default_team_policy_sort}
+set(team_policy_drift
+  ${default_team_policy_drift}
   CACHE STRING
-  "team_policy hardwired spatial sorting interval; >0 overrides the runtime spatial_sorting_interval and sizes the tiled deposit scratch halo (0 = use runtime)")
+  "team_policy tiled-deposit scratch halo drift in cells (max cells a particle may move between two sorts). Sizes the deposit scratch halo only; the sort cadence is set at runtime via spatial_sorting_interval. Default 1.")
 
 # -------------------------- Compilation settings -------------------------- #
 set(CMAKE_CXX_STANDARD 20)
@@ -161,14 +161,13 @@ if(${team_policy})
   add_compile_options("-D TEAM_POLICY")
   add_compile_options("-D TEAM_POLICY_TILE_SIZE=${team_policy_tile_size}")
 
-  # Optional compile-time hardwired sort interval. When > 0, it (a) overrides
-  # the runtime spatial_sorting_interval (see framework/parameters) and (b)
-  # sizes the tiled deposit scratch halo to DRIFT = interval, so a particle
-  # drifting over a full sort interval still deposits inside its tile scratch.
-  if(team_policy_sort GREATER 0)
-    add_compile_options(
-      "-D TEAM_POLICY_SORT_INTERVAL=${team_policy_sort}")
-  endif()
+  # Compile-time tiled-deposit scratch halo drift. Sizes the halo so a
+  # particle that drifts up to DRIFT cells between two sorts still deposits
+  # inside its tile scratch; particles drifting further take the
+  # per-particle global-J escape valve (correct, only slower). This is
+  # independent of the sort cadence, which is set at runtime via
+  # `spatial_sorting_interval`. Defaults to 1 (the sorted-every-step case).
+  add_compile_options("-D TEAM_POLICY_DRIFT=${team_policy_drift}")
 
   # Vendor sort: oneDPL on SYCL, Thrust on CUDA. Used automatically
   # when found; falls back to Kokkos::BinSort otherwise.
@@ -196,18 +195,23 @@ if(${team_policy})
   endif()
 
   if("${Kokkos_DEVICES}" MATCHES "HIP")
-    # rocThrust ships with ROCm and exposes the same thrust:: API. Using
-    # it lets the HIP backend build a single permutation via
-    # sort_by_key and gather all SoA members through one reused buffer,
-    # instead of the legacy per-member Kokkos::BinSort path which
-    # allocates a fresh `sorted_values` buffer for every member every
-    # step (the dominant source of allocator churn / fragmentation on
-    # ROCm).
+    # rocThrust ships with ROCm. The HIP sort_by_key path uses rocprim's
+    # bounded-bit radix sort directly (rocprim is rocThrust's own
+    # dependency, so its headers come in transitively; we find it
+    # explicitly to keep the include path robust). This builds a single
+    # permutation that gathers all SoA members, instead of the legacy
+    # per-member Kokkos::BinSort path which allocates a fresh
+    # `sorted_values` buffer for every member every step (the dominant
+    # source of allocator churn / fragmentation on ROCm).
     find_package(rocthrust QUIET)
     if(rocthrust_FOUND)
       message(STATUS "team_policy: rocThrust enabled for HIP sort_by_key")
       add_compile_options("-D ROCTHRUST_ENABLED")
       set(DEPENDENCIES ${DEPENDENCIES} roc::rocthrust)
+      find_package(rocprim QUIET)
+      if(rocprim_FOUND)
+        set(DEPENDENCIES ${DEPENDENCIES} roc::rocprim)
+      endif()
     else()
       message(STATUS "team_policy: rocThrust not found; using BinSort "
                      "fallback for HIP sort_by_key")
diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake
index 619f48dc1..01427921f 100644
--- a/cmake/defaults.cmake
+++ b/cmake/defaults.cmake
@@ -108,7 +108,7 @@ set(default_team_policy_tile_size
     8
     CACHE INTERNAL "Default tile edge length in cells for team_policy")
 
-set(default_team_policy_sort
-    0
+set(default_team_policy_drift
+    1
     CACHE INTERNAL
-      "Default hardwired spatial sorting interval for team_policy (0 = runtime)")
+      "Default tiled-deposit scratch halo drift for team_policy (cells between sorts)")
diff --git a/cmake/report.cmake b/cmake/report.cmake
index e6036d366..d90dfb085 100644
--- a/cmake/report.cmake
+++ b/cmake/report.cmake
@@ -141,17 +141,15 @@ if(${team_policy})
     "${Blue}"
     TEAM_POLICY_TILE_SIZE_REPORT
     46)
-  if(team_policy_sort GREATER 0)
-    printchoices(
-      "Team Sort Interval"
-      "team_policy_sort"
-      "${team_policy_sort}"
-      ${team_policy_sort}
-      0
-      "${Blue}"
-      TEAM_POLICY_SORT_INTERVAL_REPORT
-      46)
-  endif()
+  printchoices(
+    "Team Deposit Drift"
+    "team_policy_drift"
+    "${team_policy_drift}"
+    ${team_policy_drift}
+    1
+    "${Blue}"
+    TEAM_POLICY_DRIFT_REPORT
+    46)
 endif()
 printchoices(
   "Debug mode"
@@ -231,9 +229,7 @@ endif()
 string(APPEND REPORT_TEXT "  " ${TEAM_POLICY_REPORT} "\n")
 if(${team_policy})
   string(APPEND REPORT_TEXT "  " ${TEAM_POLICY_TILE_SIZE_REPORT} "\n")
-  if(team_policy_sort GREATER 0)
-    string(APPEND REPORT_TEXT "  " ${TEAM_POLICY_SORT_INTERVAL_REPORT} "\n")
-  endif()
+  string(APPEND REPORT_TEXT "  " ${TEAM_POLICY_DRIFT_REPORT} "\n")
 endif()
 
 string(
diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h
index eadda70b9..b8cfe63ab 100644
--- a/src/engines/srpic/currents.h
+++ b/src/engines/srpic/currents.h
@@ -128,7 +128,7 @@ namespace ntt {
       // partition per-particle:
       //   - a particle whose full stencil has drifted out of its tile is
       //     deposited straight to the global J view (the per-particle escape
-      //     valve); `team_policy_sort_interval` sizes the scratch halo so the
+      //     valve); `team_policy_drift` sizes the scratch halo so the
       //     common in-tile case stays in fast SLM (see currents_deposit.hpp);
       //   - particles dead-tagged in place since the sort are clamped out by
       //     the kernel and skipped by the dead-tag test;
diff --git a/src/framework/parameters/parameters.cpp b/src/framework/parameters/parameters.cpp
index 4116783a1..7372da510 100644
--- a/src/framework/parameters/parameters.cpp
+++ b/src/framework/parameters/parameters.cpp
@@ -67,18 +67,11 @@ namespace ntt {
       "clear_interval",
       defaults::clear_interval);
     set("particles.clear_interval", global_clearing_interval);
-#if defined(TEAM_POLICY_SORT_INTERVAL)
-    // See particles.cpp: the compile-time team_policy_sort_interval overrides
-    // the runtime value (kept consistent here for the stored global param).
-    const auto global_spatial_sorting_interval = static_cast<timestep_t>(
-      TEAM_POLICY_SORT_INTERVAL);
-#else
     const auto global_spatial_sorting_interval = toml::find_or<timestep_t>(
       toml_data,
       "particles",
       "spatial_sorting_interval",
       0u);
-#endif
     set("particles.spatial_sorting_interval", global_spatial_sorting_interval);
 
     set("scales.n0", ppc0 / get<real_t>("scales.V0"));
diff --git a/src/framework/parameters/particles.cpp b/src/framework/parameters/particles.cpp
index 46192035b..6f1a23c03 100644
--- a/src/framework/parameters/particles.cpp
+++ b/src/framework/parameters/particles.cpp
@@ -121,19 +121,10 @@ namespace ntt {
         sp,
         "clear_interval",
         global_clearing_interval);
-#if defined(TEAM_POLICY_SORT_INTERVAL)
-      // Compile-time hardwired sort interval (the `team_policy_sort_interval`
-      // CMake knob). It overrides whatever the input file requested so the
-      // tiled deposit's scratch halo — sized for exactly this cadence — is
-      // guaranteed to contain every particle's drift between sorts.
-      const auto spatial_sorting_interval = static_cast<timestep_t>(
-        TEAM_POLICY_SORT_INTERVAL);
-#else
       const auto spatial_sorting_interval = toml::find_or<timestep_t>(
         sp,
         "spatial_sorting_interval",
         global_spatial_sorting_interval);
-#endif
       auto pusher_str = toml::find_or(sp, "pusher", std::string(def_pusher));
       const auto npayloads_real = toml::find_or(sp,
                                                 "n_payloads_real",
diff --git a/src/kernels/currents_deposit.hpp b/src/kernels/currents_deposit.hpp
index 8840a1046..b92b02423 100644
--- a/src/kernels/currents_deposit.hpp
+++ b/src/kernels/currents_deposit.hpp
@@ -786,11 +786,12 @@ namespace kernel {
    * per step elapsed since the last sort. The scratch HALO is
    * `STENCIL_REACH(O) + DRIFT`, where `STENCIL_REACH = 2` for zigzag
    * (writes `{i_prev, i_prev+1, i, i+1}` ⇒ +2 above `min(i, i_prev)` with
-   * `|Δi|=1`) and `O` for Esirkepov. `DRIFT` is the
-   * `team_policy_sort_interval` CMake knob (macro TEAM_POLICY_SORT_INTERVAL)
-   * when set — the hardwired sort interval, hence the maximum drift any
-   * particle accrues between sorts — and `1` otherwise (the
-   * every-step-sorted common case).
+   * `|Δi|=1`) and `O` for Esirkepov. `DRIFT` is the `team_policy_drift`
+   * CMake knob (macro TEAM_POLICY_DRIFT) — the number of cells a particle
+   * may drift between two sorts that the halo is sized to absorb — and `1`
+   * by default (the every-step-sorted common case). It is independent of
+   * the sort cadence, which is set at runtime via `spatial_sorting_interval`;
+   * particles that drift past the halo take the escape valve below.
    *
    * Correctness does **not** depend on the halo size. Any particle whose
    * full stencil escapes the scratch tile — because it drifted further
@@ -801,9 +802,9 @@ namespace kernel {
    * valve). Each particle's stencil is therefore deposited exactly once
    * (entirely to SLM scratch when it fits, entirely to global J when it
    * does not), so the path is charge-conserving; it is merely slower per
-   * write. Sizing `DRIFT` to the sort interval keeps the common,
-   * within-interval drift in fast SLM; sorting less often only costs
-   * escape-valve traffic, never accuracy.
+   * write. Sizing `DRIFT` to the typical between-sort drift keeps the
+   * common case in fast SLM; sorting less often (or drifting past the
+   * halo) only costs escape-valve traffic, never accuracy.
    *
    * **Partition coverage.** The team iteration covers only the particles
    * partitioned at the last sort, `[0, layout.npart_partitioned)`, clamped
@@ -836,15 +837,15 @@ namespace kernel {
      *
      * drift — sort runs at end-of-step (see srpic.hpp), so a particle is
      * pushed once per step between its last sort and a given deposit. With
-     * a sort interval of `K`, a particle therefore drifts at most `K` cells
-     * (CFL |v dt/dx| <= 1/2 ⇒ |Δi| <= 1 per step) before the next sort. The
-     * `team_policy_sort_interval` CMake knob (macro TEAM_POLICY_SORT_INTERVAL)
-     * pins that interval at compile time and feeds it here as DRIFT, sizing
-     * the halo so a fully-interval-drifted particle still deposits inside
-     * its tile scratch. When the knob is unset, DRIFT defaults to 1 (the
-     * sorted-every-step common case); any particle that drifts past the halo
-     * (e.g. a larger runtime interval, or a CFL excursion) takes the
-     * per-particle global-J escape valve below — correct, only slower (see
+     * a runtime sort interval of `K` (spatial_sorting_interval), a particle
+     * drifts at most `K` cells (CFL |v dt/dx| <= 1/2 ⇒ |Δi| <= 1 per step)
+     * before the next sort. The `team_policy_drift` CMake knob (macro
+     * TEAM_POLICY_DRIFT) sets DRIFT independently of `K`, sizing the halo so
+     * a particle that drifts up to DRIFT cells still deposits inside its
+     * tile scratch. DRIFT defaults to 1 (the sorted-every-step common case);
+     * any particle that drifts past the halo (e.g. a larger sort interval,
+     * or a CFL excursion) takes the per-particle global-J escape valve
+     * below — correct, only slower (see
      * the class doc-comment for why this is charge-conserving).
      */
     static constexpr int STENCIL_REACH   = (O == 0u) ? 2 : static_cast<int>(O);
@@ -854,8 +855,8 @@ namespace kernel {
     // coords conservatively bounds every deposited cell for any order
     // (Esirkepov reaches max+O; O=0 zigzag reaches max+1).
     static constexpr int FOOTPRINT_REACH = (O == 0u) ? 1 : static_cast<int>(O);
-#if defined(TEAM_POLICY_SORT_INTERVAL)
-    static constexpr int DRIFT = static_cast<int>(TEAM_POLICY_SORT_INTERVAL);
+#if defined(TEAM_POLICY_DRIFT)
+    static constexpr int DRIFT = static_cast<int>(TEAM_POLICY_DRIFT);
 #else
     static constexpr int DRIFT = 1;
 #endif

From cd1be6b383839fb72a31e078c3da5a59f0246d71 Mon Sep 17 00:00:00 2001
From: Ludwig Boess <ludwig.boess@outlook.de>
Date: Fri, 19 Jun 2026 22:37:08 -0400
Subject: [PATCH 13/33] AMD-specific sorting improvements

---
 src/framework/containers/particles.h        |  12 ++
 src/framework/containers/particles_sort.cpp | 128 +++++++------
 src/global/utils/sort_dispatch.h            | 154 ++++++++++++---
 tests/framework/particles_sort.cpp          | 196 ++++++++++++++------
 tests/framework/sort_by_key.cpp             |   3 +
 5 files changed, 353 insertions(+), 140 deletions(-)

diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h
index 0877efd15..511a07a62 100644
--- a/src/framework/containers/particles.h
+++ b/src/framework/containers/particles.h
@@ -99,6 +99,18 @@ namespace ntt {
     // vendor libraries detected by CMake.
     TileLayout<D> m_tile_layout {};
 
+#if defined(TEAM_POLICY)
+    // Build m_tile_layout.tile_offsets / npart_partitioned from the
+    // already-sorted tile-index keys. A separate member function (not a
+    // lambda local to SortSpatially) so the inner device kernel is not an
+    // extended __device__ lambda nested inside another lambda — which
+    // nvcc forbids. Lets the vendor path run the offsets pass and then
+    // release the keys before the SoA gather allocates its buffers.
+    void compute_tile_offsets(const array_t<ncells_t*>& tile_indices,
+                              ncells_t                  total_tiles,
+                              npart_t                   npart_local);
+#endif
+
   public:
     // for empty allocation
     Particles() {}
diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp
index a52b013f7..3db6c1438 100644
--- a/src/framework/containers/particles_sort.cpp
+++ b/src/framework/containers/particles_sort.cpp
@@ -205,6 +205,61 @@ namespace ntt {
     m_is_sorted = true;
   }
 
+#if defined(TEAM_POLICY)
+  template <Dimension D, Coord::type C>
+  void Particles<D, C>::compute_tile_offsets(
+    const array_t<ncells_t*>& tile_indices,
+    ncells_t                  total_tiles,
+    npart_t                   npart_local) {
+    // Compute the per-tile prefix-sum `tile_offsets` for the tiled
+    // pusher from the (already sorted) `tile_indices` — monotonically
+    // non-decreasing for alive particles, with the dead sentinel
+    // `total_tiles + 1` clustered at the end. Transition-detect directly
+    // on it: the start of each non-empty tile is the only place a write
+    // happens — atomic-free in the dense branch. Empty tiles (no
+    // particles) are filled by a reverse pass on a small host mirror
+    // (`total_tiles ≈ 176K` at production scale → ~700 KB).
+    array_t<npart_t*> tile_offsets { "tile_offsets", total_tiles + 1u };
+    Kokkos::deep_copy(tile_offsets, static_cast<npart_t>(npart_local));
+
+    const auto total_tiles_v = total_tiles;
+    auto       ti_v          = tile_indices;
+    Kokkos::parallel_for(
+      "DetectTileBoundaries",
+      CreateParticleRangePolicy<Dim::_1D>({ 0u }, { npart_local }),
+      Lambda(prtlidx_t p) {
+        const auto t_curr   = ti_v(p);
+        const bool boundary = (p == 0u) || (ti_v(p - 1u) != t_curr);
+        if (!boundary) {
+          return;
+        }
+        if (t_curr < total_tiles_v) {
+          tile_offsets(t_curr) = p;
+        } else {
+          // First dead particle — also marks the alive_count boundary
+          // stored at index total_tiles.
+          Kokkos::atomic_min(&tile_offsets(total_tiles_v), p);
+        }
+      });
+
+    auto h_offsets = Kokkos::create_mirror_view(tile_offsets);
+    Kokkos::deep_copy(h_offsets, tile_offsets);
+    for (auto t = static_cast<std::size_t>(total_tiles); t-- > 0u;) {
+      if (h_offsets(t) > h_offsets(t + 1u)) {
+        h_offsets(t) = h_offsets(t + 1u);
+      }
+    }
+    Kokkos::deep_copy(tile_offsets, h_offsets);
+
+    m_tile_layout.tile_offsets = tile_offsets;
+    // tile_offsets(total_tiles) is the alive-particle count at sort time:
+    // the tiles partition exactly [0, npart_partitioned). The deposit
+    // launcher compares this against the live npart() to detect (and
+    // separately deposit) particles appended since this sort.
+    m_tile_layout.npart_partitioned = h_offsets(total_tiles);
+  }
+#endif // TEAM_POLICY
+
   template <Dimension D, Coord::type C>
   void Particles<D, C>::SortSpatially(const Grid<D>& grid) {
 #if defined(TEAM_POLICY)
@@ -262,6 +317,7 @@ namespace ntt {
     //    the dead-particle sentinel bin (total_tiles + 1u).
     const ncells_t n_bins = total_tiles + 2u;
     const auto     slice  = prtl_slice_t(0, npart_local);
+
   #if defined(TEAM_POLICY_USE_VENDOR_SORT)
     // Vendor path: produce an explicit permutation via sort_by_key,
     // then apply it to each SoA member by gathering into a fresh
@@ -286,6 +342,12 @@ namespace ntt {
                                        n_bins,
                                        sort::backend::Thrust {});
     #endif
+    // `tile_indices` is sorted in place by sort_by_key. Build the tile
+    // offsets from it now, then drop it before the gather allocates its
+    // `maxnpart`-sized buffers — so the keys are not co-resident with
+    // them at the gather's peak (#2).
+    compute_tile_offsets(tile_indices, total_tiles, npart_local);
+    tile_indices = array_t<ncells_t*> {};
     Kokkos::fence("SortSpatially: pre-gather drain");
     apply_permutation_to_soa(perm);
   #else
@@ -330,67 +392,17 @@ namespace ntt {
       sorter.sort(Kokkos::subview(pld_i, slice, pldi));
     }
     // Apply the same permutation to `tile_indices` itself so it ends
-    // monotonically non-decreasing for the offsets pass below.
+    // monotonically non-decreasing for the offsets pass, then build the
+    // tile offsets from it (the in-place BinSort path has no separate
+    // gather to hoist this ahead of).
     sorter.sort(tile_indices);
+    compute_tile_offsets(tile_indices, total_tiles, npart_local);
   #endif // TEAM_POLICY_USE_VENDOR_SORT
 
-    // 5. Compute per-tile prefix-sum `tile_offsets` for the tiled
-    //    pusher. `tile_indices` is now sorted (monotonically
-    //    non-decreasing for alive particles, dead sentinel
-    //    `total_tiles + 1` clustered at the end) — vendor sort_by_key
-    //    sorts keys in place; the BinSort path explicitly applies the
-    //    same permutation to `tile_indices` above. Transition-detect
-    //    directly on it: the start of each non-empty tile is the only
-    //    place a write happens — atomic-free in the dense branch.
-    //    Empty tiles (no particles) are filled by a reverse pass on a
-    //    small host mirror (`total_tiles ≈ 176K` at production scale →
-    //    ~700 KB).
-    {
-      array_t<npart_t*> tile_offsets { "tile_offsets", total_tiles + 1u };
-      Kokkos::deep_copy(tile_offsets, static_cast<npart_t>(npart_local));
-
-      const auto total_tiles_v = total_tiles;
-      auto       ti_v          = tile_indices;
-      Kokkos::parallel_for(
-        "DetectTileBoundaries",
-        rangeActiveParticles(),
-        Lambda(prtlidx_t p) {
-          const auto t_curr   = ti_v(p);
-          const bool boundary = (p == 0u) || (ti_v(p - 1u) != t_curr);
-          if (!boundary) {
-            return;
-          }
-          if (t_curr < total_tiles_v) {
-            tile_offsets(t_curr) = p;
-          } else {
-            // First dead particle — also marks the alive_count boundary
-            // stored at index total_tiles.
-            Kokkos::atomic_min(&tile_offsets(total_tiles_v), p);
-          }
-        });
-
-      auto h_offsets = Kokkos::create_mirror_view(tile_offsets);
-      Kokkos::deep_copy(h_offsets, tile_offsets);
-      for (auto t = static_cast<std::size_t>(total_tiles); t-- > 0u;) {
-        if (h_offsets(t) > h_offsets(t + 1u)) {
-          h_offsets(t) = h_offsets(t + 1u);
-        }
-      }
-      Kokkos::deep_copy(tile_offsets, h_offsets);
-
-      m_tile_layout.tile_offsets = tile_offsets;
-      // tile_offsets(total_tiles) is the alive-particle count at sort time:
-      // the tiles partition exactly [0, npart_partitioned). The deposit
-      // launcher compares this against the live npart() to detect (and
-      // separately deposit) particles appended since this sort.
-      m_tile_layout.npart_partitioned = h_offsets(total_tiles);
-    }
-
-    // 6. Populate `m_tile_layout` size/shape. `tile_perm` is not used
-    //    in the current design — the SoA arrays are physically permuted
-    //    into tile order, so consumers iterate
-    //    `[tile_offsets(t), tile_offsets(t+1))` directly without a
-    //    separate permutation indirection.
+    // Populate `m_tile_layout` size/shape. `tile_perm` is not used in the
+    // current design — the SoA arrays are physically permuted into tile
+    // order, so consumers iterate `[tile_offsets(t), tile_offsets(t+1))`
+    // directly without a separate permutation indirection.
     m_tile_layout.ntiles_per_axis[0] = ntx[0];
     m_tile_layout.ntiles_per_axis[1] = ntx[1];
     m_tile_layout.ntiles_per_axis[2] = ntx[2];
diff --git a/src/global/utils/sort_dispatch.h b/src/global/utils/sort_dispatch.h
index 506a90742..cfc389464 100644
--- a/src/global/utils/sort_dispatch.h
+++ b/src/global/utils/sort_dispatch.h
@@ -14,8 +14,13 @@
  *       keys[perm[0]] <= keys[perm[1]] <= ... in stable order.
  *       Always-available overloads: BinSort (uses Kokkos::BinSort) and
  *       StdSort (host-side std::stable_sort fallback). The vendor-library
- *       overloads (OneDPL on SYCL, Thrust on CUDA) are conditional on the
- *       respective build flags.
+ *       overloads (OneDPL on SYCL, cub radix sort on CUDA, rocprim radix
+ *       sort on HIP) are conditional on the respective build flags.
+ * @note The CUDA/HIP overloads bound the radix sort to the significant
+ *       key bits (`significant_bits(n_bins)`) instead of the full 32, so
+ *       only ceil(log2(n_bins)) bits are sorted — fewer radix passes than
+ *       a full-width `thrust::sort_by_key`. Scratch is transient (freed at
+ *       scope exit); no persistent buffer is retained (cf. 787aa045).
  */
 
 #ifndef GLOBAL_UTILS_SORT_DISPATCH_H
@@ -38,22 +43,32 @@
   #include <oneapi/dpl/execution>
 #endif
 #if defined(CUDA_ENABLED) && defined(THRUST_ENABLED)
-  #include <thrust/device_ptr.h>
-  #include <thrust/sequence.h>
-  #include <thrust/sort.h>
+  #include <cub/device/device_radix_sort.cuh>
 #endif
 #if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)
-  #include <thrust/device_ptr.h>
-  #include <thrust/execution_policy.h>
-  #include <thrust/sequence.h>
-  #include <thrust/sort.h>
+  #include <rocprim/rocprim.hpp>
 #endif
 
 #include <algorithm>
+#include <cstdint>
 #include <numeric>
 
 namespace ntt::sort_helpers {
 
+  // Number of low-order bits needed to represent keys in [0, n_bins).
+  // The radix sort only needs to scan these bits — bounding `end_bit` to
+  // ceil(log2(n_bins)) instead of 32 cuts the number of passes (e.g. 18
+  // bits when total_tiles ~ 176K). Returns at least 1.
+  inline unsigned int significant_bits(ncells_t n_bins) {
+    unsigned int bits = 0u;
+    while (bits < 32u &&
+           (static_cast<std::uint64_t>(1u) << bits) <
+             static_cast<std::uint64_t>(n_bins)) {
+      ++bits;
+    }
+    return (bits == 0u) ? 1u : bits;
+  }
+
   // Always-available legacy fallback: Kokkos::BinSort. n_bins must be an
   // upper bound on distinct key values.
   inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
@@ -109,39 +124,126 @@ namespace ntt::sort_helpers {
 #if defined(CUDA_ENABLED) && defined(THRUST_ENABLED)
   inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
                                    prtl_perm_t&              perm,
-                                   ncells_t /*n_bins*/,
+                                   ncells_t                  n_bins,
                                    ::sort::backend::Thrust) {
     const auto n = static_cast<npart_t>(keys.extent(0));
     if (n == 0u) {
       return;
     }
-    Kokkos::fence("sort_by_key_dispatch Thrust: pre-sort");
-    thrust::device_ptr<ncells_t> kp(keys.data());
-    thrust::device_ptr<npart_t>  pp(perm.data());
-    thrust::sequence(pp, pp + n);
-    thrust::sort_by_key(kp, kp + n, pp);
-    Kokkos::fence("sort_by_key_dispatch Thrust: post-sort");
+    auto exec   = Kokkos::DefaultExecutionSpace();
+    auto perm_v = perm;
+    Kokkos::parallel_for(
+      "PermInitIota",
+      n,
+      KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; });
+
+    // Out-of-place radix sort bounded to the significant key bits. The
+    // _out buffers and temp storage are transient (freed at scope exit).
+    array_t<ncells_t*> keys_out("tile_keys_sorted", n);
+    prtl_perm_t        perm_out("tile_perm_sorted", n);
+    const int          end_bit = static_cast<int>(significant_bits(n_bins));
+
+    exec.fence("sort_by_key_dispatch Thrust: pre-sort");
+    auto stream = exec.cuda_stream();
+
+    std::size_t temp_bytes = 0;
+    auto        err = cub::DeviceRadixSort::SortPairs(nullptr,
+                                              temp_bytes,
+                                              keys.data(),
+                                              keys_out.data(),
+                                              perm.data(),
+                                              perm_out.data(),
+                                              n,
+                                              0,
+                                              end_bit,
+                                              stream);
+    raise::ErrorIf(err != cudaSuccess,
+                   "cub::DeviceRadixSort::SortPairs (size query) failed",
+                   HERE);
+    array_t<char*> temp("cub_radix_temp",
+                        (temp_bytes == 0u) ? std::size_t { 1 } : temp_bytes);
+    err = cub::DeviceRadixSort::SortPairs(temp.data(),
+                                          temp_bytes,
+                                          keys.data(),
+                                          keys_out.data(),
+                                          perm.data(),
+                                          perm_out.data(),
+                                          n,
+                                          0,
+                                          end_bit,
+                                          stream);
+    raise::ErrorIf(err != cudaSuccess,
+                   "cub::DeviceRadixSort::SortPairs failed",
+                   HERE);
+    exec.fence("sort_by_key_dispatch Thrust: post-sort");
+
+    // Publish sorted keys back in place; swap the sorted permutation in.
+    auto keys_nc = keys; // non-const handle aliasing the same storage
+    Kokkos::deep_copy(keys_nc, keys_out);
+    perm = perm_out;
   }
 #endif
 
 #if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)
-  // rocThrust exposes the same thrust:: API as CUDA Thrust; with hipcc
-  // device_ptr-based algorithms dispatch to the HIP backend. Mirrors
-  // the CUDA Thrust overload.
+  // HIP analogue of the CUDA cub overload, using rocprim's radix sort
+  // (which ships with rocThrust). Same bounded-bit, out-of-place,
+  // transient-scratch scheme.
   inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
                                    prtl_perm_t&              perm,
-                                   ncells_t /*n_bins*/,
+                                   ncells_t                  n_bins,
                                    ::sort::backend::Rocthrust) {
     const auto n = static_cast<npart_t>(keys.extent(0));
     if (n == 0u) {
       return;
     }
-    Kokkos::fence("sort_by_key_dispatch Rocthrust: pre-sort");
-    thrust::device_ptr<ncells_t> kp(keys.data());
-    thrust::device_ptr<npart_t>  pp(perm.data());
-    thrust::sequence(pp, pp + n);
-    thrust::sort_by_key(kp, kp + n, pp);
-    Kokkos::fence("sort_by_key_dispatch Rocthrust: post-sort");
+    auto exec   = Kokkos::DefaultExecutionSpace();
+    auto perm_v = perm;
+    Kokkos::parallel_for(
+      "PermInitIota",
+      n,
+      KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; });
+
+    array_t<ncells_t*>  keys_out("tile_keys_sorted", n);
+    prtl_perm_t         perm_out("tile_perm_sorted", n);
+    const unsigned int  end_bit = significant_bits(n_bins);
+
+    exec.fence("sort_by_key_dispatch Rocthrust: pre-sort");
+    auto stream = exec.hip_stream();
+
+    std::size_t temp_bytes = 0;
+    auto        err = rocprim::radix_sort_pairs(nullptr,
+                                         temp_bytes,
+                                         keys.data(),
+                                         keys_out.data(),
+                                         perm.data(),
+                                         perm_out.data(),
+                                         static_cast<std::size_t>(n),
+                                         0u,
+                                         end_bit,
+                                         stream);
+    raise::ErrorIf(err != hipSuccess,
+                   "rocprim::radix_sort_pairs (size query) failed",
+                   HERE);
+    array_t<char*> temp("rocprim_radix_temp",
+                        (temp_bytes == 0u) ? std::size_t { 1 } : temp_bytes);
+    err = rocprim::radix_sort_pairs(temp.data(),
+                                    temp_bytes,
+                                    keys.data(),
+                                    keys_out.data(),
+                                    perm.data(),
+                                    perm_out.data(),
+                                    static_cast<std::size_t>(n),
+                                    0u,
+                                    end_bit,
+                                    stream);
+    raise::ErrorIf(err != hipSuccess,
+                   "rocprim::radix_sort_pairs failed",
+                   HERE);
+    exec.fence("sort_by_key_dispatch Rocthrust: post-sort");
+
+    auto keys_nc = keys; // non-const handle aliasing the same storage
+    Kokkos::deep_copy(keys_nc, keys_out);
+    perm = perm_out;
   }
 #endif
 
diff --git a/tests/framework/particles_sort.cpp b/tests/framework/particles_sort.cpp
index 6945c962f..4e4881d98 100644
--- a/tests/framework/particles_sort.cpp
+++ b/tests/framework/particles_sort.cpp
@@ -31,12 +31,14 @@ auto main(int argc, char* argv[]) -> int {
         ntt::EmissionType::NONE,
         2u,
         1u);
-      auto& i1_p     = prtls.i1;
-      auto& i2_p     = prtls.i2;
-      auto& tag_p    = prtls.tag;
-      auto& weight_p = prtls.weight;
-      auto& pld_r    = prtls.pld_r;
-      auto& pld_i    = prtls.pld_i;
+      auto& i1_p      = prtls.i1;
+      auto& i2_p      = prtls.i2;
+      auto& i1_prev_p = prtls.i1_prev;
+      auto& i2_prev_p = prtls.i2_prev;
+      auto& tag_p     = prtls.tag;
+      auto& weight_p  = prtls.weight;
+      auto& pld_r     = prtls.pld_r;
+      auto& pld_i     = prtls.pld_i;
       Kokkos::parallel_for(
         "InitParticles",
         prtls.maxnpart(),
@@ -61,9 +63,14 @@ auto main(int argc, char* argv[]) -> int {
               i2_p(p)     = 23u;
               weight_p(p) = 3.0;
             }
-            pld_r(p, 0) = weight_p(p) + static_cast<real_t>(0.5);
-            pld_r(p, 1) = weight_p(p) + static_cast<real_t>(10.5);
-            pld_i(p, 0) = static_cast<npart_t>(weight_p(p) + 10.0);
+            // team_policy keys on min(i, i_prev); without a meaningful
+            // i_prev every key would collapse to 0. Set i_prev = i so the
+            // tile key reduces to the particle's current cell.
+            i1_prev_p(p) = i1_p(p);
+            i2_prev_p(p) = i2_p(p);
+            pld_r(p, 0)  = weight_p(p) + static_cast<real_t>(0.5);
+            pld_r(p, 1)  = weight_p(p) + static_cast<real_t>(10.5);
+            pld_i(p, 0)  = static_cast<npart_t>(weight_p(p) + 10.0);
           } else {
             tag_p(p) = ntt::ParticleTag::dead;
           }
@@ -75,43 +82,78 @@ auto main(int argc, char* argv[]) -> int {
 
       prtls.SortSpatially(grid);
 
+      auto i1_h     = Kokkos::create_mirror_view(prtls.i1);
+      auto i2_h     = Kokkos::create_mirror_view(prtls.i2);
+      auto tag_h    = Kokkos::create_mirror_view(prtls.tag);
       auto weight_h = Kokkos::create_mirror_view(prtls.weight);
       auto pld_r_h  = Kokkos::create_mirror_view(prtls.pld_r);
       auto pld_i_h  = Kokkos::create_mirror_view(prtls.pld_i);
+      Kokkos::deep_copy(i1_h, prtls.i1);
+      Kokkos::deep_copy(i2_h, prtls.i2);
+      Kokkos::deep_copy(tag_h, prtls.tag);
       Kokkos::deep_copy(weight_h, prtls.weight);
       Kokkos::deep_copy(pld_r_h, prtls.pld_r);
       Kokkos::deep_copy(pld_i_h, prtls.pld_i);
 
-      // Only [0, npart) is defined after a sort. The swap-based gather in
-      // apply_permutation_to_soa replaces each SoA View, zero-filling the
-      // spare capacity [npart, maxnpart) (a don't-care region overwritten
-      // by injection), so the old "tail preserved" check no longer holds.
+      // Tile geometry, mirroring sort::PositionToTileIndex. T = 1 (no
+      // team_policy) reproduces the legacy per-cell ordering.
+#if defined(TEAM_POLICY)
+      const ncells_t T = static_cast<ncells_t>(TEAM_POLICY_TILE_SIZE);
+#else
+      const ncells_t T = 1u;
+#endif
+      const auto     na   = grid.n_active();
+      const ncells_t ntx2 = (na[1] + T - 1u) / T;
+      const auto     tile_of = [&](int a, int b) -> ncells_t {
+        return (static_cast<ncells_t>(a) / T) * ntx2 +
+               (static_cast<ncells_t>(b) / T);
+      };
+
+      // SortSpatially is order-by-tile, not order-by-cell: assert the
+      // invariants that hold for any tile size rather than a hardwired
+      // permutation. (1) alive particles form a prefix sorted by
+      // non-decreasing tile index; (2) dead particles (weight == -1) form
+      // the suffix; (3) every SoA member is permuted by the *same*
+      // permutation, so each alive slot still satisfies pld == f(weight);
+      // (4) no alive particle is lost. Only [0, npart) is defined after a
+      // sort — the swap-gather zero-fills [npart, maxnpart).
+      bool     seen_dead   = false;
+      bool     have_prev   = false;
+      ncells_t prev_tile   = 0u;
+      npart_t  n_alive_obs = 0u;
       for (auto p { 0u }; p < 66u; ++p) {
-        if (p < 16u) {
-          raise::ErrorIf(weight_h(p) != 3.0, "error in sorting particles", HERE);
-        } else if (p < 33u) {
-          raise::ErrorIf(weight_h(p) != 1.0, "error in sorting particles", HERE);
-        } else if (p < 46u) {
-          raise::ErrorIf(weight_h(p) != 2.0, "error in sorting particles", HERE);
-        } else if (p < 59u) {
-          raise::ErrorIf(weight_h(p) != 0.0, "error in sorting particles", HERE);
-        } else {
-          raise::ErrorIf(weight_h(p) != -1.0, "error in sorting particles", HERE);
-        }
-        if (p < 59u) {
-          raise::ErrorIf(pld_r_h(p, 0) != weight_h(p) + static_cast<real_t>(0.5),
-                         "error in sorting particle real payload 0",
-                         HERE);
-          raise::ErrorIf(pld_r_h(p, 1) != weight_h(p) + static_cast<real_t>(10.5),
-                         "error in sorting particle real payload 1",
+        if (tag_h(p) != ntt::ParticleTag::alive) {
+          seen_dead = true;
+          raise::ErrorIf(weight_h(p) != -1.0,
+                         "dead particle has unexpected weight",
                          HERE);
-          raise::ErrorIf(
-            pld_i_h(p, 0) !=
-              static_cast<npart_t>(weight_h(p) + static_cast<real_t>(10.0)),
-            "error in sorting particle integer payload 0",
-            HERE);
+          continue;
         }
+        raise::ErrorIf(seen_dead,
+                       "alive particle after a dead one (not sorted to prefix)",
+                       HERE);
+        const auto tile = tile_of(i1_h(p), i2_h(p));
+        raise::ErrorIf(have_prev && (tile < prev_tile),
+                       "alive particles not sorted by tile index",
+                       HERE);
+        prev_tile = tile;
+        have_prev = true;
+        ++n_alive_obs;
+        raise::ErrorIf(pld_r_h(p, 0) != weight_h(p) + static_cast<real_t>(0.5),
+                       "error in sorting particle real payload 0",
+                       HERE);
+        raise::ErrorIf(pld_r_h(p, 1) != weight_h(p) + static_cast<real_t>(10.5),
+                       "error in sorting particle real payload 1",
+                       HERE);
+        raise::ErrorIf(
+          pld_i_h(p, 0) !=
+            static_cast<npart_t>(weight_h(p) + static_cast<real_t>(10.0)),
+          "error in sorting particle integer payload 0",
+          HERE);
       }
+      raise::ErrorIf(n_alive_obs != 59u,
+                     "wrong number of alive particles after sort",
+                     HERE);
     }
     {
       // 3D
@@ -133,11 +175,14 @@ auto main(int argc, char* argv[]) -> int {
         ntt::EmissionType::NONE,
         0u,
         0u);
-      auto& i1_p     = prtls.i1;
-      auto& i2_p     = prtls.i2;
-      auto& i3_p     = prtls.i3;
-      auto& tag_p    = prtls.tag;
-      auto& weight_p = prtls.weight;
+      auto& i1_p      = prtls.i1;
+      auto& i2_p      = prtls.i2;
+      auto& i3_p      = prtls.i3;
+      auto& i1_prev_p = prtls.i1_prev;
+      auto& i2_prev_p = prtls.i2_prev;
+      auto& i3_prev_p = prtls.i3_prev;
+      auto& tag_p     = prtls.tag;
+      auto& weight_p  = prtls.weight;
       Kokkos::parallel_for(
         "InitParticles",
         prtls.maxnpart(),
@@ -171,6 +216,11 @@ auto main(int argc, char* argv[]) -> int {
               i3_p(p)     = 7u;
               weight_p(p) = 4.0;
             }
+            // see 2D block: i_prev = i so the team_policy tile key reduces
+            // to the particle's current cell.
+            i1_prev_p(p) = i1_p(p);
+            i2_prev_p(p) = i2_p(p);
+            i3_prev_p(p) = i3_p(p);
           } else {
             tag_p(p) = ntt::ParticleTag::dead;
           }
@@ -182,27 +232,61 @@ auto main(int argc, char* argv[]) -> int {
 
       prtls.SortSpatially(grid);
 
+      auto i1_h     = Kokkos::create_mirror_view(prtls.i1);
+      auto i2_h     = Kokkos::create_mirror_view(prtls.i2);
+      auto i3_h     = Kokkos::create_mirror_view(prtls.i3);
+      auto tag_h    = Kokkos::create_mirror_view(prtls.tag);
       auto weight_h = Kokkos::create_mirror_view(prtls.weight);
+      Kokkos::deep_copy(i1_h, prtls.i1);
+      Kokkos::deep_copy(i2_h, prtls.i2);
+      Kokkos::deep_copy(i3_h, prtls.i3);
+      Kokkos::deep_copy(tag_h, prtls.tag);
       Kokkos::deep_copy(weight_h, prtls.weight);
-      // Only [0, npart) is defined after a sort. The swap-based gather in
-      // apply_permutation_to_soa replaces each SoA View, zero-filling the
-      // spare capacity [npart, maxnpart) (a don't-care region overwritten
-      // by injection), so the old "tail preserved" check no longer holds.
+
+      // Same invariants as the 2D block (no payloads here): alive prefix
+      // sorted by non-decreasing tile index, dead (weight == -1) suffix,
+      // alive count preserved. T = 1 reproduces the legacy per-cell order.
+#if defined(TEAM_POLICY)
+      const ncells_t T = static_cast<ncells_t>(TEAM_POLICY_TILE_SIZE);
+#else
+      const ncells_t T = 1u;
+#endif
+      const auto     na   = grid.n_active();
+      const ncells_t ntx2 = (na[1] + T - 1u) / T;
+      const ncells_t ntx3 = (na[2] + T - 1u) / T;
+      const auto     tile_of = [&](int a, int b, int c) -> ncells_t {
+        return ((static_cast<ncells_t>(a) / T) * ntx2 +
+                (static_cast<ncells_t>(b) / T)) *
+                 ntx3 +
+               (static_cast<ncells_t>(c) / T);
+      };
+
+      bool     seen_dead   = false;
+      bool     have_prev   = false;
+      ncells_t prev_tile   = 0u;
+      npart_t  n_alive_obs = 0u;
       for (auto p { 0u }; p < 66u; ++p) {
-        if (p < 13u) {
-          raise::ErrorIf(weight_h(p) != 4.0, "error in sorting particles", HERE);
-        } else if (p < 26u) {
-          raise::ErrorIf(weight_h(p) != 1.0, "error in sorting particles", HERE);
-        } else if (p < 39u) {
-          raise::ErrorIf(weight_h(p) != 2.0, "error in sorting particles", HERE);
-        } else if (p < 46u) {
-          raise::ErrorIf(weight_h(p) != 0.0, "error in sorting particles", HERE);
-        } else if (p < 59u) {
-          raise::ErrorIf(weight_h(p) != 3.0, "error in sorting particles", HERE);
-        } else {
-          raise::ErrorIf(weight_h(p) != -1.0, "error in sorting particles", HERE);
+        if (tag_h(p) != ntt::ParticleTag::alive) {
+          seen_dead = true;
+          raise::ErrorIf(weight_h(p) != -1.0,
+                         "dead particle has unexpected weight",
+                         HERE);
+          continue;
         }
+        raise::ErrorIf(seen_dead,
+                       "alive particle after a dead one (not sorted to prefix)",
+                       HERE);
+        const auto tile = tile_of(i1_h(p), i2_h(p), i3_h(p));
+        raise::ErrorIf(have_prev && (tile < prev_tile),
+                       "alive particles not sorted by tile index",
+                       HERE);
+        prev_tile = tile;
+        have_prev = true;
+        ++n_alive_obs;
       }
+      raise::ErrorIf(n_alive_obs != 59u,
+                     "wrong number of alive particles after sort",
+                     HERE);
     }
 
   } catch (const std::exception& e) {
diff --git a/tests/framework/sort_by_key.cpp b/tests/framework/sort_by_key.cpp
index 94dc51d0e..9ccecc732 100644
--- a/tests/framework/sort_by_key.cpp
+++ b/tests/framework/sort_by_key.cpp
@@ -99,6 +99,9 @@ auto main(int argc, char* argv[]) -> int {
 #endif
 #if defined(CUDA_ENABLED) && defined(THRUST_ENABLED)
     test_one_backend("Thrust", ::sort::backend::Thrust {});
+#endif
+#if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)
+    test_one_backend("Rocthrust", ::sort::backend::Rocthrust {});
 #endif
   } catch (std::exception& e) {
     std::cerr << e.what() << std::endl;

From ed70dbd24512efb3c483f9773f67fb865eb44f97 Mon Sep 17 00:00:00 2001
From: Ludwig Boess <ludwig.boess@outlook.de>
Date: Sun, 21 Jun 2026 12:48:28 -0400
Subject: [PATCH 14/33] reduced exchange for filters

---
 src/engines/srpic/currents.h | 116 +++++++++++++++++++++++++++++++----
 1 file changed, 104 insertions(+), 12 deletions(-)

diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h
index b8cfe63ab..30eb9466d 100644
--- a/src/engines/srpic/currents.h
+++ b/src/engines/srpic/currents.h
@@ -27,6 +27,8 @@
 #include "kernels/currents_deposit.hpp"
 #include "kernels/digital_filter.hpp"
 
+#include <utility>
+
 namespace ntt {
   namespace srpic {
 
@@ -202,9 +204,11 @@ namespace ntt {
                         Domain<SimEngine::SRPIC, M>&     domain,
                         const SimulationParams&          params) {
       logger::Checkpoint("Launching currents filtering kernels", HERE);
-      auto       range   = srpic::RangeWithAxisBCs(domain);
       const auto nfilter = params.template get<unsigned short>(
         "algorithms.current_filters");
+      if (nfilter == 0u) {
+        return;
+      }
       tuple_t<ncells_t, M::Dim> size;
       if constexpr (M::Dim == Dim::_1D || M::Dim == Dim::_2D || M::Dim == Dim::_3D) {
         size[0] = domain.mesh.n_active(in::x1);
@@ -215,17 +219,105 @@ namespace ntt {
       if constexpr (M::Dim == Dim::_3D) {
         size[2] = domain.mesh.n_active(in::x3);
       }
-      // !TODO: this needs to be done more efficiently
-      for (auto i { 0u }; i < nfilter; ++i) {
-        Kokkos::deep_copy(domain.fields.buff, domain.fields.cur);
-        Kokkos::parallel_for("CurrentsFilter",
-                             range,
-                             kernel::DigitalFilter_kernel<M::Dim, M::CoordType>(
-                               domain.fields.cur,
-                               domain.fields.buff,
-                               size,
-                               domain.mesh.flds_bc()));
-        metadomain.CommunicateFields(domain, Comm::J);
+
+      // The filter ping-pongs `cur` <-> scratch `buff`: one up-front copy
+      // seeds `buff` with valid ghost cells (the kernel writes only the
+      // cells in its launch range), then each pass filters the input into
+      // the other buffer and swaps the View handles so `cur` again names the
+      // result. `buff` is pure scratch (also reused by CommunicateFields), so
+      // the permanent handle swap is transparent and `cur` always names the
+      // result. The single seeding copy preserves physical-boundary ghosts
+      // (conductor/match/atmosphere/...), which neither the kernel nor the
+      // MPI exchange refresh.
+      Kokkos::deep_copy(domain.fields.buff, domain.fields.cur);
+
+      const auto flds_bc = domain.mesh.flds_bc();
+
+      if constexpr (M::CoordType == Coord::Cartesian) {
+        // Reduced-exchange ghost-margin scheme. One halo exchange refreshes
+        // N_GHOSTS ghost layers — enough for N_GHOSTS passes of the 3-point
+        // binomial if each pass also recomputes the inner ghost layers it
+        // will need next. We therefore extend the launch range by a shrinking
+        // margin `m` into the ghost zone, but only on comm-refreshed sides
+        // (PERIODIC self-wrap or SYNC inter-domain), where the ghost cell is
+        // interior physics. Physical-boundary ghosts are never written or
+        // refreshed, exactly as in the per-pass loop, so the result is
+        // identical for every BC — while doing one exchange per N_GHOSTS
+        // passes instead of one per pass. (Entering the loop the ghosts are
+        // valid to distance N_GHOSTS: srpic.hpp runs CommunicateFields(J)
+        // immediately before CurrentsFilter.)
+        const int  G = static_cast<int>(N_GHOSTS);
+        const auto comm_side = [](FldsBC b) {
+          return (b == FldsBC::PERIODIC) or (b == FldsBC::SYNC);
+        };
+        bool ext_lo[3] = { false, false, false };
+        bool ext_hi[3] = { false, false, false };
+        for (auto d { 0 }; d < static_cast<int>(M::Dim); ++d) {
+          ext_lo[d] = comm_side(flds_bc[d].first);
+          ext_hi[d] = comm_side(flds_bc[d].second);
+        }
+        const auto make_range = [&](int m) -> range_t<M::Dim> {
+          const auto ml = [&](int d) -> ncells_t {
+            return ext_lo[d] ? static_cast<ncells_t>(m) : 0u;
+          };
+          const auto mh = [&](int d) -> ncells_t {
+            return ext_hi[d] ? static_cast<ncells_t>(m) : 0u;
+          };
+          if constexpr (M::Dim == Dim::_1D) {
+            return CreateRangePolicy<Dim::_1D>(
+              { domain.mesh.i_min(in::x1) - ml(0) },
+              { domain.mesh.i_max(in::x1) + mh(0) });
+          } else if constexpr (M::Dim == Dim::_2D) {
+            return CreateRangePolicy<Dim::_2D>(
+              { domain.mesh.i_min(in::x1) - ml(0),
+                domain.mesh.i_min(in::x2) - ml(1) },
+              { domain.mesh.i_max(in::x1) + mh(0),
+                domain.mesh.i_max(in::x2) + mh(1) });
+          } else {
+            return CreateRangePolicy<Dim::_3D>(
+              { domain.mesh.i_min(in::x1) - ml(0),
+                domain.mesh.i_min(in::x2) - ml(1),
+                domain.mesh.i_min(in::x3) - ml(2) },
+              { domain.mesh.i_max(in::x1) + mh(0),
+                domain.mesh.i_max(in::x2) + mh(1),
+                domain.mesh.i_max(in::x3) + mh(2) });
+          }
+        };
+        int m = G - 1;
+        for (auto i { 0u }; i < nfilter; ++i) {
+          Kokkos::parallel_for(
+            "CurrentsFilter",
+            make_range(m),
+            kernel::DigitalFilter_kernel<M::Dim, M::CoordType>(
+              domain.fields.buff,
+              domain.fields.cur,
+              size,
+              flds_bc));
+          std::swap(domain.fields.cur, domain.fields.buff);
+          --m;
+          if (m < 0 or i == nfilter - 1u) {
+            // refresh ghosts to distance G (and leave them valid for the
+            // downstream field solver after the final pass)
+            metadomain.CommunicateFields(domain, Comm::J);
+            m = G - 1;
+          }
+        }
+      } else {
+        // Non-Cartesian (axis BCs need the +1 range fixup): keep the
+        // per-pass exchange cadence, still ping-ponging the buffers.
+        const auto range = srpic::RangeWithAxisBCs(domain);
+        for (auto i { 0u }; i < nfilter; ++i) {
+          Kokkos::parallel_for(
+            "CurrentsFilter",
+            range,
+            kernel::DigitalFilter_kernel<M::Dim, M::CoordType>(
+              domain.fields.buff,
+              domain.fields.cur,
+              size,
+              flds_bc));
+          std::swap(domain.fields.cur, domain.fields.buff);
+          metadomain.CommunicateFields(domain, Comm::J);
+        }
       }
     }
 

From 314d1424c2d2927a119a8918a45dfce8aa9bb501 Mon Sep 17 00:00:00 2001
From: Ludwig Boess <ludwig.boess@outlook.de>
Date: Sun, 21 Jun 2026 12:49:15 -0400
Subject: [PATCH 15/33] moved tag_offsets_h outside of loop to avoid multiple
 device-host copies

---
 src/framework/containers/particles_comm.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/framework/containers/particles_comm.cpp b/src/framework/containers/particles_comm.cpp
index 0fa140f93..6ede3abf0 100644
--- a/src/framework/containers/particles_comm.cpp
+++ b/src/framework/containers/particles_comm.cpp
@@ -266,6 +266,11 @@ namespace ntt {
     auto iteration        = 0;
     auto current_received = 0;
 
+    // `tag_offsets` is the same for every direction; mirror it to the host
+    // once here instead of re-copying it inside the direction loop.
+    auto tag_offsets_h = Kokkos::create_mirror_view(tag_offsets);
+    Kokkos::deep_copy(tag_offsets_h, tag_offsets);
+
     for (const auto& direction : dirs_to_comm) {
       const auto send_rank     = send_ranks.at(direction);
       const auto recv_rank     = recv_ranks.at(direction);
@@ -291,9 +296,6 @@ namespace ntt {
                                               npart_send_in * NPLDS_I };
       }
 
-      auto tag_offsets_h = Kokkos::create_mirror_view(tag_offsets);
-      Kokkos::deep_copy(tag_offsets_h, tag_offsets);
-
       npart_t idx_offset = npart_dead;
       if (tag_send > 2) {
         idx_offset += tag_offsets_h(tag_send - 3);

From 8047a118816d27f46a577a27235ac19ac8f740b7 Mon Sep 17 00:00:00 2001
From: Ludwig Boess <ludwig.boess@outlook.de>
Date: Sun, 21 Jun 2026 12:49:52 -0400
Subject: [PATCH 16/33] batched async communication

---
 src/framework/domain/comm_mpi.hpp        | 170 +++++++++++++++++++++++
 src/framework/domain/metadomain_comm.cpp |  66 +++++++++
 2 files changed, 236 insertions(+)

diff --git a/src/framework/domain/comm_mpi.hpp b/src/framework/domain/comm_mpi.hpp
index 52103c170..668efdd75 100644
--- a/src/framework/domain/comm_mpi.hpp
+++ b/src/framework/domain/comm_mpi.hpp
@@ -365,6 +365,176 @@ namespace comm {
     }
   }
 
+  // ---- batched, non-blocking field halo exchange ------------------------ //
+
+  // Per-direction parameters, collected once by the caller (GetSendRecvParams)
+  // and reused for every field communicated in the same call.
+  template <Dimension D>
+  struct FieldCommDir {
+    int                       send_rank { -1 }, recv_rank { -1 };
+    unsigned int              send_ind { 0 }, recv_ind { 0 };
+    std::vector<cell_range_t> send_slice {}, recv_slice {};
+    int                       tag { 0 };
+  };
+
+  /**
+   * @brief Non-additive (ghost-overwrite) halo exchange of one field across
+   *        all directions at once.
+   * @details Packs every send, posts all `MPI_Irecv` then all `MPI_Isend`
+   *          (unique per-direction tags), a single `MPI_Waitall`, then unpacks
+   *          — overlapping the round-trips instead of serializing one blocking
+   *          `MPI_Sendrecv` per direction. Self-communication (periodic single
+   *          domain) is a local copy. Slicing/packing mirrors
+   *          `comm::CommunicateField`. The tag pairs A's send in a direction
+   *          with B's receive in the same direction (the matching neighbor),
+   *          so it is unique per (rank-pair, direction).
+   */
+  template <Dimension D, int N>
+  inline void CommunicateFieldBatched(unsigned int                        my_idx,
+                                      ndfield_t<D, N>&                    fld,
+                                      const std::vector<FieldCommDir<D>>& dirs,
+                                      const cell_range_t&                 comps) {
+    static constexpr unsigned short Dp1 = static_cast<unsigned short>(D) + 1;
+    using buf_t                         = ndarray_t<Dp1>;
+    const ncells_t ncomp       = comps.second - comps.first;
+    const auto     ndirs       = dirs.size();
+
+    const auto is_self = [&](const FieldCommDir<D>& dd) {
+      return (dd.send_ind == my_idx) && (dd.recv_ind == my_idx);
+    };
+    const auto ext = [](const std::vector<cell_range_t>& sl, int d) -> ncells_t {
+      return sl[d].second - sl[d].first;
+    };
+    const auto make_buf = [&](const char*                      lbl,
+                              const std::vector<cell_range_t>& sl) -> buf_t {
+      if constexpr (D == Dim::_1D) {
+        return buf_t(lbl, ext(sl, 0), ncomp);
+      } else if constexpr (D == Dim::_2D) {
+        return buf_t(lbl, ext(sl, 0), ext(sl, 1), ncomp);
+      } else {
+        return buf_t(lbl, ext(sl, 0), ext(sl, 1), ext(sl, 2), ncomp);
+      }
+    };
+    const auto count = [](const buf_t& b) -> int {
+      ncells_t n = 1;
+      for (auto d { 0 }; d < static_cast<int>(Dp1); ++d) {
+        n *= static_cast<ncells_t>(b.extent(d));
+      }
+      return static_cast<int>(n);
+    };
+    const auto pack = [&](buf_t& b, const std::vector<cell_range_t>& sl) {
+      if constexpr (D == Dim::_1D) {
+        Kokkos::deep_copy(b, Kokkos::subview(fld, sl[0], comps));
+      } else if constexpr (D == Dim::_2D) {
+        Kokkos::deep_copy(b, Kokkos::subview(fld, sl[0], sl[1], comps));
+      } else {
+        Kokkos::deep_copy(b, Kokkos::subview(fld, sl[0], sl[1], sl[2], comps));
+      }
+    };
+    const auto unpack = [&](const buf_t& b, const std::vector<cell_range_t>& sl) {
+      if constexpr (D == Dim::_1D) {
+        Kokkos::deep_copy(Kokkos::subview(fld, sl[0], comps), b);
+      } else if constexpr (D == Dim::_2D) {
+        Kokkos::deep_copy(Kokkos::subview(fld, sl[0], sl[1], comps), b);
+      } else {
+        Kokkos::deep_copy(Kokkos::subview(fld, sl[0], sl[1], sl[2], comps), b);
+      }
+    };
+    const auto self_copy = [&](const std::vector<cell_range_t>& ssl,
+                               const std::vector<cell_range_t>& rsl) {
+      if constexpr (D == Dim::_1D) {
+        Kokkos::deep_copy(Kokkos::subview(fld, rsl[0], comps),
+                          Kokkos::subview(fld, ssl[0], comps));
+      } else if constexpr (D == Dim::_2D) {
+        Kokkos::deep_copy(Kokkos::subview(fld, rsl[0], rsl[1], comps),
+                          Kokkos::subview(fld, ssl[0], ssl[1], comps));
+      } else {
+        Kokkos::deep_copy(Kokkos::subview(fld, rsl[0], rsl[1], rsl[2], comps),
+                          Kokkos::subview(fld, ssl[0], ssl[1], ssl[2], comps));
+      }
+    };
+
+    std::vector<buf_t> send_buf(ndirs), recv_buf(ndirs);
+#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI)
+    using hbuf_t = typename buf_t::host_mirror_type;
+    std::vector<hbuf_t> send_h(ndirs), recv_h(ndirs);
+#endif
+
+    // phase 1: self copies, pack sends, allocate recvs
+    for (auto i { 0u }; i < ndirs; ++i) {
+      const auto& dd = dirs[i];
+      if (is_self(dd)) {
+        self_copy(dd.send_slice, dd.recv_slice);
+        continue;
+      }
+      if (dd.recv_rank >= 0) {
+        recv_buf[i] = make_buf("recv_fld", dd.recv_slice);
+#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI)
+        recv_h[i] = Kokkos::create_mirror_view(recv_buf[i]);
+#endif
+      }
+      if (dd.send_rank >= 0) {
+        send_buf[i] = make_buf("send_fld", dd.send_slice);
+        pack(send_buf[i], dd.send_slice);
+#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI)
+        send_h[i] = Kokkos::create_mirror_view(send_buf[i]);
+        Kokkos::deep_copy(send_h[i], send_buf[i]);
+#endif
+      }
+    }
+    // drain packs (and device->host staging) before MPI reads the buffers
+    Kokkos::fence("CommunicateFieldBatched: pre-MPI");
+
+    // phase 2: post all recvs, then all sends
+    std::vector<MPI_Request> reqs;
+    reqs.reserve(2 * ndirs);
+    for (auto i { 0u }; i < ndirs; ++i) {
+      const auto& dd = dirs[i];
+      if (is_self(dd) || dd.recv_rank < 0) {
+        continue;
+      }
+      reqs.emplace_back();
+#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI)
+      MPI_Irecv(recv_h[i].data(), count(recv_buf[i]), mpi::get_type<real_t>(),
+                dd.recv_rank, dd.tag, MPI_COMM_WORLD, &reqs.back());
+#else
+      MPI_Irecv(recv_buf[i].data(), count(recv_buf[i]), mpi::get_type<real_t>(),
+                dd.recv_rank, dd.tag, MPI_COMM_WORLD, &reqs.back());
+#endif
+    }
+    for (auto i { 0u }; i < ndirs; ++i) {
+      const auto& dd = dirs[i];
+      if (is_self(dd) || dd.send_rank < 0) {
+        continue;
+      }
+      reqs.emplace_back();
+#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI)
+      MPI_Isend(send_h[i].data(), count(send_buf[i]), mpi::get_type<real_t>(),
+                dd.send_rank, dd.tag, MPI_COMM_WORLD, &reqs.back());
+#else
+      MPI_Isend(send_buf[i].data(), count(send_buf[i]), mpi::get_type<real_t>(),
+                dd.send_rank, dd.tag, MPI_COMM_WORLD, &reqs.back());
+#endif
+    }
+
+    // phase 3: complete all transfers
+    if (not reqs.empty()) {
+      MPI_Waitall(static_cast<int>(reqs.size()), reqs.data(), MPI_STATUSES_IGNORE);
+    }
+
+    // phase 4: unpack received ghosts
+    for (auto i { 0u }; i < ndirs; ++i) {
+      const auto& dd = dirs[i];
+      if (is_self(dd) || dd.recv_rank < 0) {
+        continue;
+      }
+#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI)
+      Kokkos::deep_copy(recv_buf[i], recv_h[i]);
+#endif
+      unpack(recv_buf[i], dd.recv_slice);
+    }
+  }
+
 } // namespace comm
 
 #endif // FRAMEWORK_DOMAIN_COMM_MPI_HPP
diff --git a/src/framework/domain/metadomain_comm.cpp b/src/framework/domain/metadomain_comm.cpp
index f8c0f60d7..d6061d7bf 100644
--- a/src/framework/domain/metadomain_comm.cpp
+++ b/src/framework/domain/metadomain_comm.cpp
@@ -273,6 +273,71 @@ namespace ntt {
       comp_range_cur = cell_range_t(cur::jx1, cur::jx3 + 1);
     }
     // traverse in all directions and send/recv the fields
+#if defined(MPI_ENABLED)
+    // Batched non-blocking exchange: collect the per-direction params once,
+    // then overlap all directions for each field (post all Irecv/Isend with
+    // per-direction tags + one Waitall) instead of a blocking Sendrecv per
+    // direction. The #else branch is the single-rank non-MPI path.
+    {
+      std::vector<comm::FieldCommDir<M::Dim>> dirs;
+      dirs.reserve(dir::Directions<M::Dim>::all.size());
+      for (auto& direction : dir::Directions<M::Dim>::all) {
+        const auto [send_params, recv_params] = GetSendRecvParams(this,
+                                                                  domain,
+                                                                  direction,
+                                                                  false);
+        const auto [send_indrank, send_slice] = send_params;
+        const auto [recv_indrank, recv_slice] = recv_params;
+        const auto [send_ind, send_rank]      = send_indrank;
+        const auto [recv_ind, recv_rank]      = recv_indrank;
+        if (send_rank < 0 and recv_rank < 0) {
+          continue;
+        }
+        comm::FieldCommDir<M::Dim> fcd;
+        fcd.send_rank  = send_rank;
+        fcd.recv_rank  = recv_rank;
+        fcd.send_ind   = send_ind;
+        fcd.recv_ind   = recv_ind;
+        fcd.send_slice = send_slice;
+        fcd.recv_slice = recv_slice;
+        fcd.tag = static_cast<int>(mpi::PrtlSendTag<M::Dim>::dir2tag(direction));
+        dirs.push_back(std::move(fcd));
+      }
+      if (comm_em) {
+        comm::CommunicateFieldBatched<M::Dim, 6>(domain.index(),
+                                                 domain.fields.em,
+                                                 dirs,
+                                                 comp_range_fld);
+      }
+      if constexpr (S == SimEngine::GRPIC) {
+        if (comm_aux) {
+          comm::CommunicateFieldBatched<M::Dim, 6>(domain.index(),
+                                                   domain.fields.aux,
+                                                   dirs,
+                                                   comp_range_fld);
+        }
+        if (comm_em0) {
+          comm::CommunicateFieldBatched<M::Dim, 6>(domain.index(),
+                                                   domain.fields.em0,
+                                                   dirs,
+                                                   comp_range_fld);
+        }
+        if (comm_j) {
+          comm::CommunicateFieldBatched<M::Dim, 3>(domain.index(),
+                                                   domain.fields.cur0,
+                                                   dirs,
+                                                   comp_range_cur);
+        }
+      } else {
+        if (comm_j) {
+          comm::CommunicateFieldBatched<M::Dim, 3>(domain.index(),
+                                                   domain.fields.cur,
+                                                   dirs,
+                                                   comp_range_cur);
+        }
+      }
+    }
+#else
     for (auto& direction : dir::Directions<M::Dim>::all) {
       const auto [send_params,
                   recv_params] = GetSendRecvParams(this, domain, direction, false);
@@ -364,6 +429,7 @@ namespace ntt {
         }
       }
     }
+#endif // MPI_ENABLED
   }
 
   template <Dimension D, int N>

From 50369176dde282993a75a1c73bd89b097ffcc38d Mon Sep 17 00:00:00 2001
From: Ludwig Boess <ludwig.boess@outlook.de>
Date: Sun, 21 Jun 2026 16:45:53 -0400
Subject: [PATCH 17/33] removed async comm because of bandwidth issue

---
 src/framework/domain/comm_mpi.hpp        | 170 -----------------------
 src/framework/domain/metadomain_comm.cpp |  66 ---------
 2 files changed, 236 deletions(-)

diff --git a/src/framework/domain/comm_mpi.hpp b/src/framework/domain/comm_mpi.hpp
index 668efdd75..52103c170 100644
--- a/src/framework/domain/comm_mpi.hpp
+++ b/src/framework/domain/comm_mpi.hpp
@@ -365,176 +365,6 @@ namespace comm {
     }
   }
 
-  // ---- batched, non-blocking field halo exchange ------------------------ //
-
-  // Per-direction parameters, collected once by the caller (GetSendRecvParams)
-  // and reused for every field communicated in the same call.
-  template <Dimension D>
-  struct FieldCommDir {
-    int                       send_rank { -1 }, recv_rank { -1 };
-    unsigned int              send_ind { 0 }, recv_ind { 0 };
-    std::vector<cell_range_t> send_slice {}, recv_slice {};
-    int                       tag { 0 };
-  };
-
-  /**
-   * @brief Non-additive (ghost-overwrite) halo exchange of one field across
-   *        all directions at once.
-   * @details Packs every send, posts all `MPI_Irecv` then all `MPI_Isend`
-   *          (unique per-direction tags), a single `MPI_Waitall`, then unpacks
-   *          — overlapping the round-trips instead of serializing one blocking
-   *          `MPI_Sendrecv` per direction. Self-communication (periodic single
-   *          domain) is a local copy. Slicing/packing mirrors
-   *          `comm::CommunicateField`. The tag pairs A's send in a direction
-   *          with B's receive in the same direction (the matching neighbor),
-   *          so it is unique per (rank-pair, direction).
-   */
-  template <Dimension D, int N>
-  inline void CommunicateFieldBatched(unsigned int                        my_idx,
-                                      ndfield_t<D, N>&                    fld,
-                                      const std::vector<FieldCommDir<D>>& dirs,
-                                      const cell_range_t&                 comps) {
-    static constexpr unsigned short Dp1 = static_cast<unsigned short>(D) + 1;
-    using buf_t                         = ndarray_t<Dp1>;
-    const ncells_t ncomp       = comps.second - comps.first;
-    const auto     ndirs       = dirs.size();
-
-    const auto is_self = [&](const FieldCommDir<D>& dd) {
-      return (dd.send_ind == my_idx) && (dd.recv_ind == my_idx);
-    };
-    const auto ext = [](const std::vector<cell_range_t>& sl, int d) -> ncells_t {
-      return sl[d].second - sl[d].first;
-    };
-    const auto make_buf = [&](const char*                      lbl,
-                              const std::vector<cell_range_t>& sl) -> buf_t {
-      if constexpr (D == Dim::_1D) {
-        return buf_t(lbl, ext(sl, 0), ncomp);
-      } else if constexpr (D == Dim::_2D) {
-        return buf_t(lbl, ext(sl, 0), ext(sl, 1), ncomp);
-      } else {
-        return buf_t(lbl, ext(sl, 0), ext(sl, 1), ext(sl, 2), ncomp);
-      }
-    };
-    const auto count = [](const buf_t& b) -> int {
-      ncells_t n = 1;
-      for (auto d { 0 }; d < static_cast<int>(Dp1); ++d) {
-        n *= static_cast<ncells_t>(b.extent(d));
-      }
-      return static_cast<int>(n);
-    };
-    const auto pack = [&](buf_t& b, const std::vector<cell_range_t>& sl) {
-      if constexpr (D == Dim::_1D) {
-        Kokkos::deep_copy(b, Kokkos::subview(fld, sl[0], comps));
-      } else if constexpr (D == Dim::_2D) {
-        Kokkos::deep_copy(b, Kokkos::subview(fld, sl[0], sl[1], comps));
-      } else {
-        Kokkos::deep_copy(b, Kokkos::subview(fld, sl[0], sl[1], sl[2], comps));
-      }
-    };
-    const auto unpack = [&](const buf_t& b, const std::vector<cell_range_t>& sl) {
-      if constexpr (D == Dim::_1D) {
-        Kokkos::deep_copy(Kokkos::subview(fld, sl[0], comps), b);
-      } else if constexpr (D == Dim::_2D) {
-        Kokkos::deep_copy(Kokkos::subview(fld, sl[0], sl[1], comps), b);
-      } else {
-        Kokkos::deep_copy(Kokkos::subview(fld, sl[0], sl[1], sl[2], comps), b);
-      }
-    };
-    const auto self_copy = [&](const std::vector<cell_range_t>& ssl,
-                               const std::vector<cell_range_t>& rsl) {
-      if constexpr (D == Dim::_1D) {
-        Kokkos::deep_copy(Kokkos::subview(fld, rsl[0], comps),
-                          Kokkos::subview(fld, ssl[0], comps));
-      } else if constexpr (D == Dim::_2D) {
-        Kokkos::deep_copy(Kokkos::subview(fld, rsl[0], rsl[1], comps),
-                          Kokkos::subview(fld, ssl[0], ssl[1], comps));
-      } else {
-        Kokkos::deep_copy(Kokkos::subview(fld, rsl[0], rsl[1], rsl[2], comps),
-                          Kokkos::subview(fld, ssl[0], ssl[1], ssl[2], comps));
-      }
-    };
-
-    std::vector<buf_t> send_buf(ndirs), recv_buf(ndirs);
-#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI)
-    using hbuf_t = typename buf_t::host_mirror_type;
-    std::vector<hbuf_t> send_h(ndirs), recv_h(ndirs);
-#endif
-
-    // phase 1: self copies, pack sends, allocate recvs
-    for (auto i { 0u }; i < ndirs; ++i) {
-      const auto& dd = dirs[i];
-      if (is_self(dd)) {
-        self_copy(dd.send_slice, dd.recv_slice);
-        continue;
-      }
-      if (dd.recv_rank >= 0) {
-        recv_buf[i] = make_buf("recv_fld", dd.recv_slice);
-#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI)
-        recv_h[i] = Kokkos::create_mirror_view(recv_buf[i]);
-#endif
-      }
-      if (dd.send_rank >= 0) {
-        send_buf[i] = make_buf("send_fld", dd.send_slice);
-        pack(send_buf[i], dd.send_slice);
-#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI)
-        send_h[i] = Kokkos::create_mirror_view(send_buf[i]);
-        Kokkos::deep_copy(send_h[i], send_buf[i]);
-#endif
-      }
-    }
-    // drain packs (and device->host staging) before MPI reads the buffers
-    Kokkos::fence("CommunicateFieldBatched: pre-MPI");
-
-    // phase 2: post all recvs, then all sends
-    std::vector<MPI_Request> reqs;
-    reqs.reserve(2 * ndirs);
-    for (auto i { 0u }; i < ndirs; ++i) {
-      const auto& dd = dirs[i];
-      if (is_self(dd) || dd.recv_rank < 0) {
-        continue;
-      }
-      reqs.emplace_back();
-#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI)
-      MPI_Irecv(recv_h[i].data(), count(recv_buf[i]), mpi::get_type<real_t>(),
-                dd.recv_rank, dd.tag, MPI_COMM_WORLD, &reqs.back());
-#else
-      MPI_Irecv(recv_buf[i].data(), count(recv_buf[i]), mpi::get_type<real_t>(),
-                dd.recv_rank, dd.tag, MPI_COMM_WORLD, &reqs.back());
-#endif
-    }
-    for (auto i { 0u }; i < ndirs; ++i) {
-      const auto& dd = dirs[i];
-      if (is_self(dd) || dd.send_rank < 0) {
-        continue;
-      }
-      reqs.emplace_back();
-#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI)
-      MPI_Isend(send_h[i].data(), count(send_buf[i]), mpi::get_type<real_t>(),
-                dd.send_rank, dd.tag, MPI_COMM_WORLD, &reqs.back());
-#else
-      MPI_Isend(send_buf[i].data(), count(send_buf[i]), mpi::get_type<real_t>(),
-                dd.send_rank, dd.tag, MPI_COMM_WORLD, &reqs.back());
-#endif
-    }
-
-    // phase 3: complete all transfers
-    if (not reqs.empty()) {
-      MPI_Waitall(static_cast<int>(reqs.size()), reqs.data(), MPI_STATUSES_IGNORE);
-    }
-
-    // phase 4: unpack received ghosts
-    for (auto i { 0u }; i < ndirs; ++i) {
-      const auto& dd = dirs[i];
-      if (is_self(dd) || dd.recv_rank < 0) {
-        continue;
-      }
-#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI)
-      Kokkos::deep_copy(recv_buf[i], recv_h[i]);
-#endif
-      unpack(recv_buf[i], dd.recv_slice);
-    }
-  }
-
 } // namespace comm
 
 #endif // FRAMEWORK_DOMAIN_COMM_MPI_HPP
diff --git a/src/framework/domain/metadomain_comm.cpp b/src/framework/domain/metadomain_comm.cpp
index d6061d7bf..f8c0f60d7 100644
--- a/src/framework/domain/metadomain_comm.cpp
+++ b/src/framework/domain/metadomain_comm.cpp
@@ -273,71 +273,6 @@ namespace ntt {
       comp_range_cur = cell_range_t(cur::jx1, cur::jx3 + 1);
     }
     // traverse in all directions and send/recv the fields
-#if defined(MPI_ENABLED)
-    // Batched non-blocking exchange: collect the per-direction params once,
-    // then overlap all directions for each field (post all Irecv/Isend with
-    // per-direction tags + one Waitall) instead of a blocking Sendrecv per
-    // direction. The #else branch is the single-rank non-MPI path.
-    {
-      std::vector<comm::FieldCommDir<M::Dim>> dirs;
-      dirs.reserve(dir::Directions<M::Dim>::all.size());
-      for (auto& direction : dir::Directions<M::Dim>::all) {
-        const auto [send_params, recv_params] = GetSendRecvParams(this,
-                                                                  domain,
-                                                                  direction,
-                                                                  false);
-        const auto [send_indrank, send_slice] = send_params;
-        const auto [recv_indrank, recv_slice] = recv_params;
-        const auto [send_ind, send_rank]      = send_indrank;
-        const auto [recv_ind, recv_rank]      = recv_indrank;
-        if (send_rank < 0 and recv_rank < 0) {
-          continue;
-        }
-        comm::FieldCommDir<M::Dim> fcd;
-        fcd.send_rank  = send_rank;
-        fcd.recv_rank  = recv_rank;
-        fcd.send_ind   = send_ind;
-        fcd.recv_ind   = recv_ind;
-        fcd.send_slice = send_slice;
-        fcd.recv_slice = recv_slice;
-        fcd.tag = static_cast<int>(mpi::PrtlSendTag<M::Dim>::dir2tag(direction));
-        dirs.push_back(std::move(fcd));
-      }
-      if (comm_em) {
-        comm::CommunicateFieldBatched<M::Dim, 6>(domain.index(),
-                                                 domain.fields.em,
-                                                 dirs,
-                                                 comp_range_fld);
-      }
-      if constexpr (S == SimEngine::GRPIC) {
-        if (comm_aux) {
-          comm::CommunicateFieldBatched<M::Dim, 6>(domain.index(),
-                                                   domain.fields.aux,
-                                                   dirs,
-                                                   comp_range_fld);
-        }
-        if (comm_em0) {
-          comm::CommunicateFieldBatched<M::Dim, 6>(domain.index(),
-                                                   domain.fields.em0,
-                                                   dirs,
-                                                   comp_range_fld);
-        }
-        if (comm_j) {
-          comm::CommunicateFieldBatched<M::Dim, 3>(domain.index(),
-                                                   domain.fields.cur0,
-                                                   dirs,
-                                                   comp_range_cur);
-        }
-      } else {
-        if (comm_j) {
-          comm::CommunicateFieldBatched<M::Dim, 3>(domain.index(),
-                                                   domain.fields.cur,
-                                                   dirs,
-                                                   comp_range_cur);
-        }
-      }
-    }
-#else
     for (auto& direction : dir::Directions<M::Dim>::all) {
       const auto [send_params,
                   recv_params] = GetSendRecvParams(this, domain, direction, false);
@@ -429,7 +364,6 @@ namespace ntt {
         }
       }
     }
-#endif // MPI_ENABLED
   }
 
   template <Dimension D, int N>

From 88914209ae20bff930cf6bedacb0653053fa81f3 Mon Sep 17 00:00:00 2001
From: Ludwig Boess <ludwig.boess@outlook.de>
Date: Sun, 21 Jun 2026 16:51:00 -0400
Subject: [PATCH 18/33] bugfix in current deposit

---
 src/kernels/currents_deposit.hpp | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/kernels/currents_deposit.hpp b/src/kernels/currents_deposit.hpp
index b92b02423..98fa0b661 100644
--- a/src/kernels/currents_deposit.hpp
+++ b/src/kernels/currents_deposit.hpp
@@ -1042,8 +1042,12 @@ namespace kernel {
               [&](int g_i1, int comp, real_t v) {
                 if (to_scratch) {
                   Kokkos::atomic_add(&scr(g_i1 - origin_J1_low, comp), v);
-                  //} else if (g_i1 >= 0 and g_i1 < j_ext1) {
-                } else {
+                } else if (g_i1 >= 0 and g_i1 < j_ext1) {
+                  // Bounds-clip the escape-valve write against J's storage,
+                  // exactly as the cooperative flush does. Cells past the
+                  // ghost stripe are re-supplied by SynchronizeFields(J); an
+                  // unclipped write here faults the GPU (an escaped boundary
+                  // particle's stencil can reach past j_ext1).
                   Kokkos::atomic_add(&J(g_i1, comp), v);
                 }
               });
@@ -1115,7 +1119,11 @@ namespace kernel {
                   Kokkos::atomic_add(
                     &scr(g_i1 - origin_J1_low, g_i2 - origin_J2_low, comp),
                     v);
-                } else {
+                } else if (g_i1 >= 0 and g_i1 < j_ext1 and g_i2 >= 0 and
+                           g_i2 < j_ext2) {
+                  // Bounds-clip as the cooperative flush does; an unclipped
+                  // escape-valve write faults the GPU when an escaped boundary
+                  // particle's stencil reaches past j_ext.
                   Kokkos::atomic_add(&J(g_i1, g_i2, comp), v);
                 }
               });
@@ -1195,7 +1203,11 @@ namespace kernel {
                                           g_i3 - origin_J3_low,
                                           comp),
                                      v);
-                } else {
+                } else if (g_i1 >= 0 and g_i1 < j_ext1 and g_i2 >= 0 and
+                           g_i2 < j_ext2 and g_i3 >= 0 and g_i3 < j_ext3) {
+                  // Bounds-clip as the cooperative flush does; an unclipped
+                  // escape-valve write faults the GPU when an escaped boundary
+                  // particle's stencil reaches past j_ext.
                   Kokkos::atomic_add(&J(g_i1, g_i2, g_i3, comp), v);
                 }
               });

From 4f3a7625ab055fec718fdac32bb1f588c980777c Mon Sep 17 00:00:00 2001
From: Ludwig Boess <ludwig.boess@outlook.de>
Date: Sun, 21 Jun 2026 19:38:24 -0400
Subject: [PATCH 19/33]  double-buffer radix sort to drop the N-sized temp
 (fixes device OOM at scale)

---
 src/global/utils/sort_dispatch.h | 74 +++++++++++++++++++++-----------
 1 file changed, 49 insertions(+), 25 deletions(-)

diff --git a/src/global/utils/sort_dispatch.h b/src/global/utils/sort_dispatch.h
index cfc389464..27a20a910 100644
--- a/src/global/utils/sort_dispatch.h
+++ b/src/global/utils/sort_dispatch.h
@@ -137,8 +137,8 @@ namespace ntt::sort_helpers {
       n,
       KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; });
 
-    // Out-of-place radix sort bounded to the significant key bits. The
-    // _out buffers and temp storage are transient (freed at scope exit).
+    // Radix sort bounded to the significant key bits. The _out buffers and
+    // temp storage are transient (freed at scope exit).
     array_t<ncells_t*> keys_out("tile_keys_sorted", n);
     prtl_perm_t        perm_out("tile_perm_sorted", n);
     const int          end_bit = static_cast<int>(significant_bits(n_bins));
@@ -146,13 +146,20 @@ namespace ntt::sort_helpers {
     exec.fence("sort_by_key_dispatch Thrust: pre-sort");
     auto stream = exec.cuda_stream();
 
+    // DoubleBuffer radix sort: cub ping-pongs between the supplied
+    // (current, alternate) buffer pairs, so `temp_bytes` holds only the
+    // histograms (~MB) instead of an internal N-sized alternate (~8*N bytes).
+    // Nearly halves the sort's transient memory vs the plain out-of-place
+    // form, which matters at high npart where the N-sized temp can fail to
+    // allocate (device OOM at scale).
+    cub::DoubleBuffer<ncells_t> d_keys(keys.data(), keys_out.data());
+    cub::DoubleBuffer<npart_t>  d_perm(perm.data(), perm_out.data());
+
     std::size_t temp_bytes = 0;
     auto        err = cub::DeviceRadixSort::SortPairs(nullptr,
                                               temp_bytes,
-                                              keys.data(),
-                                              keys_out.data(),
-                                              perm.data(),
-                                              perm_out.data(),
+                                              d_keys,
+                                              d_perm,
                                               n,
                                               0,
                                               end_bit,
@@ -164,10 +171,8 @@ namespace ntt::sort_helpers {
                         (temp_bytes == 0u) ? std::size_t { 1 } : temp_bytes);
     err = cub::DeviceRadixSort::SortPairs(temp.data(),
                                           temp_bytes,
-                                          keys.data(),
-                                          keys_out.data(),
-                                          perm.data(),
-                                          perm_out.data(),
+                                          d_keys,
+                                          d_perm,
                                           n,
                                           0,
                                           end_bit,
@@ -177,10 +182,16 @@ namespace ntt::sort_helpers {
                    HERE);
     exec.fence("sort_by_key_dispatch Thrust: post-sort");
 
-    // Publish sorted keys back in place; swap the sorted permutation in.
-    auto keys_nc = keys; // non-const handle aliasing the same storage
-    Kokkos::deep_copy(keys_nc, keys_out);
-    perm = perm_out;
+    // Publish results from whichever buffer cub left as Current() (depends on
+    // the pass count): copy sorted keys back into `keys`' storage if they
+    // ended up in the alternate, and point `perm` at its current buffer.
+    if (d_keys.Current() != keys.data()) {
+      auto keys_nc = keys; // non-const handle aliasing the same storage
+      Kokkos::deep_copy(keys_nc, keys_out);
+    }
+    if (d_perm.Current() == perm_out.data()) {
+      perm = perm_out;
+    }
   }
 #endif
 
@@ -210,13 +221,20 @@ namespace ntt::sort_helpers {
     exec.fence("sort_by_key_dispatch Rocthrust: pre-sort");
     auto stream = exec.hip_stream();
 
+    // double_buffer radix sort: rocprim ping-pongs between the supplied
+    // (current, alternate) buffer pairs, so `temp_storage` holds only the
+    // histograms (~MB) instead of an internal N-sized alternate (~8*N bytes,
+    // the dominant `rocprim_radix_temp`). This nearly halves the sort's
+    // transient memory vs the plain out-of-place form, which matters at high
+    // npart where that N-sized temp can fail to allocate (device OOM at scale).
+    rocprim::double_buffer<ncells_t> d_keys(keys.data(), keys_out.data());
+    rocprim::double_buffer<npart_t>  d_perm(perm.data(), perm_out.data());
+
     std::size_t temp_bytes = 0;
     auto        err = rocprim::radix_sort_pairs(nullptr,
                                          temp_bytes,
-                                         keys.data(),
-                                         keys_out.data(),
-                                         perm.data(),
-                                         perm_out.data(),
+                                         d_keys,
+                                         d_perm,
                                          static_cast<std::size_t>(n),
                                          0u,
                                          end_bit,
@@ -228,10 +246,8 @@ namespace ntt::sort_helpers {
                         (temp_bytes == 0u) ? std::size_t { 1 } : temp_bytes);
     err = rocprim::radix_sort_pairs(temp.data(),
                                     temp_bytes,
-                                    keys.data(),
-                                    keys_out.data(),
-                                    perm.data(),
-                                    perm_out.data(),
+                                    d_keys,
+                                    d_perm,
                                     static_cast<std::size_t>(n),
                                     0u,
                                     end_bit,
@@ -241,9 +257,17 @@ namespace ntt::sort_helpers {
                    HERE);
     exec.fence("sort_by_key_dispatch Rocthrust: post-sort");
 
-    auto keys_nc = keys; // non-const handle aliasing the same storage
-    Kokkos::deep_copy(keys_nc, keys_out);
-    perm = perm_out;
+    // Publish results from whichever buffer rocprim left as `current()`
+    // (depends on the pass count). If the sorted keys ended up in the
+    // alternate, copy them back into `keys`' storage so downstream
+    // (compute_tile_offsets) sees them; point `perm` at its current buffer.
+    if (d_keys.current() != keys.data()) {
+      auto keys_nc = keys; // non-const handle aliasing the same storage
+      Kokkos::deep_copy(keys_nc, keys_out);
+    }
+    if (d_perm.current() == perm_out.data()) {
+      perm = perm_out;
+    }
   }
 #endif
 

From 94ba9cdb3e484dbffed0a681bb1222d873b207af Mon Sep 17 00:00:00 2001
From: Ludwig Boess <ludwig.boess@outlook.de>
Date: Wed, 24 Jun 2026 18:26:32 -0400
Subject: [PATCH 20/33] fix printing of total particles

---
 src/global/utils/diag.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/global/utils/diag.cpp b/src/global/utils/diag.cpp
index 4cf763faf..604a872ae 100644
--- a/src/global/utils/diag.cpp
+++ b/src/global/utils/diag.cpp
@@ -17,6 +17,7 @@
   #include <iterator>
 #endif // MPI_ENABLED
 
+#include <cstddef>
 #include <iomanip>
 #include <numeric>
 #include <sstream>
@@ -26,8 +27,8 @@
 
 namespace diag {
   auto npart_stats(npart_t npart, npart_t maxnpart)
-    -> std::vector<std::pair<npart_t, unsigned short>> {
-    auto stats = std::vector<std::pair<npart_t, unsigned short>>();
+    -> std::vector<std::pair<std::size_t, unsigned short>> {
+    auto stats = std::vector<std::pair<std::size_t, unsigned short>>();
     const auto percentage = [](npart_t part, npart_t maxpart) -> unsigned short {
       return static_cast<unsigned short>(
         100.0f * static_cast<float>(part) / static_cast<float>(maxpart));
@@ -59,9 +60,9 @@ namespace diag {
     if (rank != MPI_ROOT_RANK) {
       return stats;
     }
-    const npart_t tot_npart = std::accumulate(mpi_npart.begin(),
-                                              mpi_npart.end(),
-                                              static_cast<npart_t>(0));
+    const std::size_t tot_npart = std::accumulate(mpi_npart.begin(),
+                                                  mpi_npart.end(),
+                                                  static_cast<std::size_t>(0));
     const npart_t max_idx   = std::distance(
       mpi_npart.begin(),
       std::max_element(mpi_npart.begin(), mpi_npart.end()));

From 9a4333e1798a6965054dc80407ac64f7ff3774e8 Mon Sep 17 00:00:00 2001
From: Ludwig Boess <ludwig.boess@outlook.de>
Date: Wed, 24 Jun 2026 18:47:03 -0400
Subject: [PATCH 21/33] use a persistent buffer per sort to reduce memory
 overhead

---
 src/framework/containers/particles.h        |  19 +--
 src/framework/containers/particles_sort.cpp | 131 ++++++++++++++------
 2 files changed, 101 insertions(+), 49 deletions(-)

diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h
index 511a07a62..c3fa93137 100644
--- a/src/framework/containers/particles.h
+++ b/src/framework/containers/particles.h
@@ -311,14 +311,17 @@ namespace ntt {
   private:
     /**
      * @brief Apply a particle-index permutation (built by oneDPL/Thrust
-     *        sort_by_key) to the SoA member arrays. Each member is
-     *        gathered into a fresh full-capacity buffer whose handle is
-     *        then swapped in (no copy-back), one buffer at a time, fenced
-     *        before the old storage is released. The *_prev arrays are
-     *        intentionally not permuted (overwritten by the next push
-     *        before any read). Only compiled when a vendor sort backend
-     *        is enabled; the BinSort path applies the permutation in
-     *        place via `sorter.sort(view)` instead.
+     *        sort_by_key) to the SoA member arrays. Members are gathered
+     *        through `perm` into a reusable `n`-sized scratch buffer
+     *        (one per member type, shared across members of that type)
+     *        and copied back in place, so the large persistent member
+     *        arrays keep their storage/address and the gather makes a
+     *        handful of transient allocations instead of one maxnpart
+     *        buffer per member. The *_prev arrays are intentionally not
+     *        permuted (overwritten by the next push before any read).
+     *        Only compiled when a vendor sort backend is enabled; the
+     *        BinSort path applies the permutation in place via
+     *        `sorter.sort(view)` instead.
      */
     void apply_permutation_to_soa(const prtl_perm_t& perm);
 
diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp
index 3db6c1438..8261fe5b4 100644
--- a/src/framework/containers/particles_sort.cpp
+++ b/src/framework/containers/particles_sort.cpp
@@ -474,53 +474,73 @@ namespace ntt {
 #if defined(TEAM_POLICY_USE_VENDOR_SORT)
   namespace permute_helpers {
 
-    // Permute a 1D SoA member array `arr` by `perm`. Gathers into a
-    // fresh buffer allocated at the member's full capacity (maxnpart),
-    // then swaps the View handle in. This avoids the redundant copy-back
-    // pass of the old gather-then-deep_copy approach (~2x less HBM
-    // traffic). Allocating at full capacity preserves the member's spare
-    // room for injection; the untouched tail [n, capacity) is
-    // zero-initialized by Kokkos (cleaner than the stale values the old
-    // deep_copy left there). The fence drains the gather (which reads the
-    // old storage) before the swap drops the last reference to it.
+    // Permute a 1D SoA member `arr` by `perm` in place, using a
+    // caller-owned reusable `scratch` buffer. Gathers the live prefix
+    // `arr[perm[0..n)]` into `scratch[0..n)`, then copies it back into
+    // `arr[0..n)`. Unlike the old swap-the-handle approach, `arr` keeps
+    // its original storage and full maxnpart capacity, so the large
+    // persistent member arrays never change address between sorts (the
+    // dominant source of allocator churn / fragmentation on ROCm), and
+    // `scratch` is shared across every member of the same type within one
+    // sort -- one transient allocation per type group instead of a fresh
+    // maxnpart buffer per member. `scratch` is sized to the live count
+    // `n` (not maxnpart), which also lowers the gather's peak transient.
+    // The tail `arr[n, capacity)` is left untouched (stale, never read:
+    // consumers iterate `[0, npart())`). Cost vs the swap: one extra
+    // copy-back pass (~2x HBM traffic), negligible when sorting is a
+    // small fraction of the step. The fences keep the shared scratch from
+    // being overwritten by the next member's gather before this member's
+    // copy-back has drained it.
     template <typename V>
-    inline void permute_1d_swap(V&                 arr,
+    inline void permute_1d_into(V&                 arr,
+                                const V&           scratch,
                                 const prtl_perm_t& perm,
                                 npart_t            n) {
       if (n == 0u) {
         return;
       }
-      V    buf(arr.label(), arr.extent(0));
       auto perm_v = perm;
       auto arr_v  = arr;
+      auto buf_v  = scratch;
       Kokkos::parallel_for(
         "Permute1D",
         n,
-        KOKKOS_LAMBDA(const npart_t p) { buf(p) = arr_v(perm_v(p)); });
-      Kokkos::fence("permute_1d_swap: end");
-      arr = buf;
+        KOKKOS_LAMBDA(const npart_t p) { buf_v(p) = arr_v(perm_v(p)); });
+      Kokkos::fence("permute_1d_into: gather");
+      Kokkos::deep_copy(
+        Kokkos::subview(arr, std::make_pair(static_cast<npart_t>(0), n)),
+        Kokkos::subview(scratch, std::make_pair(static_cast<npart_t>(0), n)));
+      Kokkos::fence("permute_1d_into: copy-back");
     }
 
-    // 2D analogue for `pld_r` / `pld_i`.
+    // 2D analogue for `pld_r` / `pld_i` (`scratch` is `(>= n, ncols)`).
     template <typename V>
-    inline void permute_2d_swap(V&                 arr,
+    inline void permute_2d_into(V&                 arr,
+                                const V&           scratch,
                                 const prtl_perm_t& perm,
                                 npart_t            n,
                                 npart_t            ncols) {
       if (n == 0u or ncols == 0u) {
         return;
       }
-      V    buf(arr.label(), arr.extent(0), arr.extent(1));
       auto perm_v = perm;
       auto arr_v  = arr;
+      auto buf_v  = scratch;
       Kokkos::parallel_for(
         "Permute2D",
         CreateParticleRangePolicy<Dim::_2D>({ 0u, 0u }, { n, ncols }),
         KOKKOS_LAMBDA(const npart_t p, const npart_t l) {
-          buf(p, l) = arr_v(perm_v(p), l);
+          buf_v(p, l) = arr_v(perm_v(p), l);
         });
-      Kokkos::fence("permute_2d_swap: end");
-      arr = buf;
+      Kokkos::fence("permute_2d_into: gather");
+      Kokkos::deep_copy(
+        Kokkos::subview(arr,
+                        std::make_pair(static_cast<npart_t>(0), n),
+                        Kokkos::ALL),
+        Kokkos::subview(scratch,
+                        std::make_pair(static_cast<npart_t>(0), n),
+                        Kokkos::ALL));
+      Kokkos::fence("permute_2d_into: copy-back");
     }
 
   } // namespace permute_helpers
@@ -532,8 +552,8 @@ namespace ntt {
       return;
     }
 
-    using permute_helpers::permute_1d_swap;
-    using permute_helpers::permute_2d_swap;
+    using permute_helpers::permute_1d_into;
+    using permute_helpers::permute_2d_into;
 
     // The *_prev arrays (i{1,2,3}_prev, dx{1,2,3}_prev) are intentionally
     // NOT permuted. SortSpatially runs at the very end of the step loop
@@ -551,31 +571,60 @@ namespace ntt {
     //     prev field saved to the checkpoint differs from the old code.
     // Permuting prev would therefore reorder data that is overwritten
     // before it is ever observed.
-    if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) {
-      permute_1d_swap(i1, perm, n);
-      permute_1d_swap(dx1, perm, n);
+    //
+    // Each block below allocates a single `n`-sized scratch buffer that
+    // is reused for every member of that type, then freed before the next
+    // block allocates its own. The whole gather thus makes one transient
+    // allocation per type group (int / prtldx_t / real_t / short / each
+    // payload) instead of one fresh maxnpart buffer per member, and peak
+    // transient is a single n-sized scratch at a time.
+    {
+      array_t<int*> scratch { "perm_scratch_int", n };
+      if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) {
+        permute_1d_into(i1, scratch, perm, n);
+      }
+      if constexpr (D == Dim::_2D or D == Dim::_3D) {
+        permute_1d_into(i2, scratch, perm, n);
+      }
+      if constexpr (D == Dim::_3D) {
+        permute_1d_into(i3, scratch, perm, n);
+      }
     }
-    if constexpr (D == Dim::_2D or D == Dim::_3D) {
-      permute_1d_swap(i2, perm, n);
-      permute_1d_swap(dx2, perm, n);
+    {
+      array_t<prtldx_t*> scratch { "perm_scratch_prtldx", n };
+      if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) {
+        permute_1d_into(dx1, scratch, perm, n);
+      }
+      if constexpr (D == Dim::_2D or D == Dim::_3D) {
+        permute_1d_into(dx2, scratch, perm, n);
+      }
+      if constexpr (D == Dim::_3D) {
+        permute_1d_into(dx3, scratch, perm, n);
+      }
     }
-    if constexpr (D == Dim::_3D) {
-      permute_1d_swap(i3, perm, n);
-      permute_1d_swap(dx3, perm, n);
-    }
-    permute_1d_swap(ux1, perm, n);
-    permute_1d_swap(ux2, perm, n);
-    permute_1d_swap(ux3, perm, n);
-    permute_1d_swap(weight, perm, n);
-    permute_1d_swap(tag, perm, n);
-    if constexpr (D == Dim::_2D and C != Coord::Cartesian) {
-      permute_1d_swap(phi, perm, n);
+    {
+      array_t<real_t*> scratch { "perm_scratch_real", n };
+      permute_1d_into(ux1, scratch, perm, n);
+      permute_1d_into(ux2, scratch, perm, n);
+      permute_1d_into(ux3, scratch, perm, n);
+      permute_1d_into(weight, scratch, perm, n);
+      if constexpr (D == Dim::_2D and C != Coord::Cartesian) {
+        permute_1d_into(phi, scratch, perm, n);
+      }
+    }
+    {
+      array_t<short*> scratch { "perm_scratch_tag", n };
+      permute_1d_into(tag, scratch, perm, n);
     }
     if (npld_r() > 0) {
-      permute_2d_swap(pld_r, perm, n, static_cast<npart_t>(npld_r()));
+      const auto         ncols = static_cast<npart_t>(npld_r());
+      array_t<real_t**>  scratch { "perm_scratch_pld_r", n, ncols };
+      permute_2d_into(pld_r, scratch, perm, n, ncols);
     }
     if (npld_i() > 0) {
-      permute_2d_swap(pld_i, perm, n, static_cast<npart_t>(npld_i()));
+      const auto         ncols = static_cast<npart_t>(npld_i());
+      array_t<npart_t**> scratch { "perm_scratch_pld_i", n, ncols };
+      permute_2d_into(pld_i, scratch, perm, n, ncols);
     }
   }
 #endif // TEAM_POLICY_USE_VENDOR_SORT

From 216940ce14c55be31f9851ab6217fa11ca85e2d9 Mon Sep 17 00:00:00 2001
From: Ludwig Boess <ludwig.boess@outlook.de>
Date: Wed, 24 Jun 2026 18:47:17 -0400
Subject: [PATCH 22/33] added the option to explicitly disable vendor sort

---
 CMakeLists.txt                | 98 ++++++++++++++++++++---------------
 cmake/defaults.cmake          | 13 +++++
 cmake/report.cmake            | 10 ++++
 src/global/utils/reporter.cpp |  7 +++
 4 files changed, 87 insertions(+), 41 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f8409209b..b9bbb1767 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,6 +71,10 @@ set(team_policy_drift
   ${default_team_policy_drift}
   CACHE STRING
   "team_policy tiled-deposit scratch halo drift in cells (max cells a particle may move between two sorts). Sizes the deposit scratch halo only; the sort cadence is set at runtime via spatial_sorting_interval. Default 1.")
+set(vendor_sort
+  ${default_vendor_sort}
+  CACHE BOOL
+  "Use the vendor sort_by_key (oneDPL/Thrust/rocThrust) for the team_policy spatial sort when available. OFF forces the Kokkos::BinSort fallback, which sorts each SoA member in place (lower peak memory, no maxnpart gather buffer) at the cost of sort speed.")
 
 # -------------------------- Compilation settings -------------------------- #
 set(CMAKE_CXX_STANDARD 20)
@@ -169,53 +173,65 @@ if(${team_policy})
   # `spatial_sorting_interval`. Defaults to 1 (the sorted-every-step case).
   add_compile_options("-D TEAM_POLICY_DRIFT=${team_policy_drift}")
 
-  # Vendor sort: oneDPL on SYCL, Thrust on CUDA. Used automatically
-  # when found; falls back to Kokkos::BinSort otherwise.
-  if("${Kokkos_DEVICES}" MATCHES "SYCL")
-    find_package(oneDPL QUIET)
-    if(oneDPL_FOUND)
-      message(STATUS "team_policy: oneDPL found, enabling SYCL sort_by_key")
-      add_compile_options("-D ONEDPL_ENABLED")
-      set(DEPENDENCIES ${DEPENDENCIES} oneDPL)
-    else()
-      message(STATUS "team_policy: oneDPL not found; using BinSort fallback "
-                     "for SYCL sort_by_key")
+  # Vendor sort: oneDPL on SYCL, Thrust on CUDA, rocThrust/rocprim on HIP.
+  # When `vendor_sort` is ON (default) the available library is detected
+  # and used; the spatial sort then builds a single permutation that
+  # gathers all SoA members. When `vendor_sort` is OFF, or no library is
+  # found, the code falls back to Kokkos::BinSort, which sorts each member
+  # in place -- lower peak memory and no maxnpart gather buffer, at the
+  # cost of sort speed (negligible when sorting is a small fraction of the
+  # step). The `vendor_sort` knob lets you force the BinSort fallback even
+  # when a vendor library is present.
+  if(${vendor_sort})
+    if("${Kokkos_DEVICES}" MATCHES "SYCL")
+      find_package(oneDPL QUIET)
+      if(oneDPL_FOUND)
+        message(STATUS "team_policy: oneDPL found, enabling SYCL sort_by_key")
+        add_compile_options("-D ONEDPL_ENABLED")
+        set(DEPENDENCIES ${DEPENDENCIES} oneDPL)
+      else()
+        message(STATUS "team_policy: oneDPL not found; using BinSort fallback "
+                       "for SYCL sort_by_key")
+      endif()
     endif()
-  endif()
 
-  if("${Kokkos_DEVICES}" MATCHES "CUDA")
-    find_package(Thrust QUIET)
-    if(Thrust_FOUND)
-      message(STATUS "team_policy: Thrust enabled for CUDA sort_by_key")
-      add_compile_options("-D THRUST_ENABLED")
-    else()
-      message(STATUS "team_policy: Thrust not found; using BinSort fallback "
-                     "for CUDA sort_by_key")
+    if("${Kokkos_DEVICES}" MATCHES "CUDA")
+      find_package(Thrust QUIET)
+      if(Thrust_FOUND)
+        message(STATUS "team_policy: Thrust enabled for CUDA sort_by_key")
+        add_compile_options("-D THRUST_ENABLED")
+      else()
+        message(STATUS "team_policy: Thrust not found; using BinSort fallback "
+                       "for CUDA sort_by_key")
+      endif()
     endif()
-  endif()
 
-  if("${Kokkos_DEVICES}" MATCHES "HIP")
-    # rocThrust ships with ROCm. The HIP sort_by_key path uses rocprim's
-    # bounded-bit radix sort directly (rocprim is rocThrust's own
-    # dependency, so its headers come in transitively; we find it
-    # explicitly to keep the include path robust). This builds a single
-    # permutation that gathers all SoA members, instead of the legacy
-    # per-member Kokkos::BinSort path which allocates a fresh
-    # `sorted_values` buffer for every member every step (the dominant
-    # source of allocator churn / fragmentation on ROCm).
-    find_package(rocthrust QUIET)
-    if(rocthrust_FOUND)
-      message(STATUS "team_policy: rocThrust enabled for HIP sort_by_key")
-      add_compile_options("-D ROCTHRUST_ENABLED")
-      set(DEPENDENCIES ${DEPENDENCIES} roc::rocthrust)
-      find_package(rocprim QUIET)
-      if(rocprim_FOUND)
-        set(DEPENDENCIES ${DEPENDENCIES} roc::rocprim)
+    if("${Kokkos_DEVICES}" MATCHES "HIP")
+      # rocThrust ships with ROCm. The HIP sort_by_key path uses rocprim's
+      # bounded-bit radix sort directly (rocprim is rocThrust's own
+      # dependency, so its headers come in transitively; we find it
+      # explicitly to keep the include path robust). This builds a single
+      # permutation that gathers all SoA members, instead of the legacy
+      # per-member Kokkos::BinSort path which allocates a fresh
+      # `sorted_values` buffer for every member every step (the dominant
+      # source of allocator churn / fragmentation on ROCm).
+      find_package(rocthrust QUIET)
+      if(rocthrust_FOUND)
+        message(STATUS "team_policy: rocThrust enabled for HIP sort_by_key")
+        add_compile_options("-D ROCTHRUST_ENABLED")
+        set(DEPENDENCIES ${DEPENDENCIES} roc::rocthrust)
+        find_package(rocprim QUIET)
+        if(rocprim_FOUND)
+          set(DEPENDENCIES ${DEPENDENCIES} roc::rocprim)
+        endif()
+      else()
+        message(STATUS "team_policy: rocThrust not found; using BinSort "
+                       "fallback for HIP sort_by_key")
       endif()
-    else()
-      message(STATUS "team_policy: rocThrust not found; using BinSort "
-                     "fallback for HIP sort_by_key")
     endif()
+  else()
+    message(STATUS "team_policy: vendor_sort=OFF; forcing Kokkos::BinSort "
+                   "fallback for spatial sort_by_key")
   endif()
 endif()
 
diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake
index 01427921f..c03ebaf81 100644
--- a/cmake/defaults.cmake
+++ b/cmake/defaults.cmake
@@ -104,6 +104,19 @@ else()
 endif()
 set_property(CACHE default_team_policy PROPERTY TYPE BOOL)
 
+if(DEFINED ENV{Entity_ENABLE_VENDOR_SORT})
+  set(default_vendor_sort
+      $ENV{Entity_ENABLE_VENDOR_SORT}
+      CACHE INTERNAL
+        "Default flag for vendor sort_by_key (oneDPL/Thrust/rocThrust)")
+else()
+  set(default_vendor_sort
+      ON
+      CACHE INTERNAL
+        "Default flag for vendor sort_by_key (oneDPL/Thrust/rocThrust)")
+endif()
+set_property(CACHE default_vendor_sort PROPERTY TYPE BOOL)
+
 set(default_team_policy_tile_size
     8
     CACHE INTERNAL "Default tile edge length in cells for team_policy")
diff --git a/cmake/report.cmake b/cmake/report.cmake
index d90dfb085..d972fafc4 100644
--- a/cmake/report.cmake
+++ b/cmake/report.cmake
@@ -150,6 +150,15 @@ if(${team_policy})
     "${Blue}"
     TEAM_POLICY_DRIFT_REPORT
     46)
+  printchoices(
+    "Vendor sort"
+    "vendor_sort"
+    "${ON_OFF_VALUES}"
+    ${vendor_sort}
+    ON
+    "${Green}"
+    VENDOR_SORT_REPORT
+    46)
 endif()
 printchoices(
   "Debug mode"
@@ -230,6 +239,7 @@ string(APPEND REPORT_TEXT "  " ${TEAM_POLICY_REPORT} "\n")
 if(${team_policy})
   string(APPEND REPORT_TEXT "  " ${TEAM_POLICY_TILE_SIZE_REPORT} "\n")
   string(APPEND REPORT_TEXT "  " ${TEAM_POLICY_DRIFT_REPORT} "\n")
+  string(APPEND REPORT_TEXT "  " ${VENDOR_SORT_REPORT} "\n")
 endif()
 
 string(
diff --git a/src/global/utils/reporter.cpp b/src/global/utils/reporter.cpp
index a4b10eee6..6113a89c7 100644
--- a/src/global/utils/reporter.cpp
+++ b/src/global/utils/reporter.cpp
@@ -253,6 +253,13 @@ namespace reporter {
 
 #if defined(TEAM_POLICY)
     AddParam(report, 4, "TEAM_POLICY", "%s", "ON");
+  #if (defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) ||                     \
+    (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) ||                       \
+    (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED))
+    AddParam(report, 4, "VENDOR_SORT", "%s", "ON");
+  #else
+    AddParam(report, 4, "VENDOR_SORT", "%s", "OFF (BinSort)");
+  #endif
 #else
     AddParam(report, 4, "TEAM_POLICY", "%s", "OFF");
 #endif

From af942c82eeda02c30e15004069caf83b57583dd4 Mon Sep 17 00:00:00 2001
From: Ludwig Boess <ludwig.boess@outlook.de>
Date: Wed, 24 Jun 2026 19:08:50 -0400
Subject: [PATCH 23/33] remove dead particles after every sort

---
 src/framework/containers/particles.h        |  8 ++++-
 src/framework/containers/particles_sort.cpp | 35 +++++++++++++++------
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h
index c3fa93137..fe4a0b17f 100644
--- a/src/framework/containers/particles.h
+++ b/src/framework/containers/particles.h
@@ -322,8 +322,14 @@ namespace ntt {
      *        Only compiled when a vendor sort backend is enabled; the
      *        BinSort path applies the permutation in place via
      *        `sorter.sort(view)` instead.
+     * @param perm Permutation: sorted position -> pre-sort slot index.
+     * @param n Number of leading (alive) particles to gather. Pass
+     *        `npart_partitioned` so only the alive set is moved into
+     *        `[0, n)` (the dead were binned to the sentinel tile and sort
+     *        to the tail); the caller then drops the dead tail via
+     *        `set_npart(n)`.
      */
-    void apply_permutation_to_soa(const prtl_perm_t& perm);
+    void apply_permutation_to_soa(const prtl_perm_t& perm, npart_t n);
 
   public:
 #endif
diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp
index 8261fe5b4..5e9ae4147 100644
--- a/src/framework/containers/particles_sort.cpp
+++ b/src/framework/containers/particles_sort.cpp
@@ -319,12 +319,12 @@ namespace ntt {
     const auto     slice  = prtl_slice_t(0, npart_local);
 
   #if defined(TEAM_POLICY_USE_VENDOR_SORT)
-    // Vendor path: produce an explicit permutation via sort_by_key,
-    // then apply it to each SoA member by gathering into a fresh
-    // full-capacity buffer and swapping the View handle in (no
-    // copy-back). The *_prev arrays are skipped — see
+    // Vendor path: produce an explicit permutation via sort_by_key, then
+    // apply it to each SoA member by gathering the alive prefix through a
+    // reusable scratch buffer (one per member type, sized to the alive
+    // count, copied back in place). The *_prev arrays are skipped — see
     // apply_permutation_to_soa. Peak transient = one
-    // `maxnpart × sizeof(member)` buffer at a time.
+    // `npart_partitioned × sizeof(member)` scratch at a time.
     prtl_perm_t perm { "tile_perm", npart_local };
     #if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)
     sort_helpers::sort_by_key_dispatch(tile_indices,
@@ -349,7 +349,14 @@ namespace ntt {
     compute_tile_offsets(tile_indices, total_tiles, npart_local);
     tile_indices = array_t<ncells_t*> {};
     Kokkos::fence("SortSpatially: pre-gather drain");
-    apply_permutation_to_soa(perm);
+    // Gather only the alive particles. The sort binned dead particles to
+    // the sentinel tile, so they occupy [npart_partitioned, npart_local)
+    // and `perm[0, npart_partitioned)` lists the alive slots in tile
+    // order. Restricting the gather to the alive count drops the dead in
+    // the same pass (no separate compaction), skips work on dead slots,
+    // and sizes the gather scratch to the alive count. The dead tail is
+    // released below via `set_npart`.
+    apply_permutation_to_soa(perm, m_tile_layout.npart_partitioned);
   #else
     // BinSort path: same mechanism as legacy SortSpatially (BinSort
     // allocates one temp View per `sorter.sort(view)` call and frees
@@ -411,6 +418,16 @@ namespace ntt {
     m_tile_layout.tile_perm          = prtl_perm_t {};
     m_is_sorted                      = true;
 
+    // Compact-on-sort: drop the dead tail now instead of waiting for the
+    // periodic RemoveDead. The sort parked dead particles in the sentinel
+    // bin at [npart_partitioned, npart()), and the alive set is exactly
+    // [0, npart_partitioned) — gathered into tile order above (vendor) or
+    // sorted in place (BinSort). Shrinking npart() here keeps it, and
+    // every subsequent sort's transient buffers, tracking the alive count
+    // instead of ratcheting up with dead slots between clearing intervals.
+    // (RemoveDead remains the compactor when spatial sorting is disabled.)
+    set_npart(m_tile_layout.npart_partitioned);
+
     Kokkos::fence("SortSpatially: end of team_policy path");
 #else  // !TEAM_POLICY — legacy in-place BinSort by global cell index
     const auto nx2         = grid.n_active(in::x2);
@@ -546,8 +563,8 @@ namespace ntt {
   } // namespace permute_helpers
 
   template <Dimension D, Coord::type C>
-  void Particles<D, C>::apply_permutation_to_soa(const prtl_perm_t& perm) {
-    const auto n = npart();
+  void Particles<D, C>::apply_permutation_to_soa(const prtl_perm_t& perm,
+                                                 npart_t            n) {
     if (n == 0u) {
       return;
     }
@@ -632,7 +649,7 @@ namespace ntt {
 #if defined(TEAM_POLICY_USE_VENDOR_SORT)
   #define APPLY_PERM_INSTANTIATE(D, C)                                         \
     template void Particles<D, C>::apply_permutation_to_soa(                   \
-      const prtl_perm_t&);
+      const prtl_perm_t&, npart_t);
 #else
   #define APPLY_PERM_INSTANTIATE(D, C)
 #endif

From e1de8ae56656b286bcfcfce474d7a17af8555a6a Mon Sep 17 00:00:00 2001
From: Ludwig Boess <ludwig.boess@outlook.de>
Date: Wed, 24 Jun 2026 19:09:15 -0400
Subject: [PATCH 24/33] test update

---
 tests/framework/particles_sort.cpp | 44 ++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/tests/framework/particles_sort.cpp b/tests/framework/particles_sort.cpp
index 4e4881d98..a2b82456e 100644
--- a/tests/framework/particles_sort.cpp
+++ b/tests/framework/particles_sort.cpp
@@ -112,16 +112,30 @@ auto main(int argc, char* argv[]) -> int {
       // SortSpatially is order-by-tile, not order-by-cell: assert the
       // invariants that hold for any tile size rather than a hardwired
       // permutation. (1) alive particles form a prefix sorted by
-      // non-decreasing tile index; (2) dead particles (weight == -1) form
-      // the suffix; (3) every SoA member is permuted by the *same*
-      // permutation, so each alive slot still satisfies pld == f(weight);
-      // (4) no alive particle is lost. Only [0, npart) is defined after a
-      // sort — the swap-gather zero-fills [npart, maxnpart).
+      // non-decreasing tile index; (2) every SoA member is permuted by the
+      // *same* permutation, so each alive slot still satisfies
+      // pld == f(weight); (3) no alive particle is lost. Only [0, npart())
+      // is defined after a sort. The team_policy path compacts — it drops
+      // the dead, so npart() equals the alive count and [0, npart()) is
+      // entirely alive; the legacy (non-team) path keeps the dead as a
+      // weight == -1 suffix, leaving npart() unchanged. Iterating
+      // [0, npart()) exercises both: the prefix-sorted / no-alive-after-dead
+      // checks below hold either way.
+#if defined(TEAM_POLICY)
+      raise::ErrorIf(prtls.npart() != 59u,
+                     "team_policy sort must compact: npart() should equal "
+                     "the alive count",
+                     HERE);
+#else
+      raise::ErrorIf(prtls.npart() != 66u,
+                     "legacy sort should leave npart() unchanged",
+                     HERE);
+#endif
       bool     seen_dead   = false;
       bool     have_prev   = false;
       ncells_t prev_tile   = 0u;
       npart_t  n_alive_obs = 0u;
-      for (auto p { 0u }; p < 66u; ++p) {
+      for (auto p { 0u }; p < prtls.npart(); ++p) {
         if (tag_h(p) != ntt::ParticleTag::alive) {
           seen_dead = true;
           raise::ErrorIf(weight_h(p) != -1.0,
@@ -244,8 +258,10 @@ auto main(int argc, char* argv[]) -> int {
       Kokkos::deep_copy(weight_h, prtls.weight);
 
       // Same invariants as the 2D block (no payloads here): alive prefix
-      // sorted by non-decreasing tile index, dead (weight == -1) suffix,
-      // alive count preserved. T = 1 reproduces the legacy per-cell order.
+      // sorted by non-decreasing tile index, alive count preserved. The
+      // team_policy path compacts the dead away (npart() == alive count);
+      // the legacy path keeps them as a weight == -1 suffix. T = 1
+      // reproduces the legacy per-cell order.
 #if defined(TEAM_POLICY)
       const ncells_t T = static_cast<ncells_t>(TEAM_POLICY_TILE_SIZE);
 #else
@@ -261,11 +277,21 @@ auto main(int argc, char* argv[]) -> int {
                (static_cast<ncells_t>(c) / T);
       };
 
+#if defined(TEAM_POLICY)
+      raise::ErrorIf(prtls.npart() != 59u,
+                     "team_policy sort must compact: npart() should equal "
+                     "the alive count",
+                     HERE);
+#else
+      raise::ErrorIf(prtls.npart() != 66u,
+                     "legacy sort should leave npart() unchanged",
+                     HERE);
+#endif
       bool     seen_dead   = false;
       bool     have_prev   = false;
       ncells_t prev_tile   = 0u;
       npart_t  n_alive_obs = 0u;
-      for (auto p { 0u }; p < 66u; ++p) {
+      for (auto p { 0u }; p < prtls.npart(); ++p) {
         if (tag_h(p) != ntt::ParticleTag::alive) {
           seen_dead = true;
           raise::ErrorIf(weight_h(p) != -1.0,

From 592ff89012eaf3276bc6dc491193df9c329765e8 Mon Sep 17 00:00:00 2001
From: Ludwig Boess <ludwig.boess@outlook.de>
Date: Thu, 25 Jun 2026 11:11:47 -0400
Subject: [PATCH 25/33] generalized reduced exchange for current filters on any
 coordinate system

---
 src/engines/srpic/currents.h | 167 ++++++++++++++++++-----------------
 1 file changed, 88 insertions(+), 79 deletions(-)

diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h
index 30eb9466d..dbb482e83 100644
--- a/src/engines/srpic/currents.h
+++ b/src/engines/srpic/currents.h
@@ -233,90 +233,99 @@ namespace ntt {
 
       const auto flds_bc = domain.mesh.flds_bc();
 
-      if constexpr (M::CoordType == Coord::Cartesian) {
-        // Reduced-exchange ghost-margin scheme. One halo exchange refreshes
-        // N_GHOSTS ghost layers — enough for N_GHOSTS passes of the 3-point
-        // binomial if each pass also recomputes the inner ghost layers it
-        // will need next. We therefore extend the launch range by a shrinking
-        // margin `m` into the ghost zone, but only on comm-refreshed sides
-        // (PERIODIC self-wrap or SYNC inter-domain), where the ghost cell is
-        // interior physics. Physical-boundary ghosts are never written or
-        // refreshed, exactly as in the per-pass loop, so the result is
-        // identical for every BC — while doing one exchange per N_GHOSTS
-        // passes instead of one per pass. (Entering the loop the ghosts are
-        // valid to distance N_GHOSTS: srpic.hpp runs CommunicateFields(J)
-        // immediately before CurrentsFilter.)
-        const int  G = static_cast<int>(N_GHOSTS);
-        const auto comm_side = [](FldsBC b) {
-          return (b == FldsBC::PERIODIC) or (b == FldsBC::SYNC);
+      // Reduced-exchange ghost-margin scheme (all coordinate types). One halo
+      // exchange refreshes N_GHOSTS ghost layers — enough for N_GHOSTS passes
+      // of the 3-point binomial if each pass also recomputes the inner ghost
+      // layers it will need next. We therefore extend the launch range by a
+      // shrinking margin `m` into the ghost zone, but only on comm-refreshed
+      // sides (PERIODIC self-wrap or SYNC inter-domain), where the ghost cell
+      // is interior physics. Physical-boundary ghosts are never written or
+      // refreshed, exactly as in the per-pass loop, so the result is identical
+      // for every BC — while doing one exchange per N_GHOSTS passes instead of
+      // one per pass. (Entering the loop the ghosts are valid to distance
+      // N_GHOSTS: srpic.hpp runs CommunicateFields(J) immediately before
+      // CurrentsFilter.)
+      //
+      // Non-Cartesian axis: the theta (x2) direction is self-contained in the
+      // filter kernel — the axis branches only ever read/write within
+      // [i2_min, i2_max] and never cross the axis — so the axis needs no halo
+      // exchange at all (the axis current fold is done once by
+      // SynchronizeFields(J) before this function). The single coordinate
+      // dependency is that the axis cell sits at i_max(x2), one past the active
+      // range, and must be filtered on every pass. We therefore add a fixed +1
+      // to the x2 upper bound when that side is AXIS — a physical boundary, so
+      // never the shrinking comm margin. This folds the old RangeWithAxisBCs
+      // fixup into make_range, letting the same loop serve every CoordType.
+      const int  G = static_cast<int>(N_GHOSTS);
+      const auto comm_side = [](FldsBC b) {
+        return (b == FldsBC::PERIODIC) or (b == FldsBC::SYNC);
+      };
+      bool ext_lo[3] = { false, false, false };
+      bool ext_hi[3] = { false, false, false };
+      for (auto d { 0 }; d < static_cast<int>(M::Dim); ++d) {
+        ext_lo[d] = comm_side(flds_bc[d].first);
+        ext_hi[d] = comm_side(flds_bc[d].second);
+      }
+      // AXIS at the upper x2 boundary needs the axis cell (i_max(x2)) included
+      // every pass; matches srpic::RangeWithAxisBCs. (The lower-x2 axis cell is
+      // already i_min(x2), so no fixup is needed there.)
+      bool axis_hi_x2 = false;
+      if constexpr (M::CoordType != Coord::Cartesian and
+                    (M::Dim == Dim::_2D or M::Dim == Dim::_3D)) {
+        axis_hi_x2 = (flds_bc[1].second == FldsBC::AXIS);
+      }
+      const auto make_range = [&](int m) -> range_t<M::Dim> {
+        const auto ml = [&](int d) -> ncells_t {
+          return ext_lo[d] ? static_cast<ncells_t>(m) : 0u;
         };
-        bool ext_lo[3] = { false, false, false };
-        bool ext_hi[3] = { false, false, false };
-        for (auto d { 0 }; d < static_cast<int>(M::Dim); ++d) {
-          ext_lo[d] = comm_side(flds_bc[d].first);
-          ext_hi[d] = comm_side(flds_bc[d].second);
-        }
-        const auto make_range = [&](int m) -> range_t<M::Dim> {
-          const auto ml = [&](int d) -> ncells_t {
-            return ext_lo[d] ? static_cast<ncells_t>(m) : 0u;
-          };
-          const auto mh = [&](int d) -> ncells_t {
-            return ext_hi[d] ? static_cast<ncells_t>(m) : 0u;
-          };
-          if constexpr (M::Dim == Dim::_1D) {
-            return CreateRangePolicy<Dim::_1D>(
-              { domain.mesh.i_min(in::x1) - ml(0) },
-              { domain.mesh.i_max(in::x1) + mh(0) });
-          } else if constexpr (M::Dim == Dim::_2D) {
-            return CreateRangePolicy<Dim::_2D>(
-              { domain.mesh.i_min(in::x1) - ml(0),
-                domain.mesh.i_min(in::x2) - ml(1) },
-              { domain.mesh.i_max(in::x1) + mh(0),
-                domain.mesh.i_max(in::x2) + mh(1) });
-          } else {
-            return CreateRangePolicy<Dim::_3D>(
-              { domain.mesh.i_min(in::x1) - ml(0),
-                domain.mesh.i_min(in::x2) - ml(1),
-                domain.mesh.i_min(in::x3) - ml(2) },
-              { domain.mesh.i_max(in::x1) + mh(0),
-                domain.mesh.i_max(in::x2) + mh(1),
-                domain.mesh.i_max(in::x3) + mh(2) });
+        const auto mh = [&](int d) -> ncells_t {
+          if (ext_hi[d]) {
+            return static_cast<ncells_t>(m);
           }
-        };
-        int m = G - 1;
-        for (auto i { 0u }; i < nfilter; ++i) {
-          Kokkos::parallel_for(
-            "CurrentsFilter",
-            make_range(m),
-            kernel::DigitalFilter_kernel<M::Dim, M::CoordType>(
-              domain.fields.buff,
-              domain.fields.cur,
-              size,
-              flds_bc));
-          std::swap(domain.fields.cur, domain.fields.buff);
-          --m;
-          if (m < 0 or i == nfilter - 1u) {
-            // refresh ghosts to distance G (and leave them valid for the
-            // downstream field solver after the final pass)
-            metadomain.CommunicateFields(domain, Comm::J);
-            m = G - 1;
+          // axis cell fixup (x2 == dimension index 1); mutually exclusive with
+          // the comm margin since AXIS is not a comm side
+          if (d == 1 and axis_hi_x2) {
+            return 1u;
           }
+          return 0u;
+        };
+        if constexpr (M::Dim == Dim::_1D) {
+          return CreateRangePolicy<Dim::_1D>(
+            { domain.mesh.i_min(in::x1) - ml(0) },
+            { domain.mesh.i_max(in::x1) + mh(0) });
+        } else if constexpr (M::Dim == Dim::_2D) {
+          return CreateRangePolicy<Dim::_2D>(
+            { domain.mesh.i_min(in::x1) - ml(0),
+              domain.mesh.i_min(in::x2) - ml(1) },
+            { domain.mesh.i_max(in::x1) + mh(0),
+              domain.mesh.i_max(in::x2) + mh(1) });
+        } else {
+          return CreateRangePolicy<Dim::_3D>(
+            { domain.mesh.i_min(in::x1) - ml(0),
+              domain.mesh.i_min(in::x2) - ml(1),
+              domain.mesh.i_min(in::x3) - ml(2) },
+            { domain.mesh.i_max(in::x1) + mh(0),
+              domain.mesh.i_max(in::x2) + mh(1),
+              domain.mesh.i_max(in::x3) + mh(2) });
         }
-      } else {
-        // Non-Cartesian (axis BCs need the +1 range fixup): keep the
-        // per-pass exchange cadence, still ping-ponging the buffers.
-        const auto range = srpic::RangeWithAxisBCs(domain);
-        for (auto i { 0u }; i < nfilter; ++i) {
-          Kokkos::parallel_for(
-            "CurrentsFilter",
-            range,
-            kernel::DigitalFilter_kernel<M::Dim, M::CoordType>(
-              domain.fields.buff,
-              domain.fields.cur,
-              size,
-              flds_bc));
-          std::swap(domain.fields.cur, domain.fields.buff);
+      };
+      int m = G - 1;
+      for (auto i { 0u }; i < nfilter; ++i) {
+        Kokkos::parallel_for(
+          "CurrentsFilter",
+          make_range(m),
+          kernel::DigitalFilter_kernel<M::Dim, M::CoordType>(
+            domain.fields.buff,
+            domain.fields.cur,
+            size,
+            flds_bc));
+        std::swap(domain.fields.cur, domain.fields.buff);
+        --m;
+        if (m < 0 or i == nfilter - 1u) {
+          // refresh ghosts to distance G (and leave them valid for the
+          // downstream field solver after the final pass)
           metadomain.CommunicateFields(domain, Comm::J);
+          m = G - 1;
         }
       }
     }

From 6e1e23e29b82e05a3c2593657925af239af36e30 Mon Sep 17 00:00:00 2001
From: LudwigBoess <ludwig.boess@outlook.de>
Date: Thu, 25 Jun 2026 18:37:44 +0000
Subject: [PATCH 26/33] bugfix

---
 src/engines/srpic/currents.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h
index dbb482e83..23c57a903 100644
--- a/src/engines/srpic/currents.h
+++ b/src/engines/srpic/currents.h
@@ -47,6 +47,7 @@ namespace ntt {
                              dt));
     }
 
+#if defined(TEAM_POLICY)
     /**
      * @brief Tiled deposit launcher (TeamPolicy + per-team scratch).
      *
@@ -116,6 +117,7 @@ namespace ntt {
         Kokkos::Experimental::contribute(cur_nc, scatter_cur);
       }
     }
+#endif // TEAM_POLICY
 
     template <SRMetricClass M>
     void CurrentsDeposit(Domain<SimEngine::SRPIC, M>& domain,

From ff3ec19bcd463024fd5f78979d5135c120792228 Mon Sep 17 00:00:00 2001
From: LudwigBoess <ludwig.boess@outlook.de>
Date: Thu, 25 Jun 2026 23:08:19 +0000
Subject: [PATCH 27/33] move definittion of `compute_tile_offsets` to `public`
 so it compiles under nvcc

---
 src/framework/containers/particles.h | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h
index fe4a0b17f..396b5aec8 100644
--- a/src/framework/containers/particles.h
+++ b/src/framework/containers/particles.h
@@ -99,18 +99,6 @@ namespace ntt {
     // vendor libraries detected by CMake.
     TileLayout<D> m_tile_layout {};
 
-#if defined(TEAM_POLICY)
-    // Build m_tile_layout.tile_offsets / npart_partitioned from the
-    // already-sorted tile-index keys. A separate member function (not a
-    // lambda local to SortSpatially) so the inner device kernel is not an
-    // extended __device__ lambda nested inside another lambda — which
-    // nvcc forbids. Lets the vendor path run the offsets pass and then
-    // release the keys before the SoA gather allocates its buffers.
-    void compute_tile_offsets(const array_t<ncells_t*>& tile_indices,
-                              ncells_t                  total_tiles,
-                              npart_t                   npart_local);
-#endif
-
   public:
     // for empty allocation
     Particles() {}
@@ -219,6 +207,20 @@ namespace ntt {
       return m_ntags;
     }
 
+#if defined(TEAM_POLICY)
+    // Build m_tile_layout.tile_offsets / npart_partitioned from the
+    // already-sorted tile-index keys. A separate member function (not a
+    // lambda local to SortSpatially) so the inner device kernel is not an
+    // extended __device__ lambda nested inside another lambda — which
+    // nvcc forbids. Lets the vendor path run the offsets pass and then
+    // release the keys before the SoA gather allocates its buffers.
+    // NOTE: must be public — nvcc forbids an extended __host__ __device__
+    // lambda inside a member function with private/protected access.
+    void compute_tile_offsets(const array_t<ncells_t*>& tile_indices,
+                              ncells_t                  total_tiles,
+                              npart_t                   npart_local);
+#endif
+
     [[nodiscard]]
     auto memory_footprint() const -> std::size_t {
       std::size_t footprint  = 0;

From 03ccd200e25baae4064e1a3e8e29ad5b2b0502f6 Mon Sep 17 00:00:00 2001
From: LudwigBoess <ludwig.boess@outlook.de>
Date: Fri, 26 Jun 2026 01:02:05 +0000
Subject: [PATCH 28/33] added `team_policy_team_size` as a tunable runtime
 parameter

---
 input.example.toml                      |  8 +++++
 src/engines/engine.hpp                  | 13 ++++---
 src/engines/reporter.cpp                | 15 ++++++++
 src/engines/srpic/currents.h            | 48 ++++++++++++++++++++++---
 src/framework/parameters/algorithms.cpp |  7 ++++
 src/framework/parameters/algorithms.h   |  1 +
 src/global/defaults.h                   |  2 ++
 7 files changed, 85 insertions(+), 9 deletions(-)

diff --git a/input.example.toml b/input.example.toml
index 741d5c2d2..87ca48f3c 100644
--- a/input.example.toml
+++ b/input.example.toml
@@ -301,6 +301,14 @@
     #   @type: bool
     #   @default: true
     enable = ""
+    # team_policy tiled-deposit work-group (team) size
+    #   @type: uint [>= 0]
+    #   @default: 0
+    #   @note: 0 keeps Kokkos::AUTO (backend occupancy heuristic); a positive
+    #          value overrides it, clamped to the backend/scratch maximum at
+    #          launch. Only used in `team_policy=ON` builds. Pick a multiple of
+    #          the device subgroup width for best occupancy (see ideal_tile_size.py)
+    team_policy_team_size = ""
 
     # @inferred:
     # - order
diff --git a/src/engines/engine.hpp b/src/engines/engine.hpp
index b20e163ca..057bfcb5a 100644
--- a/src/engines/engine.hpp
+++ b/src/engines/engine.hpp
@@ -78,10 +78,11 @@ namespace ntt {
     Metadomain<S, M> m_metadomain;
     user::PGen<S, M> m_pgen;
 
-    const bool       is_resuming;
-    const simtime_t  runtime;
-    const real_t     dt;
-    const timestep_t max_steps;
+    const bool        is_resuming;
+    const simtime_t   runtime;
+    const real_t      dt;
+    const std::size_t team_policy_team_size;
+    const timestep_t  max_steps;
     const timestep_t start_step;
     const simtime_t  start_time;
     simtime_t        time;
@@ -109,6 +110,8 @@ namespace ntt {
       , is_resuming { m_params.get<bool>("checkpoint.is_resuming") }
       , runtime { m_params.get<simtime_t>("simulation.runtime") }
       , dt { m_params.get<real_t>("algorithms.timestep.dt") }
+      , team_policy_team_size { m_params.get<std::size_t>(
+          "algorithms.deposit.team_policy_team_size") }
       , max_steps { static_cast<timestep_t>(runtime / dt) }
       , start_step { m_params.get<timestep_t>("checkpoint.start_step") }
       , start_time { m_params.get<simtime_t>("checkpoint.start_time") }
@@ -127,6 +130,8 @@ namespace ntt {
       auto parameters = prm::Parameters {};
       parameters.set("dt", static_cast<real_t>(dt));
       parameters.set("time", static_cast<simtime_t>(time));
+      parameters.set("team_policy_team_size",
+                     static_cast<std::size_t>(team_policy_team_size));
       return parameters;
     }
   };
diff --git a/src/engines/reporter.cpp b/src/engines/reporter.cpp
index 3caeda5de..fcd46fb81 100644
--- a/src/engines/reporter.cpp
+++ b/src/engines/reporter.cpp
@@ -34,6 +34,21 @@ namespace ntt {
     reporter::AddParam(report, 4, "Engine", "%s", SimEngine(S).to_string());
 #if defined(TEAM_POLICY)
     reporter::AddParam(report, 4, "Tile size", "%d", TEAM_POLICY_TILE_SIZE);
+  #if defined(TEAM_POLICY_DRIFT)
+    reporter::AddParam(report, 4, "Halo drift", "%d", TEAM_POLICY_DRIFT);
+  #endif
+    if (params.template get<std::size_t>(
+          "algorithms.deposit.team_policy_team_size") == 0u) {
+      reporter::AddParam(report, 4, "Team size", "%s", "AUTO (Kokkos)");
+    } else {
+      reporter::AddParam(
+        report,
+        4,
+        "Team size",
+        "%d (requested; clamped to backend max at launch)",
+        static_cast<int>(params.template get<std::size_t>(
+          "algorithms.deposit.team_policy_team_size")));
+    }
 #endif
     reporter::AddParam(report, 4, "Metric", "%s", M.to_string());
 #if SHAPE_ORDER == 0
diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h
index 23c57a903..74946ed68 100644
--- a/src/engines/srpic/currents.h
+++ b/src/engines/srpic/currents.h
@@ -66,7 +66,8 @@ namespace ntt {
     void CallDepositKernelTiled(const Particles<M::Dim, M::CoordType>& species,
                                 const M&                    local_metric,
                                 const ndfield_t<M::Dim, 3>& cur,
-                                real_t                      dt) {
+                                real_t                      dt,
+                                int                         team_size_req) {
       static_assert(O <= 11u, "Shape order must be <= 11");
       constexpr unsigned short T = static_cast<unsigned short>(
         TEAM_POLICY_TILE_SIZE);
@@ -86,11 +87,41 @@ namespace ntt {
           dt,     layout,  species.npart()
         };
 
+      const auto scratch = Kokkos::PerTeam(
+        decltype(deposit_kernel)::scratch_bytes());
+
+      // Team (work-group) size. The default (team_size_req == 0) leaves
+      // Kokkos::AUTO, which sizes the team from the backend occupancy
+      // heuristic. A positive `algorithms.deposit.team_policy_team_size`
+      // overrides it, clamped to the scratch/backend-feasible maximum so an
+      // over-large request cannot abort the launch (Kokkos errors when
+      // team_size > team_size_max). No portable subgroup rounding is applied;
+      // pick a multiple of the device subgroup width (printed per arch by
+      // ideal_tile_size.py) for the best occupancy.
       Kokkos::TeamPolicy<> policy(static_cast<int>(layout.ntiles_total),
                                   Kokkos::AUTO);
-      policy.set_scratch_size(
-        0,
-        Kokkos::PerTeam(decltype(deposit_kernel)::scratch_bytes()));
+      policy.set_scratch_size(0, scratch);
+      if (team_size_req > 0) {
+        const int ts_max = policy.team_size_max(deposit_kernel,
+                                                Kokkos::ParallelForTag {});
+        int       ts     = team_size_req;
+        if (ts > ts_max) {
+          raise::Warning(
+            fmt::format("algorithms.deposit.team_policy_team_size = %d exceeds "
+                        "the tiled-deposit maximum %d on this backend; clamping "
+                        "to %d",
+                        team_size_req,
+                        ts_max,
+                        ts_max),
+            HERE);
+          ts = ts_max;
+        }
+        policy = Kokkos::TeamPolicy<>(static_cast<int>(layout.ntiles_total), ts);
+        policy.set_scratch_size(0, scratch);
+        logger::Checkpoint(
+          fmt::format("Tiled deposit: explicit team size %d", ts),
+          HERE);
+      }
       Kokkos::parallel_for("CurrentsDepositTiled", policy, deposit_kernel);
 
       // Particles appended since the last sort (injection / MPI receive on a
@@ -126,6 +157,12 @@ namespace ntt {
       Kokkos::deep_copy(domain.fields.cur, ZERO);
 
 #if defined(TEAM_POLICY)
+      // Optional runtime override for the tiled-deposit team (work-group) size;
+      // 0 (default) keeps Kokkos::AUTO. Clamped to the backend max in the
+      // launcher (see CallDepositKernelTiled).
+      const auto team_size_req = static_cast<int>(
+        engine_params.get<std::size_t>("team_policy_team_size",
+                                       std::optional<std::size_t> { 0u }));
 
       // Tiled deposit. Correctness no longer depends on the SoA being in a
       // "sorted" state at deposit time — the tiled kernel handles a stale
@@ -176,7 +213,8 @@ namespace ntt {
           CallDepositKernelTiled<M, SHAPE_ORDER>(species,
                                                  domain.mesh.metric,
                                                  domain.fields.cur,
-                                                 dt);
+                                                 dt,
+                                                 team_size_req);
         }
       }
 #else
diff --git a/src/framework/parameters/algorithms.cpp b/src/framework/parameters/algorithms.cpp
index 4766db965..87c77fbd9 100644
--- a/src/framework/parameters/algorithms.cpp
+++ b/src/framework/parameters/algorithms.cpp
@@ -33,6 +33,11 @@ namespace ntt {
 
       deposit_enable = toml::find_or(toml_data, "algorithms", "deposit", "enable", true);
       deposit_order = static_cast<unsigned short>(SHAPE_ORDER);
+      deposit_team_policy_team_size = toml::find_or(toml_data,
+                                                    "algorithms",
+                                                    "deposit",
+                                                    "team_policy_team_size",
+                                                    defaults::team_policy_team_size);
 
       fieldsolver_enable = toml::find_or(toml_data,
                                          "algorithms",
@@ -140,6 +145,8 @@ namespace ntt {
 
       params->set("algorithms.deposit.enable", deposit_enable.value());
       params->set("algorithms.deposit.order", deposit_order.value());
+      params->set("algorithms.deposit.team_policy_team_size",
+                  deposit_team_policy_team_size.value());
 
       params->set("algorithms.fieldsolver.enable", fieldsolver_enable.value());
       for (const auto& [key, value] : fieldsolver_stencil_coeffs.value()) {
diff --git a/src/framework/parameters/algorithms.h b/src/framework/parameters/algorithms.h
index 97edb244a..c46f480fe 100644
--- a/src/framework/parameters/algorithms.h
+++ b/src/framework/parameters/algorithms.h
@@ -34,6 +34,7 @@ namespace ntt {
 
       std::optional<bool>           deposit_enable;
       std::optional<unsigned short> deposit_order;
+      std::optional<std::size_t>    deposit_team_policy_team_size;
 
       std::optional<bool>                          fieldsolver_enable;
       std::optional<std::map<std::string, real_t>> fieldsolver_stencil_coeffs;
diff --git a/src/global/defaults.h b/src/global/defaults.h
index e1387677e..c4f9b67f4 100644
--- a/src/global/defaults.h
+++ b/src/global/defaults.h
@@ -22,6 +22,8 @@ namespace ntt::defaults {
 
   const unsigned short current_filters = 0;
 
+  const std::size_t team_policy_team_size = 0;
+
   const std::string em_pusher      = "Boris";
   const std::string ph_pusher      = "Photon";
   const timestep_t  clear_interval = 100;

From ca7307dd7ee3b19e0997dedbc6787131ed9556ed Mon Sep 17 00:00:00 2001
From: LudwigBoess <ludwig.boess@outlook.de>
Date: Fri, 26 Jun 2026 02:38:14 +0000
Subject: [PATCH 29/33] add script to compute ideal tile size

---
 ideal_tile_size.py | 955 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 955 insertions(+)
 create mode 100644 ideal_tile_size.py

diff --git a/ideal_tile_size.py b/ideal_tile_size.py
new file mode 100644
index 000000000..e216f5981
--- /dev/null
+++ b/ideal_tile_size.py
@@ -0,0 +1,955 @@
+#!/usr/bin/env python3
+"""Recommend the team tile size (T_TILE) for entity's tiled current-deposit kernel.
+
+Each GPU work-group (team) owns a TE^dim scratch tile in shared memory (SLM on Intel,
+LDS on AMD, shared mem on NVIDIA), accumulates its particles' currents into it, then
+flushes once to global memory.  With
+
+    TE   = T_TILE + 2*HALO
+    HALO = stencil_reach + drift   (stencil_reach = shape_order for Esirkepov, 2 for the
+                                    O==0 zigzag deposit; drift = the compile-time
+                                    `team_policy_drift` CMake knob, NOT the runtime
+                                    spatial_sorting_interval -- see currents_deposit.hpp)
+
+the tile size is squeezed by three competing pressures:
+
+  * Shared-memory capacity (HARD): TE^dim * ncomp * sizeof(real) must fit, ideally with
+    several work-groups resident per compute unit so latency is hidden.  Binds in 3D /
+    double precision / on AMD's 64 KiB LDS.
+  * Halo overhead (push LARGER): zero-fill + flush sweep the whole TE^dim tile, so a tiny
+    tile is almost all halo (1-(T/TE)^dim wasted).  Big HALO (infrequent sorts) makes this
+    worse and forces larger tiles.
+  * Particles per tile (push SMALLER): ppc*T^dim particles all atomic-add into one fixed
+    scratch tile -> SLM-atomic contention and load imbalance grow with tile size.
+
+Recommendation = the largest tile that respects the particle budget and shared-memory
+residency; if that tile would be mostly halo, it is grown (toward lower halo) up to the
+shared-memory limit.  This is a first-order model -- confirm by sweeping the entity knobs
+  -D team_policy_tile_size=<T> -D team_policy_drift=<D>  and re-profiling (see roofline/).
+The team (work-group) size defaults to Kokkos::AUTO; override it at runtime with the
+  [algorithms.deposit] team_policy_team_size = <N>   (0 = AUTO)
+toml knob -- clamped to the backend maximum at launch (engines/srpic/currents.h).
+
+Two ways to drive it:
+
+  * Interactive TUI (no arguments):
+        python ideal_tile_size.py
+
+  * Scriptable CLI (any argument):
+        python ideal_tile_size.py pvc --dim 2 --ppc 16
+        python ideal_tile_size.py amd --dim 3 --ppc 64
+        python ideal_tile_size.py all --dim 2 --ppc 16 --drift 4   # infrequent sorting
+"""
+import argparse
+import curses
+import os
+import sys
+from typing import Callable, List, Optional, Tuple
+
+# Shared-memory budget is the load-bearing number.  Per compute unit (Xe-core / SM / CU);
+# smem_wg_max is the largest single-work-group allocation.
+ARCH = {
+    "pvc":    dict(label="Intel Data Center GPU Max 1550 (PVC, Xe-HPC), per tile",
+                   smem_cu=128 * 1024, smem_wg_max=128 * 1024,
+                   subgroup=32, max_wg=1024, n_cu=64, cu="Xe-core"),
+    "a100":   dict(label="NVIDIA A100 (Ampere)",
+                   smem_cu=164 * 1024, smem_wg_max=163 * 1024,
+                   subgroup=32, max_wg=1024, n_cu=108, cu="SM",
+                   note="shared mem >48 KiB/block needs opt-in (cudaFuncAttributeMaxDynamicSharedMemorySize)"),
+    "h100":   dict(label="NVIDIA H100 (Hopper)",
+                   smem_cu=228 * 1024, smem_wg_max=227 * 1024,
+                   subgroup=32, max_wg=1024, n_cu=132, cu="SM",
+                   note="shared mem >48 KiB/block needs opt-in (cudaFuncAttributeMaxDynamicSharedMemorySize)"),
+    "gh200":  dict(label="NVIDIA GH200 Grace Hopper (Hopper H100/H200 GPU)",
+                   smem_cu=228 * 1024, smem_wg_max=227 * 1024,
+                   subgroup=32, max_wg=1024, n_cu=132, cu="SM",
+                   note="shared mem >48 KiB/block needs opt-in (cudaFuncAttributeMaxDynamicSharedMemorySize)"),
+    "mi250x": dict(label="AMD Instinct MI250X (CDNA2), per GCD",
+                   smem_cu=64 * 1024, smem_wg_max=64 * 1024,
+                   subgroup=64, max_wg=1024, n_cu=110, cu="CU"),
+    "mi300x": dict(label="AMD Instinct MI300X (CDNA3)",
+                   smem_cu=64 * 1024, smem_wg_max=64 * 1024,
+                   subgroup=64, max_wg=1024, n_cu=304, cu="CU"),
+}
+ALIAS = {"nvidia": "h100", "amd": "mi300x", "intel": "pvc", "mi250": "mi250x", "mi300": "mi300x",
+         "gracehopper": "gh200", "grace-hopper": "gh200", "gh200x": "gh200"}
+PRECISION = {"single": 4, "double": 8}
+
+# arch choices offered in the TUI (canonical keys plus the meta-target "all")
+ARCH_CHOICES = list(ARCH.keys()) + ["all"]
+# "all" expands to one representative of each vendor (matches the CLI behaviour)
+ALL_ARCHS = ["pvc", "nvidia", "amd"]
+
+
+# ============================
+# core model (shared by TUI + CLI)
+# ============================
+
+class Settings:
+    """Tunable inputs for the tile-size model.
+
+    Attribute names match the argparse dest names so `recommend`/`report_lines` accept
+    either a Settings instance (TUI) or an argparse namespace (CLI) interchangeably.
+    """
+
+    def __init__(self):
+        self.arch = "pvc"          # an ARCH key or "all"
+        self.dim = 2               # 1 / 2 / 3
+        self.ppc = 16.0            # particles per cell (per species)
+        self.shape_order = 2       # entity shape_order
+        self.precision = "single"  # single / double
+        self.components = 3        # current-field components (J has 3)
+        self.drift = 1             # team_policy_drift: cells of drift the scratch halo absorbs
+                                   # (compile-time CMake knob, independent of spatial_sorting_interval)
+        self.target_resident = 2   # work-groups resident per compute unit
+        self.npart_cap = 1600.0    # particle-per-tile budget (contention / load-balance proxy)
+        self.halo_max = 0.70       # halo fraction above which the tile is grown
+        self.grid = 0              # cells per dim (0 disables the GPU-fill check)
+        self.balance_factor = 4    # min tiles per compute unit
+        self.min_tile = 4          # entity's team_policy_tile_sizes list starts at 4
+        self.max_tile = 64
+
+
+def resolve_arch(name):
+    key = ALIAS.get(name.lower(), name.lower())
+    if key not in ARCH:
+        raise SystemExit("unknown arch '%s'; choose from %s (or aliases %s)"
+                         % (name, ", ".join(ARCH), ", ".join(ALIAS)))
+    return key, ARCH[key]
+
+
+def recommend(hw, p):
+    """p: Settings or argparse namespace. Returns dict with rows, chosen row, binding."""
+    # Matches DepositCurrentsTiled_kernel: STENCIL_REACH = O for Esirkepov (O>=1),
+    # 2 for the O==0 zigzag deposit; HALO = STENCIL_REACH + TEAM_POLICY_DRIFT.
+    stencil_reach = 2 if p.shape_order == 0 else p.shape_order
+    halo = stencil_reach + p.drift
+    real = PRECISION[p.precision]
+    atoms_pp = p.components * (p.shape_order + 1) ** p.dim   # ~ useful atomics / particle
+
+    rows = []
+    for T in range(p.min_tile, p.max_tile + 1, 2):          # entity uses even tile sizes
+        TE = T + 2 * halo
+        scratch = TE ** p.dim * p.components * real
+        npart = p.ppc * T ** p.dim
+        halo_frac = 1.0 - (T / TE) ** p.dim
+        ovhd = (2.0 * p.components / p.ppc) * (TE / T) ** p.dim / atoms_pp  # zero+flush vs deposit
+        resident = int(hw["smem_cu"] // scratch) if scratch else 0
+        ntiles = (p.grid / T) ** p.dim if p.grid else None
+        rows.append(dict(T=T, TE=TE, scratch=scratch, npart=npart, halo_frac=halo_frac,
+                         ovhd=ovhd, resident=resident, ntiles=ntiles,
+                         ok_cap=scratch <= hw["smem_wg_max"]))
+
+    capfeas = [r for r in rows if r["ok_cap"]]
+    if not capfeas:
+        return dict(halo=halo, rows=rows, chosen=None, binding=None, grown=False)
+
+    def largest(pred, default):
+        ts = [r["T"] for r in capfeas if pred(r)]
+        return max(ts) if ts else default
+
+    T_cap = max(r["T"] for r in capfeas)
+    T_res = largest(lambda r: r["resident"] >= p.target_resident, p.min_tile)
+    T_np = largest(lambda r: r["npart"] <= p.npart_cap, p.min_tile)
+    T_bal = largest(lambda r: r["ntiles"] is None or r["ntiles"] >= p.balance_factor * hw["n_cu"], p.min_tile)
+    bounds = {"shared-memory capacity": T_cap, "shared-memory residency": T_res,
+              "GPU fill (too few tiles)": T_bal, "particle budget per tile": T_np}
+
+    chosen_T = max(min(bounds.values()), p.min_tile)
+    binding = min(bounds, key=lambda k: bounds[k])
+
+    # If the particle-budget pick is mostly halo, grow the tile to cut halo, but never
+    # past what shared memory / GPU-fill allow (that just trades halo for contention).
+    grown = False
+    cur = next(r for r in capfeas if r["T"] == chosen_T)
+    if cur["halo_frac"] > p.halo_max:
+        halo_ok = [r["T"] for r in capfeas if r["halo_frac"] <= p.halo_max]
+        ceil_T = min(T_res, T_bal, T_cap)
+        if halo_ok:
+            target = max(min(halo_ok), chosen_T)        # smallest tile that clears halo_max
+            new_T = min(max(target, chosen_T), ceil_T)
+            if new_T > chosen_T:
+                chosen_T, grown = new_T, True
+                binding = ("halo overhead" if min(halo_ok) <= ceil_T
+                           else min({k: v for k, v in bounds.items()
+                                     if k != "particle budget per tile"},
+                                    key=lambda k: bounds[k]))
+        else:
+            new_T = min(ceil_T, T_cap)                  # can't clear halo_max at all -> go as big as SLM allows
+            if new_T > chosen_T:
+                chosen_T, grown = new_T, True
+                binding = "shared-memory residency"
+
+    chosen = next(r for r in capfeas if r["T"] == chosen_T)
+    return dict(halo=halo, rows=rows, chosen=chosen, binding=binding, grown=grown, atoms_pp=atoms_pp)
+
+
+def kib(b):
+    return "%.1f" % (b / 1024.0)
+
+
+def report_lines(name, key, hw, p, res):
+    """Build the recommendation report as a list of text lines (no printing)."""
+    L = []
+    L.append("=" * 80)
+    L.append("%s   [preset: %s]" % (hw["label"], key))
+    L.append("  dim=%d  ppc=%g  shape_order=%d  precision=%s(%dB)  J-components=%d"
+             % (p.dim, p.ppc, p.shape_order, p.precision, PRECISION[p.precision], p.components))
+    reach = 2 if p.shape_order == 0 else p.shape_order
+    reach_kind = "zigzag" if p.shape_order == 0 else "Esirkepov O"
+    L.append("  HALO = stencil_reach + drift = %d + %d = %d   ->   TE = T_TILE + %d"
+             "   (reach %d = %s; drift = team_policy_drift)"
+             % (reach, p.drift, res["halo"], 2 * res["halo"], reach, reach_kind))
+    L.append("  shared mem %s KiB/%s (budget %s KiB for %d resident WGs); subgroup=%d, n_cu=%d"
+             % (kib(hw["smem_cu"]), hw["cu"], kib(hw["smem_cu"] / p.target_resident),
+                p.target_resident, hw["subgroup"], hw["n_cu"]))
+    if hw.get("note"):
+        L.append("  note: %s" % hw["note"])
+    L.append("-" * 80)
+    L.append("  T_TILE  TE   scratch  resWG  part/tile  halo%  zero+flush%")
+    for r in res["rows"]:
+        if not r["ok_cap"]:
+            continue
+        mark = "  <== recommended" if r is res["chosen"] else ""
+        L.append("   %3d   %4d  %6s K  %4d  %9.0f  %4.0f   %7.1f%s"
+                 % (r["T"], r["TE"], kib(r["scratch"]), r["resident"], r["npart"],
+                    100 * r["halo_frac"], 100 * r["ovhd"], mark))
+    L.append("-" * 80)
+
+    c = res["chosen"]
+    if c is None:
+        L.append("  INFEASIBLE: even T_TILE=%d does not fit %s KiB shared memory."
+                 % (p.min_tile, kib(hw["smem_wg_max"])))
+        L.append("  -> sort more often (smaller drift), use precision single, lower shape_order,")
+        L.append("     or use a non-tiled (global-atomic / ScatterView) deposit on this arch.")
+        return L
+    L.append("  RECOMMENDED  T_TILE = %d   (limited by: %s%s)"
+             % (c["T"], res["binding"], "; tile grown to reduce halo" if res["grown"] else ""))
+    L.append("    %.1f KiB scratch/team, %d work-groups resident/%s, %.0f particles/team, %.0f%% halo"
+             % (c["scratch"] / 1024.0, c["resident"], hw["cu"], c["npart"], 100 * c["halo_frac"]))
+    team = min(hw["max_wg"], 256 - 256 % hw["subgroup"])
+    extra = "" if c["T"] <= 16 else "   (entity's team_policy_tile_sizes list stops at 16; extend it)"
+    L.append("    entity build:  -D team_policy=ON -D team_policy_tile_size=%d -D team_policy_drift=%d%s"
+             % (min(c["T"], 16), p.drift, extra))
+    L.append("    team (work-group) size: Kokkos::AUTO by default; to override, set in the toml")
+    L.append("    [algorithms.deposit] team_policy_team_size = %d   (0 = AUTO; keep a multiple of"
+             % team)
+    L.append("    subgroup=%d), then sweep around it and re-profile" % hw["subgroup"])
+    # contextual guidance
+    if c["halo_frac"] > p.halo_max:
+        if p.drift > 1:
+            L.append("    !! %.0f%% of the tile is halo, inflated by team_policy_drift=%d; lower it "
+                     "(and sort at least that often via spatial_sorting_interval)"
+                     % (100 * c["halo_frac"], p.drift))
+        else:
+            L.append("    !! %.0f%% halo is intrinsic at this size (shared memory caps the tile here)"
+                     % (100 * c["halo_frac"]))
+    if c["npart"] > p.npart_cap:
+        L.append("    !! %.0f particles/team exceeds the %.0f budget -> watch SLM-atomic contention"
+                 % (c["npart"], p.npart_cap))
+    if c["resident"] < p.target_resident:
+        L.append("    !! only %d work-group(s) resident/%s -> limited latency hiding"
+                 % (c["resident"], hw["cu"]))
+    return L
+
+
+def archs_for(name):
+    """Expand a setting/arg value into the list of arch names to report on."""
+    return ALL_ARCHS if name.lower() == "all" else [name]
+
+
+def build_report(p):
+    """Run the model for the selected arch(es) and return the full report as lines."""
+    lines = []
+    for a in archs_for(p.arch):
+        try:
+            key, hw = resolve_arch(a)
+        except SystemExit as e:
+            lines.append(str(e))
+            continue
+        lines.extend(report_lines(a, key, hw, p, recommend(hw, p)))
+    lines.append("=" * 80)
+    return lines
+
+
+# ============================
+# colors: edit these
+# ============================
+
+COLOR_TITLE_FG = curses.COLOR_BLUE
+COLOR_TEXT_FG = curses.COLOR_WHITE
+COLOR_SELECTED_FG = curses.COLOR_WHITE
+COLOR_SELECTED_BG = curses.COLOR_BLACK
+COLOR_HINT_FG = curses.COLOR_YELLOW
+COLOR_OK_FG = curses.COLOR_GREEN
+COLOR_ERR_FG = curses.COLOR_RED
+COLOR_KEY_FG = curses.COLOR_MAGENTA
+COLOR_DIM_FG = curses.COLOR_CYAN
+
+PAIR_TITLE = 1
+PAIR_TEXT = 2
+PAIR_SELECTED = 3
+PAIR_HINT = 4
+PAIR_OK = 5
+PAIR_ERR = 6
+PAIR_KEY = 7
+PAIR_DIM = 8
+
+
+class MenuItem:
+    def __init__(self, label, hint="", right=None, on_enter=None,
+                 on_space=None, disabled=None):
+        self.label = label
+        self.hint = hint
+        self.right = right
+        self.on_enter = on_enter
+        self.on_space = on_space
+        self.disabled = disabled
+
+
+# ============================
+# TUI
+# ============================
+
+class App:
+    def __init__(self, stdscr):
+        self.stdscr = stdscr
+        self.s = Settings()
+
+        self.state = "mainmenu"
+        self.stack: List[Tuple[str, int]] = []
+        self.selected = 0
+        self.scroll = 0
+        self.message = "use arrows or j/k"
+
+        self._init_curses()
+
+    def _init_curses(self) -> None:
+        curses.curs_set(0)
+        self.stdscr.keypad(True)
+        curses.noecho()
+        curses.cbreak()
+
+        if curses.has_colors():
+            curses.start_color()
+            curses.use_default_colors()
+            curses.init_pair(PAIR_TITLE, COLOR_TITLE_FG, -1)
+            curses.init_pair(PAIR_TEXT, COLOR_TEXT_FG, -1)
+            curses.init_pair(PAIR_SELECTED, COLOR_SELECTED_FG, COLOR_SELECTED_BG)
+            curses.init_pair(PAIR_HINT, COLOR_HINT_FG, -1)
+            curses.init_pair(PAIR_OK, COLOR_OK_FG, -1)
+            curses.init_pair(PAIR_ERR, COLOR_ERR_FG, -1)
+            curses.init_pair(PAIR_KEY, COLOR_KEY_FG, -1)
+            curses.init_pair(PAIR_DIM, COLOR_DIM_FG, -1)
+
+    def cp(self, pair_id: int) -> int:
+        return curses.color_pair(pair_id) if curses.has_colors() else 0
+
+    # ----- formatting helpers -----
+
+    def arch_label(self) -> str:
+        if self.s.arch == "all":
+            return "all (%s)" % " + ".join(ALL_ARCHS)
+        try:
+            return ARCH[resolve_arch(self.s.arch)[0]]["label"]
+        except SystemExit:
+            return self.s.arch
+
+    # ----- nav stack -----
+
+    def push(self, st: str) -> None:
+        self.stack.append((self.state, self.selected))
+        self.state = st
+        self.selected = 0
+        self.scroll = 0
+        self.message = ""
+
+    def pop(self) -> None:
+        if self.stack:
+            self.state, self.selected = self.stack.pop()
+        else:
+            self.state, self.selected = "mainmenu", 0
+        self.scroll = 0
+        self.message = ""
+
+    # ----- drawing -----
+
+    def add(self, y: int, x: int, s: str, attr: int = 0) -> None:
+        try:
+            self.stdscr.addstr(y, x, s, attr)
+        except curses.error:
+            pass
+
+    def hline(self, y: int) -> None:
+        _, w = self.stdscr.getmaxyx()
+        try:
+            self.stdscr.hline(y, 0, curses.ACS_HLINE, max(0, w - 1))
+        except curses.error:
+            pass
+
+    def draw_keybar(self, y: int, x: int, pairs: List[Tuple[str, str]]) -> None:
+        cur_x = x
+        for key, action in pairs:
+            self.add(y, cur_x, key, self.cp(PAIR_KEY) | curses.A_BOLD)
+            cur_x += len(key)
+            self.add(y, cur_x, " ", self.cp(PAIR_DIM))
+            cur_x += 1
+            self.add(y, cur_x, action, self.cp(PAIR_HINT))
+            cur_x += len(action)
+            self.add(y, cur_x, "   ", self.cp(PAIR_DIM))
+            cur_x += 3
+
+    def breadcrumb(self) -> str:
+        return {
+            "mainmenu": "mainmenu",
+            "arch": "mainmenu > architecture",
+            "physics": "mainmenu > physics & particles",
+            "tuning": "mainmenu > tuning knobs",
+        }.get(self.state, "mainmenu")
+
+    def draw_menu(self, title: str, prompt: str, items: List[MenuItem]) -> None:
+        self.stdscr.erase()
+        h, w = self.stdscr.getmaxyx()
+
+        self.add(0, 2, title, self.cp(PAIR_TITLE) | curses.A_BOLD)
+        bc = self.breadcrumb()
+        self.add(0, max(2, w - 2 - len(bc)), bc, self.cp(PAIR_DIM))
+
+        self.draw_keybar(
+            1,
+            2,
+            [
+                ("up/dn/j/k", "move"),
+                ("enter", "select"),
+                ("space", "toggle/cycle"),
+                ("b", "back"),
+                ("q", "quit"),
+            ],
+        )
+        self.hline(2)
+
+        status1 = ("arch: %s    dim: %d    precision: %s    ppc: %g"
+                   % (self.s.arch, self.s.dim, self.s.precision, self.s.ppc))
+        status2 = ("shape_order: %d    drift: %d    components: %d    tile range: %d-%d"
+                   % (self.s.shape_order, self.s.drift, self.s.components,
+                      self.s.min_tile, self.s.max_tile))
+        self.add(3, 2, status1[: w - 4], self.cp(PAIR_TEXT))
+        self.add(4, 2, status2[: w - 4], self.cp(PAIR_TEXT))
+        self.hline(5)
+
+        self.add(6, 2, prompt[: w - 4], self.cp(PAIR_TEXT) | curses.A_BOLD)
+
+        list_y = 8
+        footer_h = 3
+        view_h = max(1, h - list_y - footer_h)
+        n = len(items)
+
+        if n == 0:
+            self.add(list_y, 2, "(empty)", self.cp(PAIR_HINT))
+        else:
+            self.selected = max(0, min(self.selected, n - 1))
+
+            if self.selected < self.scroll:
+                self.scroll = self.selected
+            if self.selected >= self.scroll + view_h:
+                self.scroll = self.selected - view_h + 1
+            self.scroll = max(0, min(self.scroll, max(0, n - view_h)))
+
+            shown = items[self.scroll : self.scroll + view_h]
+
+            for i, it in enumerate(shown):
+                idx = self.scroll + i
+                sel = idx == self.selected
+                dis = bool(it.disabled and it.disabled())
+
+                row_attr = (
+                    self.cp(PAIR_SELECTED) | curses.A_BOLD
+                    if sel
+                    else (self.cp(PAIR_DIM) if dis else self.cp(PAIR_TEXT))
+                )
+                self.add(list_y + i, 2, ("  %s" % it.label)[: w - 4], row_attr)
+
+                if it.right:
+                    rt = (it.right() or "").strip()
+                    if rt:
+                        rt = rt[: max(0, w - 6)]
+                        x = max(2, w - 2 - len(rt))
+                        rt_attr = (
+                            row_attr
+                            if sel
+                            else (self.cp(PAIR_HINT) if not dis else self.cp(PAIR_DIM))
+                        )
+                        self.add(list_y + i, x, rt, rt_attr)
+
+                if sel and it.hint:
+                    self.add(
+                        list_y + i,
+                        min(w - 4, 30),
+                        ("    %s" % it.hint)[: w - 4],
+                        self.cp(PAIR_HINT),
+                    )
+
+        self.hline(h - 3)
+        msg = self.message or ""
+        if msg:
+            is_err = msg.startswith("error")
+            attr = (self.cp(PAIR_ERR) if is_err else self.cp(PAIR_OK)) | curses.A_BOLD
+            self.add(h - 2, 2, msg[: w - 4], attr)
+        self.stdscr.refresh()
+
+    # ----- modals -----
+
+    def input_box(self, title: str, prompt: str, initial: str) -> Optional[str]:
+        h, w = self.stdscr.getmaxyx()
+        win_h, win_w = 9, min(86, max(46, w - 6))
+        top, left = max(0, (h - win_h) // 2), max(0, (w - win_w) // 2)
+
+        win = curses.newwin(win_h, win_w, top, left)
+        win.keypad(True)
+        win.border()
+
+        win.addstr(1, 2, title[: win_w - 4], self.cp(PAIR_TITLE) | curses.A_BOLD)
+        win.addstr(2, 2, prompt[: win_w - 4], self.cp(PAIR_TEXT))
+
+        buf = list(initial)
+        curses.curs_set(1)
+
+        while True:
+            win.addstr(4, 2, " " * (win_w - 4), self.cp(PAIR_TEXT))
+            text = "".join(buf)
+            if len(text) > win_w - 4:
+                text = text[-(win_w - 4) :]
+            win.addstr(4, 2, text, self.cp(PAIR_TEXT) | curses.A_BOLD)
+            win.addstr(6, 2, "enter=ok   esc=cancel", self.cp(PAIR_DIM))
+            win.refresh()
+
+            ch = win.getch()
+            if ch == 27:
+                curses.curs_set(0)
+                return None
+            if ch in (curses.KEY_ENTER, 10, 13):
+                curses.curs_set(0)
+                return "".join(buf).strip()
+            if ch in (curses.KEY_BACKSPACE, 127, 8):
+                if buf:
+                    buf.pop()
+            elif 32 <= ch <= 126:
+                buf.append(chr(ch))
+
+    # ----- value editors -----
+
+    def cycle_attr(self, attr: str, options: list) -> None:
+        cur = getattr(self.s, attr)
+        if cur not in options:
+            setattr(self.s, attr, options[0])
+        else:
+            setattr(self.s, attr, options[(options.index(cur) + 1) % len(options)])
+
+    def edit_int(self, label: str, attr: str, minv: Optional[int] = None) -> None:
+        val = self.input_box(label, "enter an integer:", str(getattr(self.s, attr)))
+        if val is None or val == "":
+            return
+        try:
+            n = int(val)
+        except ValueError:
+            self.message = "error: '%s' is not an integer" % val
+            return
+        if minv is not None and n < minv:
+            self.message = "error: %s must be >= %d" % (label, minv)
+            return
+        setattr(self.s, attr, n)
+        self.message = "%s = %d" % (label, n)
+
+    def edit_float(self, label: str, attr: str,
+                   minv: Optional[float] = None, maxv: Optional[float] = None) -> None:
+        val = self.input_box(label, "enter a number:", str(getattr(self.s, attr)))
+        if val is None or val == "":
+            return
+        try:
+            x = float(val)
+        except ValueError:
+            self.message = "error: '%s' is not a number" % val
+            return
+        if minv is not None and x < minv:
+            self.message = "error: %s must be >= %g" % (label, minv)
+            return
+        if maxv is not None and x > maxv:
+            self.message = "error: %s must be <= %g" % (label, maxv)
+            return
+        setattr(self.s, attr, x)
+        self.message = "%s = %g" % (label, x)
+
+    # ----- report pager -----
+
+    def _pager_attr(self, ln: str) -> int:
+        s = ln.strip()
+        if "RECOMMENDED" in ln:
+            return self.cp(PAIR_OK) | curses.A_BOLD
+        if "<== recommended" in ln:
+            return self.cp(PAIR_OK)
+        if "INFEASIBLE" in ln or "unknown arch" in ln:
+            return self.cp(PAIR_ERR) | curses.A_BOLD
+        if s.startswith("!!"):
+            return self.cp(PAIR_HINT)
+        if s.startswith("T_TILE"):
+            return self.cp(PAIR_TITLE) | curses.A_BOLD
+        if set(s) <= {"=", "-"} and s:
+            return self.cp(PAIR_DIM)
+        return self.cp(PAIR_TEXT)
+
+    def pager(self, title: str, lines: List[str]) -> None:
+        top = 0
+        note = ""
+        while True:
+            self.stdscr.erase()
+            h, w = self.stdscr.getmaxyx()
+
+            self.add(0, 2, title, self.cp(PAIR_TITLE) | curses.A_BOLD)
+            self.draw_keybar(
+                1,
+                2,
+                [
+                    ("up/dn/j/k", "scroll"),
+                    ("PgUp/PgDn", "page"),
+                    ("g/G", "top/end"),
+                    ("w", "save"),
+                    ("b/q", "back"),
+                ],
+            )
+            self.hline(2)
+
+            list_y = 4
+            view_h = max(1, h - list_y - 2)
+            n = len(lines)
+            top = max(0, min(top, max(0, n - view_h)))
+
+            for i, ln in enumerate(lines[top : top + view_h]):
+                self.add(list_y + i, 2, ln[: w - 3], self._pager_attr(ln))
+
+            self.hline(h - 2)
+            footer = "line %d-%d / %d" % (top + 1, min(top + view_h, n), n)
+            if note:
+                footer += "    " + note
+            self.add(h - 1, 2, footer[: w - 4], self.cp(PAIR_DIM))
+            self.stdscr.refresh()
+
+            ch = self.stdscr.getch()
+            if ch in (ord("q"), ord("Q"), ord("b"), 8, 127):
+                return
+            if ch in (curses.KEY_UP, ord("k"), ord("K")):
+                top -= 1
+            elif ch in (curses.KEY_DOWN, ord("j"), ord("J")):
+                top += 1
+            elif ch in (curses.KEY_PPAGE,):
+                top -= view_h
+            elif ch in (curses.KEY_NPAGE, ord(" ")):
+                top += view_h
+            elif ch in (ord("g"),):
+                top = 0
+            elif ch in (ord("G"),):
+                top = n
+            elif ch in (ord("w"), ord("W")):
+                note = self._save_report(lines)
+
+    def _save_report(self, lines: List[str]) -> str:
+        path = os.path.join(os.getcwd(), "ideal_tile_size_report.txt")
+        try:
+            with open(path, "w") as f:
+                f.write("\n".join(lines) + "\n")
+            return "saved to %s" % path
+        except OSError as e:
+            return "save failed: %s" % e
+
+    def do_compute(self) -> None:
+        self.pager("recommendation  [%s]" % self.s.arch, build_report(self.s))
+
+    def reset(self) -> None:
+        self.s = Settings()
+        self.message = "reset to defaults"
+
+    # ----- menus -----
+
+    def menu_main(self) -> Tuple[str, str, List[MenuItem]]:
+        return (
+            "entity tile-size advisor",
+            "main menu:",
+            [
+                MenuItem(
+                    "architecture",
+                    "choose the target GPU (or 'all')",
+                    right=self.arch_label,
+                    on_enter=lambda: self.push("arch"),
+                ),
+                MenuItem(
+                    "physics & particles",
+                    "dim, ppc, shape order, precision, ...",
+                    right=lambda: "dim %d / ppc %g / so %d / %s"
+                    % (self.s.dim, self.s.ppc, self.s.shape_order, self.s.precision),
+                    on_enter=lambda: self.push("physics"),
+                ),
+                MenuItem(
+                    "tuning knobs",
+                    "residency, budgets, tile range, ...",
+                    right=lambda: "res %d / cap %g / halo %.2f"
+                    % (self.s.target_resident, self.s.npart_cap, self.s.halo_max),
+                    on_enter=lambda: self.push("tuning"),
+                ),
+                MenuItem(
+                    "compute recommendation",
+                    "run the model and show the report",
+                    on_enter=self.do_compute,
+                ),
+                MenuItem("reset to defaults", "", on_enter=self.reset),
+                MenuItem("exit", "", on_enter=lambda: setattr(self, "state", "exit")),
+            ],
+        )
+
+    def menu_arch(self) -> Tuple[str, str, List[MenuItem]]:
+        def choose(name: str):
+            self.s.arch = name
+            self.pop()
+
+        def label(name: str) -> str:
+            mark = "(*)" if name == self.s.arch else "( )"
+            return "%s %s" % (mark, name)
+
+        def right(name: str) -> str:
+            if name == "all":
+                return " + ".join(ALL_ARCHS)
+            return ARCH[name]["label"]
+
+        items = [
+            MenuItem(
+                label(a),
+                "select target",
+                right=(lambda a=a: right(a)),
+                on_enter=(lambda a=a: choose(a)),
+            )
+            for a in ARCH_CHOICES
+        ]
+        items.append(MenuItem("back", "return", on_enter=self.pop))
+        return ("architecture", "pick the target GPU:", items)
+
+    def menu_physics(self) -> Tuple[str, str, List[MenuItem]]:
+        def cyc_dim():
+            self.cycle_attr("dim", [1, 2, 3])
+
+        def cyc_prec():
+            self.cycle_attr("precision", ["single", "double"])
+
+        return (
+            "physics & particles",
+            "set physical inputs:",
+            [
+                MenuItem(
+                    "dim",
+                    "space cycles: 1 / 2 / 3",
+                    right=lambda: str(self.s.dim),
+                    on_enter=cyc_dim,
+                    on_space=cyc_dim,
+                ),
+                MenuItem(
+                    "ppc",
+                    "particles per cell (per species)",
+                    right=lambda: "%g" % self.s.ppc,
+                    on_enter=lambda: self.edit_float("ppc", "ppc", minv=0.0),
+                ),
+                MenuItem(
+                    "shape_order",
+                    "entity particle shape order",
+                    right=lambda: str(self.s.shape_order),
+                    on_enter=lambda: self.edit_int("shape_order", "shape_order", minv=0),
+                ),
+                MenuItem(
+                    "precision",
+                    "space cycles: single / double",
+                    right=lambda: self.s.precision,
+                    on_enter=cyc_prec,
+                    on_space=cyc_prec,
+                ),
+                MenuItem(
+                    "components",
+                    "current-field components (J has 3)",
+                    right=lambda: str(self.s.components),
+                    on_enter=lambda: self.edit_int("components", "components", minv=1),
+                ),
+                MenuItem(
+                    "drift",
+                    "team_policy_drift CMake knob: cells the scratch halo absorbs (>= spatial_sorting_interval)",
+                    right=lambda: str(self.s.drift),
+                    on_enter=lambda: self.edit_int("drift", "drift", minv=0),
+                ),
+                MenuItem("back", "return", on_enter=self.pop),
+            ],
+        )
+
+    def menu_tuning(self) -> Tuple[str, str, List[MenuItem]]:
+        return (
+            "tuning knobs",
+            "set the model's budgets and ranges:",
+            [
+                MenuItem(
+                    "target_resident",
+                    "work-groups resident per compute unit",
+                    right=lambda: str(self.s.target_resident),
+                    on_enter=lambda: self.edit_int("target_resident", "target_resident", minv=1),
+                ),
+                MenuItem(
+                    "npart_cap",
+                    "particle-per-tile budget (contention proxy)",
+                    right=lambda: "%g" % self.s.npart_cap,
+                    on_enter=lambda: self.edit_float("npart_cap", "npart_cap", minv=1.0),
+                ),
+                MenuItem(
+                    "halo_max",
+                    "halo fraction above which the tile is grown (0..1)",
+                    right=lambda: "%.2f" % self.s.halo_max,
+                    on_enter=lambda: self.edit_float("halo_max", "halo_max", minv=0.0, maxv=1.0),
+                ),
+                MenuItem(
+                    "grid",
+                    "cells per dim (0 disables GPU-fill check)",
+                    right=lambda: str(self.s.grid),
+                    on_enter=lambda: self.edit_int("grid", "grid", minv=0),
+                ),
+                MenuItem(
+                    "balance_factor",
+                    "min tiles per compute unit",
+                    right=lambda: str(self.s.balance_factor),
+                    on_enter=lambda: self.edit_int("balance_factor", "balance_factor", minv=1),
+                ),
+                MenuItem(
+                    "min_tile",
+                    "smallest T_TILE to consider",
+                    right=lambda: str(self.s.min_tile),
+                    on_enter=lambda: self.edit_int("min_tile", "min_tile", minv=1),
+                ),
+                MenuItem(
+                    "max_tile",
+                    "largest T_TILE to consider",
+                    right=lambda: str(self.s.max_tile),
+                    on_enter=lambda: self.edit_int("max_tile", "max_tile", minv=1),
+                ),
+                MenuItem("back", "return", on_enter=self.pop),
+            ],
+        )
+
+    def get_menu(self) -> Tuple[str, str, List[MenuItem]]:
+        if self.state == "mainmenu":
+            return self.menu_main()
+        if self.state == "arch":
+            return self.menu_arch()
+        if self.state == "physics":
+            return self.menu_physics()
+        if self.state == "tuning":
+            return self.menu_tuning()
+        self.state = "mainmenu"
+        return self.menu_main()
+
+    # ----- navigation -----
+
+    def is_disabled(self, it: MenuItem) -> bool:
+        return bool(it.disabled and it.disabled())
+
+    def move_sel(self, items: List[MenuItem], delta: int) -> None:
+        if not items:
+            return
+        n = len(items)
+        start = self.selected
+        for _ in range(n):
+            self.selected = (self.selected + delta) % n
+            if not self.is_disabled(items[self.selected]):
+                return
+        self.selected = start
+
+    def activate(self, items: List[MenuItem], enter: bool) -> None:
+        if not items:
+            return
+        it = items[self.selected]
+        if self.is_disabled(it):
+            self.message = "error: option disabled."
+            return
+        fn = it.on_enter if enter else it.on_space
+        if fn:
+            fn()
+
+    # ----- loop -----
+
+    def run(self) -> None:
+        while True:
+            if self.state == "exit":
+                return
+
+            title, prompt, items = self.get_menu()
+            self.draw_menu(title, prompt, items)
+
+            ch = self.stdscr.getch()
+
+            if ch in (ord("q"), ord("Q")):
+                self.state = "exit"
+                continue
+            if ch in (ord("b"), 8, 127):
+                self.pop()
+                continue
+            if ch in (curses.KEY_UP, ord("k"), ord("K")):
+                self.move_sel(items, -1)
+                continue
+            if ch in (curses.KEY_DOWN, ord("j"), ord("J")):
+                self.move_sel(items, +1)
+                continue
+            if ch in (curses.KEY_ENTER, 10, 13):
+                self.activate(items, enter=True)
+                continue
+            if ch == ord(" "):
+                self.activate(items, enter=False)
+                continue
+
+
+def run_tui() -> int:
+    try:
+        curses.wrapper(lambda stdscr: App(stdscr).run())
+    except KeyboardInterrupt:
+        return 130
+    return 0
+
+
+# ============================
+# CLI (preserved for scripting / sweeps)
+# ============================
+
+def run_cli(argv) -> int:
+    ap = argparse.ArgumentParser(description="Recommend entity team tile size (T_TILE).")
+    ap.add_argument("arch", help="pvc | nvidia | amd  (or a100/h100/gh200/mi250x/mi300x, or 'all')")
+    ap.add_argument("--dim", type=int, default=2, choices=(1, 2, 3))
+    ap.add_argument("--ppc", type=float, default=16.0, help="particles per cell (per species)")
+    ap.add_argument("--shape-order", type=int, default=2, help="particle shape order (entity shape_order)")
+    ap.add_argument("--precision", choices=("single", "double"), default="single")
+    ap.add_argument("--components", type=int, default=3, help="current-field components (J has 3)")
+    ap.add_argument("--drift", type=int, default=1,
+                    help="team_policy_drift CMake knob (compile-time): cells of drift the scratch "
+                         "halo absorbs; size it >= spatial_sorting_interval")
+    ap.add_argument("--target-resident", type=int, default=2, help="work-groups resident per compute unit")
+    ap.add_argument("--npart-cap", type=float, default=1600,
+                    help="particle-per-tile budget (SLM-atomic-contention / load-balance proxy)")
+    ap.add_argument("--halo-max", type=float, default=0.70, help="halo fraction above which the tile is grown")
+    ap.add_argument("--grid", type=int, default=0, help="cells per dim (optional; enables a GPU-fill check)")
+    ap.add_argument("--balance-factor", type=int, default=4, help="min tiles per compute unit")
+    ap.add_argument("--min-tile", type=int, default=4,
+                    help="smallest T_TILE to consider (entity's team_policy_tile_sizes starts at 4)")
+    ap.add_argument("--max-tile", type=int, default=64)
+    p = ap.parse_args(argv)
+
+    for ln in build_report(p):
+        print(ln)
+    return 0
+
+
+def main() -> int:
+    # no arguments -> interactive TUI; any argument -> scriptable CLI
+    if len(sys.argv) > 1:
+        return run_cli(sys.argv[1:])
+    return run_tui()
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 61ebebfffdf38c91456311087b94b96a20917d99 Mon Sep 17 00:00:00 2001
From: LudwigBoess <ludwig.boess@outlook.de>
Date: Fri, 26 Jun 2026 03:03:23 +0000
Subject: [PATCH 30/33] team policy for GRPIC

---
 src/engines/grpic/currents.h | 228 ++++++++++++++++++++++++++++++-----
 src/engines/grpic/grpic.hpp  |   3 +-
 2 files changed, 200 insertions(+), 31 deletions(-)

diff --git a/src/engines/grpic/currents.h b/src/engines/grpic/currents.h
index 11c847533..cb4032f54 100644
--- a/src/engines/grpic/currents.h
+++ b/src/engines/grpic/currents.h
@@ -2,6 +2,8 @@
  * @file engines/grpic/currents.h
  * @brief Current deposition and filtering routines for the GRPIC engine
  * @implements
+ *   - ntt::grpic::CallDepositKernel<> -> void                 (flat path)
+ *   - ntt::grpic::CallDepositKernelTiled<> -> void            (TEAM_POLICY)
  *   - ntt::grpic::CurrentsDeposit<> -> void
  *   - ntt::grpic::CurrentsFilter<> -> void
  * @namespaces:
@@ -12,8 +14,11 @@
 #define ENGINES_GRPIC_CURRENTS_H
 
 #include "enums.h"
+#include "global.h"
 
+#include "arch/kokkos_aliases.h"
 #include "traits/metric.h"
+#include "utils/error.h"
 #include "utils/log.h"
 #include "utils/param_container.h"
 
@@ -26,13 +31,198 @@
 namespace ntt {
   namespace grpic {
 
+    template <GRMetricClass M, unsigned short O>
+    void CallDepositKernel(const Particles<M::Dim, M::CoordType>& species,
+                           const M&                               local_metric,
+                           const scatter_ndfield_t<M::Dim, 3>&    scatter_cur,
+                           real_t                                 dt) {
+      Kokkos::parallel_for("CurrentsDeposit",
+                           species.rangeActiveParticles(),
+                           kernel::DepositCurrents_kernel<SimEngine::GRPIC, M, O>(
+                             scatter_cur,
+                             species,
+                             local_metric,
+                             (real_t)(species.charge()),
+                             dt));
+    }
+
+#if defined(TEAM_POLICY)
+    /**
+     * @brief Tiled deposit launcher (TeamPolicy + per-team scratch).
+     *
+     * Identical in structure to the SRPIC launcher (`engines/srpic/currents.h`):
+     * iterates over `tile_layout.ntiles_total` teams; each team accumulates its
+     * tile's particle contributions in SLM scratch and atomically flushes to the
+     * global J (here `cur0`, the GRPIC half-step current). Requires the species
+     * to have been sorted with `team_policy` enabled (`tile_layout` populated by
+     * `SortSpatially`).
+     *
+     * The deposit body (`kernel::DepositOneParticle<SimEngine::GRPIC, M, O>`) is
+     * the same shared math used by the flat path — it already carries the GR
+     * velocity-recovery branch — so the only engine-specific differences from
+     * SRPIC are the `SimEngine::GRPIC` tag and the `cur0` target.
+     *
+     * Falls back to the flat kernel for the tail `[npart_partitioned, npart)`
+     * exactly as SRPIC does; see the per-step coverage note in
+     * `kernels/currents_deposit.hpp`.
+     */
+    template <GRMetricClass M, unsigned short O>
+    void CallDepositKernelTiled(const Particles<M::Dim, M::CoordType>& species,
+                                const M&                    local_metric,
+                                const ndfield_t<M::Dim, 3>& cur,
+                                real_t                      dt,
+                                int                         team_size_req) {
+      static_assert(O <= 11u, "Shape order must be <= 11");
+      constexpr unsigned short T = static_cast<unsigned short>(
+        TEAM_POLICY_TILE_SIZE);
+      const auto& layout = species.tile_layout();
+      raise::ErrorIf(layout.ntiles_total == 0u,
+                     "CallDepositKernelTiled: tile_layout has 0 tiles — call "
+                     "SortSpatially before CurrentsDeposit",
+                     HERE);
+      raise::ErrorIf(layout.tile_offsets.extent(0) != layout.ntiles_total + 1u,
+                     "CallDepositKernelTiled: tile_offsets size inconsistent "
+                     "with ntiles_total",
+                     HERE);
+
+      auto deposit_kernel =
+        kernel::DepositCurrentsTiled_kernel<SimEngine::GRPIC, M, O, T> {
+          cur,    species, local_metric, (real_t)(species.charge()),
+          dt,     layout,  species.npart()
+        };
+
+      const auto scratch = Kokkos::PerTeam(
+        decltype(deposit_kernel)::scratch_bytes());
+
+      // Team (work-group) size. The default (team_size_req == 0) leaves
+      // Kokkos::AUTO, which sizes the team from the backend occupancy
+      // heuristic. A positive `algorithms.deposit.team_policy_team_size`
+      // overrides it, clamped to the scratch/backend-feasible maximum so an
+      // over-large request cannot abort the launch (Kokkos errors when
+      // team_size > team_size_max). No portable subgroup rounding is applied;
+      // pick a multiple of the device subgroup width (printed per arch by
+      // ideal_tile_size.py) for the best occupancy.
+      Kokkos::TeamPolicy<> policy(static_cast<int>(layout.ntiles_total),
+                                  Kokkos::AUTO);
+      policy.set_scratch_size(0, scratch);
+      if (team_size_req > 0) {
+        const int ts_max = policy.team_size_max(deposit_kernel,
+                                                Kokkos::ParallelForTag {});
+        int       ts     = team_size_req;
+        if (ts > ts_max) {
+          raise::Warning(
+            fmt::format("algorithms.deposit.team_policy_team_size = %d exceeds "
+                        "the tiled-deposit maximum %d on this backend; clamping "
+                        "to %d",
+                        team_size_req,
+                        ts_max,
+                        ts_max),
+            HERE);
+          ts = ts_max;
+        }
+        policy = Kokkos::TeamPolicy<>(static_cast<int>(layout.ntiles_total), ts);
+        policy.set_scratch_size(0, scratch);
+        logger::Checkpoint(
+          fmt::format("Tiled deposit: explicit team size %d", ts),
+          HERE);
+      }
+      Kokkos::parallel_for("CurrentsDepositTiled", policy, deposit_kernel);
+
+      // Particles appended since the last sort (injection / MPI receive on a
+      // no-sort step) live past the partition and are not visited by any team
+      // above. Deposit that tail [npart_partitioned, npart) with the flat
+      // scatter-view kernel so every active particle is deposited exactly
+      // once. The range is empty when the species was just sorted (the
+      // every-step-sorted common case), so this is a no-op there.
+      if (species.npart() > layout.npart_partitioned) {
+        // `cur` is a const ref; take a non-const View handle (shallow copy,
+        // shares storage) so the scatter view can contribute back into it.
+        auto cur_nc      = cur;
+        auto scatter_cur = Kokkos::Experimental::create_scatter_view(cur_nc);
+        Kokkos::parallel_for(
+          "CurrentsDepositTiledTail",
+          CreateParticleRangePolicy<Dim::_1D>({ layout.npart_partitioned },
+                                              { species.npart() }),
+          kernel::DepositCurrents_kernel<SimEngine::GRPIC, M, O>(
+            scatter_cur,
+            species,
+            local_metric,
+            (real_t)(species.charge()),
+            dt));
+        Kokkos::Experimental::contribute(cur_nc, scatter_cur);
+      }
+    }
+#endif // TEAM_POLICY
+
     template <GRMetricClass M>
     void CurrentsDeposit(Domain<SimEngine::GRPIC, M>& domain,
                          const prm::Parameters&       engine_params) {
+      const auto dt = engine_params.get<real_t>("dt");
+      // GRPIC deposits the half-step current into `cur0` (the engine no longer
+      // pre-zeros it — this is the single source of truth, matching SRPIC).
+      Kokkos::deep_copy(domain.fields.cur0, ZERO);
+
+#if defined(TEAM_POLICY)
+      // Optional runtime override for the tiled-deposit team (work-group) size;
+      // 0 (default) keeps Kokkos::AUTO. Clamped to the backend max in the
+      // launcher (see CallDepositKernelTiled).
+      const auto team_size_req = static_cast<int>(
+        engine_params.get<std::size_t>("team_policy_team_size",
+                                       std::optional<std::size_t> { 0u }));
+
+      // Tiled deposit. Correctness no longer depends on the SoA being in a
+      // "sorted" state at deposit time — the tiled kernel handles a stale
+      // partition per-particle (escape valve for drifted particles, dead-tag
+      // clamp, and the launcher's flat tail pass for appended particles). The
+      // only case the tiled kernel cannot serve is the very first step, before
+      // any SortSpatially has populated a layout; that species takes the flat
+      // scatter-view path for that step alone. See engines/srpic/currents.h and
+      // kernels/currents_deposit.hpp for the full coverage argument.
+      for (auto& species : domain.species) {
+        if ((species.pusher() == ParticlePusher::NONE) or
+            (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
+          continue;
+        }
+        const auto& layout = species.tile_layout();
+        if (layout.ntiles_total == 0u or layout.tile_offsets.extent(0) == 0u) {
+          logger::Checkpoint(
+            fmt::format("Launching currents deposit (flat, no sort yet) for "
+                        "%d [%s] : %lu %f",
+                        species.index(),
+                        species.label().c_str(),
+                        species.npart(),
+                        (double)species.charge()),
+            HERE);
+          auto scatter_cur0 = Kokkos::Experimental::create_scatter_view(
+            domain.fields.cur0);
+          CallDepositKernel<M, SHAPE_ORDER>(species,
+                                            domain.mesh.metric,
+                                            scatter_cur0,
+                                            dt);
+          Kokkos::Experimental::contribute(domain.fields.cur0, scatter_cur0);
+        } else {
+          logger::Checkpoint(
+            fmt::format("Launching tiled currents deposit for %d [%s] : %lu %f",
+                        species.index(),
+                        species.label().c_str(),
+                        species.npart(),
+                        (double)species.charge()),
+            HERE);
+          CallDepositKernelTiled<M, SHAPE_ORDER>(species,
+                                                 domain.mesh.metric,
+                                                 domain.fields.cur0,
+                                                 dt,
+                                                 team_size_req);
+        }
+      }
+#else
       auto scatter_cur0 = Kokkos::Experimental::create_scatter_view(
         domain.fields.cur0);
-      const auto dt = engine_params.get<real_t>("dt");
       for (auto& species : domain.species) {
+        if ((species.pusher() == ParticlePusher::NONE) or
+            (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
+          continue;
+        }
         logger::Checkpoint(
           fmt::format("Launching currents deposit kernel for %d [%s] : %lu %f",
                       species.index(),
@@ -40,36 +230,14 @@ namespace ntt {
                       species.npart(),
                       (double)species.charge()),
           HERE);
-        if (species.npart() == 0 || cmp::AlmostZero(species.charge())) {
-          continue;
-        }
-        Kokkos::parallel_for("CurrentsDeposit",
-                             species.rangeActiveParticles(),
-                             kernel::DepositCurrents_kernel<SimEngine::GRPIC, M>(
-                               scatter_cur0,
-                               species.i1,
-                               species.i2,
-                               species.i3,
-                               species.i1_prev,
-                               species.i2_prev,
-                               species.i3_prev,
-                               species.dx1,
-                               species.dx2,
-                               species.dx3,
-                               species.dx1_prev,
-                               species.dx2_prev,
-                               species.dx3_prev,
-                               species.ux1,
-                               species.ux2,
-                               species.ux3,
-                               species.phi,
-                               species.weight,
-                               species.tag,
-                               domain.mesh.metric,
-                               (real_t)(species.charge()),
-                               dt));
+
+        CallDepositKernel<M, SHAPE_ORDER>(species,
+                                          domain.mesh.metric,
+                                          scatter_cur0,
+                                          dt);
       }
       Kokkos::Experimental::contribute(domain.fields.cur0, scatter_cur0);
+#endif
     }
 
     template <GRMetricClass M>
@@ -103,4 +271,4 @@ namespace ntt {
   } // namespace grpic
 } // namespace ntt
 
-#endif // ENGINES_GRPIC_CURRENTS_H
\ No newline at end of file
+#endif // ENGINES_GRPIC_CURRENTS_H
diff --git a/src/engines/grpic/grpic.hpp b/src/engines/grpic/grpic.hpp
index fc2daa4de..621592955 100644
--- a/src/engines/grpic/grpic.hpp
+++ b/src/engines/grpic/grpic.hpp
@@ -416,7 +416,8 @@ namespace ntt {
          */
         if (deposit_enabled) {
           timers.start("CurrentDeposit");
-          Kokkos::deep_copy(dom.fields.cur0, ZERO);
+          // `cur0` is zeroed inside grpic::CurrentsDeposit (matching SRPIC),
+          // so no pre-zero is needed here.
           grpic::CurrentsDeposit(dom, this->engineParams());
           timers.stop("CurrentDeposit");
 

From 210454b150ce7674985c030076b3429ca746722e Mon Sep 17 00:00:00 2001
From: LudwigBoess <ludwig.boess@outlook.de>
Date: Fri, 26 Jun 2026 14:03:51 +0000
Subject: [PATCH 31/33] bugfix in deposit tests

---
 tests/kernels/deposit.cpp       | 18 ++++++++++++------
 tests/kernels/deposit_tiled.cpp | 16 ++++++++++++++++
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/tests/kernels/deposit.cpp b/tests/kernels/deposit.cpp
index 24f544202..10095d304 100644
--- a/tests/kernels/deposit.cpp
+++ b/tests/kernels/deposit.cpp
@@ -142,15 +142,21 @@ void testDeposit(const std::vector<ncells_t>&         res,
 
   auto J_scat = Kokkos::Experimental::create_scatter_view(J);
 
+  // The deposit kernel now takes a `ParticleArrays` SoA struct instead of
+  // the individual per-component arrays. Pack the per-test arrays into one;
+  // payload (pld_*) members stay default (unused here).
+  ParticleArrays pa;
+  pa.i1 = i1, pa.i2 = i2, pa.i3 = i3;
+  pa.i1_prev = i1_prev, pa.i2_prev = i2_prev, pa.i3_prev = i3_prev;
+  pa.dx1 = dx1, pa.dx2 = dx2, pa.dx3 = dx3;
+  pa.dx1_prev = dx1_prev, pa.dx2_prev = dx2_prev, pa.dx3_prev = dx3_prev;
+  pa.ux1 = ux1, pa.ux2 = ux2, pa.ux3 = ux3;
+  pa.phi = phi, pa.weight = weight, pa.tag = tag;
+
   // clang-format off
   Kokkos::parallel_for("CurrentsDeposit", 10,
                        kernel::DepositCurrents_kernel<S, M, O>(J_scat,
-                                                            i1, i2, i3,
-                                                            i1_prev, i2_prev, i3_prev,
-                                                            dx1, dx2, dx3,
-                                                            dx1_prev, dx2_prev, dx3_prev,
-                                                            ux1, ux2, ux3,
-                                                            phi, weight, tag,
+                                                            pa,
                                                             metric, charge, inv_dt));
   // clang-format on
 
diff --git a/tests/kernels/deposit_tiled.cpp b/tests/kernels/deposit_tiled.cpp
index 3cc2f62b0..1b6237ae4 100644
--- a/tests/kernels/deposit_tiled.cpp
+++ b/tests/kernels/deposit_tiled.cpp
@@ -339,6 +339,22 @@ namespace {
   // boundary.
   template <unsigned short O, unsigned short T_TILE>
   void run_drift_case() {
+    // This case deposits boundary-adjacent particles (cells touching the
+    // ghost stripe), so an order-O stencil must fit inside the field's
+    // N_GHOSTS ghost layers. N_GHOSTS is a compile-time constant fixed by
+    // the build's SHAPE_ORDER ((SHAPE_ORDER+1)/2 + 1); a build whose ghost
+    // width is smaller than order O requires would deposit outside the
+    // field -- silent on GPU (no Kokkos View bounds guard; the overshoot
+    // cells carry zero shape-weight so results still match) but heap
+    // corruption on a host/SERIAL build. Skip those orders here; build at
+    // the matching SHAPE_ORDER to drift-test higher orders. The equivalence
+    // ("X-1") cases above stay interior, so they exercise all orders.
+    if constexpr ((O + 1u) / 2u + 1u > N_GHOSTS) {
+      std::cerr << "deposit_tiled[drift] SKIP O=" << O << " T_TILE=" << T_TILE
+                << " (needs N_GHOSTS>=" << ((O + 1u) / 2u + 1u)
+                << ", build has " << N_GHOSTS << ")\n";
+      return;
+    }
     using metric_t = metric::Minkowski<Dim::_2D>;
     constexpr unsigned short nx1 = 50u, nx2 = 50u;
     metric_t metric { { nx1, nx2 }, { { 0.0, 55.0 }, { 0.0, 55.0 } }, {} };

From e9cfa06efbd7a975d98644a5d5cd0955c3d0bd45 Mon Sep 17 00:00:00 2001
From: LudwigBoess <ludwig.boess@outlook.de>
Date: Fri, 26 Jun 2026 14:23:32 +0000
Subject: [PATCH 32/33] added explicit charge conservation test to the tiled
 deposit test

---
 tests/kernels/deposit_tiled.cpp | 53 +++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/tests/kernels/deposit_tiled.cpp b/tests/kernels/deposit_tiled.cpp
index 1b6237ae4..f058d6f30 100644
--- a/tests/kernels/deposit_tiled.cpp
+++ b/tests/kernels/deposit_tiled.cpp
@@ -162,6 +162,55 @@ namespace {
               << " T_TILE=" << T_TILE << "  max_diff=" << max_diff << '\n';
   }
 
+  // Intrinsic charge-conservation check on a single deposited J field.
+  // Esirkepov/zigzag deposits satisfy the discrete continuity equation, so
+  // the spatial sum of the discrete divergence div.J = dJx/dx + dJy/dy
+  // vanishes whenever the summation region encloses every particle's full
+  // stencil (J == 0 on the region's outer boundary). This is evaluated on
+  // J_tiled ALONE -- it does not compare against the flat reference -- so it
+  // certifies the per-particle escape valve deposits each drifted particle's
+  // stencil as one coherent unit: no cell dropped, duplicated, or split
+  // between SLM scratch and global J. (The run_drift_case order guard keeps
+  // every stencil inside [0, j_ext), so the extreme ghost cells stay zero and
+  // the telescoping boundary flux is genuinely zero rather than clipped.)
+  // Accumulated in double regardless of build precision to keep the
+  // tolerance tight.
+  void check_charge_conservation(const ndfield_t<Dim::_2D, 3>& J,
+                                 unsigned short                O,
+                                 unsigned short                T_TILE,
+                                 const char*                   label) {
+    auto h = Kokkos::create_mirror_view(J);
+    Kokkos::deep_copy(h, J);
+
+    double sum_div = 0.0; // Sum over the field of div.J (jx1 -> dx, jx2 -> dy).
+    double abs_tot = 0.0; // Total |J|, sets the relative tolerance scale.
+    for (ncells_t i = 1; i < h.extent(0); ++i) {
+      for (ncells_t j = 1; j < h.extent(1); ++j) {
+        sum_div += (static_cast<double>(h(i, j, 0)) -
+                    static_cast<double>(h(i - 1, j, 0))) +
+                   (static_cast<double>(h(i, j, 1)) -
+                    static_cast<double>(h(i, j - 1, 1)));
+      }
+    }
+    for (ncells_t i = 0; i < h.extent(0); ++i) {
+      for (ncells_t j = 0; j < h.extent(1); ++j) {
+        abs_tot += std::fabs(static_cast<double>(h(i, j, 0))) +
+                   std::fabs(static_cast<double>(h(i, j, 1)));
+      }
+    }
+    const double tol = 1.0e-5 * (abs_tot > 1.0 ? abs_tot : 1.0);
+    if (std::fabs(sum_div) > tol) {
+      std::cerr << "deposit_tiled[" << label
+                << "] CHARGE NON-CONSERVED for O=" << O << " T_TILE=" << T_TILE
+                << " : sum(div.J)=" << sum_div << " tol=" << tol
+                << " (abs_tot=" << abs_tot << ")\n";
+      throw std::logic_error(
+        "DepositCurrentsTiled_kernel charge non-conservation");
+    }
+    std::cerr << "deposit_tiled[" << label << "] charge-conserved O=" << O
+              << " T_TILE=" << T_TILE << "  sum(div.J)=" << sum_div << '\n';
+  }
+
   template <unsigned short O, unsigned short T_TILE>
   void run_one_case() {
     using metric_t = metric::Minkowski<Dim::_2D>;
@@ -484,6 +533,10 @@ namespace {
     }
 
     compare_J_fields(J_flat, J_tiled, O, T_TILE, "drift");
+    // Self-contained conservation check on the escape-valve output: the
+    // drifted particles all take the per-particle global-J path, so this
+    // certifies that path is charge-conserving without leaning on J_flat.
+    check_charge_conservation(J_tiled, O, T_TILE, "drift");
   }
 
   template <unsigned short T_TILE>

From ca3f162fa0bff91ffea83fcf55f6aaa50e987a50 Mon Sep 17 00:00:00 2001
From: Ludwig Boess <ludwig.boess@outlook.de>
Date: Mon, 29 Jun 2026 14:49:09 -0400
Subject: [PATCH 33/33] bugfix in for sort includes

---
 src/global/utils/sort_dispatch.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/global/utils/sort_dispatch.h b/src/global/utils/sort_dispatch.h
index 27a20a910..432b1c1b9 100644
--- a/src/global/utils/sort_dispatch.h
+++ b/src/global/utils/sort_dispatch.h
@@ -38,6 +38,21 @@
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Sort.hpp>
 
+// Entity's Kokkos alias macros (arch/kokkos_aliases.h) define bare words such
+// as `Function`, `Inline`, `Lambda` and `ClassLambda`. These collide with
+// template-parameter and member names used inside the vendor sort headers
+// (rocPRIM, cub, oneDPL) and corrupt their parsing (e.g. rocPRIM's
+// `template<class Tuple, class Function, ...>`). Suspend the aliases across the
+// vendor includes only, then restore them for the rest of the translation unit.
+#pragma push_macro("Function")
+#pragma push_macro("Inline")
+#pragma push_macro("Lambda")
+#pragma push_macro("ClassLambda")
+#undef Function
+#undef Inline
+#undef Lambda
+#undef ClassLambda
+
 #if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)
   #include <oneapi/dpl/algorithm>
   #include <oneapi/dpl/execution>
@@ -49,6 +64,11 @@
   #include <rocprim/rocprim.hpp>
 #endif
 
+#pragma pop_macro("ClassLambda")
+#pragma pop_macro("Lambda")
+#pragma pop_macro("Inline")
+#pragma pop_macro("Function")
+
 #include <algorithm>
 #include <cstdint>
 #include <numeric>