diff --git a/CMakeLists.txt b/CMakeLists.txt
index e3c580020..3b82fca9b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ set(PROJECT_NAME entity)
 
 project(
   ${PROJECT_NAME}
-  VERSION 1.4.2
+  VERSION 1.5.0
   LANGUAGES CXX C)
 add_compile_options("-D ENTITY_VERSION=\"${PROJECT_VERSION}\"")
 set(hash_cmd "git diff --quiet src/ && echo $(git rev-parse HEAD) ")
@@ -58,6 +58,24 @@ set(gpu_aware_mpi
     ${default_gpu_aware_mpi}
     CACHE BOOL "Enable GPU-aware MPI")
 
+set(team_policy
+  ${default_team_policy}
+  CACHE BOOL "Enable team_policy tile-blocked deposit/pusher kernels")
+set(team_policy_tile_size
+  ${default_team_policy_tile_size}
+  CACHE STRING "team_policy tile edge length in cells")
+set(team_policy_tile_sizes
+  "4;6;8;10;12;14;16"
+  CACHE STRING "team_policy tile-size choices")
+set(team_policy_drift
+  ${default_team_policy_drift}
+  CACHE STRING
+  "team_policy tiled-deposit scratch halo drift in cells (max cells a particle may move between two sorts). Sizes the deposit scratch halo only; the sort cadence is set at runtime via spatial_sorting_interval. Default 1.")
+set(vendor_sort
+  ${default_vendor_sort}
+  CACHE BOOL
+  "Use the vendor sort_by_key (oneDPL/Thrust/rocThrust) for the team_policy spatial sort when available. OFF forces the Kokkos::BinSort fallback, which sorts each SoA member in place (lower peak memory, no maxnpart gather buffer) at the cost of sort speed.")
+
 # -------------------------- Compilation settings -------------------------- #
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -136,6 +154,87 @@ else()
   set(DEVICE_ENABLED OFF)
 endif()
 
+# ------------------------------ team_policy wiring ------------------------ #
+if(${team_policy})
+  list(FIND team_policy_tile_sizes "${team_policy_tile_size}" _tps_idx)
+  if(_tps_idx EQUAL -1)
+    message(FATAL_ERROR
+      "${Red}team_policy_tile_size must be one of ${team_policy_tile_sizes}, "
+      "got '${team_policy_tile_size}'${ColorReset}")
+  endif()
+  add_compile_options("-D TEAM_POLICY")
+  add_compile_options("-D TEAM_POLICY_TILE_SIZE=${team_policy_tile_size}")
+
+  # Compile-time tiled-deposit scratch halo drift. Sizes the halo so a
+  # particle that drifts up to DRIFT cells between two sorts still deposits
+  # inside its tile scratch; particles drifting further take the
+  # per-particle global-J escape valve (correct, only slower). This is
+  # independent of the sort cadence, which is set at runtime via
+  # `spatial_sorting_interval`. Defaults to 1 (the sorted-every-step case).
+  add_compile_options("-D TEAM_POLICY_DRIFT=${team_policy_drift}")
+
+  # Vendor sort: oneDPL on SYCL, Thrust on CUDA, rocThrust/rocprim on HIP.
+  # When `vendor_sort` is ON (default) the available library is detected
+  # and used; the spatial sort then builds a single permutation that
+  # gathers all SoA members. When `vendor_sort` is OFF, or no library is
+  # found, the code falls back to Kokkos::BinSort, which sorts each member
+  # in place -- lower peak memory and no maxnpart gather buffer, at the
+  # cost of sort speed (negligible when sorting is a small fraction of the
+  # step). The `vendor_sort` knob lets you force the BinSort fallback even
+  # when a vendor library is present.
+  if(${vendor_sort})
+    if("${Kokkos_DEVICES}" MATCHES "SYCL")
+      find_package(oneDPL QUIET)
+      if(oneDPL_FOUND)
+        message(STATUS "team_policy: oneDPL found, enabling SYCL sort_by_key")
+        add_compile_options("-D ONEDPL_ENABLED")
+        set(DEPENDENCIES ${DEPENDENCIES} oneDPL)
+      else()
+        message(STATUS "team_policy: oneDPL not found; using BinSort fallback "
+                       "for SYCL sort_by_key")
+      endif()
+    endif()
+
+    if("${Kokkos_DEVICES}" MATCHES "CUDA")
+      find_package(Thrust QUIET)
+      if(Thrust_FOUND)
+        message(STATUS "team_policy: Thrust enabled for CUDA sort_by_key")
+        add_compile_options("-D THRUST_ENABLED")
+      else()
+        message(STATUS "team_policy: Thrust not found; using BinSort fallback "
+                       "for CUDA sort_by_key")
+      endif()
+    endif()
+
+    if("${Kokkos_DEVICES}" MATCHES "HIP")
+      # rocThrust ships with ROCm. The HIP sort_by_key path uses rocprim's
+      # bounded-bit radix sort directly (rocprim is rocThrust's own
+      # dependency, so its headers come in transitively; we find it
+      # explicitly to keep the include path robust). This builds a single
+      # permutation that gathers all SoA members, instead of the legacy
+      # per-member Kokkos::BinSort path which allocates a fresh
+      # `sorted_values` buffer for every member every step (the dominant
+      # source of allocator churn / fragmentation on ROCm).
+      find_package(rocthrust QUIET)
+      if(rocthrust_FOUND)
+        message(STATUS "team_policy: rocThrust enabled for HIP sort_by_key")
+        add_compile_options("-D ROCTHRUST_ENABLED")
+        set(DEPENDENCIES ${DEPENDENCIES} roc::rocthrust)
+        find_package(rocprim QUIET)
+        if(rocprim_FOUND)
+          set(DEPENDENCIES ${DEPENDENCIES} roc::rocprim)
+        endif()
+      else()
+        message(STATUS "team_policy: rocThrust not found; using BinSort "
+                       "fallback for HIP sort_by_key")
+      endif()
+    endif()
+  else()
+    message(STATUS "team_policy: vendor_sort=OFF; forcing Kokkos::BinSort "
+                   "fallback for spatial sort_by_key")
+  endif()
+endif()
+
 # MPI
 if(${mpi})
   find_or_fetch_dependency(MPI FALSE REQUIRED)
@@ -145,6 +244,62 @@ if(${mpi})
   if(${DEVICE_ENABLED})
     if(${gpu_aware_mpi})
       add_compile_options("-D GPU_AWARE_MPI")
+
+      # On Cray systems (e.g. Frontier) GPU-aware Cray MPICH can only
+      # handle device pointers if the GPU Transport Layer (GTL) library
+      # is linked. The Cray compiler wrappers (cc/CC) inject this
+      # automatically, but we build with hipcc/nvcc directly, so
+      # find_package(MPI) only finds base libmpi and the GTL is left
+      # out -> MPI_Sendrecv on a device pointer fails with
+      # "OFI ... Bad address". Add it explicitly here.
+      #
+      # Cray PE exports PE_MPICH_GTL_DIR_<accel> / PE_MPICH_GTL_LIBS_<accel>
+      # (e.g. amd_gfx90a -> -lmpi_gtl_hsa). Their absence means this is
+      # not a Cray MPICH build, in which case nothing extra is needed.
+      if("${Kokkos_DEVICES}" MATCHES "HIP")
+        set(_gtl_accels amd_gfx942 amd_gfx940 amd_gfx90a amd_gfx908 amd_gfx906)
+      elseif("${Kokkos_DEVICES}" MATCHES "CUDA")
+        set(_gtl_accels nvidia90 nvidia80 nvidia70)
+      elseif("${Kokkos_DEVICES}" MATCHES "SYCL")
+        set(_gtl_accels ponteVecchio)
+      else()
+        set(_gtl_accels "")
+      endif()
+
+      set(_gtl_dir "")
+      set(_gtl_libflag "")
+      foreach(_accel ${_gtl_accels})
+        if((NOT _gtl_dir) AND (DEFINED ENV{PE_MPICH_GTL_DIR_${_accel}}))
+          # strip the leading "-L" from the Cray-provided value
+          string(REGEX REPLACE "^-L" ""
+                 _gtl_dir "$ENV{PE_MPICH_GTL_DIR_${_accel}}")
+          string(REGEX REPLACE "^-l" ""
+                 _gtl_libflag "$ENV{PE_MPICH_GTL_LIBS_${_accel}}")
+        endif()
+      endforeach()
+
+      if(_gtl_dir AND _gtl_libflag)
+        find_library(MPI_GTL_LIBRARY
+          NAMES ${_gtl_libflag}
+          HINTS "${_gtl_dir}"
+          NO_DEFAULT_PATH)
+        if(MPI_GTL_LIBRARY)
+          message(STATUS
+            "GPU-aware MPI: linking Cray GTL library ${MPI_GTL_LIBRARY}")
+          set(DEPENDENCIES ${DEPENDENCIES} ${MPI_GTL_LIBRARY})
+        else()
+          message(FATAL_ERROR
+            "${Red}gpu_aware_mpi=ON: Cray MPICH detected but the GTL "
+            "library 'lib${_gtl_libflag}' was not found in '${_gtl_dir}'. "
+            "GPU-aware MPI will crash at runtime without it. Make sure the "
+            "craype-accel module is loaded, or build with gpu_aware_mpi=OFF."
+            "${ColorReset}")
+        endif()
+      else()
+        message(STATUS
+          "GPU-aware MPI: no Cray GTL environment found; assuming the MPI "
+          "implementation is GPU-aware without an extra transport library.")
+      endif()
     endif()
   else()
     set(gpu_aware_mpi
diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake
index fb8790019..c03ebaf81 100644
--- a/cmake/defaults.cmake
+++ b/cmake/defaults.cmake
@@ -92,3 +92,36 @@ else()
 endif()
 
 set_property(CACHE default_gpu_aware_mpi PROPERTY TYPE BOOL)
+
+if(DEFINED ENV{Entity_ENABLE_TEAM_POLICY})
+  set(default_team_policy
+      $ENV{Entity_ENABLE_TEAM_POLICY}
+      CACHE INTERNAL "Default flag for team_policy tile-blocked kernels")
+else()
+  set(default_team_policy
+      OFF
+      CACHE INTERNAL "Default flag for team_policy tile-blocked kernels")
+endif()
+set_property(CACHE default_team_policy PROPERTY TYPE BOOL)
+
+if(DEFINED ENV{Entity_ENABLE_VENDOR_SORT})
+  set(default_vendor_sort
+      $ENV{Entity_ENABLE_VENDOR_SORT}
+      CACHE INTERNAL
+        "Default flag for vendor sort_by_key (oneDPL/Thrust/rocThrust)")
+else()
+  set(default_vendor_sort
+      ON
+      CACHE INTERNAL
+        "Default flag for vendor sort_by_key (oneDPL/Thrust/rocThrust)")
+endif()
+set_property(CACHE default_vendor_sort PROPERTY TYPE BOOL)
+
+set(default_team_policy_tile_size
+    8
+    CACHE INTERNAL "Default tile edge length in cells for team_policy")
+
+set(default_team_policy_drift
+    1
+    CACHE INTERNAL
+      "Default tiled-deposit scratch halo drift for team_policy (cells between sorts)")
diff --git a/cmake/report.cmake b/cmake/report.cmake
index 7b145bfc7..d972fafc4 100644
--- a/cmake/report.cmake
+++ b/cmake/report.cmake
@@ -122,6 +122,44 @@ if(${mpi} AND ${DEVICE_ENABLED})
     GPU_AWARE_MPI_REPORT
     46)
 endif()
+printchoices(
+  "Team Policy"
+  "team_policy"
+  "${ON_OFF_VALUES}"
+  ${team_policy}
+  OFF
+  "${Green}"
+  TEAM_POLICY_REPORT
+  46)
+if(${team_policy})
+  printchoices(
+    "Team Tile Size"
+    "team_policy_tile_size"
+    "${team_policy_tile_sizes}"
+    ${team_policy_tile_size}
+    ${default_team_policy_tile_size}
+    "${Blue}"
+    TEAM_POLICY_TILE_SIZE_REPORT
+    46)
+  printchoices(
+    "Team Deposit Drift"
+    "team_policy_drift"
+    "${team_policy_drift}"
+    ${team_policy_drift}
+    1
+    "${Blue}"
+    TEAM_POLICY_DRIFT_REPORT
+    46)
+  printchoices(
+    "Vendor sort"
+    "vendor_sort"
+    "${ON_OFF_VALUES}"
+    ${vendor_sort}
+    ON
+    "${Green}"
+    VENDOR_SORT_REPORT
+    46)
+endif()
 printchoices(
   "Debug mode"
   "DEBUG"
@@ -197,6 +235,13 @@ if(${mpi} AND ${DEVICE_ENABLED})
   string(APPEND REPORT_TEXT "  " ${GPU_AWARE_MPI_REPORT} "\n")
 endif()
 
+string(APPEND REPORT_TEXT "  " ${TEAM_POLICY_REPORT} "\n")
+if(${team_policy})
+  string(APPEND REPORT_TEXT "  " ${TEAM_POLICY_TILE_SIZE_REPORT} "\n")
+  string(APPEND REPORT_TEXT "  " ${TEAM_POLICY_DRIFT_REPORT} "\n")
+  string(APPEND REPORT_TEXT "  " ${VENDOR_SORT_REPORT} "\n")
+endif()
+
 string(
   APPEND
   REPORT_TEXT
diff --git a/ideal_tile_size.py b/ideal_tile_size.py
new file mode 100644
index 000000000..e216f5981
--- /dev/null
+++ b/ideal_tile_size.py
@@ -0,0 +1,955 @@
+#!/usr/bin/env python3
+"""Recommend the team tile size (T_TILE) for entity's tiled current-deposit kernel.
+
+Each GPU work-group (team) owns a TE^dim scratch tile in shared memory (SLM on Intel,
+LDS on AMD, shared mem on NVIDIA), accumulates its particles' currents into it, then
+flushes once to global memory.  With
+
+    TE   = T_TILE + 2*HALO
+    HALO = stencil_reach + drift   (stencil_reach = shape_order for Esirkepov, 2 for the
+                                    O==0 zigzag deposit; drift = the compile-time
+                                    `team_policy_drift` CMake knob, NOT the runtime
+                                    spatial_sorting_interval -- see currents_deposit.hpp)
+
+the tile size is squeezed by three competing pressures:
+
+  * Shared-memory capacity (HARD): TE^dim * ncomp * sizeof(real) must fit, ideally with
+    several work-groups resident per compute unit so latency is hidden.  Binds in 3D /
+    double precision / on AMD's 64 KiB LDS.
+  * Halo overhead (push LARGER): zero-fill + flush sweep the whole TE^dim tile, so a tiny
+    tile is almost all halo (1-(T/TE)^dim wasted).  Big HALO (infrequent sorts) makes this
+    worse and forces larger tiles.
+  * Particles per tile (push SMALLER): ppc*T^dim particles all atomic-add into one fixed
+    scratch tile -> SLM-atomic contention and load imbalance grow with tile size.
+
+Recommendation = the largest tile that respects the particle budget and shared-memory
+residency; if that tile would be mostly halo, it is grown (toward lower halo) up to the
+shared-memory limit.  This is a first-order model -- confirm by sweeping the entity knobs
+  -D team_policy_tile_size=<T> -D team_policy_drift=<D>  and re-profiling (see roofline/).
+The team (work-group) size defaults to Kokkos::AUTO; override it at runtime with the
+  [algorithms.deposit] team_policy_team_size = <N>   (0 = AUTO)
+toml knob -- clamped to the backend maximum at launch (engines/srpic/currents.h).
+
+Two ways to drive it:
+
+  * Interactive TUI (no arguments):
+        python ideal_tile_size.py
+
+  * Scriptable CLI (any argument):
+        python ideal_tile_size.py pvc --dim 2 --ppc 16
+        python ideal_tile_size.py amd --dim 3 --ppc 64
+        python ideal_tile_size.py all --dim 2 --ppc 16 --drift 4   # infrequent sorting
+"""
+import argparse
+import curses
+import os
+import sys
+from typing import Callable, List, Optional, Tuple
+
+# Shared-memory budget is the load-bearing number.  Per compute unit (Xe-core / SM / CU);
+# smem_wg_max is the largest single-work-group allocation.
+ARCH = {
+    "pvc":    dict(label="Intel Data Center GPU Max 1550 (PVC, Xe-HPC), per tile",
+                   smem_cu=128 * 1024, smem_wg_max=128 * 1024,
+                   subgroup=32, max_wg=1024, n_cu=64, cu="Xe-core"),
+    "a100":   dict(label="NVIDIA A100 (Ampere)",
+                   smem_cu=164 * 1024, smem_wg_max=163 * 1024,
+                   subgroup=32, max_wg=1024, n_cu=108, cu="SM",
+                   note="shared mem >48 KiB/block needs opt-in (cudaFuncAttributeMaxDynamicSharedMemorySize)"),
+    "h100":   dict(label="NVIDIA H100 (Hopper)",
+                   smem_cu=228 * 1024, smem_wg_max=227 * 1024,
+                   subgroup=32, max_wg=1024, n_cu=132, cu="SM",
+                   note="shared mem >48 KiB/block needs opt-in (cudaFuncAttributeMaxDynamicSharedMemorySize)"),
+    "gh200":  dict(label="NVIDIA GH200 Grace Hopper (Hopper H100/H200 GPU)",
+                   smem_cu=228 * 1024, smem_wg_max=227 * 1024,
+                   subgroup=32, max_wg=1024, n_cu=132, cu="SM",
+                   note="shared mem >48 KiB/block needs opt-in (cudaFuncAttributeMaxDynamicSharedMemorySize)"),
+    "mi250x": dict(label="AMD Instinct MI250X (CDNA2), per GCD",
+                   smem_cu=64 * 1024, smem_wg_max=64 * 1024,
+                   subgroup=64, max_wg=1024, n_cu=110, cu="CU"),
+    "mi300x": dict(label="AMD Instinct MI300X (CDNA3)",
+                   smem_cu=64 * 1024, smem_wg_max=64 * 1024,
+                   subgroup=64, max_wg=1024, n_cu=304, cu="CU"),
+}
+ALIAS = {"nvidia": "h100", "amd": "mi300x", "intel": "pvc", "mi250": "mi250x", "mi300": "mi300x",
+         "gracehopper": "gh200", "grace-hopper": "gh200", "gh200x": "gh200"}
+PRECISION = {"single": 4, "double": 8}
+
+# arch choices offered in the TUI (canonical keys plus the meta-target "all")
+ARCH_CHOICES = list(ARCH.keys()) + ["all"]
+# "all" expands to one representative of each vendor (matches the CLI behaviour)
+ALL_ARCHS = ["pvc", "nvidia", "amd"]
+
+
+# ============================
+# core model (shared by TUI + CLI)
+# ============================
+
+class Settings:
+    """Tunable inputs for the tile-size model.
+
+    Attribute names match the argparse dest names so `recommend`/`report_lines` accept
+    either a Settings instance (TUI) or an argparse namespace (CLI) interchangeably.
+    """
+
+    def __init__(self):
+        self.arch = "pvc"          # an ARCH key or "all"
+        self.dim = 2               # 1 / 2 / 3
+        self.ppc = 16.0            # particles per cell (per species)
+        self.shape_order = 2       # entity shape_order
+        self.precision = "single"  # single / double
+        self.components = 3        # current-field components (J has 3)
+        self.drift = 1             # team_policy_drift: cells of drift the scratch halo absorbs
+                                   # (compile-time CMake knob, independent of spatial_sorting_interval)
+        self.target_resident = 2   # work-groups resident per compute unit
+        self.npart_cap = 1600.0    # particle-per-tile budget (contention / load-balance proxy)
+        self.halo_max = 0.70       # halo fraction above which the tile is grown
+        self.grid = 0              # cells per dim (0 disables the GPU-fill check)
+        self.balance_factor = 4    # min tiles per compute unit
+        self.min_tile = 4          # entity's team_policy_tile_sizes list starts at 4
+        self.max_tile = 64
+
+
+def resolve_arch(name):
+    key = ALIAS.get(name.lower(), name.lower())
+    if key not in ARCH:
+        raise SystemExit("unknown arch '%s'; choose from %s (or aliases %s)"
+                         % (name, ", ".join(ARCH), ", ".join(ALIAS)))
+    return key, ARCH[key]
+
+
+def recommend(hw, p):
+    """p: Settings or argparse namespace. Returns dict with rows, chosen row, binding."""
+    # Matches DepositCurrentsTiled_kernel: STENCIL_REACH = O for Esirkepov (O>=1),
+    # 2 for the O==0 zigzag deposit; HALO = STENCIL_REACH + TEAM_POLICY_DRIFT.
+    stencil_reach = 2 if p.shape_order == 0 else p.shape_order
+    halo = stencil_reach + p.drift
+    real = PRECISION[p.precision]
+    atoms_pp = p.components * (p.shape_order + 1) ** p.dim   # ~ useful atomics / particle
+
+    rows = []
+    for T in range(p.min_tile, p.max_tile + 1, 2):          # entity uses even tile sizes
+        TE = T + 2 * halo
+        scratch = TE ** p.dim * p.components * real
+        npart = p.ppc * T ** p.dim
+        halo_frac = 1.0 - (T / TE) ** p.dim
+        ovhd = (2.0 * p.components / p.ppc) * (TE / T) ** p.dim / atoms_pp  # zero+flush vs deposit
+        resident = int(hw["smem_cu"] // scratch) if scratch else 0
+        ntiles = (p.grid / T) ** p.dim if p.grid else None
+        rows.append(dict(T=T, TE=TE, scratch=scratch, npart=npart, halo_frac=halo_frac,
+                         ovhd=ovhd, resident=resident, ntiles=ntiles,
+                         ok_cap=scratch <= hw["smem_wg_max"]))
+
+    capfeas = [r for r in rows if r["ok_cap"]]
+    if not capfeas:
+        return dict(halo=halo, rows=rows, chosen=None, binding=None, grown=False)
+
+    def largest(pred, default):
+        ts = [r["T"] for r in capfeas if pred(r)]
+        return max(ts) if ts else default
+
+    T_cap = max(r["T"] for r in capfeas)
+    T_res = largest(lambda r: r["resident"] >= p.target_resident, p.min_tile)
+    T_np = largest(lambda r: r["npart"] <= p.npart_cap, p.min_tile)
+    T_bal = largest(lambda r: r["ntiles"] is None or r["ntiles"] >= p.balance_factor * hw["n_cu"], p.min_tile)
+    bounds = {"shared-memory capacity": T_cap, "shared-memory residency": T_res,
+              "GPU fill (too few tiles)": T_bal, "particle budget per tile": T_np}
+
+    chosen_T = max(min(bounds.values()), p.min_tile)
+    binding = min(bounds, key=lambda k: bounds[k])
+
+    # If the particle-budget pick is mostly halo, grow the tile to cut halo, but never
+    # past what shared memory / GPU-fill allow (that just trades halo for contention).
+    grown = False
+    cur = next(r for r in capfeas if r["T"] == chosen_T)
+    if cur["halo_frac"] > p.halo_max:
+        halo_ok = [r["T"] for r in capfeas if r["halo_frac"] <= p.halo_max]
+        ceil_T = min(T_res, T_bal, T_cap)
+        if halo_ok:
+            target = max(min(halo_ok), chosen_T)        # smallest tile that clears halo_max
+            new_T = min(max(target, chosen_T), ceil_T)
+            if new_T > chosen_T:
+                chosen_T, grown = new_T, True
+                binding = ("halo overhead" if min(halo_ok) <= ceil_T
+                           else min({k: v for k, v in bounds.items()
+                                     if k != "particle budget per tile"},
+                                    key=lambda k: bounds[k]))
+        else:
+            new_T = min(ceil_T, T_cap)                  # can't clear halo_max at all -> go as big as SLM allows
+            if new_T > chosen_T:
+                chosen_T, grown = new_T, True
+                binding = "shared-memory residency"
+
+    chosen = next(r for r in capfeas if r["T"] == chosen_T)
+    return dict(halo=halo, rows=rows, chosen=chosen, binding=binding, grown=grown, atoms_pp=atoms_pp)
+
+
+def kib(b):
+    return "%.1f" % (b / 1024.0)
+
+
+def report_lines(name, key, hw, p, res):
+    """Build the recommendation report as a list of text lines (no printing)."""
+    L = []
+    L.append("=" * 80)
+    L.append("%s   [preset: %s]" % (hw["label"], key))
+    L.append("  dim=%d  ppc=%g  shape_order=%d  precision=%s(%dB)  J-components=%d"
+             % (p.dim, p.ppc, p.shape_order, p.precision, PRECISION[p.precision], p.components))
+    reach = 2 if p.shape_order == 0 else p.shape_order
+    reach_kind = "zigzag" if p.shape_order == 0 else "Esirkepov O"
+    L.append("  HALO = stencil_reach + drift = %d + %d = %d   ->   TE = T_TILE + %d"
+             "   (reach %d = %s; drift = team_policy_drift)"
+             % (reach, p.drift, res["halo"], 2 * res["halo"], reach, reach_kind))
+    L.append("  shared mem %s KiB/%s (budget %s KiB for %d resident WGs); subgroup=%d, n_cu=%d"
+             % (kib(hw["smem_cu"]), hw["cu"], kib(hw["smem_cu"] / p.target_resident),
+                p.target_resident, hw["subgroup"], hw["n_cu"]))
+    if hw.get("note"):
+        L.append("  note: %s" % hw["note"])
+    L.append("-" * 80)
+    L.append("  T_TILE  TE   scratch  resWG  part/tile  halo%  zero+flush%")
+    for r in res["rows"]:
+        if not r["ok_cap"]:
+            continue
+        mark = "  <== recommended" if r is res["chosen"] else ""
+        L.append("   %3d   %4d  %6s K  %4d  %9.0f  %4.0f   %7.1f%s"
+                 % (r["T"], r["TE"], kib(r["scratch"]), r["resident"], r["npart"],
+                    100 * r["halo_frac"], 100 * r["ovhd"], mark))
+    L.append("-" * 80)
+
+    c = res["chosen"]
+    if c is None:
+        L.append("  INFEASIBLE: even T_TILE=%d does not fit %s KiB shared memory."
+                 % (p.min_tile, kib(hw["smem_wg_max"])))
+        L.append("  -> sort more often (smaller drift), use precision single, lower shape_order,")
+        L.append("     or use a non-tiled (global-atomic / ScatterView) deposit on this arch.")
+        return L
+    L.append("  RECOMMENDED  T_TILE = %d   (limited by: %s%s)"
+             % (c["T"], res["binding"], "; tile grown to reduce halo" if res["grown"] else ""))
+    L.append("    %.1f KiB scratch/team, %d work-groups resident/%s, %.0f particles/team, %.0f%% halo"
+             % (c["scratch"] / 1024.0, c["resident"], hw["cu"], c["npart"], 100 * c["halo_frac"]))
+    team = min(hw["max_wg"], 256 - 256 % hw["subgroup"])
+    extra = "" if c["T"] <= 16 else "   (entity's team_policy_tile_sizes list stops at 16; extend it)"
+    L.append("    entity build:  -D team_policy=ON -D team_policy_tile_size=%d -D team_policy_drift=%d%s"
+             % (min(c["T"], 16), p.drift, extra))
+    L.append("    team (work-group) size: Kokkos::AUTO by default; to override, set in the toml")
+    L.append("    [algorithms.deposit] team_policy_team_size = %d   (0 = AUTO; keep a multiple of"
+             % team)
+    L.append("    subgroup=%d), then sweep around it and re-profile" % hw["subgroup"])
+    # contextual guidance
+    if c["halo_frac"] > p.halo_max:
+        if p.drift > 1:
+            L.append("    !! %.0f%% of the tile is halo, inflated by team_policy_drift=%d; lower it "
+                     "(and sort at least that often via spatial_sorting_interval)"
+                     % (100 * c["halo_frac"], p.drift))
+        else:
+            L.append("    !! %.0f%% halo is intrinsic at this size (shared memory caps the tile here)"
+                     % (100 * c["halo_frac"]))
+    if c["npart"] > p.npart_cap:
+        L.append("    !! %.0f particles/team exceeds the %.0f budget -> watch SLM-atomic contention"
+                 % (c["npart"], p.npart_cap))
+    if c["resident"] < p.target_resident:
+        L.append("    !! only %d work-group(s) resident/%s -> limited latency hiding"
+                 % (c["resident"], hw["cu"]))
+    return L
+
+
+def archs_for(name):
+    """Expand a setting/arg value into the list of arch names to report on."""
+    return ALL_ARCHS if name.lower() == "all" else [name]
+
+
+def build_report(p):
+    """Run the model for the selected arch(es) and return the full report as lines."""
+    lines = []
+    for a in archs_for(p.arch):
+        try:
+            key, hw = resolve_arch(a)
+        except SystemExit as e:
+            lines.append(str(e))
+            continue
+        lines.extend(report_lines(a, key, hw, p, recommend(hw, p)))
+    lines.append("=" * 80)
+    return lines
+
+
+# ============================
+# colors: edit these
+# ============================
+
+COLOR_TITLE_FG = curses.COLOR_BLUE
+COLOR_TEXT_FG = curses.COLOR_WHITE
+COLOR_SELECTED_FG = curses.COLOR_WHITE
+COLOR_SELECTED_BG = curses.COLOR_BLACK
+COLOR_HINT_FG = curses.COLOR_YELLOW
+COLOR_OK_FG = curses.COLOR_GREEN
+COLOR_ERR_FG = curses.COLOR_RED
+COLOR_KEY_FG = curses.COLOR_MAGENTA
+COLOR_DIM_FG = curses.COLOR_CYAN
+
+PAIR_TITLE = 1
+PAIR_TEXT = 2
+PAIR_SELECTED = 3
+PAIR_HINT = 4
+PAIR_OK = 5
+PAIR_ERR = 6
+PAIR_KEY = 7
+PAIR_DIM = 8
+
+
+class MenuItem:
+    def __init__(self, label, hint="", right=None, on_enter=None,
+                 on_space=None, disabled=None):
+        self.label = label
+        self.hint = hint
+        self.right = right
+        self.on_enter = on_enter
+        self.on_space = on_space
+        self.disabled = disabled
+
+
+# ============================
+# TUI
+# ============================
+
+class App:
+    def __init__(self, stdscr):
+        self.stdscr = stdscr
+        self.s = Settings()
+
+        self.state = "mainmenu"
+        self.stack: List[Tuple[str, int]] = []
+        self.selected = 0
+        self.scroll = 0
+        self.message = "use arrows or j/k"
+
+        self._init_curses()
+
+    def _init_curses(self) -> None:
+        curses.curs_set(0)
+        self.stdscr.keypad(True)
+        curses.noecho()
+        curses.cbreak()
+
+        if curses.has_colors():
+            curses.start_color()
+            curses.use_default_colors()
+            curses.init_pair(PAIR_TITLE, COLOR_TITLE_FG, -1)
+            curses.init_pair(PAIR_TEXT, COLOR_TEXT_FG, -1)
+            curses.init_pair(PAIR_SELECTED, COLOR_SELECTED_FG, COLOR_SELECTED_BG)
+            curses.init_pair(PAIR_HINT, COLOR_HINT_FG, -1)
+            curses.init_pair(PAIR_OK, COLOR_OK_FG, -1)
+            curses.init_pair(PAIR_ERR, COLOR_ERR_FG, -1)
+            curses.init_pair(PAIR_KEY, COLOR_KEY_FG, -1)
+            curses.init_pair(PAIR_DIM, COLOR_DIM_FG, -1)
+
+    def cp(self, pair_id: int) -> int:
+        return curses.color_pair(pair_id) if curses.has_colors() else 0
+
+    # ----- formatting helpers -----
+
+    def arch_label(self) -> str:
+        if self.s.arch == "all":
+            return "all (%s)" % " + ".join(ALL_ARCHS)
+        try:
+            return ARCH[resolve_arch(self.s.arch)[0]]["label"]
+        except SystemExit:
+            return self.s.arch
+
+    # ----- nav stack -----
+
+    def push(self, st: str) -> None:
+        self.stack.append((self.state, self.selected))
+        self.state = st
+        self.selected = 0
+        self.scroll = 0
+        self.message = ""
+
+    def pop(self) -> None:
+        if self.stack:
+            self.state, self.selected = self.stack.pop()
+        else:
+            self.state, self.selected = "mainmenu", 0
+        self.scroll = 0
+        self.message = ""
+
+    # ----- drawing -----
+
+    def add(self, y: int, x: int, s: str, attr: int = 0) -> None:
+        try:
+            self.stdscr.addstr(y, x, s, attr)
+        except curses.error:
+            pass
+
+    def hline(self, y: int) -> None:
+        _, w = self.stdscr.getmaxyx()
+        try:
+            self.stdscr.hline(y, 0, curses.ACS_HLINE, max(0, w - 1))
+        except curses.error:
+            pass
+
+    def draw_keybar(self, y: int, x: int, pairs: List[Tuple[str, str]]) -> None:
+        cur_x = x
+        for key, action in pairs:
+            self.add(y, cur_x, key, self.cp(PAIR_KEY) | curses.A_BOLD)
+            cur_x += len(key)
+            self.add(y, cur_x, " ", self.cp(PAIR_DIM))
+            cur_x += 1
+            self.add(y, cur_x, action, self.cp(PAIR_HINT))
+            cur_x += len(action)
+            self.add(y, cur_x, "   ", self.cp(PAIR_DIM))
+            cur_x += 3
+
+    def breadcrumb(self) -> str:
+        return {
+            "mainmenu": "mainmenu",
+            "arch": "mainmenu > architecture",
+            "physics": "mainmenu > physics & particles",
+            "tuning": "mainmenu > tuning knobs",
+        }.get(self.state, "mainmenu")
+
+    def draw_menu(self, title: str, prompt: str, items: List[MenuItem]) -> None:
+        self.stdscr.erase()
+        h, w = self.stdscr.getmaxyx()
+
+        self.add(0, 2, title, self.cp(PAIR_TITLE) | curses.A_BOLD)
+        bc = self.breadcrumb()
+        self.add(0, max(2, w - 2 - len(bc)), bc, self.cp(PAIR_DIM))
+
+        self.draw_keybar(
+            1,
+            2,
+            [
+                ("up/dn/j/k", "move"),
+                ("enter", "select"),
+                ("space", "toggle/cycle"),
+                ("b", "back"),
+                ("q", "quit"),
+            ],
+        )
+        self.hline(2)
+
+        status1 = ("arch: %s    dim: %d    precision: %s    ppc: %g"
+                   % (self.s.arch, self.s.dim, self.s.precision, self.s.ppc))
+        status2 = ("shape_order: %d    drift: %d    components: %d    tile range: %d-%d"
+                   % (self.s.shape_order, self.s.drift, self.s.components,
+                      self.s.min_tile, self.s.max_tile))
+        self.add(3, 2, status1[: w - 4], self.cp(PAIR_TEXT))
+        self.add(4, 2, status2[: w - 4], self.cp(PAIR_TEXT))
+        self.hline(5)
+
+        self.add(6, 2, prompt[: w - 4], self.cp(PAIR_TEXT) | curses.A_BOLD)
+
+        list_y = 8
+        footer_h = 3
+        view_h = max(1, h - list_y - footer_h)
+        n = len(items)
+
+        if n == 0:
+            self.add(list_y, 2, "(empty)", self.cp(PAIR_HINT))
+        else:
+            self.selected = max(0, min(self.selected, n - 1))
+
+            if self.selected < self.scroll:
+                self.scroll = self.selected
+            if self.selected >= self.scroll + view_h:
+                self.scroll = self.selected - view_h + 1
+            self.scroll = max(0, min(self.scroll, max(0, n - view_h)))
+
+            shown = items[self.scroll : self.scroll + view_h]
+
+            for i, it in enumerate(shown):
+                idx = self.scroll + i
+                sel = idx == self.selected
+                dis = bool(it.disabled and it.disabled())
+
+                row_attr = (
+                    self.cp(PAIR_SELECTED) | curses.A_BOLD
+                    if sel
+                    else (self.cp(PAIR_DIM) if dis else self.cp(PAIR_TEXT))
+                )
+                self.add(list_y + i, 2, ("  %s" % it.label)[: w - 4], row_attr)
+
+                if it.right:
+                    rt = (it.right() or "").strip()
+                    if rt:
+                        rt = rt[: max(0, w - 6)]
+                        x = max(2, w - 2 - len(rt))
+                        rt_attr = (
+                            row_attr
+                            if sel
+                            else (self.cp(PAIR_HINT) if not dis else self.cp(PAIR_DIM))
+                        )
+                        self.add(list_y + i, x, rt, rt_attr)
+
+                if sel and it.hint:
+                    self.add(
+                        list_y + i,
+                        min(w - 4, 30),
+                        ("    %s" % it.hint)[: w - 4],
+                        self.cp(PAIR_HINT),
+                    )
+
+        self.hline(h - 3)
+        msg = self.message or ""
+        if msg:
+            is_err = msg.startswith("error")
+            attr = (self.cp(PAIR_ERR) if is_err else self.cp(PAIR_OK)) | curses.A_BOLD
+            self.add(h - 2, 2, msg[: w - 4], attr)
+        self.stdscr.refresh()
+
+    # ----- modals -----
+
+    def input_box(self, title: str, prompt: str, initial: str) -> Optional[str]:
+        h, w = self.stdscr.getmaxyx()
+        win_h, win_w = 9, min(86, max(46, w - 6))
+        top, left = max(0, (h - win_h) // 2), max(0, (w - win_w) // 2)
+
+        win = curses.newwin(win_h, win_w, top, left)
+        win.keypad(True)
+        win.border()
+
+        win.addstr(1, 2, title[: win_w - 4], self.cp(PAIR_TITLE) | curses.A_BOLD)
+        win.addstr(2, 2, prompt[: win_w - 4], self.cp(PAIR_TEXT))
+
+        buf = list(initial)
+        curses.curs_set(1)
+
+        while True:
+            win.addstr(4, 2, " " * (win_w - 4), self.cp(PAIR_TEXT))
+            text = "".join(buf)
+            if len(text) > win_w - 4:
+                text = text[-(win_w - 4) :]
+            win.addstr(4, 2, text, self.cp(PAIR_TEXT) | curses.A_BOLD)
+            win.addstr(6, 2, "enter=ok   esc=cancel", self.cp(PAIR_DIM))
+            win.refresh()
+
+            ch = win.getch()
+            if ch == 27:
+                curses.curs_set(0)
+                return None
+            if ch in (curses.KEY_ENTER, 10, 13):
+                curses.curs_set(0)
+                return "".join(buf).strip()
+            if ch in (curses.KEY_BACKSPACE, 127, 8):
+                if buf:
+                    buf.pop()
+            elif 32 <= ch <= 126:
+                buf.append(chr(ch))
+
+    # ----- value editors -----
+
+    def cycle_attr(self, attr: str, options: list) -> None:
+        cur = getattr(self.s, attr)
+        if cur not in options:
+            setattr(self.s, attr, options[0])
+        else:
+            setattr(self.s, attr, options[(options.index(cur) + 1) % len(options)])
+
+    def edit_int(self, label: str, attr: str, minv: Optional[int] = None) -> None:
+        val = self.input_box(label, "enter an integer:", str(getattr(self.s, attr)))
+        if val is None or val == "":
+            return
+        try:
+            n = int(val)
+        except ValueError:
+            self.message = "error: '%s' is not an integer" % val
+            return
+        if minv is not None and n < minv:
+            self.message = "error: %s must be >= %d" % (label, minv)
+            return
+        setattr(self.s, attr, n)
+        self.message = "%s = %d" % (label, n)
+
+    def edit_float(self, label: str, attr: str,
+                   minv: Optional[float] = None, maxv: Optional[float] = None) -> None:
+        val = self.input_box(label, "enter a number:", str(getattr(self.s, attr)))
+        if val is None or val == "":
+            return
+        try:
+            x = float(val)
+        except ValueError:
+            self.message = "error: '%s' is not a number" % val
+            return
+        if minv is not None and x < minv:
+            self.message = "error: %s must be >= %g" % (label, minv)
+            return
+        if maxv is not None and x > maxv:
+            self.message = "error: %s must be <= %g" % (label, maxv)
+            return
+        setattr(self.s, attr, x)
+        self.message = "%s = %g" % (label, x)
+
+    # ----- report pager -----
+
+    def _pager_attr(self, ln: str) -> int:
+        s = ln.strip()
+        if "RECOMMENDED" in ln:
+            return self.cp(PAIR_OK) | curses.A_BOLD
+        if "<== recommended" in ln:
+            return self.cp(PAIR_OK)
+        if "INFEASIBLE" in ln or "unknown arch" in ln:
+            return self.cp(PAIR_ERR) | curses.A_BOLD
+        if s.startswith("!!"):
+            return self.cp(PAIR_HINT)
+        if s.startswith("T_TILE"):
+            return self.cp(PAIR_TITLE) | curses.A_BOLD
+        if set(s) <= {"=", "-"} and s:
+            return self.cp(PAIR_DIM)
+        return self.cp(PAIR_TEXT)
+
+    def pager(self, title: str, lines: List[str]) -> None:
+        top = 0
+        note = ""
+        while True:
+            self.stdscr.erase()
+            h, w = self.stdscr.getmaxyx()
+
+            self.add(0, 2, title, self.cp(PAIR_TITLE) | curses.A_BOLD)
+            self.draw_keybar(
+                1,
+                2,
+                [
+                    ("up/dn/j/k", "scroll"),
+                    ("PgUp/PgDn", "page"),
+                    ("g/G", "top/end"),
+                    ("w", "save"),
+                    ("b/q", "back"),
+                ],
+            )
+            self.hline(2)
+
+            list_y = 4
+            view_h = max(1, h - list_y - 2)
+            n = len(lines)
+            top = max(0, min(top, max(0, n - view_h)))
+
+            for i, ln in enumerate(lines[top : top + view_h]):
+                self.add(list_y + i, 2, ln[: w - 3], self._pager_attr(ln))
+
+            self.hline(h - 2)
+            footer = "line %d-%d / %d" % (top + 1, min(top + view_h, n), n)
+            if note:
+                footer += "    " + note
+            self.add(h - 1, 2, footer[: w - 4], self.cp(PAIR_DIM))
+            self.stdscr.refresh()
+
+            ch = self.stdscr.getch()
+            if ch in (ord("q"), ord("Q"), ord("b"), 8, 127):
+                return
+            if ch in (curses.KEY_UP, ord("k"), ord("K")):
+                top -= 1
+            elif ch in (curses.KEY_DOWN, ord("j"), ord("J")):
+                top += 1
+            elif ch in (curses.KEY_PPAGE,):
+                top -= view_h
+            elif ch in (curses.KEY_NPAGE, ord(" ")):
+                top += view_h
+            elif ch in (ord("g"),):
+                top = 0
+            elif ch in (ord("G"),):
+                top = n
+            elif ch in (ord("w"), ord("W")):
+                note = self._save_report(lines)
+
+    def _save_report(self, lines: List[str]) -> str:
+        path = os.path.join(os.getcwd(), "ideal_tile_size_report.txt")
+        try:
+            with open(path, "w") as f:
+                f.write("\n".join(lines) + "\n")
+            return "saved to %s" % path
+        except OSError as e:
+            return "save failed: %s" % e
+
+    def do_compute(self) -> None:
+        self.pager("recommendation  [%s]" % self.s.arch, build_report(self.s))
+
+    def reset(self) -> None:
+        self.s = Settings()
+        self.message = "reset to defaults"
+
+    # ----- menus -----
+
+    def menu_main(self) -> Tuple[str, str, List[MenuItem]]:
+        return (
+            "entity tile-size advisor",
+            "main menu:",
+            [
+                MenuItem(
+                    "architecture",
+                    "choose the target GPU (or 'all')",
+                    right=self.arch_label,
+                    on_enter=lambda: self.push("arch"),
+                ),
+                MenuItem(
+                    "physics & particles",
+                    "dim, ppc, shape order, precision, ...",
+                    right=lambda: "dim %d / ppc %g / so %d / %s"
+                    % (self.s.dim, self.s.ppc, self.s.shape_order, self.s.precision),
+                    on_enter=lambda: self.push("physics"),
+                ),
+                MenuItem(
+                    "tuning knobs",
+                    "residency, budgets, tile range, ...",
+                    right=lambda: "res %d / cap %g / halo %.2f"
+                    % (self.s.target_resident, self.s.npart_cap, self.s.halo_max),
+                    on_enter=lambda: self.push("tuning"),
+                ),
+                MenuItem(
+                    "compute recommendation",
+                    "run the model and show the report",
+                    on_enter=self.do_compute,
+                ),
+                MenuItem("reset to defaults", "", on_enter=self.reset),
+                MenuItem("exit", "", on_enter=lambda: setattr(self, "state", "exit")),
+            ],
+        )
+
+    def menu_arch(self) -> Tuple[str, str, List[MenuItem]]:
+        def choose(name: str):
+            self.s.arch = name
+            self.pop()
+
+        def label(name: str) -> str:
+            mark = "(*)" if name == self.s.arch else "( )"
+            return "%s %s" % (mark, name)
+
+        def right(name: str) -> str:
+            if name == "all":
+                return " + ".join(ALL_ARCHS)
+            return ARCH[name]["label"]
+
+        items = [
+            MenuItem(
+                label(a),
+                "select target",
+                right=(lambda a=a: right(a)),
+                on_enter=(lambda a=a: choose(a)),
+            )
+            for a in ARCH_CHOICES
+        ]
+        items.append(MenuItem("back", "return", on_enter=self.pop))
+        return ("architecture", "pick the target GPU:", items)
+
+    def menu_physics(self) -> Tuple[str, str, List[MenuItem]]:
+        def cyc_dim():
+            self.cycle_attr("dim", [1, 2, 3])
+
+        def cyc_prec():
+            self.cycle_attr("precision", ["single", "double"])
+
+        return (
+            "physics & particles",
+            "set physical inputs:",
+            [
+                MenuItem(
+                    "dim",
+                    "space cycles: 1 / 2 / 3",
+                    right=lambda: str(self.s.dim),
+                    on_enter=cyc_dim,
+                    on_space=cyc_dim,
+                ),
+                MenuItem(
+                    "ppc",
+                    "particles per cell (per species)",
+                    right=lambda: "%g" % self.s.ppc,
+                    on_enter=lambda: self.edit_float("ppc", "ppc", minv=0.0),
+                ),
+                MenuItem(
+                    "shape_order",
+                    "entity particle shape order",
+                    right=lambda: str(self.s.shape_order),
+                    on_enter=lambda: self.edit_int("shape_order", "shape_order", minv=0),
+                ),
+                MenuItem(
+                    "precision",
+                    "space cycles: single / double",
+                    right=lambda: self.s.precision,
+                    on_enter=cyc_prec,
+                    on_space=cyc_prec,
+                ),
+                MenuItem(
+                    "components",
+                    "current-field components (J has 3)",
+                    right=lambda: str(self.s.components),
+                    on_enter=lambda: self.edit_int("components", "components", minv=1),
+                ),
+                MenuItem(
+                    "drift",
+                    "team_policy_drift CMake knob: cells the scratch halo absorbs (>= spatial_sorting_interval)",
+                    right=lambda: str(self.s.drift),
+                    on_enter=lambda: self.edit_int("drift", "drift", minv=0),
+                ),
+                MenuItem("back", "return", on_enter=self.pop),
+            ],
+        )
+
+    def menu_tuning(self) -> Tuple[str, str, List[MenuItem]]:
+        return (
+            "tuning knobs",
+            "set the model's budgets and ranges:",
+            [
+                MenuItem(
+                    "target_resident",
+                    "work-groups resident per compute unit",
+                    right=lambda: str(self.s.target_resident),
+                    on_enter=lambda: self.edit_int("target_resident", "target_resident", minv=1),
+                ),
+                MenuItem(
+                    "npart_cap",
+                    "particle-per-tile budget (contention proxy)",
+                    right=lambda: "%g" % self.s.npart_cap,
+                    on_enter=lambda: self.edit_float("npart_cap", "npart_cap", minv=1.0),
+                ),
+                MenuItem(
+                    "halo_max",
+                    "halo fraction above which the tile is grown (0..1)",
+                    right=lambda: "%.2f" % self.s.halo_max,
+                    on_enter=lambda: self.edit_float("halo_max", "halo_max", minv=0.0, maxv=1.0),
+                ),
+                MenuItem(
+                    "grid",
+                    "cells per dim (0 disables GPU-fill check)",
+                    right=lambda: str(self.s.grid),
+                    on_enter=lambda: self.edit_int("grid", "grid", minv=0),
+                ),
+                MenuItem(
+                    "balance_factor",
+                    "min tiles per compute unit",
+                    right=lambda: str(self.s.balance_factor),
+                    on_enter=lambda: self.edit_int("balance_factor", "balance_factor", minv=1),
+                ),
+                MenuItem(
+                    "min_tile",
+                    "smallest T_TILE to consider",
+                    right=lambda: str(self.s.min_tile),
+                    on_enter=lambda: self.edit_int("min_tile", "min_tile", minv=1),
+                ),
+                MenuItem(
+                    "max_tile",
+                    "largest T_TILE to consider",
+                    right=lambda: str(self.s.max_tile),
+                    on_enter=lambda: self.edit_int("max_tile", "max_tile", minv=1),
+                ),
+                MenuItem("back", "return", on_enter=self.pop),
+            ],
+        )
+
+    def get_menu(self) -> Tuple[str, str, List[MenuItem]]:
+        if self.state == "mainmenu":
+            return self.menu_main()
+        if self.state == "arch":
+            return self.menu_arch()
+        if self.state == "physics":
+            return self.menu_physics()
+        if self.state == "tuning":
+            return self.menu_tuning()
+        self.state = "mainmenu"
+        return self.menu_main()
+
+    # ----- navigation -----
+
+    def is_disabled(self, it: MenuItem) -> bool:
+        return bool(it.disabled and it.disabled())
+
+    def move_sel(self, items: List[MenuItem], delta: int) -> None:
+        if not items:
+            return
+        n = len(items)
+        start = self.selected
+        for _ in range(n):
+            self.selected = (self.selected + delta) % n
+            if not self.is_disabled(items[self.selected]):
+                return
+        self.selected = start
+
+    def activate(self, items: List[MenuItem], enter: bool) -> None:
+        if not items:
+            return
+        it = items[self.selected]
+        if self.is_disabled(it):
+            self.message = "error: option disabled."
+            return
+        fn = it.on_enter if enter else it.on_space
+        if fn:
+            fn()
+
+    # ----- loop -----
+
+    def run(self) -> None:
+        while True:
+            if self.state == "exit":
+                return
+
+            title, prompt, items = self.get_menu()
+            self.draw_menu(title, prompt, items)
+
+            ch = self.stdscr.getch()
+
+            if ch in (ord("q"), ord("Q")):
+                self.state = "exit"
+                continue
+            if ch in (ord("b"), 8, 127):
+                self.pop()
+                continue
+            if ch in (curses.KEY_UP, ord("k"), ord("K")):
+                self.move_sel(items, -1)
+                continue
+            if ch in (curses.KEY_DOWN, ord("j"), ord("J")):
+                self.move_sel(items, +1)
+                continue
+            if ch in (curses.KEY_ENTER, 10, 13):
+                self.activate(items, enter=True)
+                continue
+            if ch == ord(" "):
+                self.activate(items, enter=False)
+                continue
+
+
+def run_tui() -> int:
+    try:
+        curses.wrapper(lambda stdscr: App(stdscr).run())
+    except KeyboardInterrupt:
+        return 130
+    return 0
+
+
+# ============================
+# CLI (preserved for scripting / sweeps)
+# ============================
+
+def run_cli(argv) -> int:
+    ap = argparse.ArgumentParser(description="Recommend entity team tile size (T_TILE).")
+    ap.add_argument("arch", help="pvc | nvidia | amd  (or a100/h100/gh200/mi250x/mi300x, or 'all')")
+    ap.add_argument("--dim", type=int, default=2, choices=(1, 2, 3))
+    ap.add_argument("--ppc", type=float, default=16.0, help="particles per cell (per species)")
+    ap.add_argument("--shape-order", type=int, default=2, help="particle shape order (entity shape_order)")
+    ap.add_argument("--precision", choices=("single", "double"), default="single")
+    ap.add_argument("--components", type=int, default=3, help="current-field components (J has 3)")
+    ap.add_argument("--drift", type=int, default=1,
+                    help="team_policy_drift CMake knob (compile-time): cells of drift the scratch "
+                         "halo absorbs; size it >= spatial_sorting_interval")
+    ap.add_argument("--target-resident", type=int, default=2, help="work-groups resident per compute unit")
+    ap.add_argument("--npart-cap", type=float, default=1600,
+                    help="particle-per-tile budget (SLM-atomic-contention / load-balance proxy)")
+    ap.add_argument("--halo-max", type=float, default=0.70, help="halo fraction above which the tile is grown")
+    ap.add_argument("--grid", type=int, default=0, help="cells per dim (optional; enables a GPU-fill check)")
+    ap.add_argument("--balance-factor", type=int, default=4, help="min tiles per compute unit")
+    ap.add_argument("--min-tile", type=int, default=4,
+                    help="smallest T_TILE to consider (entity's team_policy_tile_sizes starts at 4)")
+    ap.add_argument("--max-tile", type=int, default=64)
+    p = ap.parse_args(argv)
+
+    for ln in build_report(p):
+        print(ln)
+    return 0
+
+
+def main() -> int:
+    # no arguments -> interactive TUI; any argument -> scriptable CLI
+    if len(sys.argv) > 1:
+        return run_cli(sys.argv[1:])
+    return run_tui()
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/input.example.toml b/input.example.toml
index 741d5c2d2..87ca48f3c 100644
--- a/input.example.toml
+++ b/input.example.toml
@@ -301,6 +301,14 @@
     #   @type: bool
     #   @default: true
     enable = ""
+    # team_policy tiled-deposit work-group (team) size
+    #   @type: uint [>= 0]
+    #   @default: 0
+    #   @note: 0 keeps Kokkos::AUTO (backend occupancy heuristic); a positive
+    #          value overrides it, clamped to the backend/scratch maximum at
+    #          launch. Only used in `team_policy=ON` builds. Pick a multiple of
+    #          the device subgroup width for best occupancy (see ideal_tile_size.py)
+    team_policy_team_size = ""
 
     # @inferred:
     # - order
diff --git a/src/engines/engine.hpp b/src/engines/engine.hpp
index b20e163ca..057bfcb5a 100644
--- a/src/engines/engine.hpp
+++ b/src/engines/engine.hpp
@@ -78,10 +78,11 @@ namespace ntt {
     Metadomain<S, M> m_metadomain;
     user::PGen<S, M> m_pgen;
 
-    const bool       is_resuming;
-    const simtime_t  runtime;
-    const real_t     dt;
-    const timestep_t max_steps;
+    const bool        is_resuming;
+    const simtime_t   runtime;
+    const real_t      dt;
+    const std::size_t team_policy_team_size;
+    const timestep_t  max_steps;
     const timestep_t start_step;
     const simtime_t  start_time;
     simtime_t        time;
@@ -109,6 +110,8 @@ namespace ntt {
       , is_resuming { m_params.get<bool>("checkpoint.is_resuming") }
       , runtime { m_params.get<simtime_t>("simulation.runtime") }
       , dt { m_params.get<real_t>("algorithms.timestep.dt") }
+      , team_policy_team_size { m_params.get<std::size_t>(
+          "algorithms.deposit.team_policy_team_size") }
       , max_steps { static_cast<timestep_t>(runtime / dt) }
       , start_step { m_params.get<timestep_t>("checkpoint.start_step") }
       , start_time { m_params.get<simtime_t>("checkpoint.start_time") }
@@ -127,6 +130,8 @@ namespace ntt {
       auto parameters = prm::Parameters {};
       parameters.set("dt", static_cast<real_t>(dt));
       parameters.set("time", static_cast<simtime_t>(time));
+      parameters.set("team_policy_team_size",
+                     static_cast<std::size_t>(team_policy_team_size));
       return parameters;
     }
   };
diff --git a/src/engines/grpic/currents.h b/src/engines/grpic/currents.h
index 11c847533..cb4032f54 100644
--- a/src/engines/grpic/currents.h
+++ b/src/engines/grpic/currents.h
@@ -2,6 +2,8 @@
  * @file engines/grpic/currents.h
  * @brief Current deposition and filtering routines for the GRPIC engine
  * @implements
+ *   - ntt::grpic::CallDepositKernel<> -> void                 (flat path)
+ *   - ntt::grpic::CallDepositKernelTiled<> -> void            (TEAM_POLICY)
  *   - ntt::grpic::CurrentsDeposit<> -> void
  *   - ntt::grpic::CurrentsFilter<> -> void
  * @namespaces:
@@ -12,8 +14,11 @@
 #define ENGINES_GRPIC_CURRENTS_H
 
 #include "enums.h"
+#include "global.h"
 
+#include "arch/kokkos_aliases.h"
 #include "traits/metric.h"
+#include "utils/error.h"
 #include "utils/log.h"
 #include "utils/param_container.h"
 
@@ -26,13 +31,198 @@
 namespace ntt {
   namespace grpic {
 
+    template <GRMetricClass M, unsigned short O>
+    void CallDepositKernel(const Particles<M::Dim, M::CoordType>& species,
+                           const M&                               local_metric,
+                           const scatter_ndfield_t<M::Dim, 3>&    scatter_cur,
+                           real_t                                 dt) {
+      Kokkos::parallel_for("CurrentsDeposit",
+                           species.rangeActiveParticles(),
+                           kernel::DepositCurrents_kernel<SimEngine::GRPIC, M, O>(
+                             scatter_cur,
+                             species,
+                             local_metric,
+                             (real_t)(species.charge()),
+                             dt));
+    }
+
+#if defined(TEAM_POLICY)
+    /**
+     * @brief Tiled deposit launcher (TeamPolicy + per-team scratch).
+     *
+     * Identical in structure to the SRPIC launcher (`engines/srpic/currents.h`):
+     * iterates over `tile_layout.ntiles_total` teams; each team accumulates its
+     * tile's particle contributions in SLM scratch and atomically flushes to the
+     * global J (here `cur0`, the GRPIC half-step current). Requires the species
+     * to have been sorted with `team_policy` enabled (`tile_layout` populated by
+     * `SortSpatially`).
+     *
+     * The deposit body (`kernel::DepositOneParticle<SimEngine::GRPIC, M, O>`) is
+     * the same shared math used by the flat path — it already carries the GR
+     * velocity-recovery branch — so the only engine-specific differences from
+     * SRPIC are the `SimEngine::GRPIC` tag and the `cur0` target.
+     *
+     * Falls back to the flat kernel for the tail `[npart_partitioned, npart)`
+     * exactly as SRPIC does; see the per-step coverage note in
+     * `kernels/currents_deposit.hpp`.
+     */
+    template <GRMetricClass M, unsigned short O>
+    void CallDepositKernelTiled(const Particles<M::Dim, M::CoordType>& species,
+                                const M&                    local_metric,
+                                const ndfield_t<M::Dim, 3>& cur,
+                                real_t                      dt,
+                                int                         team_size_req) {
+      static_assert(O <= 11u, "Shape order must be <= 11");
+      constexpr unsigned short T = static_cast<unsigned short>(
+        TEAM_POLICY_TILE_SIZE);
+      const auto& layout = species.tile_layout();
+      raise::ErrorIf(layout.ntiles_total == 0u,
+                     "CallDepositKernelTiled: tile_layout has 0 tiles — call "
+                     "SortSpatially before CurrentsDeposit",
+                     HERE);
+      raise::ErrorIf(layout.tile_offsets.extent(0) != layout.ntiles_total + 1u,
+                     "CallDepositKernelTiled: tile_offsets size inconsistent "
+                     "with ntiles_total",
+                     HERE);
+
+      auto deposit_kernel =
+        kernel::DepositCurrentsTiled_kernel<SimEngine::GRPIC, M, O, T> {
+          cur,    species, local_metric, (real_t)(species.charge()),
+          dt,     layout,  species.npart()
+        };
+
+      const auto scratch = Kokkos::PerTeam(
+        decltype(deposit_kernel)::scratch_bytes());
+
+      // Team (work-group) size. The default (team_size_req == 0) leaves
+      // Kokkos::AUTO, which sizes the team from the backend occupancy
+      // heuristic. A positive `algorithms.deposit.team_policy_team_size`
+      // overrides it, clamped to the scratch/backend-feasible maximum so an
+      // over-large request cannot abort the launch (Kokkos errors when
+      // team_size > team_size_max). No portable subgroup rounding is applied;
+      // pick a multiple of the device subgroup width (printed per arch by
+      // ideal_tile_size.py) for the best occupancy.
+      Kokkos::TeamPolicy<> policy(static_cast<int>(layout.ntiles_total),
+                                  Kokkos::AUTO);
+      policy.set_scratch_size(0, scratch);
+      if (team_size_req > 0) {
+        const int ts_max = policy.team_size_max(deposit_kernel,
+                                                Kokkos::ParallelForTag {});
+        int       ts     = team_size_req;
+        if (ts > ts_max) {
+          raise::Warning(
+            fmt::format("algorithms.deposit.team_policy_team_size = %d exceeds "
+                        "the tiled-deposit maximum %d on this backend; clamping "
+                        "to %d",
+                        team_size_req,
+                        ts_max,
+                        ts_max),
+            HERE);
+          ts = ts_max;
+        }
+        policy = Kokkos::TeamPolicy<>(static_cast<int>(layout.ntiles_total), ts);
+        policy.set_scratch_size(0, scratch);
+        logger::Checkpoint(
+          fmt::format("Tiled deposit: explicit team size %d", ts),
+          HERE);
+      }
+      Kokkos::parallel_for("CurrentsDepositTiled", policy, deposit_kernel);
+
+      // Particles appended since the last sort (injection / MPI receive on a
+      // no-sort step) live past the partition and are not visited by any team
+      // above. Deposit that tail [npart_partitioned, npart) with the flat
+      // scatter-view kernel so every active particle is deposited exactly
+      // once. The range is empty when the species was just sorted (the
+      // every-step-sorted common case), so this is a no-op there.
+      if (species.npart() > layout.npart_partitioned) {
+        // `cur` is a const ref; take a non-const View handle (shallow copy,
+        // shares storage) so the scatter view can contribute back into it.
+        auto cur_nc      = cur;
+        auto scatter_cur = Kokkos::Experimental::create_scatter_view(cur_nc);
+        Kokkos::parallel_for(
+          "CurrentsDepositTiledTail",
+          CreateParticleRangePolicy<Dim::_1D>({ layout.npart_partitioned },
+                                              { species.npart() }),
+          kernel::DepositCurrents_kernel<SimEngine::GRPIC, M, O>(
+            scatter_cur,
+            species,
+            local_metric,
+            (real_t)(species.charge()),
+            dt));
+        Kokkos::Experimental::contribute(cur_nc, scatter_cur);
+      }
+    }
+#endif // TEAM_POLICY
+
     template <GRMetricClass M>
     void CurrentsDeposit(Domain<SimEngine::GRPIC, M>& domain,
                          const prm::Parameters&       engine_params) {
+      const auto dt = engine_params.get<real_t>("dt");
+      // GRPIC deposits the half-step current into `cur0` (the engine no longer
+      // pre-zeros it — this is the single source of truth, matching SRPIC).
+      Kokkos::deep_copy(domain.fields.cur0, ZERO);
+
+#if defined(TEAM_POLICY)
+      // Optional runtime override for the tiled-deposit team (work-group) size;
+      // 0 (default) keeps Kokkos::AUTO. Clamped to the backend max in the
+      // launcher (see CallDepositKernelTiled).
+      const auto team_size_req = static_cast<int>(
+        engine_params.get<std::size_t>("team_policy_team_size",
+                                       std::optional<std::size_t> { 0u }));
+
+      // Tiled deposit. Correctness no longer depends on the SoA being in a
+      // "sorted" state at deposit time — the tiled kernel handles a stale
+      // partition per-particle (escape valve for drifted particles, dead-tag
+      // clamp, and the launcher's flat tail pass for appended particles). The
+      // only case the tiled kernel cannot serve is the very first step, before
+      // any SortSpatially has populated a layout; that species takes the flat
+      // scatter-view path for that step alone. See engines/srpic/currents.h and
+      // kernels/currents_deposit.hpp for the full coverage argument.
+      for (auto& species : domain.species) {
+        if ((species.pusher() == ParticlePusher::NONE) or
+            (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
+          continue;
+        }
+        const auto& layout = species.tile_layout();
+        if (layout.ntiles_total == 0u or layout.tile_offsets.extent(0) == 0u) {
+          logger::Checkpoint(
+            fmt::format("Launching currents deposit (flat, no sort yet) for "
+                        "%d [%s] : %lu %f",
+                        species.index(),
+                        species.label().c_str(),
+                        species.npart(),
+                        (double)species.charge()),
+            HERE);
+          auto scatter_cur0 = Kokkos::Experimental::create_scatter_view(
+            domain.fields.cur0);
+          CallDepositKernel<M, SHAPE_ORDER>(species,
+                                            domain.mesh.metric,
+                                            scatter_cur0,
+                                            dt);
+          Kokkos::Experimental::contribute(domain.fields.cur0, scatter_cur0);
+        } else {
+          logger::Checkpoint(
+            fmt::format("Launching tiled currents deposit for %d [%s] : %lu %f",
+                        species.index(),
+                        species.label().c_str(),
+                        species.npart(),
+                        (double)species.charge()),
+            HERE);
+          CallDepositKernelTiled<M, SHAPE_ORDER>(species,
+                                                 domain.mesh.metric,
+                                                 domain.fields.cur0,
+                                                 dt,
+                                                 team_size_req);
+        }
+      }
+#else
       auto scatter_cur0 = Kokkos::Experimental::create_scatter_view(
         domain.fields.cur0);
-      const auto dt = engine_params.get<real_t>("dt");
       for (auto& species : domain.species) {
+        if ((species.pusher() == ParticlePusher::NONE) or
+            (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
+          continue;
+        }
         logger::Checkpoint(
           fmt::format("Launching currents deposit kernel for %d [%s] : %lu %f",
                       species.index(),
@@ -40,36 +230,14 @@ namespace ntt {
                       species.npart(),
                       (double)species.charge()),
           HERE);
-        if (species.npart() == 0 || cmp::AlmostZero(species.charge())) {
-          continue;
-        }
-        Kokkos::parallel_for("CurrentsDeposit",
-                             species.rangeActiveParticles(),
-                             kernel::DepositCurrents_kernel<SimEngine::GRPIC, M>(
-                               scatter_cur0,
-                               species.i1,
-                               species.i2,
-                               species.i3,
-                               species.i1_prev,
-                               species.i2_prev,
-                               species.i3_prev,
-                               species.dx1,
-                               species.dx2,
-                               species.dx3,
-                               species.dx1_prev,
-                               species.dx2_prev,
-                               species.dx3_prev,
-                               species.ux1,
-                               species.ux2,
-                               species.ux3,
-                               species.phi,
-                               species.weight,
-                               species.tag,
-                               domain.mesh.metric,
-                               (real_t)(species.charge()),
-                               dt));
+
+        CallDepositKernel<M, SHAPE_ORDER>(species,
+                                          domain.mesh.metric,
+                                          scatter_cur0,
+                                          dt);
       }
       Kokkos::Experimental::contribute(domain.fields.cur0, scatter_cur0);
+#endif
     }
 
     template <GRMetricClass M>
@@ -103,4 +271,4 @@ namespace ntt {
   } // namespace grpic
 } // namespace ntt
 
-#endif // ENGINES_GRPIC_CURRENTS_H
\ No newline at end of file
+#endif // ENGINES_GRPIC_CURRENTS_H
diff --git a/src/engines/grpic/grpic.hpp b/src/engines/grpic/grpic.hpp
index fc2daa4de..621592955 100644
--- a/src/engines/grpic/grpic.hpp
+++ b/src/engines/grpic/grpic.hpp
@@ -416,7 +416,8 @@ namespace ntt {
          */
         if (deposit_enabled) {
           timers.start("CurrentDeposit");
-          Kokkos::deep_copy(dom.fields.cur0, ZERO);
+          // `cur0` is zeroed inside grpic::CurrentsDeposit (matching SRPIC),
+          // so no pre-zero is needed here.
           grpic::CurrentsDeposit(dom, this->engineParams());
           timers.stop("CurrentDeposit");
 
diff --git a/src/engines/reporter.cpp b/src/engines/reporter.cpp
index f56874d23..fcd46fb81 100644
--- a/src/engines/reporter.cpp
+++ b/src/engines/reporter.cpp
@@ -32,6 +32,24 @@ namespace ntt {
                        "%s",
                        params.template get<std::string>("simulation.name").c_str());
     reporter::AddParam(report, 4, "Engine", "%s", SimEngine(S).to_string());
+#if defined(TEAM_POLICY)
+    reporter::AddParam(report, 4, "Tile size", "%d", TEAM_POLICY_TILE_SIZE);
+  #if defined(TEAM_POLICY_DRIFT)
+    reporter::AddParam(report, 4, "Halo drift", "%d", TEAM_POLICY_DRIFT);
+  #endif
+    if (params.template get<std::size_t>(
+          "algorithms.deposit.team_policy_team_size") == 0u) {
+      reporter::AddParam(report, 4, "Team size", "%s", "AUTO (Kokkos)");
+    } else {
+      reporter::AddParam(
+        report,
+        4,
+        "Team size",
+        "%d (requested; clamped to backend max at launch)",
+        static_cast<int>(params.template get<std::size_t>(
+          "algorithms.deposit.team_policy_team_size")));
+    }
+#endif
     reporter::AddParam(report, 4, "Metric", "%s", M.to_string());
 #if SHAPE_ORDER == 0
     reporter::AddParam(report, 4, "Deposit", "%s", "zigzag");
diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h
index 3afabea8a..74946ed68 100644
--- a/src/engines/srpic/currents.h
+++ b/src/engines/srpic/currents.h
@@ -2,7 +2,8 @@
  * @file engines/srpic/currents.h
  * @brief Current deposition and filtering routines for the SRPIC engine
  * @implements
- *   - ntt::srpic::CallDepositKernel<> -> void
+ *   - ntt::srpic::CallDepositKernel<> -> void                 (flat path)
+ *   - ntt::srpic::CallDepositKernelTiled<> -> void            (TEAM_POLICY)
  *   - ntt::srpic::CurrentsDeposit<> -> void
  *   - ntt::srpic::CurrentsFilter<> -> void
  * @namespaces:
@@ -26,6 +27,8 @@
 #include "kernels/currents_deposit.hpp"
 #include "kernels/digital_filter.hpp"
 
+#include <utility>
+
 namespace ntt {
   namespace srpic {
 
@@ -38,34 +41,183 @@ namespace ntt {
                            species.rangeActiveParticles(),
                            kernel::DepositCurrents_kernel<SimEngine::SRPIC, M, O>(
                              scatter_cur,
-                             species.i1,
-                             species.i2,
-                             species.i3,
-                             species.i1_prev,
-                             species.i2_prev,
-                             species.i3_prev,
-                             species.dx1,
-                             species.dx2,
-                             species.dx3,
-                             species.dx1_prev,
-                             species.dx2_prev,
-                             species.dx3_prev,
-                             species.ux1,
-                             species.ux2,
-                             species.ux3,
-                             species.phi,
-                             species.weight,
-                             species.tag,
+                             species,
                              local_metric,
                              (real_t)(species.charge()),
                              dt));
     }
 
+#if defined(TEAM_POLICY)
+    /**
+     * @brief Tiled deposit launcher (TeamPolicy + per-team scratch).
+     *
+     * Iterates over `tile_layout.ntiles_total` teams; each team accumulates
+     * its tile's particle contributions in SLM scratch and atomically
+     * flushes to the global J. Requires the species to have been sorted
+     * with `team_policy` enabled (`tile_layout` populated by
+     * `SortSpatially`).
+     *
+     * Falls back to the flat kernel if `tile_offsets` is empty — this
+     * happens on the first step before the first sort, or for very small
+     * species that exited early in `SortSpatially`. The fallback uses the
+     * passed-in `scatter_cur` so the caller still composes correctly.
+     */
+    template <SRMetricClass M, unsigned short O>
+    void CallDepositKernelTiled(const Particles<M::Dim, M::CoordType>& species,
+                                const M&                    local_metric,
+                                const ndfield_t<M::Dim, 3>& cur,
+                                real_t                      dt,
+                                int                         team_size_req) {
+      static_assert(O <= 11u, "Shape order must be <= 11");
+      constexpr unsigned short T = static_cast<unsigned short>(
+        TEAM_POLICY_TILE_SIZE);
+      const auto& layout = species.tile_layout();
+      raise::ErrorIf(layout.ntiles_total == 0u,
+                     "CallDepositKernelTiled: tile_layout has 0 tiles — call "
+                     "SortSpatially before CurrentsDeposit",
+                     HERE);
+      raise::ErrorIf(layout.tile_offsets.extent(0) != layout.ntiles_total + 1u,
+                     "CallDepositKernelTiled: tile_offsets size inconsistent "
+                     "with ntiles_total",
+                     HERE);
+
+      auto deposit_kernel =
+        kernel::DepositCurrentsTiled_kernel<SimEngine::SRPIC, M, O, T> {
+          cur,    species, local_metric, (real_t)(species.charge()),
+          dt,     layout,  species.npart()
+        };
+
+      const auto scratch = Kokkos::PerTeam(
+        decltype(deposit_kernel)::scratch_bytes());
+
+      // Team (work-group) size. The default (team_size_req == 0) leaves
+      // Kokkos::AUTO, which sizes the team from the backend occupancy
+      // heuristic. A positive `algorithms.deposit.team_policy_team_size`
+      // overrides it, clamped to the scratch/backend-feasible maximum so an
+      // over-large request cannot abort the launch (Kokkos errors when
+      // team_size > team_size_max). No portable subgroup rounding is applied;
+      // pick a multiple of the device subgroup width (printed per arch by
+      // ideal_tile_size.py) for the best occupancy.
+      Kokkos::TeamPolicy<> policy(static_cast<int>(layout.ntiles_total),
+                                  Kokkos::AUTO);
+      policy.set_scratch_size(0, scratch);
+      if (team_size_req > 0) {
+        const int ts_max = policy.team_size_max(deposit_kernel,
+                                                Kokkos::ParallelForTag {});
+        int       ts     = team_size_req;
+        if (ts > ts_max) {
+          raise::Warning(
+            fmt::format("algorithms.deposit.team_policy_team_size = %d exceeds "
+                        "the tiled-deposit maximum %d on this backend; clamping "
+                        "to %d",
+                        team_size_req,
+                        ts_max,
+                        ts_max),
+            HERE);
+          ts = ts_max;
+        }
+        policy = Kokkos::TeamPolicy<>(static_cast<int>(layout.ntiles_total), ts);
+        policy.set_scratch_size(0, scratch);
+        logger::Checkpoint(
+          fmt::format("Tiled deposit: explicit team size %d", ts),
+          HERE);
+      }
+      Kokkos::parallel_for("CurrentsDepositTiled", policy, deposit_kernel);
+
+      // Particles appended since the last sort (injection / MPI receive on a
+      // no-sort step) live past the partition and are not visited by any team
+      // above. Deposit that tail [npart_partitioned, npart) with the flat
+      // scatter-view kernel so every active particle is deposited exactly
+      // once. The range is empty when the species was just sorted (the
+      // every-step-sorted common case), so this is a no-op there.
+      if (species.npart() > layout.npart_partitioned) {
+        // `cur` is a const ref; take a non-const View handle (shallow copy,
+        // shares storage) so the scatter view can contribute back into it.
+        auto cur_nc      = cur;
+        auto scatter_cur = Kokkos::Experimental::create_scatter_view(cur_nc);
+        Kokkos::parallel_for(
+          "CurrentsDepositTiledTail",
+          CreateParticleRangePolicy<Dim::_1D>({ layout.npart_partitioned },
+                                              { species.npart() }),
+          kernel::DepositCurrents_kernel<SimEngine::SRPIC, M, O>(
+            scatter_cur,
+            species,
+            local_metric,
+            (real_t)(species.charge()),
+            dt));
+        Kokkos::Experimental::contribute(cur_nc, scatter_cur);
+      }
+    }
+#endif // TEAM_POLICY
+
     template <SRMetricClass M>
     void CurrentsDeposit(Domain<SimEngine::SRPIC, M>& domain,
                          const prm::Parameters&       engine_params) {
       const auto dt = engine_params.get<real_t>("dt");
       Kokkos::deep_copy(domain.fields.cur, ZERO);
+
+#if defined(TEAM_POLICY)
+      // Optional runtime override for the tiled-deposit team (work-group) size;
+      // 0 (default) keeps Kokkos::AUTO. Clamped to the backend max in the
+      // launcher (see CallDepositKernelTiled).
+      const auto team_size_req = static_cast<int>(
+        engine_params.get<std::size_t>("team_policy_team_size",
+                                       std::optional<std::size_t> { 0u }));
+
+      // Tiled deposit. Correctness no longer depends on the SoA being in a
+      // "sorted" state at deposit time — the tiled kernel handles a stale
+      // partition per-particle:
+      //   - a particle whose full stencil has drifted out of its tile is
+      //     deposited straight to the global J view (the per-particle escape
+      //     valve); `team_policy_drift` sizes the scratch halo so the
+      //     common in-tile case stays in fast SLM (see currents_deposit.hpp);
+      //   - particles dead-tagged in place since the sort are clamped out by
+      //     the kernel and skipped by the dead-tag test;
+      //   - particles appended past the partition since the sort (injection /
+      //     MPI receive on a no-sort step) are deposited by the launcher's
+      //     flat tail pass over [npart_partitioned, npart).
+      // Together these cover every active particle exactly once for any sort
+      // interval. The only case the tiled kernel cannot serve is the very
+      // first step, before any SortSpatially has populated a layout; that
+      // species takes the flat scatter-view path for that step alone.
+      for (auto& species : domain.species) {
+        if ((species.pusher() == ParticlePusher::NONE) or
+            (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
+          continue;
+        }
+        const auto& layout = species.tile_layout();
+        if (layout.ntiles_total == 0u or layout.tile_offsets.extent(0) == 0u) {
+          logger::Checkpoint(
+            fmt::format("Launching currents deposit (flat, no sort yet) for "
+                        "%d [%s] : %lu %f",
+                        species.index(),
+                        species.label().c_str(),
+                        species.npart(),
+                        (double)species.charge()),
+            HERE);
+          auto scatter_cur = Kokkos::Experimental::create_scatter_view(
+            domain.fields.cur);
+          CallDepositKernel<M, SHAPE_ORDER>(species,
+                                            domain.mesh.metric,
+                                            scatter_cur,
+                                            dt);
+          Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur);
+        } else {
+          logger::Checkpoint(
+            fmt::format("Launching tiled currents deposit for %d [%s] : %lu %f",
+                        species.index(),
+                        species.label().c_str(),
+                        species.npart(),
+                        (double)species.charge()),
+            HERE);
+          CallDepositKernelTiled<M, SHAPE_ORDER>(species,
+                                                 domain.mesh.metric,
+                                                 domain.fields.cur,
+                                                 dt,
+                                                 team_size_req);
+        }
+      }
+#else
       auto scatter_cur = Kokkos::Experimental::create_scatter_view(
         domain.fields.cur);
       for (auto& species : domain.species) {
@@ -84,6 +236,7 @@ namespace ntt {
         CallDepositKernel<M, SHAPE_ORDER>(species, domain.mesh.metric, scatter_cur, dt);
       }
       Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur);
+#endif
     }
 
     template <SRMetricClass M>
@@ -91,9 +244,11 @@ namespace ntt {
                         Domain<SimEngine::SRPIC, M>&     domain,
                         const SimulationParams&          params) {
       logger::Checkpoint("Launching currents filtering kernels", HERE);
-      auto       range   = srpic::RangeWithAxisBCs(domain);
       const auto nfilter = params.template get<unsigned short>(
         "algorithms.current_filters");
+      if (nfilter == 0u) {
+        return;
+      }
       tuple_t<ncells_t, M::Dim> size;
       if constexpr (M::Dim == Dim::_1D || M::Dim == Dim::_2D || M::Dim == Dim::_3D) {
         size[0] = domain.mesh.n_active(in::x1);
@@ -104,17 +259,114 @@ namespace ntt {
       if constexpr (M::Dim == Dim::_3D) {
         size[2] = domain.mesh.n_active(in::x3);
       }
-      // !TODO: this needs to be done more efficiently
+
+      // The filter ping-pongs `cur` <-> scratch `buff`: one up-front copy
+      // seeds `buff` with valid ghost cells (the kernel writes only the
+      // cells in its launch range), then each pass filters the input into
+      // the other buffer and swaps the View handles so `cur` again names the
+      // result. `buff` is pure scratch (also reused by CommunicateFields), so
+      // the permanent handle swap is transparent and `cur` always names the
+      // result. The single seeding copy preserves physical-boundary ghosts
+      // (conductor/match/atmosphere/...), which neither the kernel nor the
+      // MPI exchange refresh.
+      Kokkos::deep_copy(domain.fields.buff, domain.fields.cur);
+
+      const auto flds_bc = domain.mesh.flds_bc();
+
+      // Reduced-exchange ghost-margin scheme (all coordinate types). One halo
+      // exchange refreshes N_GHOSTS ghost layers — enough for N_GHOSTS passes
+      // of the 3-point binomial if each pass also recomputes the inner ghost
+      // layers it will need next. We therefore extend the launch range by a
+      // shrinking margin `m` into the ghost zone, but only on comm-refreshed
+      // sides (PERIODIC self-wrap or SYNC inter-domain), where the ghost cell
+      // is interior physics. Physical-boundary ghosts are never written or
+      // refreshed, exactly as in the per-pass loop, so the result is identical
+      // for every BC — while doing one exchange per N_GHOSTS passes instead of
+      // one per pass. (Entering the loop the ghosts are valid to distance
+      // N_GHOSTS: srpic.hpp runs CommunicateFields(J) immediately before
+      // CurrentsFilter.)
+      //
+      // Non-Cartesian axis: the theta (x2) direction is self-contained in the
+      // filter kernel — the axis branches only ever read/write within
+      // [i2_min, i2_max] and never cross the axis — so the axis needs no halo
+      // exchange at all (the axis current fold is done once by
+      // SynchronizeFields(J) before this function). The single coordinate
+      // dependency is that the axis cell sits at i_max(x2), one past the active
+      // range, and must be filtered on every pass. We therefore add a fixed +1
+      // to the x2 upper bound when that side is AXIS — a physical boundary, so
+      // never the shrinking comm margin. This folds the old RangeWithAxisBCs
+      // fixup into make_range, letting the same loop serve every CoordType.
+      const int  G = static_cast<int>(N_GHOSTS);
+      const auto comm_side = [](FldsBC b) {
+        return (b == FldsBC::PERIODIC) or (b == FldsBC::SYNC);
+      };
+      bool ext_lo[3] = { false, false, false };
+      bool ext_hi[3] = { false, false, false };
+      for (auto d { 0 }; d < static_cast<int>(M::Dim); ++d) {
+        ext_lo[d] = comm_side(flds_bc[d].first);
+        ext_hi[d] = comm_side(flds_bc[d].second);
+      }
+      // AXIS at the upper x2 boundary needs the axis cell (i_max(x2)) included
+      // every pass; matches srpic::RangeWithAxisBCs. (The lower-x2 axis cell is
+      // already i_min(x2), so no fixup is needed there.)
+      bool axis_hi_x2 = false;
+      if constexpr (M::CoordType != Coord::Cartesian and
+                    (M::Dim == Dim::_2D or M::Dim == Dim::_3D)) {
+        axis_hi_x2 = (flds_bc[1].second == FldsBC::AXIS);
+      }
+      const auto make_range = [&](int m) -> range_t<M::Dim> {
+        const auto ml = [&](int d) -> ncells_t {
+          return ext_lo[d] ? static_cast<ncells_t>(m) : 0u;
+        };
+        const auto mh = [&](int d) -> ncells_t {
+          if (ext_hi[d]) {
+            return static_cast<ncells_t>(m);
+          }
+          // axis cell fixup (x2 == dimension index 1); mutually exclusive with
+          // the comm margin since AXIS is not a comm side
+          if (d == 1 and axis_hi_x2) {
+            return 1u;
+          }
+          return 0u;
+        };
+        if constexpr (M::Dim == Dim::_1D) {
+          return CreateRangePolicy<Dim::_1D>(
+            { domain.mesh.i_min(in::x1) - ml(0) },
+            { domain.mesh.i_max(in::x1) + mh(0) });
+        } else if constexpr (M::Dim == Dim::_2D) {
+          return CreateRangePolicy<Dim::_2D>(
+            { domain.mesh.i_min(in::x1) - ml(0),
+              domain.mesh.i_min(in::x2) - ml(1) },
+            { domain.mesh.i_max(in::x1) + mh(0),
+              domain.mesh.i_max(in::x2) + mh(1) });
+        } else {
+          return CreateRangePolicy<Dim::_3D>(
+            { domain.mesh.i_min(in::x1) - ml(0),
+              domain.mesh.i_min(in::x2) - ml(1),
+              domain.mesh.i_min(in::x3) - ml(2) },
+            { domain.mesh.i_max(in::x1) + mh(0),
+              domain.mesh.i_max(in::x2) + mh(1),
+              domain.mesh.i_max(in::x3) + mh(2) });
+        }
+      };
+      int m = G - 1;
       for (auto i { 0u }; i < nfilter; ++i) {
-        Kokkos::deep_copy(domain.fields.buff, domain.fields.cur);
-        Kokkos::parallel_for("CurrentsFilter",
-                             range,
-                             kernel::DigitalFilter_kernel<M::Dim, M::CoordType>(
-                               domain.fields.cur,
-                               domain.fields.buff,
-                               size,
-                               domain.mesh.flds_bc()));
-        metadomain.CommunicateFields(domain, Comm::J);
+        Kokkos::parallel_for(
+          "CurrentsFilter",
+          make_range(m),
+          kernel::DigitalFilter_kernel<M::Dim, M::CoordType>(
+            domain.fields.buff,
+            domain.fields.cur,
+            size,
+            flds_bc));
+        std::swap(domain.fields.cur, domain.fields.buff);
+        --m;
+        if (m < 0 or i == nfilter - 1u) {
+          // refresh ghosts to distance G (and leave them valid for the
+          // downstream field solver after the final pass)
+          metadomain.CommunicateFields(domain, Comm::J);
+          m = G - 1;
+        }
       }
     }
 
diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h
index 895026552..396b5aec8 100644
--- a/src/framework/containers/particles.h
+++ b/src/framework/containers/particles.h
@@ -25,6 +25,7 @@
 #include "traits/metric.h"
 #include "utils/error.h"
 #include "utils/formatting.h"
+#include "utils/sorting.h"
 
 #include "framework/containers/species.h"
 #include "framework/domain/grid.h"
@@ -90,6 +91,14 @@ namespace ntt {
     const uint8_t m_ntags { (uint8_t)(2 + math::pow(3, (int)D) - 1) };
 #endif
 
+    // team_policy: tile metadata produced by SortSpatially
+    // and consumed by the tiled deposit / pusher kernels. Lazily
+    // allocated on first sort. The sort backend itself (oneDPL on SYCL,
+    // Thrust on CUDA, std::sort on Host, Kokkos::BinSort otherwise) is
+    // selected at compile time based on the Kokkos device and the
+    // vendor libraries detected by CMake.
+    TileLayout<D> m_tile_layout {};
+
   public:
     // for empty allocation
     Particles() {}
@@ -198,6 +207,20 @@ namespace ntt {
       return m_ntags;
     }
 
+#if defined(TEAM_POLICY)
+    // Build m_tile_layout.tile_offsets / npart_partitioned from the
+    // already-sorted tile-index keys. A separate member function (not a
+    // lambda local to SortSpatially) so the inner device kernel is not an
+    // extended __device__ lambda nested inside another lambda — which
+    // nvcc forbids. Lets the vendor path run the offsets pass and then
+    // release the keys before the SoA gather allocates its buffers.
+    // NOTE: must be public — nvcc forbids an extended __host__ __device__
+    // lambda inside a member function with private/protected access.
+    void compute_tile_offsets(const array_t<ncells_t*>& tile_indices,
+                              ncells_t                  total_tiles,
+                              npart_t                   npart_local);
+#endif
+
     [[nodiscard]]
     auto memory_footprint() const -> std::size_t {
       std::size_t footprint  = 0;
@@ -276,9 +299,54 @@ namespace ntt {
     /**
      * @brief Sort particles spatially by their cell indices
      * @param grid The grid object to get the cell information for sorting
+     * @note In team_policy mode (compile-time `team_policy=ON`), also
+     *       populates `m_tile_layout` with tile-offset and per-tile
+     *       permutation metadata that the tiled deposit/pusher kernels
+     *       consume.
      */
     void SortSpatially(const Grid<D>&);
 
+#if defined(TEAM_POLICY) &&                                                    \
+  ((defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) ||                       \
+   (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) ||                       \
+   (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)))
+  private:
+    /**
+     * @brief Apply a particle-index permutation (built by oneDPL/Thrust
+     *        sort_by_key) to the SoA member arrays. Members are gathered
+     *        through `perm` into a reusable `n`-sized scratch buffer
+     *        (one per member type, shared across members of that type)
+     *        and copied back in place, so the large persistent member
+     *        arrays keep their storage/address and the gather makes a
+     *        handful of transient allocations instead of one maxnpart
+     *        buffer per member. The *_prev arrays are intentionally not
+     *        permuted (overwritten by the next push before any read).
+     *        Only compiled when a vendor sort backend is enabled; the
+     *        BinSort path applies the permutation in place via
+     *        `sorter.sort(view)` instead.
+     * @param perm Permutation: sorted position -> pre-sort slot index.
+     * @param n Number of leading (alive) particles to gather. Pass
+     *        `npart_partitioned` so only the alive set is moved into
+     *        `[0, n)` (the dead were binned to the sentinel tile and sort
+     *        to the tail); the caller then drops the dead tail via
+     *        `set_npart(n)`.
+     */
+    void apply_permutation_to_soa(const prtl_perm_t& perm, npart_t n);
+
+  public:
+#endif
+
+    /**
+     * @brief Read-only access to the tile layout produced by the most
+     *        recent SortSpatially call. Returns a default-constructed
+     *        layout (`ntiles_total == 0`) when the species has not yet
+     *        been sorted.
+     */
+    [[nodiscard]]
+    auto tile_layout() const -> const TileLayout<D>& {
+      return m_tile_layout;
+    }
+
     /**
      * @brief Copy particle data from device to host.
      */
diff --git a/src/framework/containers/particles_comm.cpp b/src/framework/containers/particles_comm.cpp
index 0fa140f93..6ede3abf0 100644
--- a/src/framework/containers/particles_comm.cpp
+++ b/src/framework/containers/particles_comm.cpp
@@ -266,6 +266,11 @@ namespace ntt {
     auto iteration        = 0;
     auto current_received = 0;
 
+    // `tag_offsets` is the same for every direction; mirror it to the host
+    // once here instead of re-copying it inside the direction loop.
+    auto tag_offsets_h = Kokkos::create_mirror_view(tag_offsets);
+    Kokkos::deep_copy(tag_offsets_h, tag_offsets);
+
     for (const auto& direction : dirs_to_comm) {
       const auto send_rank     = send_ranks.at(direction);
       const auto recv_rank     = recv_ranks.at(direction);
@@ -291,9 +296,6 @@ namespace ntt {
                                               npart_send_in * NPLDS_I };
       }
 
-      auto tag_offsets_h = Kokkos::create_mirror_view(tag_offsets);
-      Kokkos::deep_copy(tag_offsets_h, tag_offsets);
-
       npart_t idx_offset = npart_dead;
       if (tag_send > 2) {
         idx_offset += tag_offsets_h(tag_send - 3);
diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp
index 904fb3fc7..5e9ae4147 100644
--- a/src/framework/containers/particles_sort.cpp
+++ b/src/framework/containers/particles_sort.cpp
@@ -8,10 +8,22 @@
 #include "framework/containers/particles.h"
 #include "framework/domain/grid.h"
 
+#if defined(TEAM_POLICY)
+  #if (defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) ||                    \
+      (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) ||                    \
+      (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED))
+    #define TEAM_POLICY_USE_VENDOR_SORT
+    #include "utils/sort_dispatch.h"
+  #endif
+#endif
+
 #include <Kokkos_Core.hpp>
 #include <Kokkos_ScatterView.hpp>
+#include <Kokkos_Sort.hpp>
 #include <Kokkos_StdAlgorithms.hpp>
 
+#include <algorithm>
+#include <cstddef>
 #include <string>
 #include <utility>
 #include <vector>
@@ -193,8 +205,231 @@ namespace ntt {
     m_is_sorted = true;
   }
 
+#if defined(TEAM_POLICY)
+  template <Dimension D, Coord::type C>
+  void Particles<D, C>::compute_tile_offsets(
+    const array_t<ncells_t*>& tile_indices,
+    ncells_t                  total_tiles,
+    npart_t                   npart_local) {
+    // Compute the per-tile prefix-sum `tile_offsets` for the tiled
+    // pusher from the (already sorted) `tile_indices` — monotonically
+    // non-decreasing for alive particles, with the dead sentinel
+    // `total_tiles + 1` clustered at the end. Transition-detect directly
+    // on it: the start of each non-empty tile is the only place a write
+    // happens — atomic-free in the dense branch. Empty tiles (no
+    // particles) are filled by a reverse pass on a small host mirror
+    // (`total_tiles ≈ 176K` at production scale → ~700 KB).
+    array_t<npart_t*> tile_offsets { "tile_offsets", total_tiles + 1u };
+    Kokkos::deep_copy(tile_offsets, static_cast<npart_t>(npart_local));
+
+    const auto total_tiles_v = total_tiles;
+    auto       ti_v          = tile_indices;
+    Kokkos::parallel_for(
+      "DetectTileBoundaries",
+      CreateParticleRangePolicy<Dim::_1D>({ 0u }, { npart_local }),
+      Lambda(prtlidx_t p) {
+        const auto t_curr   = ti_v(p);
+        const bool boundary = (p == 0u) || (ti_v(p - 1u) != t_curr);
+        if (!boundary) {
+          return;
+        }
+        if (t_curr < total_tiles_v) {
+          tile_offsets(t_curr) = p;
+        } else {
+          // First dead particle — also marks the alive_count boundary
+          // stored at index total_tiles.
+          Kokkos::atomic_min(&tile_offsets(total_tiles_v), p);
+        }
+      });
+
+    auto h_offsets = Kokkos::create_mirror_view(tile_offsets);
+    Kokkos::deep_copy(h_offsets, tile_offsets);
+    for (auto t = static_cast<std::size_t>(total_tiles); t-- > 0u;) {
+      if (h_offsets(t) > h_offsets(t + 1u)) {
+        h_offsets(t) = h_offsets(t + 1u);
+      }
+    }
+    Kokkos::deep_copy(tile_offsets, h_offsets);
+
+    m_tile_layout.tile_offsets = tile_offsets;
+    // tile_offsets(total_tiles) is the alive-particle count at sort time:
+    // the tiles partition exactly [0, npart_partitioned). The deposit
+    // launcher compares this against the live npart() to detect (and
+    // separately deposit) particles appended since this sort.
+    m_tile_layout.npart_partitioned = h_offsets(total_tiles);
+  }
+#endif // TEAM_POLICY
+
   template <Dimension D, Coord::type C>
   void Particles<D, C>::SortSpatially(const Grid<D>& grid) {
+#if defined(TEAM_POLICY)
+    // ---------------------- team_policy: tile-based sort ------------------ //
+    const auto npart_local = npart();
+    if (npart_local == 0u) {
+      m_tile_layout = TileLayout<D> {};
+      m_is_sorted   = true;
+      return;
+    }
+
+    constexpr unsigned short T = static_cast<unsigned short>(
+      TEAM_POLICY_TILE_SIZE);
+    static_assert(T > 0u, "TEAM_POLICY_TILE_SIZE must be > 0");
+
+    // 1. Compute per-axis tile counts and total_tiles.
+    const auto ncells_active = grid.n_active();
+    ncells_t   ntx[3] { 1u, 1u, 1u };
+    ncells_t   total_tiles { 1u };
+    if constexpr ((D == Dim::_1D) or (D == Dim::_2D) or (D == Dim::_3D)) {
+      ntx[0]       = static_cast<ncells_t>(math::ceil(
+        static_cast<double>(ncells_active[0]) / static_cast<double>(T)));
+      total_tiles *= ntx[0];
+    }
+    if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) {
+      ntx[1]       = static_cast<ncells_t>(math::ceil(
+        static_cast<double>(ncells_active[1]) / static_cast<double>(T)));
+      total_tiles *= ntx[1];
+    }
+    if constexpr (D == Dim::_3D) {
+      ntx[2]       = static_cast<ncells_t>(math::ceil(
+        static_cast<double>(ncells_active[2]) / static_cast<double>(T)));
+      total_tiles *= ntx[2];
+    }
+
+    // 2. Compute per-particle tile key (with min(i, i_prev)).
+    array_t<ncells_t*> tile_indices { "tile_indices", npart_local };
+    Kokkos::parallel_for(
+      "FillTileIndices",
+      rangeActiveParticles(),
+      sort::PositionToTileIndex<D, false, true> { i1,
+                                                   i2,
+                                                   i3,
+                                                   tag,
+                                                   tile_indices,
+                                                   ncells_active,
+                                                   static_cast<ncells_t>(T),
+                                                   array_t<npart_t*> {},
+                                                   i1_prev,
+                                                   i2_prev,
+                                                   i3_prev });
+
+    // 3. Sort. Vendor library (oneDPL/Thrust) when compiled in;
+    //    Kokkos::BinSort otherwise. n_bins = total_tiles + 2 covers
+    //    the dead-particle sentinel bin (total_tiles + 1u).
+    const ncells_t n_bins = total_tiles + 2u;
+    const auto     slice  = prtl_slice_t(0, npart_local);
+
+  #if defined(TEAM_POLICY_USE_VENDOR_SORT)
+    // Vendor path: produce an explicit permutation via sort_by_key, then
+    // apply it to each SoA member by gathering the alive prefix through a
+    // reusable scratch buffer (one per member type, sized to the alive
+    // count, copied back in place). The *_prev arrays are skipped — see
+    // apply_permutation_to_soa. Peak transient = one
+    // `npart_partitioned × sizeof(member)` scratch at a time.
+    prtl_perm_t perm { "tile_perm", npart_local };
+    #if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)
+    sort_helpers::sort_by_key_dispatch(tile_indices,
+                                       perm,
+                                       n_bins,
+                                       sort::backend::OneDPL {});
+    #elif defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)
+    sort_helpers::sort_by_key_dispatch(tile_indices,
+                                       perm,
+                                       n_bins,
+                                       sort::backend::Rocthrust {});
+    #else
+    sort_helpers::sort_by_key_dispatch(tile_indices,
+                                       perm,
+                                       n_bins,
+                                       sort::backend::Thrust {});
+    #endif
+    // `tile_indices` is sorted in place by sort_by_key. Build the tile
+    // offsets from it now, then drop it before the gather allocates its
+    // `maxnpart`-sized buffers — so the keys are not co-resident with
+    // them at the gather's peak (#2).
+    compute_tile_offsets(tile_indices, total_tiles, npart_local);
+    tile_indices = array_t<ncells_t*> {};
+    Kokkos::fence("SortSpatially: pre-gather drain");
+    // Gather only the alive particles. The sort binned dead particles to
+    // the sentinel tile, so they occupy [npart_partitioned, npart_local)
+    // and `perm[0, npart_partitioned)` lists the alive slots in tile
+    // order. Restricting the gather to the alive count drops the dead in
+    // the same pass (no separate compaction), skips work on dead slots,
+    // and sizes the gather scratch to the alive count. The dead tail is
+    // released below via `set_npart`.
+    apply_permutation_to_soa(perm, m_tile_layout.npart_partitioned);
+  #else
+    // BinSort path: same mechanism as legacy SortSpatially (BinSort
+    // allocates one temp View per `sorter.sort(view)` call and frees
+    // it before the next), so peak transient memory is bounded.
+    using sorter_op_t = Kokkos::BinOp1D<array_t<ncells_t*>>;
+    using sorter_t    = Kokkos::BinSort<array_t<ncells_t*>, sorter_op_t>;
+    auto bin_op       = sorter_op_t { static_cast<int>(n_bins), 0u, n_bins };
+    auto sorter       = sorter_t { tile_indices, bin_op, false };
+    sorter.create_permute_vector();
+    if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) {
+      sorter.sort(Kokkos::subview(i1, slice));
+      sorter.sort(Kokkos::subview(i1_prev, slice));
+      sorter.sort(Kokkos::subview(dx1, slice));
+      sorter.sort(Kokkos::subview(dx1_prev, slice));
+    }
+    if constexpr (D == Dim::_2D or D == Dim::_3D) {
+      sorter.sort(Kokkos::subview(i2, slice));
+      sorter.sort(Kokkos::subview(i2_prev, slice));
+      sorter.sort(Kokkos::subview(dx2, slice));
+      sorter.sort(Kokkos::subview(dx2_prev, slice));
+    }
+    if constexpr (D == Dim::_3D) {
+      sorter.sort(Kokkos::subview(i3, slice));
+      sorter.sort(Kokkos::subview(i3_prev, slice));
+      sorter.sort(Kokkos::subview(dx3, slice));
+      sorter.sort(Kokkos::subview(dx3_prev, slice));
+    }
+    sorter.sort(Kokkos::subview(ux1, slice));
+    sorter.sort(Kokkos::subview(ux2, slice));
+    sorter.sort(Kokkos::subview(ux3, slice));
+    sorter.sort(Kokkos::subview(weight, slice));
+    sorter.sort(Kokkos::subview(tag, slice));
+    if constexpr (D == Dim::_2D and C != Coord::Cartesian) {
+      sorter.sort(Kokkos::subview(phi, slice));
+    }
+    for (auto pldr { 0u }; pldr < npld_r(); ++pldr) {
+      sorter.sort(Kokkos::subview(pld_r, slice, pldr));
+    }
+    for (auto pldi { 0u }; pldi < npld_i(); ++pldi) {
+      sorter.sort(Kokkos::subview(pld_i, slice, pldi));
+    }
+    // Apply the same permutation to `tile_indices` itself so it ends
+    // monotonically non-decreasing for the offsets pass, then build the
+    // tile offsets from it (the in-place BinSort path has no separate
+    // gather to hoist this ahead of).
+    sorter.sort(tile_indices);
+    compute_tile_offsets(tile_indices, total_tiles, npart_local);
+  #endif // TEAM_POLICY_USE_VENDOR_SORT
+
+    // Populate `m_tile_layout` size/shape. `tile_perm` is not used in the
+    // current design — the SoA arrays are physically permuted into tile
+    // order, so consumers iterate `[tile_offsets(t), tile_offsets(t+1))`
+    // directly without a separate permutation indirection.
+    m_tile_layout.ntiles_per_axis[0] = ntx[0];
+    m_tile_layout.ntiles_per_axis[1] = ntx[1];
+    m_tile_layout.ntiles_per_axis[2] = ntx[2];
+    m_tile_layout.ntiles_total       = total_tiles;
+    m_tile_layout.tile_size          = T;
+    m_tile_layout.tile_perm          = prtl_perm_t {};
+    m_is_sorted                      = true;
+
+    // Compact-on-sort: drop the dead tail now instead of waiting for the
+    // periodic RemoveDead. The sort parked dead particles in the sentinel
+    // bin at [npart_partitioned, npart()), and the alive set is exactly
+    // [0, npart_partitioned) — gathered into tile order above (vendor) or
+    // sorted in place (BinSort). Shrinking npart() here keeps it, and
+    // every subsequent sort's transient buffers, tracking the alive count
+    // instead of ratcheting up with dead slots between clearing intervals.
+    // (RemoveDead remains the compactor when spatial sorting is disabled.)
+    set_npart(m_tile_layout.npart_partitioned);
+
+    Kokkos::fence("SortSpatially: end of team_policy path");
+#else  // !TEAM_POLICY — legacy in-place BinSort by global cell index
     const auto nx2         = grid.n_active(in::x2);
     const auto nx3         = grid.n_active(in::x3);
     const auto total_cells = grid.num_active();
@@ -250,13 +485,181 @@ namespace ntt {
     for (auto pldi { 0u }; pldi < npld_i(); ++pldi) {
       sorter.sort(Kokkos::subview(pld_i, slice, pldi));
     }
+#endif // TEAM_POLICY
   }
 
+#if defined(TEAM_POLICY_USE_VENDOR_SORT)
+  namespace permute_helpers {
+
+    // Permute a 1D SoA member `arr` by `perm` in place, using a
+    // caller-owned reusable `scratch` buffer. Gathers the live prefix
+    // `arr[perm[0..n)]` into `scratch[0..n)`, then copies it back into
+    // `arr[0..n)`. Unlike the old swap-the-handle approach, `arr` keeps
+    // its original storage and full maxnpart capacity, so the large
+    // persistent member arrays never change address between sorts (the
+    // dominant source of allocator churn / fragmentation on ROCm), and
+    // `scratch` is shared across every member of the same type within one
+    // sort -- one transient allocation per type group instead of a fresh
+    // maxnpart buffer per member. `scratch` is sized to the live count
+    // `n` (not maxnpart), which also lowers the gather's peak transient.
+    // The tail `arr[n, capacity)` is left untouched (stale, never read:
+    // consumers iterate `[0, npart())`). Cost vs the swap: one extra
+    // copy-back pass (~2x HBM traffic), negligible when sorting is a
+    // small fraction of the step. The fences keep the shared scratch from
+    // being overwritten by the next member's gather before this member's
+    // copy-back has drained it.
+    template <typename V>
+    inline void permute_1d_into(V&                 arr,
+                                const V&           scratch,
+                                const prtl_perm_t& perm,
+                                npart_t            n) {
+      if (n == 0u) {
+        return;
+      }
+      auto perm_v = perm;
+      auto arr_v  = arr;
+      auto buf_v  = scratch;
+      Kokkos::parallel_for(
+        "Permute1D",
+        n,
+        KOKKOS_LAMBDA(const npart_t p) { buf_v(p) = arr_v(perm_v(p)); });
+      Kokkos::fence("permute_1d_into: gather");
+      Kokkos::deep_copy(
+        Kokkos::subview(arr, std::make_pair(static_cast<npart_t>(0), n)),
+        Kokkos::subview(scratch, std::make_pair(static_cast<npart_t>(0), n)));
+      Kokkos::fence("permute_1d_into: copy-back");
+    }
+
+    // 2D analogue for `pld_r` / `pld_i` (`scratch` is `(>= n, ncols)`).
+    template <typename V>
+    inline void permute_2d_into(V&                 arr,
+                                const V&           scratch,
+                                const prtl_perm_t& perm,
+                                npart_t            n,
+                                npart_t            ncols) {
+      if (n == 0u or ncols == 0u) {
+        return;
+      }
+      auto perm_v = perm;
+      auto arr_v  = arr;
+      auto buf_v  = scratch;
+      Kokkos::parallel_for(
+        "Permute2D",
+        CreateParticleRangePolicy<Dim::_2D>({ 0u, 0u }, { n, ncols }),
+        KOKKOS_LAMBDA(const npart_t p, const npart_t l) {
+          buf_v(p, l) = arr_v(perm_v(p), l);
+        });
+      Kokkos::fence("permute_2d_into: gather");
+      Kokkos::deep_copy(
+        Kokkos::subview(arr,
+                        std::make_pair(static_cast<npart_t>(0), n),
+                        Kokkos::ALL),
+        Kokkos::subview(scratch,
+                        std::make_pair(static_cast<npart_t>(0), n),
+                        Kokkos::ALL));
+      Kokkos::fence("permute_2d_into: copy-back");
+    }
+
+  } // namespace permute_helpers
+
+  template <Dimension D, Coord::type C>
+  void Particles<D, C>::apply_permutation_to_soa(const prtl_perm_t& perm,
+                                                 npart_t            n) {
+    if (n == 0u) {
+      return;
+    }
+
+    using permute_helpers::permute_1d_into;
+    using permute_helpers::permute_2d_into;
+
+    // The *_prev arrays (i{1,2,3}_prev, dx{1,2,3}_prev) are intentionally
+    // NOT permuted. SortSpatially runs at the very end of the step loop
+    // (engine step_forward), and the first thing the next step's pusher
+    // does is overwrite prev := current (positionPush, sr.hpp / gr.hpp)
+    // for every active particle, before any consumer reads prev:
+    //   - current deposit: runs after the push, which has already
+    //     overwritten prev; species with pusher==NONE (whose prev would
+    //     stay un-permuted) are skipped by CurrentsDeposit entirely.
+    //   - pusher getParticlePrevPosition / piston: read prev only after
+    //     positionPush has rewritten it within the same call.
+    //   - checkpoint (prev is checkpoint-only, never in diagnostic
+    //     output): on restart the first push overwrites prev before it
+    //     is read, so restart results are unaffected; only the redundant
+    //     prev field saved to the checkpoint differs from the old code.
+    // Permuting prev would therefore reorder data that is overwritten
+    // before it is ever observed.
+    //
+    // Each block below allocates a single `n`-sized scratch buffer that
+    // is reused for every member of that type, then freed before the next
+    // block allocates its own. The whole gather thus makes one transient
+    // allocation per type group (int / prtldx_t / real_t / short / each
+    // payload) instead of one fresh maxnpart buffer per member, and peak
+    // transient is a single n-sized scratch at a time.
+    {
+      array_t<int*> scratch { "perm_scratch_int", n };
+      if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) {
+        permute_1d_into(i1, scratch, perm, n);
+      }
+      if constexpr (D == Dim::_2D or D == Dim::_3D) {
+        permute_1d_into(i2, scratch, perm, n);
+      }
+      if constexpr (D == Dim::_3D) {
+        permute_1d_into(i3, scratch, perm, n);
+      }
+    }
+    {
+      array_t<prtldx_t*> scratch { "perm_scratch_prtldx", n };
+      if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) {
+        permute_1d_into(dx1, scratch, perm, n);
+      }
+      if constexpr (D == Dim::_2D or D == Dim::_3D) {
+        permute_1d_into(dx2, scratch, perm, n);
+      }
+      if constexpr (D == Dim::_3D) {
+        permute_1d_into(dx3, scratch, perm, n);
+      }
+    }
+    {
+      array_t<real_t*> scratch { "perm_scratch_real", n };
+      permute_1d_into(ux1, scratch, perm, n);
+      permute_1d_into(ux2, scratch, perm, n);
+      permute_1d_into(ux3, scratch, perm, n);
+      permute_1d_into(weight, scratch, perm, n);
+      if constexpr (D == Dim::_2D and C != Coord::Cartesian) {
+        permute_1d_into(phi, scratch, perm, n);
+      }
+    }
+    {
+      array_t<short*> scratch { "perm_scratch_tag", n };
+      permute_1d_into(tag, scratch, perm, n);
+    }
+    if (npld_r() > 0) {
+      const auto         ncols = static_cast<npart_t>(npld_r());
+      array_t<real_t**>  scratch { "perm_scratch_pld_r", n, ncols };
+      permute_2d_into(pld_r, scratch, perm, n, ncols);
+    }
+    if (npld_i() > 0) {
+      const auto         ncols = static_cast<npart_t>(npld_i());
+      array_t<npart_t**> scratch { "perm_scratch_pld_i", n, ncols };
+      permute_2d_into(pld_i, scratch, perm, n, ncols);
+    }
+  }
+#endif // TEAM_POLICY_USE_VENDOR_SORT
+
+#if defined(TEAM_POLICY_USE_VENDOR_SORT)
+  #define APPLY_PERM_INSTANTIATE(D, C)                                         \
+    template void Particles<D, C>::apply_permutation_to_soa(                   \
+      const prtl_perm_t&, npart_t);
+#else
+  #define APPLY_PERM_INSTANTIATE(D, C)
+#endif
+
 #define PARTICLES_SORT(D, C)                                                   \
   template auto Particles<D, C>::NpartsPerTagAndOffsets() const                \
     -> std::pair<std::vector<npart_t>, array_t<npart_t*>>;                     \
   template void Particles<D, C>::RemoveDead();                                 \
-  template void Particles<D, C>::SortSpatially(const Grid<D>&);
+  template void Particles<D, C>::SortSpatially(const Grid<D>&);                \
+  APPLY_PERM_INSTANTIATE(D, C)
 
   PARTICLES_SORT(Dim::_1D, Coord::Cartesian)
   PARTICLES_SORT(Dim::_2D, Coord::Cartesian)
@@ -266,5 +669,6 @@ namespace ntt {
   PARTICLES_SORT(Dim::_3D, Coord::Spherical)
   PARTICLES_SORT(Dim::_3D, Coord::Qspherical)
 #undef PARTICLES_SORT
+#undef APPLY_PERM_INSTANTIATE
 
 } // namespace ntt
diff --git a/src/framework/domain/metadomain_sort.cpp b/src/framework/domain/metadomain_sort.cpp
index 791bf31a8..a2b08f8be 100644
--- a/src/framework/domain/metadomain_sort.cpp
+++ b/src/framework/domain/metadomain_sort.cpp
@@ -21,9 +21,7 @@ namespace ntt {
       const auto clearing_interval = species.clearing_interval();
       if ((clearing_interval > 0u) and (step % clearing_interval == 0u) and
           (step > 0u)) {
-        for (auto& species : domain.species) {
-          species.RemoveDead();
-        }
+        species.RemoveDead();
       }
       const auto spatial_sorting_interval = species.spatial_sorting_interval();
       if ((spatial_sorting_interval > 0u) and
diff --git a/src/framework/parameters/algorithms.cpp b/src/framework/parameters/algorithms.cpp
index 4766db965..87c77fbd9 100644
--- a/src/framework/parameters/algorithms.cpp
+++ b/src/framework/parameters/algorithms.cpp
@@ -33,6 +33,11 @@ namespace ntt {
 
       deposit_enable = toml::find_or(toml_data, "algorithms", "deposit", "enable", true);
       deposit_order = static_cast<unsigned short>(SHAPE_ORDER);
+      deposit_team_policy_team_size = toml::find_or(toml_data,
+                                                    "algorithms",
+                                                    "deposit",
+                                                    "team_policy_team_size",
+                                                    defaults::team_policy_team_size);
 
       fieldsolver_enable = toml::find_or(toml_data,
                                          "algorithms",
@@ -140,6 +145,8 @@ namespace ntt {
 
       params->set("algorithms.deposit.enable", deposit_enable.value());
       params->set("algorithms.deposit.order", deposit_order.value());
+      params->set("algorithms.deposit.team_policy_team_size",
+                  deposit_team_policy_team_size.value());
 
       params->set("algorithms.fieldsolver.enable", fieldsolver_enable.value());
       for (const auto& [key, value] : fieldsolver_stencil_coeffs.value()) {
diff --git a/src/framework/parameters/algorithms.h b/src/framework/parameters/algorithms.h
index 97edb244a..c46f480fe 100644
--- a/src/framework/parameters/algorithms.h
+++ b/src/framework/parameters/algorithms.h
@@ -34,6 +34,7 @@ namespace ntt {
 
       std::optional<bool>           deposit_enable;
       std::optional<unsigned short> deposit_order;
+      std::optional<std::size_t>    deposit_team_policy_team_size;
 
       std::optional<bool>                          fieldsolver_enable;
       std::optional<std::map<std::string, real_t>> fieldsolver_stencil_coeffs;
diff --git a/src/global/arch/kokkos_aliases.h b/src/global/arch/kokkos_aliases.h
index 314e78c0e..7212b554e 100644
--- a/src/global/arch/kokkos_aliases.h
+++ b/src/global/arch/kokkos_aliases.h
@@ -10,6 +10,7 @@
  *   - CreateRangePolicy, CreateRangePolicyOnHost
  *   - random_number_pool_t, random_generator_t
  *   - Random function
+ *   - prtl_perm_t, TileLayout<>
  * @cpp:
  *   - arch/kokkos_aliases.cpp
  * @namespaces:
@@ -225,6 +226,79 @@ namespace kokkos_aliases_hidden {
 template <Dimension D>
 using range_h_t = typename kokkos_aliases_hidden::range_h_impl<D>::type;
 
+// Array aliases of arbitrary type and dimensions (up to 4)
+namespace kokkos_aliases_hidden {
+  // c++ magic
+  template <unsigned short D, typename T>
+  struct scratch_nddata_impl {
+    using type = void;
+  };
+
+  template <typename T>
+  struct scratch_nddata_impl<1, T> {
+    using type = Kokkos::View<T*,
+                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  };
+
+  template <typename T>
+  struct scratch_nddata_impl<2, T> {
+    using type = Kokkos::View<T**,
+                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  };
+
+  template <typename T>
+  struct scratch_nddata_impl<3, T> {
+    using type = Kokkos::View<T***,
+                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  };
+
+  template <typename T>
+  struct scratch_nddata_impl<4, T> {
+    using type = Kokkos::View<T****,
+                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  };
+} // namespace kokkos_aliases_hidden
+
+template <unsigned short D, typename T>
+using scratch_nddata_t = typename kokkos_aliases_hidden::scratch_nddata_impl<D, T>::type;
+
+// Defining aliases for Scratch memory ndfield
+namespace kokkos_aliases_hidden {
+  template <Dimension D, unsigned short N, typename T>
+  struct scratch_ndfield_impl {
+    using type = void;
+  };
+
+  template <unsigned short N, typename T>
+  struct scratch_ndfield_impl<Dim::_1D, N, T> {
+    using type = Kokkos::View<T* [N],
+                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  };
+
+  template <unsigned short N, typename T>
+  struct scratch_ndfield_impl<Dim::_2D, N, T> {
+    using type = Kokkos::View<T** [N],
+                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  };
+
+  template <unsigned short N, typename T>
+  struct scratch_ndfield_impl<Dim::_3D, N, T> {
+    using type = Kokkos::View<T*** [N],
+                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  };
+} // namespace kokkos_aliases_hidden
+
+template <Dimension D, unsigned short N, typename T>
+using scratch_ndfield_t =
+  typename kokkos_aliases_hidden::scratch_ndfield_impl<D, N, T>::type;
+
 /**
  * @brief Function template for generating 1D Kokkos range policy for particles.
  * @tparam D Dimension
@@ -258,6 +332,39 @@ template <Dimension D>
 auto CreateRangePolicyOnHost(const tuple_t<ncells_t, D>&,
                              const tuple_t<ncells_t, D>&) -> range_h_t<D>;
 
+// --------------------------- team_policy types ---------------------------- //
+// Particle permutation index: maps a sorted-position p in [0, npart) to a
+// pre-sort particle index. Produced by SortSpatially, consumed by tiled
+// pusher and deposit kernels to walk particles tile-by-tile without
+// physically re-permuting the SoA arrays in lock step every step.
+using prtl_perm_t = array_t<npart_t*>;
+
+// Tile layout metadata: the contract between Stream 1 (sort) and Streams
+// 2/3 (tiled deposit / pusher). Scalars are host-resident; the views are
+// device-resident.
+//   ntiles_per_axis  : number of tiles along each axis (1 for unused axes).
+//   ntiles_total     : product of ntiles_per_axis = league size for TeamPolicy.
+//   tile_size        : tile edge length in cells (compile-time CMake knob,
+//                      replicated here for runtime checks).
+//   npart_partitioned: number of (alive) particles partitioned at the last
+//                      sort, i.e. tile_offsets(ntiles_total). The tiles cover
+//                      exactly [0, npart_partitioned); particles appended past
+//                      it (injection / MPI receive on a no-sort step) are not
+//                      partitioned and must be deposited separately.
+//   tile_offsets     : prefix-sum of per-tile particle counts; size
+//                      ntiles_total + 1; tile t owns particles
+//                      [tile_offsets(t), tile_offsets(t+1)).
+//   tile_perm        : size npart, particle index sorted by tile.
+template <Dimension D>
+struct TileLayout {
+  ncells_t          ntiles_per_axis[3] { 1u, 1u, 1u };
+  ncells_t          ntiles_total { 0u };
+  unsigned short    tile_size { 0u };
+  npart_t           npart_partitioned { 0u };
+  array_t<npart_t*> tile_offsets;
+  prtl_perm_t       tile_perm;
+};
+
 // Random number pool/generator type alias
 // (using math:: instead of Kokkos:: to suppress compiler warning on unused namespace alias)
 using random_number_pool_t = math::Random_XorShift64_Pool<Kokkos::DefaultExecutionSpace>;
diff --git a/src/global/defaults.h b/src/global/defaults.h
index e1387677e..c4f9b67f4 100644
--- a/src/global/defaults.h
+++ b/src/global/defaults.h
@@ -22,6 +22,8 @@ namespace ntt::defaults {
 
   const unsigned short current_filters = 0;
 
+  const std::size_t team_policy_team_size = 0;
+
   const std::string em_pusher      = "Boris";
   const std::string ph_pusher      = "Photon";
   const timestep_t  clear_interval = 100;
diff --git a/src/global/global.cpp b/src/global/global.cpp
index ec22fd2f3..c3aa1bcdc 100644
--- a/src/global/global.cpp
+++ b/src/global/global.cpp
@@ -6,8 +6,51 @@
   #include <mpi.h>
 #endif // MPI_ENABLED
 
+#if defined(HIP_ENABLED)
+  #include <hip/hip_runtime.h>
+
+  #include <cstdint>
+#endif // HIP_ENABLED
+
+namespace {
+#if defined(HIP_ENABLED)
+  // Turn the ROCm stream-ordered allocator into a caching arena.
+  //
+  // This Kokkos build uses hipMallocAsync/hipFreeAsync (Kokkos option
+  // IMPL_HIP_MALLOC_ASYNC). The default memory pool has a release
+  // threshold of 0, so every freed block is handed back to the driver
+  // at the next stream sync. With ~50 GB of particle SoA permanently
+  // pinned and only ~14 GB free, the per-step churn of dozens of
+  // large, differently-sized sort/comm scratch buffers fragments that
+  // free space: allocation cost grows monotonically (ParticleSort
+  // slowdown) until no contiguous mid-size block remains and BinSort's
+  // `sorted_values` allocation fails (OOM). Raising the release
+  // threshold to "unlimited" makes the pool retain and recycle freed
+  // blocks instead, which stabilizes the working set and removes both
+  // the slowdown and the OOM.
+  void ConfigureHipMemPool() {
+    int device = 0;
+    if (hipGetDevice(&device) != hipSuccess) {
+      return;
+    }
+    hipMemPool_t pool = nullptr;
+    if (hipDeviceGetDefaultMemPool(&pool, device) != hipSuccess or
+        pool == nullptr) {
+      return;
+    }
+    uint64_t threshold = UINT64_MAX;
+    (void)hipMemPoolSetAttribute(pool,
+                                 hipMemPoolAttrReleaseThreshold,
+                                 &threshold);
+  }
+#endif // HIP_ENABLED
+} // namespace
+
 void ntt::GlobalInitialize(int argc, char* argv[]) {
   Kokkos::initialize(argc, argv);
+#if defined(HIP_ENABLED)
+  ConfigureHipMemPool();
+#endif // HIP_ENABLED
 #if defined(MPI_ENABLED)
   MPI_Init(&argc, &argv);
 #endif // MPI_ENABLED
diff --git a/src/global/utils/diag.cpp b/src/global/utils/diag.cpp
index 4cf763faf..604a872ae 100644
--- a/src/global/utils/diag.cpp
+++ b/src/global/utils/diag.cpp
@@ -17,6 +17,7 @@
   #include <iterator>
 #endif // MPI_ENABLED
 
+#include <cstddef>
 #include <iomanip>
 #include <numeric>
 #include <sstream>
@@ -26,8 +27,8 @@
 
 namespace diag {
   auto npart_stats(npart_t npart, npart_t maxnpart)
-    -> std::vector<std::pair<npart_t, unsigned short>> {
-    auto stats = std::vector<std::pair<npart_t, unsigned short>>();
+    -> std::vector<std::pair<std::size_t, unsigned short>> {
+    auto stats = std::vector<std::pair<std::size_t, unsigned short>>();
     const auto percentage = [](npart_t part, npart_t maxpart) -> unsigned short {
       return static_cast<unsigned short>(
         100.0f * static_cast<float>(part) / static_cast<float>(maxpart));
@@ -59,9 +60,9 @@ namespace diag {
     if (rank != MPI_ROOT_RANK) {
       return stats;
     }
-    const npart_t tot_npart = std::accumulate(mpi_npart.begin(),
-                                              mpi_npart.end(),
-                                              static_cast<npart_t>(0));
+    const std::size_t tot_npart = std::accumulate(mpi_npart.begin(),
+                                                  mpi_npart.end(),
+                                                  static_cast<std::size_t>(0));
     const npart_t max_idx   = std::distance(
       mpi_npart.begin(),
       std::max_element(mpi_npart.begin(), mpi_npart.end()));
diff --git a/src/global/utils/reporter.cpp b/src/global/utils/reporter.cpp
index 77117c4b9..6113a89c7 100644
--- a/src/global/utils/reporter.cpp
+++ b/src/global/utils/reporter.cpp
@@ -250,6 +250,19 @@ namespace reporter {
 #else
     AddParam(report, 4, "GPU_AWARE_MPI", "%s", "OFF");
 #endif
+
+#if defined(TEAM_POLICY)
+    AddParam(report, 4, "TEAM_POLICY", "%s", "ON");
+  #if (defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) ||                     \
+    (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) ||                       \
+    (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED))
+    AddParam(report, 4, "VENDOR_SORT", "%s", "ON");
+  #else
+    AddParam(report, 4, "VENDOR_SORT", "%s", "OFF (BinSort)");
+  #endif
+#else
+    AddParam(report, 4, "TEAM_POLICY", "%s", "OFF");
+#endif
     report += "\n";
     return report;
   }
diff --git a/src/global/utils/sort_dispatch.h b/src/global/utils/sort_dispatch.h
new file mode 100644
index 000000000..432b1c1b9
--- /dev/null
+++ b/src/global/utils/sort_dispatch.h
@@ -0,0 +1,317 @@
+/**
+ * @file utils/sort_dispatch.h
+ * @brief Backend-dispatched sort_by_key for team_policy SortSpatially.
+ * @implements
+ *   - sort_helpers::sort_by_key_dispatch -> void  (BinSort, OneDPL, Thrust, StdSort)
+ * @namespaces:
+ *   - ntt::sort_helpers::
+ * @macros:
+ *   - TEAM_POLICY
+ *   - SYCL_ENABLED, ONEDPL_ENABLED  (oneDPL overload)
+ *   - CUDA_ENABLED, THRUST_ENABLED  (Thrust overload)
+ *
+ * @note Each overload produces a permutation `perm` of size N such that
+ *       keys[perm[0]] <= keys[perm[1]] <= ... in stable order.
+ *       Always-available overloads: BinSort (uses Kokkos::BinSort) and
+ *       StdSort (host-side std::stable_sort fallback). The vendor-library
+ *       overloads (OneDPL on SYCL, cub radix sort on CUDA, rocprim radix
+ *       sort on HIP) are conditional on the respective build flags.
+ * @note The CUDA/HIP overloads bound the radix sort to the significant
+ *       key bits (`significant_bits(n_bins)`) instead of the full 32, so
+ *       only ceil(log2(n_bins)) bits are sorted — fewer radix passes than
+ *       a full-width `thrust::sort_by_key`. Scratch is transient (freed at
+ *       scope exit); no persistent buffer is retained (cf. 787aa045).
+ */
+
+#ifndef GLOBAL_UTILS_SORT_DISPATCH_H
+#define GLOBAL_UTILS_SORT_DISPATCH_H
+
+#if !defined(TEAM_POLICY)
+  #error "sort_dispatch.h is only meaningful when TEAM_POLICY is defined"
+#endif
+
+#include "global.h"
+
+#include "arch/kokkos_aliases.h"
+#include "utils/sorting.h"
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Sort.hpp>
+
+// Entity's Kokkos alias macros (arch/kokkos_aliases.h) define bare words such
+// as `Function`, `Inline`, `Lambda` and `ClassLambda`. These collide with
+// template-parameter and member names used inside the vendor sort headers
+// (rocPRIM, cub, oneDPL) and corrupt their parsing (e.g. rocPRIM's
+// `template<class Tuple, class Function, ...>`). Suspend the aliases across the
+// vendor includes only, then restore them for the rest of the translation unit.
+#pragma push_macro("Function")
+#pragma push_macro("Inline")
+#pragma push_macro("Lambda")
+#pragma push_macro("ClassLambda")
+#undef Function
+#undef Inline
+#undef Lambda
+#undef ClassLambda
+
+#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)
+  #include <oneapi/dpl/algorithm>
+  #include <oneapi/dpl/execution>
+#endif
+#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED)
+  #include <cub/device/device_radix_sort.cuh>
+#endif
+#if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)
+  #include <rocprim/rocprim.hpp>
+#endif
+
+#pragma pop_macro("ClassLambda")
+#pragma pop_macro("Lambda")
+#pragma pop_macro("Inline")
+#pragma pop_macro("Function")
+
+#include <algorithm>
+#include <cstdint>
+#include <numeric>
+
+namespace ntt::sort_helpers {
+
+  // Number of low-order bits needed to represent keys in [0, n_bins).
+  // The radix sort only needs to scan these bits — bounding `end_bit` to
+  // ceil(log2(n_bins)) instead of 32 cuts the number of passes (e.g. 18
+  // bits when total_tiles ~ 176K). Returns at least 1.
+  inline unsigned int significant_bits(ncells_t n_bins) {
+    unsigned int bits = 0u;
+    while (bits < 32u &&
+           (static_cast<std::uint64_t>(1u) << bits) <
+             static_cast<std::uint64_t>(n_bins)) {
+      ++bits;
+    }
+    return (bits == 0u) ? 1u : bits;
+  }
+
+  // Always-available legacy fallback: Kokkos::BinSort. n_bins must be an
+  // upper bound on distinct key values.
+  inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
+                                   prtl_perm_t&              perm,
+                                   ncells_t                  n_bins,
+                                   ::sort::backend::BinSort) {
+    const auto n = static_cast<npart_t>(keys.extent(0));
+    if (n == 0u) {
+      return;
+    }
+    using sorter_op_t = Kokkos::BinOp1D<array_t<ncells_t*>>;
+    using sorter_t    = Kokkos::BinSort<array_t<ncells_t*>, sorter_op_t>;
+    auto bin_op       = sorter_op_t { static_cast<int>(n_bins), 0u, n_bins };
+    auto sorter       = sorter_t { keys, bin_op, false };
+    sorter.create_permute_vector();
+    auto perm_v = perm;
+    Kokkos::parallel_for(
+      "PermInitIota",
+      n,
+      KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; });
+    Kokkos::fence("sort_by_key_dispatch BinSort: pre-sort");
+    sorter.sort(perm);
+    Kokkos::fence("sort_by_key_dispatch BinSort: post-sort");
+  }
+
+#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)
+  inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
+                                   prtl_perm_t&              perm,
+                                   ncells_t /*n_bins*/,
+                                   ::sort::backend::OneDPL) {
+    const auto n = static_cast<npart_t>(keys.extent(0));
+    if (n == 0u) {
+      return;
+    }
+    auto*  keys_ptr = keys.data();
+    auto*  perm_ptr = perm.data();
+    auto   exec     = Kokkos::DefaultExecutionSpace();
+    auto   perm_v   = perm;
+    Kokkos::parallel_for(
+      "PermInitIota",
+      n,
+      KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; });
+    // Drain Kokkos's queue so oneDPL's policy sees the iota'd perm even
+    // if oneDPL submits to a different SYCL queue internally.
+    exec.fence("sort_by_key_dispatch OneDPL: pre-sort");
+    auto queue  = exec.sycl_queue();
+    auto policy = oneapi::dpl::execution::make_device_policy(queue);
+    oneapi::dpl::sort_by_key(policy, keys_ptr, keys_ptr + n, perm_ptr);
+    exec.fence("sort_by_key_dispatch OneDPL: post-sort");
+  }
+#endif
+
+#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED)
+  inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
+                                   prtl_perm_t&              perm,
+                                   ncells_t                  n_bins,
+                                   ::sort::backend::Thrust) {
+    const auto n = static_cast<npart_t>(keys.extent(0));
+    if (n == 0u) {
+      return;
+    }
+    auto exec   = Kokkos::DefaultExecutionSpace();
+    auto perm_v = perm;
+    Kokkos::parallel_for(
+      "PermInitIota",
+      n,
+      KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; });
+
+    // Radix sort bounded to the significant key bits. The _out buffers and
+    // temp storage are transient (freed at scope exit).
+    array_t<ncells_t*> keys_out("tile_keys_sorted", n);
+    prtl_perm_t        perm_out("tile_perm_sorted", n);
+    const int          end_bit = static_cast<int>(significant_bits(n_bins));
+
+    exec.fence("sort_by_key_dispatch Thrust: pre-sort");
+    auto stream = exec.cuda_stream();
+
+    // DoubleBuffer radix sort: cub ping-pongs between the supplied
+    // (current, alternate) buffer pairs, so `temp_bytes` holds only the
+    // histograms (~MB) instead of an internal N-sized alternate (~8*N bytes).
+    // Nearly halves the sort's transient memory vs the plain out-of-place
+    // form, which matters at high npart where the N-sized temp can fail to
+    // allocate (device OOM at scale).
+    cub::DoubleBuffer<ncells_t> d_keys(keys.data(), keys_out.data());
+    cub::DoubleBuffer<npart_t>  d_perm(perm.data(), perm_out.data());
+
+    std::size_t temp_bytes = 0;
+    auto        err = cub::DeviceRadixSort::SortPairs(nullptr,
+                                              temp_bytes,
+                                              d_keys,
+                                              d_perm,
+                                              n,
+                                              0,
+                                              end_bit,
+                                              stream);
+    raise::ErrorIf(err != cudaSuccess,
+                   "cub::DeviceRadixSort::SortPairs (size query) failed",
+                   HERE);
+    array_t<char*> temp("cub_radix_temp",
+                        (temp_bytes == 0u) ? std::size_t { 1 } : temp_bytes);
+    err = cub::DeviceRadixSort::SortPairs(temp.data(),
+                                          temp_bytes,
+                                          d_keys,
+                                          d_perm,
+                                          n,
+                                          0,
+                                          end_bit,
+                                          stream);
+    raise::ErrorIf(err != cudaSuccess,
+                   "cub::DeviceRadixSort::SortPairs failed",
+                   HERE);
+    exec.fence("sort_by_key_dispatch Thrust: post-sort");
+
+    // Publish results from whichever buffer cub left as Current() (depends on
+    // the pass count): copy sorted keys back into `keys`' storage if they
+    // ended up in the alternate, and point `perm` at its current buffer.
+    if (d_keys.Current() != keys.data()) {
+      auto keys_nc = keys; // non-const handle aliasing the same storage
+      Kokkos::deep_copy(keys_nc, keys_out);
+    }
+    if (d_perm.Current() == perm_out.data()) {
+      perm = perm_out;
+    }
+  }
+#endif
+
+#if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)
+  // HIP analogue of the CUDA cub overload, using rocprim's radix sort
+  // (which ships with rocThrust). Same bounded-bit, out-of-place,
+  // transient-scratch scheme.
+  inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
+                                   prtl_perm_t&              perm,
+                                   ncells_t                  n_bins,
+                                   ::sort::backend::Rocthrust) {
+    const auto n = static_cast<npart_t>(keys.extent(0));
+    if (n == 0u) {
+      return;
+    }
+    auto exec   = Kokkos::DefaultExecutionSpace();
+    auto perm_v = perm;
+    Kokkos::parallel_for(
+      "PermInitIota",
+      n,
+      KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; });
+
+    array_t<ncells_t*>  keys_out("tile_keys_sorted", n);
+    prtl_perm_t         perm_out("tile_perm_sorted", n);
+    const unsigned int  end_bit = significant_bits(n_bins);
+
+    exec.fence("sort_by_key_dispatch Rocthrust: pre-sort");
+    auto stream = exec.hip_stream();
+
+    // double_buffer radix sort: rocprim ping-pongs between the supplied
+    // (current, alternate) buffer pairs, so `temp_storage` holds only the
+    // histograms (~MB) instead of an internal N-sized alternate (~8*N bytes,
+    // the dominant `rocprim_radix_temp`). This nearly halves the sort's
+    // transient memory vs the plain out-of-place form, which matters at high
+    // npart where that N-sized temp can fail to allocate (device OOM at scale).
+    rocprim::double_buffer<ncells_t> d_keys(keys.data(), keys_out.data());
+    rocprim::double_buffer<npart_t>  d_perm(perm.data(), perm_out.data());
+
+    std::size_t temp_bytes = 0;
+    auto        err = rocprim::radix_sort_pairs(nullptr,
+                                         temp_bytes,
+                                         d_keys,
+                                         d_perm,
+                                         static_cast<std::size_t>(n),
+                                         0u,
+                                         end_bit,
+                                         stream);
+    raise::ErrorIf(err != hipSuccess,
+                   "rocprim::radix_sort_pairs (size query) failed",
+                   HERE);
+    array_t<char*> temp("rocprim_radix_temp",
+                        (temp_bytes == 0u) ? std::size_t { 1 } : temp_bytes);
+    err = rocprim::radix_sort_pairs(temp.data(),
+                                    temp_bytes,
+                                    d_keys,
+                                    d_perm,
+                                    static_cast<std::size_t>(n),
+                                    0u,
+                                    end_bit,
+                                    stream);
+    raise::ErrorIf(err != hipSuccess,
+                   "rocprim::radix_sort_pairs failed",
+                   HERE);
+    exec.fence("sort_by_key_dispatch Rocthrust: post-sort");
+
+    // Publish results from whichever buffer rocprim left as `current()`
+    // (depends on the pass count). If the sorted keys ended up in the
+    // alternate, copy them back into `keys`' storage so downstream
+    // (compute_tile_offsets) sees them; point `perm` at its current buffer.
+    if (d_keys.current() != keys.data()) {
+      auto keys_nc = keys; // non-const handle aliasing the same storage
+      Kokkos::deep_copy(keys_nc, keys_out);
+    }
+    if (d_perm.current() == perm_out.data()) {
+      perm = perm_out;
+    }
+  }
+#endif
+
+  // Host fallback: indirect-sort via std::stable_sort.
+  inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
+                                   prtl_perm_t&              perm,
+                                   ncells_t /*n_bins*/,
+                                   ::sort::backend::StdSort) {
+    const auto n = static_cast<npart_t>(keys.extent(0));
+    if (n == 0u) {
+      return;
+    }
+    auto keys_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                       keys);
+    auto perm_h = Kokkos::create_mirror_view(perm);
+    std::iota(perm_h.data(), perm_h.data() + n, npart_t { 0u });
+    std::stable_sort(perm_h.data(),
+                     perm_h.data() + n,
+                     [&](npart_t a, npart_t b) {
+                       return keys_h(a) < keys_h(b);
+                     });
+    Kokkos::deep_copy(perm, perm_h);
+  }
+
+} // namespace ntt::sort_helpers
+
+#endif // GLOBAL_UTILS_SORT_DISPATCH_H
diff --git a/src/global/utils/sorting.h b/src/global/utils/sorting.h
index dbe774402..8442f5ddb 100644
--- a/src/global/utils/sorting.h
+++ b/src/global/utils/sorting.h
@@ -5,6 +5,7 @@
  *   - sort::BinBool<>
  *   - sort::BinTag<>
  *   - sort::PositionToTileIndex<>
+ *   - sort::backend tag types (compile-time tag dispatch for sort_by_key)
  * @namespaces:
  *   - sort::
  * @note BinBool sorts by boolean values "true" then "false"
@@ -62,9 +63,27 @@ namespace sort {
     const int m_max_bins;
   };
 
-  template <Dimension D, bool Count>
+  /**
+   * @brief Bin a particle into a tile of edge length `tile_size` cells.
+   * @tparam D     Dimension.
+   * @tparam Count If true, atomic-increment `num_ppt[tile]` for each live
+   *               particle (used to populate per-tile counts in one pass).
+   * @tparam UsePrev If true, the bin key uses `min(i_curr, i_prev)` instead
+   *                 of `i_curr` alone. This guarantees that a particle whose
+   *                 Esirkepov stencil straddles a tile boundary (because it
+   *                 crossed the boundary during the pusher) lands in the
+   *                 lower-indexed of the two tiles. Combined with a halo of
+   *                 `O+1` cells in the deposit's per-tile scratch, this
+   *                 keeps every particle's stencil inside its assigned
+   *                 tile's interior+halo region. See plan §S2.4.
+   *
+   * Dead particles get the sentinel `total_tiles + 1u` so they sort to the
+   * end (or get skipped, depending on the consumer).
+   */
+  template <Dimension D, bool Count, bool UsePrev = false>
   struct PositionToTileIndex {
     const array_t<int*>   i1, i2, i3;
+    const array_t<int*>   i1_prev, i2_prev, i3_prev;
     const array_t<short*> tag;
     array_t<ncells_t*>    tile_indices;
     ncells_t              tile_size;
@@ -72,20 +91,37 @@ namespace sort {
 
     ncells_t ntx2 { 0u }, ntx3 { 0u };
     ncells_t total_tiles { 0u };
-
-    PositionToTileIndex(const array_t<int*>&         i1,
-                        const array_t<int*>&         i2,
-                        const array_t<int*>&         i3,
-                        const array_t<short*>&       tag,
-                        array_t<ncells_t*>&          tile_indices,
+    // Active-cell extents per axis. Used to clamp the bin key when
+    // UsePrev=true, since `i_prev` can be transiently negative after the
+    // pusher's periodic wrap (`i_prev -= ni`) or out-of-range after an
+    // MPI receive that hasn't translated frames. Without clamping, the
+    // signed-to-unsigned promotion in `int(-1) / uint32_t(T)` produces
+    // ~1.07e9, the linearised `tile_indices(p)` overflows past `n_bins`,
+    // and BinSort's internal `atomic_add(&bin_count[wild_idx], 1)`
+    // faults on an unmapped page.
+    int ncells1 { 1 }, ncells2 { 1 }, ncells3 { 1 };
+
+    PositionToTileIndex(const array_t<int*>&         i1_,
+                        const array_t<int*>&         i2_,
+                        const array_t<int*>&         i3_,
+                        const array_t<short*>&       tag_,
+                        array_t<ncells_t*>&          tile_indices_,
                         const std::vector<ncells_t>& ncells,
-                        ncells_t                     tile_size = 1u)
-      : i1 { i1 }
-      , i2 { i2 }
-      , i3 { i3 }
-      , tag { tag }
-      , tile_indices { tile_indices }
-      , tile_size { tile_size }
+                        ncells_t                     tile_size_ = 1u,
+                        const array_t<npart_t*>& num_ppt_ = { "num_ppt", 0u },
+                        const array_t<int*>& i1_prev_ = {},
+                        const array_t<int*>& i2_prev_ = {},
+                        const array_t<int*>& i3_prev_ = {})
+      : i1 { i1_ }
+      , i2 { i2_ }
+      , i3 { i3_ }
+      , i1_prev { i1_prev_ }
+      , i2_prev { i2_prev_ }
+      , i3_prev { i3_prev_ }
+      , tag { tag_ }
+      , tile_indices { tile_indices_ }
+      , tile_size { tile_size_ }
+      , num_ppt { num_ppt_ }
       , ntx2 { 1u }
       , ntx3 { 1u }
       , total_tiles { 1u } {
@@ -93,49 +129,122 @@ namespace sort {
                      "ncells size must match D",
                      HERE);
       if constexpr ((D == Dim::_1D) or (D == Dim::_2D) or (D == Dim::_3D)) {
+        ncells1       = static_cast<int>(ncells[0]);
         npart_t ntx1  = static_cast<ncells_t>(math::ceil(
           static_cast<double>(ncells[0]) / static_cast<double>(tile_size)));
         total_tiles  *= ntx1;
       }
       if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) {
+        ncells2      = static_cast<int>(ncells[1]);
         ntx2         = static_cast<ncells_t>(math::ceil(
           static_cast<double>(ncells[1]) / static_cast<double>(tile_size)));
         total_tiles *= ntx2;
       }
       if constexpr (D == Dim::_3D) {
+        ncells3      = static_cast<int>(ncells[2]);
         ntx3         = static_cast<ncells_t>(math::ceil(
           static_cast<double>(ncells[2]) / static_cast<double>(tile_size)));
         total_tiles *= ntx3;
       }
       if constexpr (Count) {
-        num_ppt = array_t<npart_t*> { "num_ppt", total_tiles };
+        raise::ErrorIf(num_ppt.extent(0) != total_tiles,
+                       "num_ppt must have extent equal to total tiles",
+                       HERE);
+      }
+      if constexpr (UsePrev) {
+        raise::ErrorIf(
+          i1_prev.extent(0) == 0u,
+          "PositionToTileIndex<UsePrev=true> requires i1_prev to be set",
+          HERE);
+        if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) {
+          raise::ErrorIf(
+            i2_prev.extent(0) == 0u,
+            "PositionToTileIndex<UsePrev=true> requires i2_prev to be set",
+            HERE);
+        }
+        if constexpr (D == Dim::_3D) {
+          raise::ErrorIf(
+            i3_prev.extent(0) == 0u,
+            "PositionToTileIndex<UsePrev=true> requires i3_prev to be set",
+            HERE);
+        }
       }
     }
 
-    Inline auto operator()(prtldx_t p) const {
+    Inline auto operator()(prtlidx_t p) const {
       if (tag(p) != ntt::ParticleTag::alive) {
         tile_indices(p) = total_tiles + 1u;
       } else {
+        // bin key per-axis: use min(i, i_prev) when UsePrev so that a
+        // particle straddling a boundary lands in the lower tile.
+        // Then clamp to [0, ncells_axis - 1] — `i_prev` can be negative
+        // (after the pusher's periodic-wrap path: `i_prev -= ni`) or
+        // out-of-range (after MPI receive without frame translation).
+        // Without the clamp, signed-to-unsigned promotion in
+        // `int(-1) / uint32_t(T)` makes `tile_indices(p)` overflow far
+        // past `n_bins`, and BinSort's `atomic_add(&bin_count[bin],1)`
+        // faults on an unmapped page.
+        const auto clamp_axis = [](int v, int ncells) -> int {
+          return (v < 0) ? 0 : ((v >= ncells) ? (ncells - 1) : v);
+        };
+        const auto key1 = [&]() -> int {
+          if constexpr (UsePrev) {
+            const int raw = (i1(p) < i1_prev(p)) ? i1(p) : i1_prev(p);
+            return clamp_axis(raw, ncells1);
+          } else {
+            return i1(p);
+          }
+        }();
+        const auto key2 = [&]() -> int {
+          if constexpr (UsePrev) {
+            const int raw = (i2(p) < i2_prev(p)) ? i2(p) : i2_prev(p);
+            return clamp_axis(raw, ncells2);
+          } else {
+            return i2(p);
+          }
+        }();
+        const auto key3 = [&]() -> int {
+          if constexpr (UsePrev) {
+            const int raw = (i3(p) < i3_prev(p)) ? i3(p) : i3_prev(p);
+            return clamp_axis(raw, ncells3);
+          } else {
+            return i3(p);
+          }
+        }();
         if constexpr (D == Dim::_1D) {
-          tile_indices(p) = static_cast<ncells_t>(i1(p) / tile_size);
+          tile_indices(p) = static_cast<ncells_t>(key1 / tile_size);
         } else if constexpr (D == Dim::_2D) {
-          tile_indices(p) = static_cast<ncells_t>(i1(p) / tile_size) * ntx2 +
-                            static_cast<ncells_t>(i2(p) / tile_size);
+          tile_indices(p) = static_cast<ncells_t>(key1 / tile_size) * ntx2 +
+                            static_cast<ncells_t>(key2 / tile_size);
         } else if constexpr (D == Dim::_3D) {
-          tile_indices(p) = (static_cast<ncells_t>(i1(p) / tile_size) * ntx2 +
-                             static_cast<ncells_t>(i2(p) / tile_size)) *
+          tile_indices(p) = (static_cast<ncells_t>(key1 / tile_size) * ntx2 +
+                             static_cast<ncells_t>(key2 / tile_size)) *
                               ntx3 +
-                            static_cast<ncells_t>(i3(p) / tile_size);
+                            static_cast<ncells_t>(key3 / tile_size);
         } else {
           raise::KernelError(HERE, "Wrong D in SortSpatially");
         }
         if constexpr (Count) {
-          Kokkos::atomic_add(&num_ppt(tile_indices(p)), 1u);
+          Kokkos::atomic_add(&num_ppt(tile_indices(p)), 1);
         }
       }
     }
   };
 
+  // -------------------- Backend dispatch for sort_by_key ------------------- //
+  // Compile-time tags for tag-dispatch into backend-specific
+  // sort_by_key implementations. Selection is fully compile-time: the
+  // backend that resolves depends on the active Kokkos device and the
+  // availability of the corresponding vendor library.
+  namespace backend {
+    struct OneDPL {};
+    struct Thrust {};
+    struct Rocthrust {};
+    struct StdSort {};
+    // Always-available legacy fallback using Kokkos::BinSort.
+    struct BinSort {};
+  } // namespace backend
+
 } // namespace sort
 
 #endif // GLOBAL_UTILS_SORTING_H
diff --git a/src/kernels/currents_deposit.hpp b/src/kernels/currents_deposit.hpp
index 268187af2..98fa0b661 100644
--- a/src/kernels/currents_deposit.hpp
+++ b/src/kernels/currents_deposit.hpp
@@ -1,10 +1,24 @@
 /**
  * @file kernels/currents_deposit.hpp
- * @brief Covariant algorithms for the current deposition
+ * @brief Covariant algorithms for the current deposition.
+ *
+ * Two kernels share the same per-particle body
+ * (`kernel::DepositOneParticle`):
+ *   - `kernel::DepositCurrents_kernel<S, M, O>` flat (RangePolicy over particles,
+ *     writes into a `Kokkos::Experimental::ScatterView`). Always available.
+ *   - `kernel::DepositCurrents_kernel_tiled<S, M, O, T_TILE>` team-policy
+ *     (one team per spatial tile, accumulates into team SLM scratch with
+ *     atomic adds, then flushes to global J). Available when `team_policy=ON`
+ *     (`#if defined(TEAM_POLICY)`). Stream 2 of the Pattern A plan.
+ *
  * @implements
+ *   - kernel::deposit::PrtlPack<>
+ *   - kernel::DepositOneParticle<>
  *   - kernel::DepositCurrents_kernel<>
+ *   - kernel::DepositCurrents_kernel_tiled<>   (TEAM_POLICY only)
  * @namespaces:
  *   - kernel::
+ *   - kernel::deposit::
  */
 
 #ifndef KERNELS_CURRENTS_DEPOSIT_HPP
@@ -18,9 +32,11 @@
 #include "utils/error.h"
 #include "utils/numeric.h"
 
-#include "particle_shapes.hpp"
+#include "framework/containers/particles.h"
+#include "kernels/particle_shapes.hpp"
 
 #include <Kokkos_Core.hpp>
+#include <Kokkos_ScatterView.hpp>
 
 #define i_di_to_Xi(I, DI) (static_cast<real_t>((I)) + static_cast<real_t>((DI)))
 
@@ -28,738 +44,1201 @@ namespace kernel {
   using namespace ntt;
 
   /**
-   * @brief Algorithm for the current deposition
+   * @brief Per-particle deposit body, shared between the flat and tiled
+   *        kernels.
+   *
+   * The caller supplies a `deposit_at(idx..., comp, val)` callback that
+   * applies the contribution `val` to the J component `comp` at the
+   * **global** J cell index `idx...` (already includes the `N_GHOSTS`
+   * offset). The flat kernel's callback simply does
+   * `J_acc(idx..., comp) += val` on its scatter-view accessor; the tiled
+   * kernel's callback translates `idx...` into per-tile scratch
+   * coordinates and uses `Kokkos::atomic_add` on SLM. Either way, this
+   * function is identical numerically and contains the only deposit math
+   * in the codebase.
+   *
+   * Dead particles return early. The callback is invoked once per cell
+   * write, with the dimension-appropriate signature:
+   *   - 1D: `deposit_at(int g_i1, int comp, real_t val)`
+   *   - 2D: `deposit_at(int g_i1, int g_i2, int comp, real_t val)`
+   *   - 3D: `deposit_at(int g_i1, int g_i2, int g_i3, int comp, real_t val)`
    */
-  template <SimEngine::type S, MetricClass M, unsigned short O = 1u>
-  class DepositCurrents_kernel {
+  template <SimEngine::type S, MetricClass M, unsigned short O, typename DepositFn>
+  Inline void DepositOneParticle(prtlidx_t             p,
+                                 const ParticleArrays& prtls,
+                                 const M&              metric,
+                                 real_t                charge,
+                                 real_t                inv_dt,
+                                 DepositFn             deposit_at) {
     static_assert(O <= 11u, "Shape function order O must be <= 11");
-    static constexpr auto D = M::Dim;
+    constexpr auto D = M::Dim;
 
-    scatter_ndfield_t<D, 3>  J;
-    const array_t<int*>      i1, i2, i3;
-    const array_t<int*>      i1_prev, i2_prev, i3_prev;
-    const array_t<prtldx_t*> dx1, dx2, dx3;
-    const array_t<prtldx_t*> dx1_prev, dx2_prev, dx3_prev;
-    const array_t<real_t*>   ux1, ux2, ux3;
-    const array_t<real_t*>   phi;
-    const array_t<real_t*>   weight;
-    const array_t<short*>    tag;
-    const M                  metric;
-    const real_t             charge, inv_dt;
-
-  public:
-    /**
-     * @brief explicit constructor.
-     */
-    DepositCurrents_kernel(const scatter_ndfield_t<D, 3>& scatter_cur,
-                           const array_t<int*>&           i1,
-                           const array_t<int*>&           i2,
-                           const array_t<int*>&           i3,
-                           const array_t<int*>&           i1_prev,
-                           const array_t<int*>&           i2_prev,
-                           const array_t<int*>&           i3_prev,
-                           const array_t<prtldx_t*>&      dx1,
-                           const array_t<prtldx_t*>&      dx2,
-                           const array_t<prtldx_t*>&      dx3,
-                           const array_t<prtldx_t*>&      dx1_prev,
-                           const array_t<prtldx_t*>&      dx2_prev,
-                           const array_t<prtldx_t*>&      dx3_prev,
-                           const array_t<real_t*>&        ux1,
-                           const array_t<real_t*>&        ux2,
-                           const array_t<real_t*>&        ux3,
-                           const array_t<real_t*>&        phi,
-                           const array_t<real_t*>&        weight,
-                           const array_t<short*>&         tag,
-                           const M&                       metric,
-                           real_t                         charge,
-                           const real_t                   dt)
-      : J { scatter_cur }
-      , i1 { i1 }
-      , i2 { i2 }
-      , i3 { i3 }
-      , i1_prev { i1_prev }
-      , i2_prev { i2_prev }
-      , i3_prev { i3_prev }
-      , dx1 { dx1 }
-      , dx2 { dx2 }
-      , dx3 { dx3 }
-      , dx1_prev { dx1_prev }
-      , dx2_prev { dx2_prev }
-      , dx3_prev { dx3_prev }
-      , ux1 { ux1 }
-      , ux2 { ux2 }
-      , ux3 { ux3 }
-      , phi { phi }
-      , weight { weight }
-      , tag { tag }
-      , metric { metric }
-      , charge { charge }
-      , inv_dt { ONE / dt } {
-      raise::ErrorIf(
-        (O == 2u and N_GHOSTS < 2),
-        "Order of interpolation is 2, but number of ghost cells is < 2",
-        HERE);
+    if (prtls.tag(p) == ParticleTag::dead) {
+      return;
     }
 
-    /**
-     * @brief Iteration of the loop over particles.
-     * @param p index.
-     */
-    Inline auto operator()(prtlidx_t p) const -> void {
-      if (tag(p) == ParticleTag::dead) {
-        return;
-      }
-      // recover particle velocity to deposit in unsimulated direction
-      vec_t<Dim::_3D> vp { ZERO };
-      {
-        coord_t<M::PrtlDim> xp { ZERO };
-        if constexpr (D == Dim::_1D) {
-          xp[0] = i_di_to_Xi(i1(p), dx1(p));
-        } else if constexpr (D == Dim::_2D) {
-          if constexpr (M::PrtlDim == Dim::_3D) {
-            xp[0] = i_di_to_Xi(i1(p), dx1(p));
-            xp[1] = i_di_to_Xi(i2(p), dx2(p));
-            xp[2] = phi(p);
-          } else {
-            xp[0] = i_di_to_Xi(i1(p), dx1(p));
-            xp[1] = i_di_to_Xi(i2(p), dx2(p));
-          }
-        } else {
-          xp[0] = i_di_to_Xi(i1(p), dx1(p));
-          xp[1] = i_di_to_Xi(i2(p), dx2(p));
-          xp[2] = i_di_to_Xi(i3(p), dx3(p));
-        }
-        auto inv_energy { ZERO };
-        if constexpr (S == SimEngine::SRPIC) {
-          metric.template transform_xyz<Idx::XYZ, Idx::U>(xp,
-                                                          { ux1(p), ux2(p), ux3(p) },
-                                                          vp);
-          inv_energy = ONE / math::sqrt(ONE + NORM_SQR(ux1(p), ux2(p), ux3(p)));
+    // recover particle velocity to deposit in unsimulated direction
+    [[maybe_unused]]
+    vec_t<Dim::_3D> vp { ZERO };
+    // `vp` only feeds the unsimulated-direction current in the 1D
+    // (jx2, jx3) and 2D (jx3) branches. In 3D every J component comes
+    // from the Esirkepov/zigzag charge motion and `vp` is never read,
+    // so the metric transform + 1/sqrt + NaN/Inf guard below is pure
+    // dead work there — skip it (also frees xp/inv_energy registers).
+    if constexpr (D != Dim::_3D) {
+      coord_t<M::PrtlDim> xp { ZERO };
+      if constexpr (D == Dim::_1D) {
+        xp[0] = i_di_to_Xi(prtls.i1(p), prtls.dx1(p));
+      } else if constexpr (D == Dim::_2D) {
+        if constexpr (M::PrtlDim == Dim::_3D) {
+          xp[0] = i_di_to_Xi(prtls.i1(p), prtls.dx1(p));
+          xp[1] = i_di_to_Xi(prtls.i2(p), prtls.dx2(p));
+          xp[2] = prtls.phi(p);
         } else {
-          coord_t<Dim::_2D> xp_ { ZERO };
-          xp_[0] = xp[0];
-          real_t     theta_Cd { xp[1] };
-          const auto theta_Ph { metric.template convert<2, Crd::Cd, Crd::Ph>(
-            theta_Cd) };
-          const auto small_angle { static_cast<real_t>(constant::SMALL_ANGLE_GR) };
-          const auto large_angle { static_cast<real_t>(
-            constant::PI - constant::SMALL_ANGLE_GR) };
-          if (theta_Ph < small_angle) {
-            theta_Cd = metric.template convert<2, Crd::Ph, Crd::Cd>(small_angle);
-          } else if (theta_Ph >= large_angle) {
-            theta_Cd = metric.template convert<2, Crd::Ph, Crd::Cd>(large_angle);
-          }
-          xp_[1] = theta_Cd;
-          metric.template transform<Idx::D, Idx::U>(xp_,
-                                                    { ux1(p), ux2(p), ux3(p) },
-                                                    vp);
-          inv_energy = metric.alpha(xp_) /
-                       math::sqrt(ONE + ux1(p) * vp[0] + ux2(p) * vp[1] +
-                                  ux3(p) * vp[2]);
+          xp[0] = i_di_to_Xi(prtls.i1(p), prtls.dx1(p));
+          xp[1] = i_di_to_Xi(prtls.i2(p), prtls.dx2(p));
         }
-        if (Kokkos::isnan(vp[2]) || Kokkos::isinf(vp[2])) {
-          vp[2] = ZERO;
+      } else {
+        xp[0] = i_di_to_Xi(prtls.i1(p), prtls.dx1(p));
+        xp[1] = i_di_to_Xi(prtls.i2(p), prtls.dx2(p));
+        xp[2] = i_di_to_Xi(prtls.i3(p), prtls.dx3(p));
+      }
+      auto inv_energy { ZERO };
+      if constexpr (S == SimEngine::SRPIC) {
+        metric.template transform_xyz<Idx::XYZ, Idx::U>(
+          xp,
+          { prtls.ux1(p), prtls.ux2(p), prtls.ux3(p) },
+          vp);
+        inv_energy = ONE / U2GAMMA(prtls.ux1(p), prtls.ux2(p), prtls.ux3(p));
+      } else {
+        coord_t<Dim::_2D> xp_ { ZERO };
+        xp_[0] = xp[0];
+        real_t     theta_Cd { xp[1] };
+        const auto theta_Ph { metric.template convert<2, Crd::Cd, Crd::Ph>(
+          theta_Cd) };
+        const auto small_angle { static_cast<real_t>(constant::SMALL_ANGLE_GR) };
+        const auto large_angle { static_cast<real_t>(
+          constant::PI - constant::SMALL_ANGLE_GR) };
+        if (theta_Ph < small_angle) {
+          theta_Cd = metric.template convert<2, Crd::Ph, Crd::Cd>(small_angle);
+        } else if (theta_Ph >= large_angle) {
+          theta_Cd = metric.template convert<2, Crd::Ph, Crd::Cd>(large_angle);
         }
-        vp[0] *= inv_energy;
-        vp[1] *= inv_energy;
-        vp[2] *= inv_energy;
+        xp_[1] = theta_Cd;
+        metric.template transform<Idx::D, Idx::U>(
+          xp_,
+          { prtls.ux1(p), prtls.ux2(p), prtls.ux3(p) },
+          vp);
+        inv_energy = metric.alpha(xp_) /
+                     math::sqrt(ONE + prtls.ux1(p) * vp[0] +
+                                prtls.ux2(p) * vp[1] + prtls.ux3(p) * vp[2]);
       }
+      if (Kokkos::isnan(vp[2]) || Kokkos::isinf(vp[2])) {
+        vp[2] = ZERO;
+      }
+      vp[0] *= inv_energy;
+      vp[1] *= inv_energy;
+      vp[2] *= inv_energy;
+    }
 
-      const real_t coeff { weight(p) * charge };
-
-      // ToDo: interpolation_order as parameter
-      if constexpr (O == 0u) {
-        /*
-          Zig-zag deposit
-        */
-        const auto dxp_r_1 { static_cast<prtldx_t>(i1(p) == i1_prev(p)) *
-                             (dx1(p) + dx1_prev(p)) *
+    const real_t coeff { prtls.weight(p) * charge };
+
+    if constexpr (O == 0u) {
+      /*
+        Zig-zag deposit
+      */
+      const auto dxp_r_1 { static_cast<prtldx_t>(prtls.i1(p) == prtls.i1_prev(p)) *
+                           (prtls.dx1(p) + prtls.dx1_prev(p)) *
+                           static_cast<prtldx_t>(INV_2) };
+
+      const real_t Wx1_1 { INV_2 *
+                           (dxp_r_1 + prtls.dx1_prev(p) +
+                            static_cast<real_t>(prtls.i1(p) > prtls.i1_prev(p))) };
+      const real_t Wx1_2 { INV_2 *
+                           (prtls.dx1(p) + dxp_r_1 +
+                            static_cast<real_t>(
+                              static_cast<int>(prtls.i1(p) > prtls.i1_prev(p)) +
+                              prtls.i1_prev(p) - prtls.i1(p))) };
+      const real_t Fx1_1 { (static_cast<real_t>(prtls.i1(p) > prtls.i1_prev(p)) +
+                            dxp_r_1 - prtls.dx1_prev(p)) *
+                           coeff * inv_dt };
+      const real_t Fx1_2 { (static_cast<real_t>(
+                              prtls.i1(p) - prtls.i1_prev(p) -
+                              static_cast<int>(prtls.i1(p) > prtls.i1_prev(p))) +
+                            prtls.dx1(p) - dxp_r_1) *
+                           coeff * inv_dt };
+
+      if constexpr (D == Dim::_1D) {
+        const real_t Fx2_1 { HALF * vp[1] * coeff };
+        const real_t Fx2_2 { HALF * vp[1] * coeff };
+
+        const real_t Fx3_1 { HALF * vp[2] * coeff };
+        const real_t Fx3_2 { HALF * vp[2] * coeff };
+
+        deposit_at(prtls.i1_prev(p) + N_GHOSTS, cur::jx1, Fx1_1);
+        deposit_at(prtls.i1(p) + N_GHOSTS, cur::jx1, Fx1_2);
+
+        deposit_at(prtls.i1_prev(p) + N_GHOSTS, cur::jx2, Fx2_1 * (ONE - Wx1_1));
+        deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, cur::jx2, Fx2_1 * Wx1_1);
+        deposit_at(prtls.i1(p) + N_GHOSTS, cur::jx2, Fx2_2 * (ONE - Wx1_2));
+        deposit_at(prtls.i1(p) + N_GHOSTS + 1, cur::jx2, Fx2_2 * Wx1_2);
+
+        deposit_at(prtls.i1_prev(p) + N_GHOSTS, cur::jx3, Fx3_1 * (ONE - Wx1_1));
+        deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, cur::jx3, Fx3_1 * Wx1_1);
+        deposit_at(prtls.i1(p) + N_GHOSTS, cur::jx3, Fx3_2 * (ONE - Wx1_2));
+        deposit_at(prtls.i1(p) + N_GHOSTS + 1, cur::jx3, Fx3_2 * Wx1_2);
+      } else if constexpr (D == Dim::_2D || D == Dim::_3D) {
+        const auto dxp_r_2 { static_cast<prtldx_t>(prtls.i2(p) == prtls.i2_prev(p)) *
+                             (prtls.dx2(p) + prtls.dx2_prev(p)) *
                              static_cast<prtldx_t>(INV_2) };
 
-        const real_t Wx1_1 { INV_2 * (dxp_r_1 + dx1_prev(p) +
-                                      static_cast<real_t>(i1(p) > i1_prev(p))) };
-        const real_t Wx1_2 { INV_2 * (dx1(p) + dxp_r_1 +
-                                      static_cast<real_t>(
-                                        static_cast<int>(i1(p) > i1_prev(p)) +
-                                        i1_prev(p) - i1(p))) };
-        const real_t Fx1_1 { (static_cast<real_t>(i1(p) > i1_prev(p)) +
-                              dxp_r_1 - dx1_prev(p)) *
+        const real_t Wx2_1 { INV_2 * (dxp_r_2 + prtls.dx2_prev(p) +
+                                      static_cast<real_t>(prtls.i2(p) >
+                                                          prtls.i2_prev(p))) };
+        const real_t Wx2_2 { INV_2 *
+                             (prtls.dx2(p) + dxp_r_2 +
+                              static_cast<real_t>(
+                                static_cast<int>(prtls.i2(p) > prtls.i2_prev(p)) +
+                                prtls.i2_prev(p) - prtls.i2(p))) };
+        const real_t Fx2_1 { (static_cast<real_t>(prtls.i2(p) > prtls.i2_prev(p)) +
+                              dxp_r_2 - prtls.dx2_prev(p)) *
                              coeff * inv_dt };
-        const real_t Fx1_2 { (static_cast<real_t>(
-                                i1(p) - i1_prev(p) -
-                                static_cast<int>(i1(p) > i1_prev(p))) +
-                              dx1(p) - dxp_r_1) *
-                             coeff * inv_dt };
-
-        auto J_acc = J.access();
-
-        if constexpr (D == Dim::_1D) {
-          const real_t Fx2_1 { HALF * vp[1] * coeff };
-          const real_t Fx2_2 { HALF * vp[1] * coeff };
-
+        const real_t Fx2_2 {
+          (static_cast<real_t>(prtls.i2(p) - prtls.i2_prev(p) -
+                               static_cast<int>(prtls.i2(p) > prtls.i2_prev(p))) +
+           prtls.dx2(p) - dxp_r_2) *
+          coeff * inv_dt
+        };
+
+        if constexpr (D == Dim::_2D) {
           const real_t Fx3_1 { HALF * vp[2] * coeff };
           const real_t Fx3_2 { HALF * vp[2] * coeff };
 
-          J_acc(i1_prev(p) + N_GHOSTS, cur::jx1) += Fx1_1;
-          J_acc(i1(p) + N_GHOSTS, cur::jx1)      += Fx1_2;
-
-          J_acc(i1_prev(p) + N_GHOSTS, cur::jx2)     += Fx2_1 * (ONE - Wx1_1);
-          J_acc(i1_prev(p) + N_GHOSTS + 1, cur::jx2) += Fx2_1 * Wx1_1;
-          J_acc(i1(p) + N_GHOSTS, cur::jx2)          += Fx2_2 * (ONE - Wx1_2);
-          J_acc(i1(p) + N_GHOSTS + 1, cur::jx2)      += Fx2_2 * Wx1_2;
-
-          J_acc(i1_prev(p) + N_GHOSTS, cur::jx3)     += Fx3_1 * (ONE - Wx1_1);
-          J_acc(i1_prev(p) + N_GHOSTS + 1, cur::jx3) += Fx3_1 * Wx1_1;
-          J_acc(i1(p) + N_GHOSTS, cur::jx3)          += Fx3_2 * (ONE - Wx1_2);
-          J_acc(i1(p) + N_GHOSTS + 1, cur::jx3)      += Fx3_2 * Wx1_2;
-        } else if constexpr (D == Dim::_2D || D == Dim::_3D) {
-          const auto dxp_r_2 { static_cast<prtldx_t>(i2(p) == i2_prev(p)) *
-                               (dx2(p) + dx2_prev(p)) *
-                               static_cast<prtldx_t>(INV_2) };
-
-          const real_t Wx2_1 { INV_2 * (dxp_r_2 + dx2_prev(p) +
-                                        static_cast<real_t>(i2(p) > i2_prev(p))) };
-          const real_t Wx2_2 { INV_2 * (dx2(p) + dxp_r_2 +
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     cur::jx1,
+                     Fx1_1 * (ONE - Wx2_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS + 1,
+                     cur::jx1,
+                     Fx1_1 * Wx2_1);
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS,
+                     cur::jx1,
+                     Fx1_2 * (ONE - Wx2_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS + 1,
+                     cur::jx1,
+                     Fx1_2 * Wx2_2);
+
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     cur::jx2,
+                     Fx2_1 * (ONE - Wx1_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     cur::jx2,
+                     Fx2_1 * Wx1_1);
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS,
+                     cur::jx2,
+                     Fx2_2 * (ONE - Wx1_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS + 1,
+                     prtls.i2(p) + N_GHOSTS,
+                     cur::jx2,
+                     Fx2_2 * Wx1_2);
+
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_1 * Wx1_1 * (ONE - Wx2_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS + 1,
+                     cur::jx3,
+                     Fx3_1 * (ONE - Wx1_1) * Wx2_1);
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1,
+                     prtls.i2_prev(p) + N_GHOSTS + 1,
+                     cur::jx3,
+                     Fx3_1 * Wx1_1 * Wx2_1);
+
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS + 1,
+                     prtls.i2(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_2 * Wx1_2 * (ONE - Wx2_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS + 1,
+                     cur::jx3,
+                     Fx3_2 * (ONE - Wx1_2) * Wx2_2);
+          deposit_at(prtls.i1(p) + N_GHOSTS + 1,
+                     prtls.i2(p) + N_GHOSTS + 1,
+                     cur::jx3,
+                     Fx3_2 * Wx1_2 * Wx2_2);
+        } else {
+          const auto dxp_r_3 {
+            static_cast<prtldx_t>(prtls.i3(p) == prtls.i3_prev(p)) *
+            (prtls.dx3(p) + prtls.dx3_prev(p)) * static_cast<prtldx_t>(INV_2)
+          };
+          const real_t Wx3_1 { INV_2 * (dxp_r_3 + prtls.dx3_prev(p) +
                                         static_cast<real_t>(
-                                          static_cast<int>(i2(p) > i2_prev(p)) +
-                                          i2_prev(p) - i2(p))) };
-          const real_t Fx2_1 { (static_cast<real_t>(i2(p) > i2_prev(p)) +
-                                dxp_r_2 - dx2_prev(p)) *
+                                          prtls.i3(p) > prtls.i3_prev(p))) };
+          const real_t Wx3_2 {
+            INV_2 *
+            (prtls.dx3(p) + dxp_r_3 +
+             static_cast<real_t>(static_cast<int>(prtls.i3(p) > prtls.i3_prev(p)) +
+                                 prtls.i3_prev(p) - prtls.i3(p)))
+          };
+          const real_t Fx3_1 { (static_cast<real_t>(prtls.i3(p) > prtls.i3_prev(p)) +
+                                dxp_r_3 - prtls.dx3_prev(p)) *
                                coeff * inv_dt };
-          const real_t Fx2_2 { (static_cast<real_t>(
-                                  i2(p) - i2_prev(p) -
-                                  static_cast<int>(i2(p) > i2_prev(p))) +
-                                dx2(p) - dxp_r_2) *
-                               coeff * inv_dt };
-
-          if constexpr (D == Dim::_2D) {
-            const real_t Fx3_1 { HALF * vp[2] * coeff };
-            const real_t Fx3_2 { HALF * vp[2] * coeff };
-
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  cur::jx1) += Fx1_1 * (ONE - Wx2_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  cur::jx1) += Fx1_1 * Wx2_1;
-            J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS, cur::jx1) += Fx1_2 *
-                                                                   (ONE - Wx2_2);
-            J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS + 1, cur::jx1) += Fx1_2 * Wx2_2;
-
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_1 * (ONE - Wx1_1);
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_1 * Wx1_1;
-            J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS, cur::jx2) += Fx2_2 *
-                                                                   (ONE - Wx1_2);
-            J_acc(i1(p) + N_GHOSTS + 1, i2(p) + N_GHOSTS, cur::jx2) += Fx2_2 * Wx1_2;
-
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1);
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * Wx1_1 * (ONE - Wx2_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  cur::jx3) += Fx3_1 * (ONE - Wx1_1) * Wx2_1;
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  cur::jx3) += Fx3_1 * Wx1_1 * Wx2_1;
-
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2);
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * Wx1_2 * (ONE - Wx2_2);
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS + 1,
-                  cur::jx3) += Fx3_2 * (ONE - Wx1_2) * Wx2_2;
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS + 1,
-                  cur::jx3) += Fx3_2 * Wx1_2 * Wx2_2;
-          } else {
-            const auto   dxp_r_3 { static_cast<prtldx_t>(i3(p) == i3_prev(p)) *
-                                 (dx3(p) + dx3_prev(p)) *
-                                 static_cast<prtldx_t>(INV_2) };
-            const real_t Wx3_1 { INV_2 * (dxp_r_3 + dx3_prev(p) +
-                                          static_cast<real_t>(i3(p) > i3_prev(p))) };
-            const real_t Wx3_2 { INV_2 * (dx3(p) + dxp_r_3 +
-                                          static_cast<real_t>(
-                                            static_cast<int>(i3(p) > i3_prev(p)) +
-                                            i3_prev(p) - i3(p))) };
-            const real_t Fx3_1 { (static_cast<real_t>(i3(p) > i3_prev(p)) +
-                                  dxp_r_3 - dx3_prev(p)) *
-                                 coeff * inv_dt };
-            const real_t Fx3_2 { (static_cast<real_t>(
-                                    i3(p) - i3_prev(p) -
-                                    static_cast<int>(i3(p) > i3_prev(p))) +
-                                  dx3(p) - dxp_r_3) *
-                                 coeff * inv_dt };
-
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx1) += Fx1_1 * (ONE - Wx2_1) * (ONE - Wx3_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx1) += Fx1_1 * Wx2_1 * (ONE - Wx3_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS + 1,
-                  cur::jx1) += Fx1_1 * (ONE - Wx2_1) * Wx3_1;
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  i3_prev(p) + N_GHOSTS + 1,
-                  cur::jx1) += Fx1_1 * Wx2_1 * Wx3_1;
-
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS,
-                  cur::jx1) += Fx1_2 * (ONE - Wx2_2) * (ONE - Wx3_2);
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS + 1,
-                  i3(p) + N_GHOSTS,
-                  cur::jx1) += Fx1_2 * Wx2_2 * (ONE - Wx3_2);
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS + 1,
-                  cur::jx1) += Fx1_2 * (ONE - Wx2_2) * Wx3_2;
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS + 1,
-                  i3(p) + N_GHOSTS + 1,
-                  cur::jx1) += Fx1_2 * Wx2_2 * Wx3_2;
-
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_1 * (ONE - Wx1_1) * (ONE - Wx3_1);
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_1 * Wx1_1 * (ONE - Wx3_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS + 1,
-                  cur::jx2) += Fx2_1 * (ONE - Wx1_1) * Wx3_1;
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS + 1,
-                  cur::jx2) += Fx2_1 * Wx1_1 * Wx3_1;
-
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_2 * (ONE - Wx1_2) * (ONE - Wx3_2);
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_2 * Wx1_2 * (ONE - Wx3_2);
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS + 1,
-                  cur::jx2) += Fx2_2 * (ONE - Wx1_2) * Wx3_2;
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS + 1,
-                  cur::jx2) += Fx2_2 * Wx1_2 * Wx3_2;
-
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1);
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * Wx1_1 * (ONE - Wx2_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * (ONE - Wx1_1) * Wx2_1;
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * Wx1_1 * Wx2_1;
-
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2);
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * Wx1_2 * (ONE - Wx2_2);
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS + 1,
-                  i3(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * (ONE - Wx1_2) * Wx2_2;
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS + 1,
-                  i3(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * Wx1_2 * Wx2_2;
+          const real_t Fx3_2 {
+            (static_cast<real_t>(prtls.i3(p) - prtls.i3_prev(p) -
+                                 static_cast<int>(prtls.i3(p) > prtls.i3_prev(p))) +
+             prtls.dx3(p) - dxp_r_3) *
+            coeff * inv_dt
+          };
+
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     prtls.i3_prev(p) + N_GHOSTS,
+                     cur::jx1,
+                     Fx1_1 * (ONE - Wx2_1) * (ONE - Wx3_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS + 1,
+                     prtls.i3_prev(p) + N_GHOSTS,
+                     cur::jx1,
+                     Fx1_1 * Wx2_1 * (ONE - Wx3_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     prtls.i3_prev(p) + N_GHOSTS + 1,
+                     cur::jx1,
+                     Fx1_1 * (ONE - Wx2_1) * Wx3_1);
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS + 1,
+                     prtls.i3_prev(p) + N_GHOSTS + 1,
+                     cur::jx1,
+                     Fx1_1 * Wx2_1 * Wx3_1);
+
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS,
+                     prtls.i3(p) + N_GHOSTS,
+                     cur::jx1,
+                     Fx1_2 * (ONE - Wx2_2) * (ONE - Wx3_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS + 1,
+                     prtls.i3(p) + N_GHOSTS,
+                     cur::jx1,
+                     Fx1_2 * Wx2_2 * (ONE - Wx3_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS,
+                     prtls.i3(p) + N_GHOSTS + 1,
+                     cur::jx1,
+                     Fx1_2 * (ONE - Wx2_2) * Wx3_2);
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS + 1,
+                     prtls.i3(p) + N_GHOSTS + 1,
+                     cur::jx1,
+                     Fx1_2 * Wx2_2 * Wx3_2);
+
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     prtls.i3_prev(p) + N_GHOSTS,
+                     cur::jx2,
+                     Fx2_1 * (ONE - Wx1_1) * (ONE - Wx3_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     prtls.i3_prev(p) + N_GHOSTS,
+                     cur::jx2,
+                     Fx2_1 * Wx1_1 * (ONE - Wx3_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     prtls.i3_prev(p) + N_GHOSTS + 1,
+                     cur::jx2,
+                     Fx2_1 * (ONE - Wx1_1) * Wx3_1);
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     prtls.i3_prev(p) + N_GHOSTS + 1,
+                     cur::jx2,
+                     Fx2_1 * Wx1_1 * Wx3_1);
+
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS,
+                     prtls.i3(p) + N_GHOSTS,
+                     cur::jx2,
+                     Fx2_2 * (ONE - Wx1_2) * (ONE - Wx3_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS + 1,
+                     prtls.i2(p) + N_GHOSTS,
+                     prtls.i3(p) + N_GHOSTS,
+                     cur::jx2,
+                     Fx2_2 * Wx1_2 * (ONE - Wx3_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS,
+                     prtls.i3(p) + N_GHOSTS + 1,
+                     cur::jx2,
+                     Fx2_2 * (ONE - Wx1_2) * Wx3_2);
+          deposit_at(prtls.i1(p) + N_GHOSTS + 1,
+                     prtls.i2(p) + N_GHOSTS,
+                     prtls.i3(p) + N_GHOSTS + 1,
+                     cur::jx2,
+                     Fx2_2 * Wx1_2 * Wx3_2);
+
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     prtls.i3_prev(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1,
+                     prtls.i2_prev(p) + N_GHOSTS,
+                     prtls.i3_prev(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_1 * Wx1_1 * (ONE - Wx2_1));
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS,
+                     prtls.i2_prev(p) + N_GHOSTS + 1,
+                     prtls.i3_prev(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_1 * (ONE - Wx1_1) * Wx2_1);
+          deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1,
+                     prtls.i2_prev(p) + N_GHOSTS + 1,
+                     prtls.i3_prev(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_1 * Wx1_1 * Wx2_1);
+
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS,
+                     prtls.i3(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS + 1,
+                     prtls.i2(p) + N_GHOSTS,
+                     prtls.i3(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_2 * Wx1_2 * (ONE - Wx2_2));
+          deposit_at(prtls.i1(p) + N_GHOSTS,
+                     prtls.i2(p) + N_GHOSTS + 1,
+                     prtls.i3(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_2 * (ONE - Wx1_2) * Wx2_2);
+          deposit_at(prtls.i1(p) + N_GHOSTS + 1,
+                     prtls.i2(p) + N_GHOSTS + 1,
+                     prtls.i3(p) + N_GHOSTS,
+                     cur::jx3,
+                     Fx3_2 * Wx1_2 * Wx2_2);
+        }
+      }
+    } else if constexpr ((O >= 1u) and (O <= 11u)) {
+
+      // shape function in dim1 -> always required
+      real_t iS_x1[O + 2], fS_x1[O + 2];
+      // indices of the shape function
+      int    i1_min, i1_max;
+
+      // call shape function
+      prtl_shape::for_deposit<O>(prtls.i1_prev(p),
+                                 static_cast<real_t>(prtls.dx1_prev(p)),
+                                 prtls.i1(p),
+                                 static_cast<real_t>(prtls.dx1(p)),
+                                 i1_min,
+                                 i1_max,
+                                 iS_x1,
+                                 fS_x1);
+
+      if constexpr (D == Dim::_1D) {
+        // (1D): fused Esirkepov, no [O+2] temporaries.
+        //   jx1[i] = -Qdx1dt * sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i'])
+        //          = -Qdx1dt * P1[i]                  (Eq. 38, 1D)
+        //   Wx23[i] = HALF * (fS_x1[i] + iS_x1[i])     (computed inline)
+        const real_t Qdx1dt = coeff * inv_dt;
+        const real_t QVx2   = coeff * vp[1];
+        const real_t QVx3   = coeff * vp[2];
+
+        // account for ghost cells
+        i1_min += N_GHOSTS;
+        i1_max += N_GHOSTS;
+
+        // get number of update indices for asymmetric movement
+        const int di_x1 = i1_max - i1_min;
+
+        // Current update — fused over the union line so the J cell
+        // stays L1-resident across the 3 component atomic_adds.
+        real_t P1 = ZERO;
+        for (int i = 0; i <= di_x1; ++i) {
+          P1                += fS_x1[i] - iS_x1[i];
+          const int    gi    = i1_min + i;
+          const real_t Wx23  = HALF * (fS_x1[i] + iS_x1[i]);
+          if (i < di_x1) {
+            deposit_at(gi, cur::jx1, -Qdx1dt * P1);
           }
+          deposit_at(gi, cur::jx2, QVx2 * Wx23);
+          deposit_at(gi, cur::jx3, QVx3 * Wx23);
         }
-      } else if constexpr ((O >= 1u) and (O <= 11u)) {
+
+      } else if constexpr (D == Dim::_2D) {
 
         // shape function in dim1 -> always required
-        real_t iS_x1[O + 2], fS_x1[O + 2];
+        real_t iS_x2[O + 2], fS_x2[O + 2];
         // indices of the shape function
-        int    i1_min, i1_max;
+        int    i2_min, i2_max;
 
         // call shape function
-        prtl_shape::for_deposit<O>(i1_prev(p),
-                                   static_cast<real_t>(dx1_prev(p)),
-                                   i1(p),
-                                   static_cast<real_t>(dx1(p)),
-                                   i1_min,
-                                   i1_max,
-                                   iS_x1,
-                                   fS_x1);
-
-        if constexpr (D == Dim::_1D) {
-          // define weight vectors
-          real_t Wx1[O + 2];
-          real_t Wx23[O + 2];
-
-          // Calculate weight function
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-            // Esirkepov 2001, Eq. 38 for 1D case
-            Wx1[i]  = fS_x1[i] - iS_x1[i];
-            Wx23[i] = HALF * (fS_x1[i] + iS_x1[i]);
-          }
-
-          // contribution within the shape function stencil
-          real_t jx1[O + 2];
-
-          // prefactors for j update
-          const real_t Qdx1dt = coeff * inv_dt;
-          const real_t QVx2   = coeff * vp[1];
-          const real_t QVx3   = coeff * vp[2];
-
-          // Calculate current contribution
-          jx1[0] = -Qdx1dt * Wx1[0];
-#pragma unroll
-          for (int i = 1; i < O + 2; ++i) {
-            jx1[i] = jx1[i - 1] - Qdx1dt * Wx1[i];
-          }
-
-          // account for ghost cells
-          i1_min += N_GHOSTS;
-          i1_max += N_GHOSTS;
-
-          // get number of update indices for asymmetric movement
-          const int di_x1 = i1_max - i1_min;
-
-          /*
-              Current update
-          */
-          auto J_acc = J.access();
-
-          for (int i = 0; i < di_x1; ++i) {
-            J_acc(i1_min + i, cur::jx1) += jx1[i];
-          }
-
-          for (int i = 0; i <= di_x1; ++i) {
-            J_acc(i1_min + i, cur::jx2) += QVx2 * Wx23[i];
-          }
-
-          for (int i = 0; i <= di_x1; ++i) {
-            J_acc(i1_min + i, cur::jx3) += QVx3 * Wx23[i];
-          }
-
-        } else if constexpr (D == Dim::_2D) {
-
-          // shape function in dim1 -> always required
-          real_t iS_x2[O + 2], fS_x2[O + 2];
-          // indices of the shape function
-          int    i2_min, i2_max;
-
-          // call shape function
-          prtl_shape::for_deposit<O>(i2_prev(p),
-                                     static_cast<real_t>(dx2_prev(p)),
-                                     i2(p),
-                                     static_cast<real_t>(dx2(p)),
-                                     i2_min,
-                                     i2_max,
-                                     iS_x2,
-                                     fS_x2);
-
-          // define weight tensors
-          real_t Wx1[O + 2][O + 2];
-          real_t Wx2[O + 2][O + 2];
-          real_t Wx3[O + 2][O + 2];
-
-// Calculate weight function
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-              // Esirkepov 2001, Eq. 38 (simplified)
-              Wx1[i][j] = HALF * (fS_x1[i] - iS_x1[i]) * (fS_x2[j] + iS_x2[j]);
-
-              Wx2[i][j] = HALF * (fS_x1[i] + iS_x1[i]) * (fS_x2[j] - iS_x2[j]);
-
-              Wx3[i][j] = THIRD * (fS_x2[j] * (HALF * iS_x1[i] + fS_x1[i]) +
-                                   iS_x2[j] * (HALF * fS_x1[i] + iS_x1[i]));
-            }
-          }
-
-          // contribution within the shape function stencil
-          real_t jx1[O + 2][O + 2], jx2[O + 2][O + 2];
-
-          // prefactors for j update
-          const real_t Qdx1dt = coeff * inv_dt;
-          const real_t Qdx2dt = coeff * inv_dt;
-          const real_t QVx3   = coeff * vp[2];
-
-          // Calculate current contribution
-
-          // jx1
-#pragma unroll
-          for (int j = 0; j < O + 2; ++j) {
-            jx1[0][j] = -Qdx1dt * Wx1[0][j];
-          }
-
-#pragma unroll
-          for (int i = 1; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-              jx1[i][j] = jx1[i - 1][j] - Qdx1dt * Wx1[i][j];
-            }
-          }
-
-          // jx2
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-            jx2[i][0] = -Qdx2dt * Wx2[i][0];
-          }
-
-#pragma unroll
-          for (int j = 1; j < O + 2; ++j) {
-#pragma unroll
-            for (int i = 0; i < O + 2; ++i) {
-              jx2[i][j] = jx2[i][j - 1] - Qdx2dt * Wx2[i][j];
-            }
-          }
-
-          // account for ghost cells
-          i1_min += N_GHOSTS;
-          i2_min += N_GHOSTS;
-          i1_max += N_GHOSTS;
-          i2_max += N_GHOSTS;
-
-          // get number of update indices for asymmetric movement
-          const int di_x1 = i1_max - i1_min;
-          const int di_x2 = i2_max - i2_min;
-
-          /*
-              Current update
-          */
-          auto J_acc = J.access();
-
-          for (int i = 0; i < di_x1; ++i) {
-            for (int j = 0; j <= di_x2; ++j) {
-              J_acc(i1_min + i, i2_min + j, cur::jx1) += jx1[i][j];
+        prtl_shape::for_deposit<O>(prtls.i2_prev(p),
+                                   static_cast<real_t>(prtls.dx2_prev(p)),
+                                   prtls.i2(p),
+                                   static_cast<real_t>(prtls.dx2(p)),
+                                   i2_min,
+                                   i2_max,
+                                   iS_x2,
+                                   fS_x2);
+
+        /**
+         * (2D): fused Esirkepov, no [O+2]^2 temporaries.
+         *
+         * Esirkepov 2001 Eq. 38 (simplified) is separable: with
+         * P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) and
+         * P2[j] = sum_{j'=0}^{j} (fS_x2[j'] - iS_x2[j']),
+         *   jx1[i][j] = -Q*HALF * P1[i] * (fS_x2[j] + iS_x2[j])
+         *   jx2[i][j] = -Q*HALF * P2[j] * (fS_x1[i] + iS_x1[i])
+         *   Wx3[i][j] = THIRD*( fS_x2[j]*(HALF*iS_x1[i]+fS_x1[i])
+         *                     + iS_x2[j]*(HALF*fS_x1[i]+iS_x1[i]) )
+         * with Q = coeff*inv_dt (Qdx1dt == Qdx2dt). Same value as the
+         * old explicit Wx/jx tensors up to FP reassociation;
+         * charge-conserving by construction. Prefix sums carried as
+         * running scalars, so the only per-thread state is the
+         * existing 1D shape arrays.
+         */
+        const real_t QVx3 = coeff * vp[2];
+        // -Q*HALF prefactor (Qdx1dt == Qdx2dt == coeff*inv_dt)
+        const real_t cf   = -(coeff * inv_dt) * HALF;
+
+        // account for ghost cells
+        i1_min += N_GHOSTS;
+        i2_min += N_GHOSTS;
+        i1_max += N_GHOSTS;
+        i2_max += N_GHOSTS;
+
+        // get number of update indices for asymmetric movement
+        const int di_x1 = i1_max - i1_min;
+        const int di_x2 = i2_max - i2_min;
+
+        // Current update — fused over the union plane so the J cell
+        // line stays L1-resident across the 3 component atomic_adds.
+        real_t P1 = ZERO;
+        for (int i = 0; i <= di_x1; ++i) {
+          P1                += fS_x1[i] - iS_x1[i];
+          const int    gi    = i1_min + i;
+          const real_t iSx1  = iS_x1[i];
+          const real_t fSx1  = fS_x1[i];
+          const real_t A1    = fSx1 + iSx1; // jx2 cross-factor
+          real_t       P2    = ZERO;
+          for (int j = 0; j <= di_x2; ++j) {
+            P2                += fS_x2[j] - iS_x2[j];
+            const int    gj    = i2_min + j;
+            const real_t iSx2  = iS_x2[j];
+            const real_t fSx2  = fS_x2[j];
+            if (i < di_x1) {
+              deposit_at(gi, gj, cur::jx1, cf * P1 * (fSx2 + iSx2));
             }
-          }
-
-          for (int i = 0; i <= di_x1; ++i) {
-            for (int j = 0; j < di_x2; ++j) {
-              J_acc(i1_min + i, i2_min + j, cur::jx2) += jx2[i][j];
+            if (j < di_x2) {
+              deposit_at(gi, gj, cur::jx2, cf * P2 * A1);
             }
+            const real_t Wx3 = THIRD * (fSx2 * (HALF * iSx1 + fSx1) +
+                                        iSx2 * (HALF * fSx1 + iSx1));
+            deposit_at(gi, gj, cur::jx3, QVx3 * Wx3);
           }
+        }
 
-          for (int i = 0; i <= di_x1; ++i) {
-            for (int j = 0; j <= di_x2; ++j) {
-              J_acc(i1_min + i, i2_min + j, cur::jx3) += QVx3 * Wx3[i][j];
-            }
-          }
+      } else if constexpr (D == Dim::_3D) {
+        // shape function in dim2
+        real_t iS_x2[O + 2], fS_x2[O + 2];
+        // indices of the shape function
+        int    i2_min, i2_max;
+        // call shape function
+        prtl_shape::for_deposit<O>(prtls.i2_prev(p),
+                                   static_cast<real_t>(prtls.dx2_prev(p)),
+                                   prtls.i2(p),
+                                   static_cast<real_t>(prtls.dx2(p)),
+                                   i2_min,
+                                   i2_max,
+                                   iS_x2,
+                                   fS_x2);
+
+        // shape function in dim3
+        real_t iS_x3[O + 2], fS_x3[O + 2];
+        // indices of the shape function
+        int    i3_min, i3_max;
 
-        } else if constexpr (D == Dim::_3D) {
-          // shape function in dim2
-          real_t iS_x2[O + 2], fS_x2[O + 2];
-          // indices of the shape function
-          int    i2_min, i2_max;
-          // call shape function
-          prtl_shape::for_deposit<O>(i2_prev(p),
-                                     static_cast<real_t>(dx2_prev(p)),
-                                     i2(p),
-                                     static_cast<real_t>(dx2(p)),
-                                     i2_min,
-                                     i2_max,
-                                     iS_x2,
-                                     fS_x2);
-
-          // shape function in dim3
-          real_t iS_x3[O + 2], fS_x3[O + 2];
-          // indices of the shape function
-          int    i3_min, i3_max;
-
-          // call shape function
-          prtl_shape::for_deposit<O>(i3_prev(p),
-                                     static_cast<real_t>(dx3_prev(p)),
-                                     i3(p),
-                                     static_cast<real_t>(dx3(p)),
-                                     i3_min,
-                                     i3_max,
-                                     iS_x3,
-                                     fS_x3);
-
-          // define weight tensors
-          real_t Wx1[O + 2][O + 2][O + 2];
-          real_t Wx2[O + 2][O + 2][O + 2];
-          real_t Wx3[O + 2][O + 2][O + 2];
-
-// Calculate weight function
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-#pragma unroll
-              for (int k = 0; k < O + 2; ++k) {
-                // Esirkepov 2001, Eq. 31
-                Wx1[i][j][k] = THIRD * (fS_x1[i] - iS_x1[i]) *
-                               ((iS_x2[j] * iS_x3[k] + fS_x2[j] * fS_x3[k]) +
-                                HALF * (iS_x3[k] * fS_x2[j] + iS_x2[j] * fS_x3[k]));
-
-                Wx2[i][j][k] = THIRD * (fS_x2[j] - iS_x2[j]) *
-                               (iS_x1[i] * iS_x3[k] + fS_x1[i] * fS_x3[k] +
-                                HALF * (iS_x3[k] * fS_x1[i] + iS_x1[i] * fS_x3[k]));
-
-                Wx3[i][j][k] = THIRD * (fS_x3[k] - iS_x3[k]) *
-                               (iS_x1[i] * iS_x2[j] + fS_x1[i] * fS_x2[j] +
-                                HALF * (iS_x1[i] * fS_x2[j] + iS_x2[j] * fS_x1[i]));
+        // call shape function
+        prtl_shape::for_deposit<O>(prtls.i3_prev(p),
+                                   static_cast<real_t>(prtls.dx3_prev(p)),
+                                   prtls.i3(p),
+                                   static_cast<real_t>(prtls.dx3(p)),
+                                   i3_min,
+                                   i3_max,
+                                   iS_x3,
+                                   fS_x3);
+
+        /**
+         * fused Esirkepov, no (O+2)^3 temporaries.
+         *
+         * The Esirkepov 3D current (2001, Eq. 31) is separable: with
+         * P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) (and likewise
+         * P2[j], P3[k]) the cumulative-sum currents collapse to
+         *
+         *   jx1[i][j][k] = -Q*THIRD * P1[i] * G23(j,k)
+         *   jx2[i][j][k] = -Q*THIRD * P2[j] * H13(i,k)
+         *   jx3[i][j][k] = -Q*THIRD * P3[k] * F12(i,j)
+         *
+         * with the 1D-shape cross-factors
+         *
+         *   G23(j,k) = iS_x2[j]*iS_x3[k] + fS_x2[j]*fS_x3[k]
+         *            + HALF*(iS_x3[k]*fS_x2[j] + iS_x2[j]*fS_x3[k])
+         *   H13(i,k) = iS_x1[i]*iS_x3[k] + fS_x1[i]*fS_x3[k]
+         *            + HALF*(iS_x3[k]*fS_x1[i] + iS_x1[i]*fS_x3[k])
+         *   F12(i,j) = iS_x1[i]*iS_x2[j] + fS_x1[i]*fS_x2[j]
+         *            + HALF*(iS_x1[i]*fS_x2[j] + iS_x2[j]*fS_x1[i])
+         *
+         * and Q = coeff*inv_dt (Qdxdt == Qdydt == Qdzdt). This is the
+         * same value as the old explicit Wx/jx tensors up to
+         * floating-point reassociation: charge-conserving by
+         * construction (the Esirkepov decomposition is exact). The
+         * prefix sums are carried as running scalars in the deposit
+         * loop, so the only per-thread state is the existing 1D shape
+         * arrays (no (O+2)^3 / (O+2)^2 locals, hence far fewer VGPRs
+         * and no private-memory tensor traffic).
+         */
+
+        // account for ghost cells
+        i1_min += N_GHOSTS;
+        i2_min += N_GHOSTS;
+        i3_min += N_GHOSTS;
+        i1_max += N_GHOSTS;
+        i2_max += N_GHOSTS;
+        i3_max += N_GHOSTS;
+
+        // get number of update indices for asymmetric movement
+        const int di_x1 = i1_max - i1_min;
+        const int di_x2 = i2_max - i2_min;
+        const int di_x3 = i3_max - i3_min;
+
+        // -Q*THIRD prefactor (Qdxdt == Qdydt == Qdzdt == coeff*inv_dt)
+        const real_t cf = -(coeff * inv_dt) * THIRD;
+
+        /**
+         * Current update — fused over the union cube so the J cell
+         * line stays L1-resident across the 3 component atomic_adds.
+         * Per-cell branches on (i<di_x1), (j<di_x2), (k<di_x3) skip
+         * the trailing slab where each component's stencil ends one
+         * cell short of the union; particles within a tile share
+         * di_x* so the branch predicates cleanly.
+         */
+        real_t P1 = ZERO;
+        for (int i = 0; i <= di_x1; ++i) {
+          P1                 += fS_x1[i] - iS_x1[i];
+          const int    gi     = i1_min + i;
+          const real_t iSx1i  = iS_x1[i];
+          const real_t fSx1i  = fS_x1[i];
+          real_t       P2     = ZERO;
+          for (int j = 0; j <= di_x2; ++j) {
+            P2                 += fS_x2[j] - iS_x2[j];
+            const int    gj     = i2_min + j;
+            const real_t iSx2j  = iS_x2[j];
+            const real_t fSx2j  = fS_x2[j];
+            const real_t F12    = iSx1i * iSx2j + fSx1i * fSx2j +
+                               HALF * (iSx1i * fSx2j + iSx2j * fSx1i);
+            real_t P3 = ZERO;
+            for (int k = 0; k <= di_x3; ++k) {
+              P3                 += fS_x3[k] - iS_x3[k];
+              const int    gk     = i3_min + k;
+              const real_t iSx3k  = iS_x3[k];
+              const real_t fSx3k  = fS_x3[k];
+              if (i < di_x1) {
+                const real_t G23 = iSx2j * iSx3k + fSx2j * fSx3k +
+                                   HALF * (iSx3k * fSx2j + iSx2j * fSx3k);
+                deposit_at(gi, gj, gk, cur::jx1, cf * P1 * G23);
+              }
+              if (j < di_x2) {
+                const real_t H13 = iSx1i * iSx3k + fSx1i * fSx3k +
+                                   HALF * (iSx3k * fSx1i + iSx1i * fSx3k);
+                deposit_at(gi, gj, gk, cur::jx2, cf * P2 * H13);
+              }
+              if (k < di_x3) {
+                deposit_at(gi, gj, gk, cur::jx3, cf * P3 * F12);
               }
             }
           }
+        }
 
-          // contribution within the shape function stencil
-          real_t jx1[O + 2][O + 2][O + 2], jx2[O + 2][O + 2][O + 2],
-            jx3[O + 2][O + 2][O + 2];
-
-          // prefactors to j update
-          const real_t Qdxdt = coeff * inv_dt;
-          const real_t Qdydt = coeff * inv_dt;
-          const real_t Qdzdt = coeff * inv_dt;
+      } // dim
+    } else { // order
+      raise::KernelError(
+        HERE,
+        "Unsupported interpolation order. O > 11 not supported. Seriously. "
+        "What are you even doing here? Entity already goes to 11!");
+    }
+  }
 
-          // Calculate current contribution
+  /**
+   * @brief Flat current-deposition kernel.
+   *
+   * One thread per particle (RangePolicy). Writes are coalesced through a
+   * `Kokkos::Experimental::ScatterView` to avoid per-thread atomics on
+   * global J. Constructor signature is unchanged from prior versions —
+   * `engines/srpic/currents.h` continues to call it identically.
+   */
+  template <SimEngine::type S, MetricClass M, unsigned short O = 1u>
+  class DepositCurrents_kernel {
+    static_assert(O <= 11u, "Shape function order O must be <= 11");
+    static constexpr auto D = M::Dim;
 
-          // jx1
-#pragma unroll
-          for (int j = 0; j < O + 2; ++j) {
-#pragma unroll
-            for (int k = 0; k < O + 2; ++k) {
-              jx1[0][j][k] = -Qdxdt * Wx1[0][j][k];
-            }
-          }
+    scatter_ndfield_t<D, 3> J;
+    const ParticleArrays    prtls;
+    const M                 metric;
+    const real_t            charge, inv_dt;
 
-#pragma unroll
-          for (int i = 1; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-#pragma unroll
-              for (int k = 0; k < O + 2; ++k) {
-                jx1[i][j][k] = jx1[i - 1][j][k] - Qdxdt * Wx1[i][j][k];
-              }
-            }
-          }
+  public:
+    DepositCurrents_kernel(const scatter_ndfield_t<D, 3>& scatter_cur,
+                           const ParticleArrays&          prtls,
+                           const M&                       metric,
+                           real_t                         charge,
+                           const real_t                   dt)
+      : J { scatter_cur }
+      , prtls { prtls }
+      , metric { metric }
+      , charge { charge }
+      , inv_dt { ONE / dt } {
+      raise::ErrorIf(
+        (O == 2u and N_GHOSTS < 2),
+        "Order of interpolation is 2, but number of ghost cells is < 2",
+        HERE);
+    }
 
-          // jx2
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int k = 0; k < O + 2; ++k) {
-              jx2[i][0][k] = -Qdydt * Wx2[i][0][k];
-            }
-          }
+    Inline auto operator()(prtlidx_t p) const -> void {
+      auto J_acc = J.access();
+      if constexpr (D == Dim::_1D) {
+        DepositOneParticle<S, M, O>(p,
+                                    prtls,
+                                    metric,
+                                    charge,
+                                    inv_dt,
+                                    [&](int g_i1, int comp, real_t v) {
+                                      J_acc(g_i1, comp) += v;
+                                    });
+      } else if constexpr (D == Dim::_2D) {
+        DepositOneParticle<S, M, O>(p,
+                                    prtls,
+                                    metric,
+                                    charge,
+                                    inv_dt,
+                                    [&](int g_i1, int g_i2, int comp, real_t v) {
+                                      J_acc(g_i1, g_i2, comp) += v;
+                                    });
+      } else if constexpr (D == Dim::_3D) {
+        DepositOneParticle<S, M, O>(
+          p,
+          prtls,
+          metric,
+          charge,
+          inv_dt,
+          [&](int g_i1, int g_i2, int g_i3, int comp, real_t v) {
+            J_acc(g_i1, g_i2, g_i3, comp) += v;
+          });
+      }
+    }
+  };
 
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 1; j < O + 2; ++j) {
-#pragma unroll
-              for (int k = 0; k < O + 2; ++k) {
-                jx2[i][j][k] = jx2[i][j - 1][k] - Qdydt * Wx2[i][j][k];
-              }
-            }
-          }
+  /**
+   * @brief Tiled current-deposition kernel.
+   *
+   * One team per spatial tile (`league_size = ntiles_total`). Each team
+   * accumulates particle contributions into a per-team scratch buffer of
+   * shape `(T_TILE + 2*HALO)^D × 3` real_t, where `HALO = O + 1` cells per
+   * side. Scratch atomics live in SLM (PVC: ~5–10 cycles per
+   * `atomic_add`); the global J is touched only once per scratch cell at
+   * flush time. Compared with the flat scatter-view kernel:
+   *   - global atomic pressure ~ (T_TILE + 2*HALO)^D × 3 per tile
+   *     instead of (stencil writes per particle × particles)
+   *   - per-particle stencil writes are tile-local (SLM) instead of
+   *     scattering through global HBM
+   *
+   * Supports `O ∈ {0, ..., 11}`. `O == 0` (zigzag) is wired for
+   * A/B benchmarking against the flat scatter-view kernel — its narrow
+   * stencil typically makes scratch alloc/zero/flush overhead a
+   * regression there, but it's good to be able to measure the
+   * crossover. To revert and use flat for zigzag-only builds, change
+   * the dispatch in `engines/srpic/currents.h` from
+   * `#if defined(TEAM_POLICY)` to
+   * `#if defined(TEAM_POLICY) && (SHAPE_ORDER > 0)`.
+   *
+   * Particle iteration order is governed by `tile_offsets`: tile `t`
+   * owns particles `[tile_offsets(t), tile_offsets(t+1))`, post-sort.
+   * `SortSpatially` (`particles_sort.cpp`) is responsible for keeping
+   * the SoA arrays consistent with that.
+   *
+   * **Halo sizing and escape valve.** Sort runs at the end of a step
+   * (see `srpic.hpp`); a particle is pushed once per step thereafter, so
+   * its `min(i, i_prev)` may differ from the bin key by one cell of drift
+   * per step elapsed since the last sort. The scratch HALO is
+   * `STENCIL_REACH(O) + DRIFT`, where `STENCIL_REACH = 2` for zigzag
+   * (writes `{i_prev, i_prev+1, i, i+1}` ⇒ +2 above `min(i, i_prev)` with
+   * `|Δi|=1`) and `O` for Esirkepov. `DRIFT` is the `team_policy_drift`
+   * CMake knob (macro TEAM_POLICY_DRIFT) — the number of cells a particle
+   * may drift between two sorts that the halo is sized to absorb — and `1`
+   * by default (the every-step-sorted common case). It is independent of
+   * the sort cadence, which is set at runtime via `spatial_sorting_interval`;
+   * particles that drift past the halo take the escape valve below.
+   *
+   * Correctness does **not** depend on the halo size. Any particle whose
+   * full stencil escapes the scratch tile — because it drifted further
+   * than `DRIFT`, was reordered far from its tile by a no-sort-step
+   * `CommunicateParticles`, or because the halo is otherwise undersized —
+   * is deposited *as a whole* via a direct, bounds-clipped
+   * `Kokkos::atomic_add` on the global J view (the per-particle escape
+   * valve). Each particle's stencil is therefore deposited exactly once
+   * (entirely to SLM scratch when it fits, entirely to global J when it
+   * does not), so the path is charge-conserving; it is merely slower per
+   * write. Sizing `DRIFT` to the typical between-sort drift keeps the
+   * common case in fast SLM; sorting less often (or drifting past the
+   * halo) only costs escape-valve traffic, never accuracy.
+   *
+   * **Partition coverage.** The team iteration covers only the particles
+   * partitioned at the last sort, `[0, layout.npart_partitioned)`, clamped
+   * to the live `npart`. Particles appended past the partition since the
+   * sort are not seen here; the launcher (`engines/srpic/currents.h`)
+   * deposits that tail with the flat kernel so every active particle is
+   * covered exactly once regardless of sort cadence.
+   */
+  template <SimEngine::type S, MetricClass M, unsigned short O, unsigned short T_TILE>
+  class DepositCurrentsTiled_kernel {
+    static_assert(O <= 11u, "Shape order O must be <= 11");
+    static_assert(T_TILE > 0u, "T_TILE must be positive");
+    static constexpr auto D = M::Dim;
 
-          // jx3
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-              jx3[i][j][0] = -Qdydt * Wx3[i][j][0];
-            }
-          }
+    /**
+     * Per-side scratch halo, derived from first principles.
+     *
+     *   total halo = stencil_reach(O) + drift_between_sort_and_deposit
+     *
+     * stencil_reach(O) — maximum cells the deposit writes ABOVE
+     * min(i, i_prev) under CFL |v * dt/dx| <= 1/2:
+     *   - O == 0 (zigzag):  writes { i_prev, i_prev+1, i, i+1 } => +2
+     *   - O >= 1 Esirkepov: `for_deposit` returns an (O+2)-wide
+     *     array but only O+1 entries are non-zero, and the union
+     *     window satisfies `i_max - i_min <= O+1` (see
+     *     particle_shapes.hpp::for_deposit). The genuine one-sided
+     *     reach above min(i, i_prev) is therefore O, not O+1 — the
+     *     old `O+1` carried one extra cell of conservative padding
+     *     on top of the already-conservative drift term below.
+     *
+     * drift — sort runs at end-of-step (see srpic.hpp), so a particle is
+     * pushed once per step between its last sort and a given deposit. With
+     * a runtime sort interval of `K` (spatial_sorting_interval), a particle
+     * drifts at most `K` cells (CFL |v dt/dx| <= 1/2 ⇒ |Δi| <= 1 per step)
+     * before the next sort. The `team_policy_drift` CMake knob (macro
+     * TEAM_POLICY_DRIFT) sets DRIFT independently of `K`, sizing the halo so
+     * a particle that drifts up to DRIFT cells still deposits inside its
+     * tile scratch. DRIFT defaults to 1 (the sorted-every-step common case);
+     * any particle that drifts past the halo (e.g. a larger sort interval,
+     * or a CFL excursion) takes the per-particle global-J escape valve
+     * below — correct, only slower (see
+     * the class doc-comment for why this is charge-conserving).
+     */
+    static constexpr int STENCIL_REACH   = (O == 0u) ? 2 : static_cast<int>(O);
+    // One-sided footprint reach for the per-particle escape valve: the
+    // deposit writes at most this many cells above max(i,i_prev) (and fewer
+    // below min), so [min - FOOTPRINT_REACH, max + FOOTPRINT_REACH] in cell
+    // coords conservatively bounds every deposited cell for any order
+    // (Esirkepov reaches max+O; O=0 zigzag reaches max+1).
+    static constexpr int FOOTPRINT_REACH = (O == 0u) ? 1 : static_cast<int>(O);
+#if defined(TEAM_POLICY_DRIFT)
+    static constexpr int DRIFT = static_cast<int>(TEAM_POLICY_DRIFT);
+#else
+    static constexpr int DRIFT = 1;
+#endif
+    static constexpr int HALO = STENCIL_REACH + DRIFT;
+    static constexpr int TE   = static_cast<int>(T_TILE) + 2 * HALO;
+
+    using exec_space  = Kokkos::DefaultExecutionSpace;
+    using team_policy = Kokkos::TeamPolicy<exec_space>;
+    using member_t    = typename team_policy::member_type;
+
+    ndfield_t<D, 3> J;
+    ParticleArrays  prtls;
+    const M         metric;
+    const real_t    charge, inv_dt;
+
+    // Tile metadata produced by SortSpatially.
+    array_t<npart_t*> tile_offsets;
+    ncells_t          ntx1 { 1u }, ntx2 { 1u }, ntx3 { 1u };
+    ncells_t          total_tiles { 0u };
 
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-#pragma unroll
-              for (int k = 1; k < O + 2; ++k) {
-                jx3[i][j][k] = jx3[i][j][k - 1] - Qdzdt * Wx3[i][j][k];
-              }
-            }
-          }
+    /**
+     * Current active-particle count. `tile_offsets` partitions only the
+     * particles that existed at the last sort ([0, layout.npart_partitioned));
+     * `npart` may differ if the pusher dead-tagged particles in place since.
+     * Each team clamps its `[tile_offsets(t), tile_offsets(t+1))` slice to
+     * `npart` so stale slots past the live array are never read. Particles
+     * appended *beyond* the partition (npart > npart_partitioned) are not seen
+     * by any team here — the launcher deposits that tail separately.
+     */
+    npart_t npart { 0u };
 
-          // account for ghost cells
-          i1_min += N_GHOSTS;
-          i2_min += N_GHOSTS;
-          i3_min += N_GHOSTS;
-          i1_max += N_GHOSTS;
-          i2_max += N_GHOSTS;
-          i3_max += N_GHOSTS;
-
-          // get number of update indices for asymmetric movement
-          const int di_x1 = i1_max - i1_min;
-          const int di_x2 = i2_max - i2_min;
-          const int di_x3 = i3_max - i3_min;
-
-          /*
-            Current update
-          */
-          auto J_acc = J.access();
-
-          for (int i = 0; i < di_x1; ++i) {
-            for (int j = 0; j <= di_x2; ++j) {
-              for (int k = 0; k <= di_x3; ++k) {
-                J_acc(i1_min + i, i2_min + j, i3_min + k, cur::jx1) += jx1[i][j][k];
-              }
-            }
-          }
+    /**
+     * J's full storage extent including all ghost cells. Used to clip
+     * the cooperative flush so that a partial tile at the high end of
+     * the domain does not over-write past the J view.
+     */
+    int j_ext1 { 0 }, j_ext2 { 0 }, j_ext3 { 0 };
 
-          for (int i = 0; i <= di_x1; ++i) {
-            for (int j = 0; j < di_x2; ++j) {
-              for (int k = 0; k <= di_x3; ++k) {
-                J_acc(i1_min + i, i2_min + j, i3_min + k, cur::jx2) += jx2[i][j][k];
-              }
-            }
-          }
+  public:
+    DepositCurrentsTiled_kernel(const ndfield_t<D, 3>& cur,
+                                const ParticleArrays&  prtls,
+                                const M&               metric,
+                                real_t                 charge,
+                                real_t                 dt,
+                                const TileLayout<D>&   layout,
+                                npart_t                npart)
+      : J { cur }
+      , prtls { prtls }
+      , metric { metric }
+      , charge { charge }
+      , inv_dt { ONE / dt }
+      , tile_offsets { layout.tile_offsets }
+      , ntx1 { layout.ntiles_per_axis[0] }
+      , ntx2 { layout.ntiles_per_axis[1] }
+      , ntx3 { layout.ntiles_per_axis[2] }
+      , total_tiles { layout.ntiles_total }
+      , npart { npart } {
+      raise::ErrorIf(
+        layout.tile_size != T_TILE,
+        "Tiled deposit launched with mismatched T_TILE and runtime tile_size",
+        HERE);
+      /**
+       * @note: HALO is allowed to exceed N_GHOSTS. The cooperative
+       * scratch→J flush and the per-particle escape valve both bounds-clip
+       * their writes against `j_ext*` so writes that would land past J's
+       * ghost stripe are silently dropped (they only ever come from a
+       * particle whose stencil reaches into the domain ghost region, where
+       * CommunicateFields will re-supply the contribution).
+       */
+      if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) {
+        j_ext1 = static_cast<int>(cur.extent(0));
+      }
+      if constexpr (D == Dim::_2D or D == Dim::_3D) {
+        j_ext2 = static_cast<int>(cur.extent(1));
+      }
+      if constexpr (D == Dim::_3D) {
+        j_ext3 = static_cast<int>(cur.extent(2));
+      }
+    }
 
-          for (int i = 0; i <= di_x1; ++i) {
-            for (int j = 0; j <= di_x2; ++j) {
-              for (int k = 0; k < di_x3; ++k) {
-                J_acc(i1_min + i, i2_min + j, i3_min + k, cur::jx3) += jx3[i][j][k];
-              }
-            }
-          }
+    /**
+     * @brief Per-team scratch size in bytes. Used by the launcher to set
+     *        `team_policy.set_scratch_size(0, Kokkos::PerTeam(bytes))`.
+     */
+    static constexpr size_t scratch_bytes() {
+      // The component count (3) is a *static* extent of scratch_ndfield_t
+      // (View<real_t*[3]> / **[3] / ***[3]), so shmem_size() takes only the
+      // dynamic spatial extents — passing 3 as well trips Kokkos'
+      // `rank_dynamic != number of arguments` abort. This matches the
+      // scratch View construction below, which also omits the 3.
+      if constexpr (D == Dim::_1D) {
+        return scratch_ndfield_t<D, 3, real_t>::shmem_size(TE);
+      } else if constexpr (D == Dim::_2D) {
+        return scratch_ndfield_t<D, 3, real_t>::shmem_size(TE, TE);
+      } else {
+        return scratch_ndfield_t<D, 3, real_t>::shmem_size(TE, TE, TE);
+      }
+    }
 
-        } // dim
-      } else { // order
-        raise::KernelError(
-          HERE,
-          "Unsupported interpolation order. O > 11 not supported. Seriously. "
-          "What are you even doing here? Entity already goes to 11!");
+    Inline void operator()(const member_t& team) const {
+      const auto tile_id = static_cast<ncells_t>(team.league_rank());
+      /**
+       * Tile coordinates (tile-grid indices) → tile origin in **active**
+       * cell coords (no ghost offset). Using ncells_t to match the linearised
+       * tile index produced by SortSpatially.
+       */
+      ncells_t   tx1 = 0, tx2 = 0, tx3 = 0;
+      if constexpr (D == Dim::_1D) {
+        tx1 = tile_id;
+      } else if constexpr (D == Dim::_2D) {
+        tx1 = tile_id / ntx2;
+        tx2 = tile_id - tx1 * ntx2;
+      } else {
+        const auto plane = ntx2 * ntx3;
+        tx1              = tile_id / plane;
+        const auto rem   = tile_id - tx1 * plane;
+        tx2              = rem / ntx3;
+        tx3              = rem - tx2 * ntx3;
+      }
+      /**
+       * origin_active = lowest active-cell index in the tile (no ghost).
+       * origin_J      = same value translated into J's storage coordinate
+       *                 (i.e. plus N_GHOSTS).
+       * origin_J_low  = J coordinate of scratch index 0 (i.e. origin_J - HALO).
+       * local index `li` in scratch ↔ global J index `gi = li + origin_J_low`.
+       */
+      const int origin_J1_low = static_cast<int>(tx1 * T_TILE) +
+                                static_cast<int>(N_GHOSTS) - HALO;
+      const int origin_J2_low = static_cast<int>(tx2 * T_TILE) +
+                                static_cast<int>(N_GHOSTS) - HALO;
+      const int origin_J3_low = static_cast<int>(tx3 * T_TILE) +
+                                static_cast<int>(N_GHOSTS) - HALO;
+
+      // Allocate scratch and cooperatively zero-fill it.
+      if constexpr (D == Dim::_1D) {
+        scratch_ndfield_t<D, 3, real_t> scr { team.team_scratch(0), TE };
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(team, TE * 3),
+                             [&](ncells_t idx) {
+                               const auto li = idx / 3;
+                               const auto c  = idx - li * 3;
+                               scr(li, c)    = ZERO;
+                             });
+        team.team_barrier();
+
+        // Clamp the tile's particle slice to the live array: slots past
+        // `npart` may hold stale (possibly alive-tagged) data from a prior
+        // step's compaction and must not be re-deposited.
+        const auto t_lo    = tile_offsets(tile_id);
+        const auto t_hi    = tile_offsets(tile_id + 1u);
+        const auto p_begin = (t_lo < npart) ? t_lo : npart;
+        const auto p_end   = (t_hi < npart) ? t_hi : npart;
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, p_begin, p_end),
+          [&](prtlidx_t p) {
+            /**
+             * Per-particle escape valve: route the WHOLE particle to the
+             * global J view when its Esirkepov footprint does not fit
+             * inside this tile's scratch window [0,TE); only particles
+             * fully inside the tile touch SLM scratch. A particle drifts
+             * out of its tile when sorted less often than every step.
+             *
+             * The conservative footprint bound in cell coords,
+             * [min(i,i_prev) - O, max(i,i_prev) + O], covers
+             * prtl_shape::for_deposit<O> for any order (i_min >=
+             * min-floor(O/2), i_max <= max+O), so when `to_scratch` is true
+             * every deposited cell is provably in [0,TE) and the scratch write
+             * needs no per-cell bounds test. The global path bounds-clips
+             * against J's storage extent (writes past the ghost stripe are
+             * re-supplied by SynchronizeFields(J)).
+             */
+            const int i1c = prtls.i1(p), i1p = prtls.i1_prev(p);
+            const int lo1 = (i1c < i1p ? i1c : i1p) + static_cast<int>(N_GHOSTS) -
+                            FOOTPRINT_REACH - origin_J1_low;
+            const int hi1 = (i1c > i1p ? i1c : i1p) + static_cast<int>(N_GHOSTS) +
+                            FOOTPRINT_REACH - origin_J1_low;
+            const bool to_scratch = (lo1 >= 0 and hi1 < TE);
+            DepositOneParticle<S, M, O>(
+              p,
+              prtls,
+              metric,
+              charge,
+              inv_dt,
+              [&](int g_i1, int comp, real_t v) {
+                if (to_scratch) {
+                  Kokkos::atomic_add(&scr(g_i1 - origin_J1_low, comp), v);
+                } else if (g_i1 >= 0 and g_i1 < j_ext1) {
+                  // Bounds-clip the escape-valve write against J's storage,
+                  // exactly as the cooperative flush does. Cells past the
+                  // ghost stripe are re-supplied by SynchronizeFields(J); an
+                  // unclipped write here faults the GPU (an escaped boundary
+                  // particle's stencil can reach past j_ext1).
+                  Kokkos::atomic_add(&J(g_i1, comp), v);
+                }
+              });
+          });
+        team.team_barrier();
+
+        /**
+         * Cooperative flush of scratch to global J. Bounds-clip against
+         * the J view extent in case a partial high-end tile (or non-zero
+         * halo at domain edges) would otherwise write past J.
+         */
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(team, TE * 3),
+                             [&](const int idx) {
+                               const auto li = idx / 3;
+                               const auto c  = idx - li * 3;
+                               const auto gi = li + origin_J1_low;
+                               if (gi < 0 or gi >= j_ext1) {
+                                 return;
+                               }
+                               const real_t v = scr(li, c);
+                               if (v != ZERO) {
+                                 Kokkos::atomic_add(&J(gi, c), v);
+                               }
+                             });
+      } else if constexpr (D == Dim::_2D) {
+        scratch_ndfield_t<D, 3, real_t> scr { team.team_scratch(0), TE, TE };
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(team, SQR(TE) * 3),
+                             [&](const int idx) {
+                               const auto lij = idx / 3;
+                               const auto c   = idx - lij * 3;
+                               const auto li  = lij / TE;
+                               const auto lj  = lij - li * TE;
+                               scr(li, lj, c) = ZERO;
+                             });
+        team.team_barrier();
+
+        // Clamp the tile's particle slice to the live array: slots past
+        // `npart` may hold stale (possibly alive-tagged) data from a prior
+        // step's compaction and must not be re-deposited.
+        const auto t_lo    = tile_offsets(tile_id);
+        const auto t_hi    = tile_offsets(tile_id + 1u);
+        const auto p_begin = (t_lo < npart) ? t_lo : npart;
+        const auto p_end   = (t_hi < npart) ? t_hi : npart;
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, p_begin, p_end),
+          [&](prtlidx_t p) {
+            // See 1D branch for rationale: route the whole particle to the
+            // global escape valve unless its full footprint fits in scratch.
+            const int i1c = prtls.i1(p), i1p = prtls.i1_prev(p);
+            const int i2c = prtls.i2(p), i2p = prtls.i2_prev(p);
+            const int lo1 = (i1c < i1p ? i1c : i1p) + static_cast<int>(N_GHOSTS) -
+                            FOOTPRINT_REACH - origin_J1_low;
+            const int hi1 = (i1c > i1p ? i1c : i1p) + static_cast<int>(N_GHOSTS) +
+                            FOOTPRINT_REACH - origin_J1_low;
+            const int lo2 = (i2c < i2p ? i2c : i2p) + static_cast<int>(N_GHOSTS) -
+                            FOOTPRINT_REACH - origin_J2_low;
+            const int hi2 = (i2c > i2p ? i2c : i2p) + static_cast<int>(N_GHOSTS) +
+                            FOOTPRINT_REACH - origin_J2_low;
+            const bool to_scratch = (lo1 >= 0 and hi1 < TE and lo2 >= 0 and
+                                     hi2 < TE);
+            DepositOneParticle<S, M, O>(
+              p,
+              prtls,
+              metric,
+              charge,
+              inv_dt,
+              [&](const int g_i1, const int g_i2, int comp, real_t v) {
+                if (to_scratch) {
+                  Kokkos::atomic_add(
+                    &scr(g_i1 - origin_J1_low, g_i2 - origin_J2_low, comp),
+                    v);
+                } else if (g_i1 >= 0 and g_i1 < j_ext1 and g_i2 >= 0 and
+                           g_i2 < j_ext2) {
+                  // Bounds-clip as the cooperative flush does; an unclipped
+                  // escape-valve write faults the GPU when an escaped boundary
+                  // particle's stencil reaches past j_ext.
+                  Kokkos::atomic_add(&J(g_i1, g_i2, comp), v);
+                }
+              });
+          });
+        team.team_barrier();
+
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(team, SQR(TE) * 3),
+                             [&](const int idx) {
+                               const auto lij = idx / 3;
+                               const auto c   = idx - lij * 3;
+                               const auto li  = lij / TE;
+                               const auto lj  = lij - li * TE;
+                               const auto gi  = li + origin_J1_low;
+                               const auto gj  = lj + origin_J2_low;
+                               if ((gi < 0 or gi >= j_ext1) or
+                                   (gj < 0 or gj >= j_ext2)) {
+                                 return;
+                               }
+                               const real_t v = scr(li, lj, c);
+                               if (v != ZERO) {
+                                 Kokkos::atomic_add(&J(gi, gj, c), v);
+                               }
+                             });
+      } else if constexpr (D == Dim::_3D) {
+        scratch_ndfield_t<D, 3, real_t> scr { team.team_scratch(0), TE, TE, TE };
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(team, CUBE(TE) * 3),
+                             [&](const int idx) {
+                               const auto lijk    = idx / 3;
+                               const auto c       = idx - lijk * 3;
+                               const auto li      = lijk / (TE * TE);
+                               const auto rem     = lijk - li * TE * TE;
+                               const auto lj      = rem / TE;
+                               const auto lk      = rem - lj * TE;
+                               scr(li, lj, lk, c) = ZERO;
+                             });
+        team.team_barrier();
+
+        // Clamp the tile's particle slice to the live array: slots past
+        // `npart` may hold stale (possibly alive-tagged) data from a prior
+        // step's compaction and must not be re-deposited.
+        const auto t_lo    = tile_offsets(tile_id);
+        const auto t_hi    = tile_offsets(tile_id + 1u);
+        const auto p_begin = (t_lo < npart) ? t_lo : npart;
+        const auto p_end   = (t_hi < npart) ? t_hi : npart;
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, p_begin, p_end),
+          [&](prtlidx_t p) {
+            // See 1D branch for rationale: route the whole particle to the
+            // global escape valve unless its full footprint fits in scratch.
+            const int i1c = prtls.i1(p), i1p = prtls.i1_prev(p);
+            const int i2c = prtls.i2(p), i2p = prtls.i2_prev(p);
+            const int i3c = prtls.i3(p), i3p = prtls.i3_prev(p);
+            const int lo1 = (i1c < i1p ? i1c : i1p) + static_cast<int>(N_GHOSTS) -
+                            FOOTPRINT_REACH - origin_J1_low;
+            const int hi1 = (i1c > i1p ? i1c : i1p) + static_cast<int>(N_GHOSTS) +
+                            FOOTPRINT_REACH - origin_J1_low;
+            const int lo2 = (i2c < i2p ? i2c : i2p) + static_cast<int>(N_GHOSTS) -
+                            FOOTPRINT_REACH - origin_J2_low;
+            const int hi2 = (i2c > i2p ? i2c : i2p) + static_cast<int>(N_GHOSTS) +
+                            FOOTPRINT_REACH - origin_J2_low;
+            const int lo3 = (i3c < i3p ? i3c : i3p) + static_cast<int>(N_GHOSTS) -
+                            FOOTPRINT_REACH - origin_J3_low;
+            const int hi3 = (i3c > i3p ? i3c : i3p) + static_cast<int>(N_GHOSTS) +
+                            FOOTPRINT_REACH - origin_J3_low;
+            const bool to_scratch = (lo1 >= 0 and hi1 < TE and lo2 >= 0 and
+                                     hi2 < TE and lo3 >= 0 and hi3 < TE);
+            DepositOneParticle<S, M, O>(
+              p,
+              prtls,
+              metric,
+              charge,
+              inv_dt,
+              [&](const int g_i1, const int g_i2, const int g_i3, int comp, real_t v) {
+                if (to_scratch) {
+                  Kokkos::atomic_add(&scr(g_i1 - origin_J1_low,
+                                          g_i2 - origin_J2_low,
+                                          g_i3 - origin_J3_low,
+                                          comp),
+                                     v);
+                } else if (g_i1 >= 0 and g_i1 < j_ext1 and g_i2 >= 0 and
+                           g_i2 < j_ext2 and g_i3 >= 0 and g_i3 < j_ext3) {
+                  // Bounds-clip as the cooperative flush does; an unclipped
+                  // escape-valve write faults the GPU when an escaped boundary
+                  // particle's stencil reaches past j_ext.
+                  Kokkos::atomic_add(&J(g_i1, g_i2, g_i3, comp), v);
+                }
+              });
+          });
+        team.team_barrier();
+
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(team, CUBE(TE) * 3),
+                             [&](const int idx) {
+                               const int lijk = idx / 3;
+                               const int c    = idx - lijk * 3;
+                               const int li   = lijk / (TE * TE);
+                               const int rem  = lijk - li * TE * TE;
+                               const int lj   = rem / TE;
+                               const int lk   = rem - lj * TE;
+                               const int gi   = li + origin_J1_low;
+                               const int gj   = lj + origin_J2_low;
+                               const int gk   = lk + origin_J3_low;
+                               if ((gi < 0 or gi >= j_ext1) or
+                                   (gj < 0 or gj >= j_ext2) or
+                                   (gk < 0 or gk >= j_ext3)) {
+                                 return;
+                               }
+                               const real_t v = scr(li, lj, lk, c);
+                               if (v != ZERO) {
+                                 Kokkos::atomic_add(&J(gi, gj, gk, c), v);
+                               }
+                             });
       }
     }
   };
+
 } // namespace kernel
 
 #undef i_di_to_Xi
diff --git a/tests/framework/CMakeLists.txt b/tests/framework/CMakeLists.txt
index d6dc295af..9a2e2865e 100644
--- a/tests/framework/CMakeLists.txt
+++ b/tests/framework/CMakeLists.txt
@@ -44,6 +44,12 @@ else()
   gen_test(particles_sort false)
 endif()
 
+# team_policy X-3: per-backend sort_by_key permutation test (only built
+# when the compile-time team_policy toggle is on).
+if(${team_policy})
+  gen_test(sort_by_key false)
+endif()
+
 gen_test(fields false)
 gen_test(grid_mesh false)
 if(${DEBUG})
diff --git a/tests/framework/particles_sort.cpp b/tests/framework/particles_sort.cpp
index 1ba10d828..a2b82456e 100644
--- a/tests/framework/particles_sort.cpp
+++ b/tests/framework/particles_sort.cpp
@@ -31,12 +31,14 @@ auto main(int argc, char* argv[]) -> int {
         ntt::EmissionType::NONE,
         2u,
         1u);
-      auto& i1_p     = prtls.i1;
-      auto& i2_p     = prtls.i2;
-      auto& tag_p    = prtls.tag;
-      auto& weight_p = prtls.weight;
-      auto& pld_r    = prtls.pld_r;
-      auto& pld_i    = prtls.pld_i;
+      auto& i1_p      = prtls.i1;
+      auto& i2_p      = prtls.i2;
+      auto& i1_prev_p = prtls.i1_prev;
+      auto& i2_prev_p = prtls.i2_prev;
+      auto& tag_p     = prtls.tag;
+      auto& weight_p  = prtls.weight;
+      auto& pld_r     = prtls.pld_r;
+      auto& pld_i     = prtls.pld_i;
       Kokkos::parallel_for(
         "InitParticles",
         prtls.maxnpart(),
@@ -61,9 +63,14 @@ auto main(int argc, char* argv[]) -> int {
               i2_p(p)     = 23u;
               weight_p(p) = 3.0;
             }
-            pld_r(p, 0) = weight_p(p) + static_cast<real_t>(0.5);
-            pld_r(p, 1) = weight_p(p) + static_cast<real_t>(10.5);
-            pld_i(p, 0) = static_cast<npart_t>(weight_p(p) + 10.0);
+            // team_policy keys on min(i, i_prev); without a meaningful
+            // i_prev every key would collapse to 0. Set i_prev = i so the
+            // tile key reduces to the particle's current cell.
+            i1_prev_p(p) = i1_p(p);
+            i2_prev_p(p) = i2_p(p);
+            pld_r(p, 0)  = weight_p(p) + static_cast<real_t>(0.5);
+            pld_r(p, 1)  = weight_p(p) + static_cast<real_t>(10.5);
+            pld_i(p, 0)  = static_cast<npart_t>(weight_p(p) + 10.0);
           } else {
             tag_p(p) = ntt::ParticleTag::dead;
           }
@@ -75,39 +82,92 @@ auto main(int argc, char* argv[]) -> int {
 
       prtls.SortSpatially(grid);
 
+      auto i1_h     = Kokkos::create_mirror_view(prtls.i1);
+      auto i2_h     = Kokkos::create_mirror_view(prtls.i2);
+      auto tag_h    = Kokkos::create_mirror_view(prtls.tag);
       auto weight_h = Kokkos::create_mirror_view(prtls.weight);
       auto pld_r_h  = Kokkos::create_mirror_view(prtls.pld_r);
       auto pld_i_h  = Kokkos::create_mirror_view(prtls.pld_i);
+      Kokkos::deep_copy(i1_h, prtls.i1);
+      Kokkos::deep_copy(i2_h, prtls.i2);
+      Kokkos::deep_copy(tag_h, prtls.tag);
       Kokkos::deep_copy(weight_h, prtls.weight);
       Kokkos::deep_copy(pld_r_h, prtls.pld_r);
       Kokkos::deep_copy(pld_i_h, prtls.pld_i);
 
-      for (auto p { 0u }; p < 75u; ++p) {
-        if (p < 16u) {
-          raise::ErrorIf(weight_h(p) != 3.0, "error in sorting particles", HERE);
-        } else if (p < 33u) {
-          raise::ErrorIf(weight_h(p) != 1.0, "error in sorting particles", HERE);
-        } else if (p < 46u) {
-          raise::ErrorIf(weight_h(p) != 2.0, "error in sorting particles", HERE);
-        } else if (p < 59u) {
-          raise::ErrorIf(weight_h(p) != 0.0, "error in sorting particles", HERE);
-        } else {
-          raise::ErrorIf(weight_h(p) != -1.0, "error in sorting particles", HERE);
-        }
-        if (p < 59u) {
-          raise::ErrorIf(pld_r_h(p, 0) != weight_h(p) + static_cast<real_t>(0.5),
-                         "error in sorting particle real payload 0",
-                         HERE);
-          raise::ErrorIf(pld_r_h(p, 1) != weight_h(p) + static_cast<real_t>(10.5),
-                         "error in sorting particle real payload 1",
+      // Tile geometry, mirroring sort::PositionToTileIndex. T = 1 (no
+      // team_policy) reproduces the legacy per-cell ordering.
+#if defined(TEAM_POLICY)
+      const ncells_t T = static_cast<ncells_t>(TEAM_POLICY_TILE_SIZE);
+#else
+      const ncells_t T = 1u;
+#endif
+      const auto     na   = grid.n_active();
+      const ncells_t ntx2 = (na[1] + T - 1u) / T;
+      const auto     tile_of = [&](int a, int b) -> ncells_t {
+        return (static_cast<ncells_t>(a) / T) * ntx2 +
+               (static_cast<ncells_t>(b) / T);
+      };
+
+      // SortSpatially is order-by-tile, not order-by-cell: assert the
+      // invariants that hold for any tile size rather than a hardwired
+      // permutation. (1) alive particles form a prefix sorted by
+      // non-decreasing tile index; (2) every SoA member is permuted by the
+      // *same* permutation, so each alive slot still satisfies
+      // pld == f(weight); (3) no alive particle is lost. Only [0, npart())
+      // is defined after a sort. The team_policy path compacts — it drops
+      // the dead, so npart() equals the alive count and [0, npart()) is
+      // entirely alive; the legacy (non-team) path keeps the dead as a
+      // weight == -1 suffix, leaving npart() unchanged. Iterating
+      // [0, npart()) exercises both: the prefix-sorted / no-alive-after-dead
+      // checks below hold either way.
+#if defined(TEAM_POLICY)
+      raise::ErrorIf(prtls.npart() != 59u,
+                     "team_policy sort must compact: npart() should equal "
+                     "the alive count",
+                     HERE);
+#else
+      raise::ErrorIf(prtls.npart() != 66u,
+                     "legacy sort should leave npart() unchanged",
+                     HERE);
+#endif
+      bool     seen_dead   = false;
+      bool     have_prev   = false;
+      ncells_t prev_tile   = 0u;
+      npart_t  n_alive_obs = 0u;
+      for (auto p { 0u }; p < prtls.npart(); ++p) {
+        if (tag_h(p) != ntt::ParticleTag::alive) {
+          seen_dead = true;
+          raise::ErrorIf(weight_h(p) != -1.0,
+                         "dead particle has unexpected weight",
                          HERE);
-          raise::ErrorIf(
-            pld_i_h(p, 0) !=
-              static_cast<npart_t>(weight_h(p) + static_cast<real_t>(10.0)),
-            "error in sorting particle integer payload 0",
-            HERE);
+          continue;
         }
+        raise::ErrorIf(seen_dead,
+                       "alive particle after a dead one (not sorted to prefix)",
+                       HERE);
+        const auto tile = tile_of(i1_h(p), i2_h(p));
+        raise::ErrorIf(have_prev && (tile < prev_tile),
+                       "alive particles not sorted by tile index",
+                       HERE);
+        prev_tile = tile;
+        have_prev = true;
+        ++n_alive_obs;
+        raise::ErrorIf(pld_r_h(p, 0) != weight_h(p) + static_cast<real_t>(0.5),
+                       "error in sorting particle real payload 0",
+                       HERE);
+        raise::ErrorIf(pld_r_h(p, 1) != weight_h(p) + static_cast<real_t>(10.5),
+                       "error in sorting particle real payload 1",
+                       HERE);
+        raise::ErrorIf(
+          pld_i_h(p, 0) !=
+            static_cast<npart_t>(weight_h(p) + static_cast<real_t>(10.0)),
+          "error in sorting particle integer payload 0",
+          HERE);
       }
+      raise::ErrorIf(n_alive_obs != 59u,
+                     "wrong number of alive particles after sort",
+                     HERE);
     }
     {
       // 3D
@@ -129,11 +189,14 @@ auto main(int argc, char* argv[]) -> int {
         ntt::EmissionType::NONE,
         0u,
         0u);
-      auto& i1_p     = prtls.i1;
-      auto& i2_p     = prtls.i2;
-      auto& i3_p     = prtls.i3;
-      auto& tag_p    = prtls.tag;
-      auto& weight_p = prtls.weight;
+      auto& i1_p      = prtls.i1;
+      auto& i2_p      = prtls.i2;
+      auto& i3_p      = prtls.i3;
+      auto& i1_prev_p = prtls.i1_prev;
+      auto& i2_prev_p = prtls.i2_prev;
+      auto& i3_prev_p = prtls.i3_prev;
+      auto& tag_p     = prtls.tag;
+      auto& weight_p  = prtls.weight;
       Kokkos::parallel_for(
         "InitParticles",
         prtls.maxnpart(),
@@ -167,6 +230,11 @@ auto main(int argc, char* argv[]) -> int {
               i3_p(p)     = 7u;
               weight_p(p) = 4.0;
             }
+            // see 2D block: i_prev = i so the team_policy tile key reduces
+            // to the particle's current cell.
+            i1_prev_p(p) = i1_p(p);
+            i2_prev_p(p) = i2_p(p);
+            i3_prev_p(p) = i3_p(p);
           } else {
             tag_p(p) = ntt::ParticleTag::dead;
           }
@@ -178,23 +246,73 @@ auto main(int argc, char* argv[]) -> int {
 
       prtls.SortSpatially(grid);
 
+      auto i1_h     = Kokkos::create_mirror_view(prtls.i1);
+      auto i2_h     = Kokkos::create_mirror_view(prtls.i2);
+      auto i3_h     = Kokkos::create_mirror_view(prtls.i3);
+      auto tag_h    = Kokkos::create_mirror_view(prtls.tag);
       auto weight_h = Kokkos::create_mirror_view(prtls.weight);
+      Kokkos::deep_copy(i1_h, prtls.i1);
+      Kokkos::deep_copy(i2_h, prtls.i2);
+      Kokkos::deep_copy(i3_h, prtls.i3);
+      Kokkos::deep_copy(tag_h, prtls.tag);
       Kokkos::deep_copy(weight_h, prtls.weight);
-      for (auto p { 0u }; p < 75u; ++p) {
-        if (p < 13u) {
-          raise::ErrorIf(weight_h(p) != 4.0, "error in sorting particles", HERE);
-        } else if (p < 26u) {
-          raise::ErrorIf(weight_h(p) != 1.0, "error in sorting particles", HERE);
-        } else if (p < 39u) {
-          raise::ErrorIf(weight_h(p) != 2.0, "error in sorting particles", HERE);
-        } else if (p < 46u) {
-          raise::ErrorIf(weight_h(p) != 0.0, "error in sorting particles", HERE);
-        } else if (p < 59u) {
-          raise::ErrorIf(weight_h(p) != 3.0, "error in sorting particles", HERE);
-        } else {
-          raise::ErrorIf(weight_h(p) != -1.0, "error in sorting particles", HERE);
+
+      // Same invariants as the 2D block (no payloads here): alive prefix
+      // sorted by non-decreasing tile index, alive count preserved. The
+      // team_policy path compacts the dead away (npart() == alive count);
+      // the legacy path keeps them as a weight == -1 suffix. T = 1
+      // reproduces the legacy per-cell order.
+#if defined(TEAM_POLICY)
+      const ncells_t T = static_cast<ncells_t>(TEAM_POLICY_TILE_SIZE);
+#else
+      const ncells_t T = 1u;
+#endif
+      const auto     na   = grid.n_active();
+      const ncells_t ntx2 = (na[1] + T - 1u) / T;
+      const ncells_t ntx3 = (na[2] + T - 1u) / T;
+      const auto     tile_of = [&](int a, int b, int c) -> ncells_t {
+        return ((static_cast<ncells_t>(a) / T) * ntx2 +
+                (static_cast<ncells_t>(b) / T)) *
+                 ntx3 +
+               (static_cast<ncells_t>(c) / T);
+      };
+
+#if defined(TEAM_POLICY)
+      raise::ErrorIf(prtls.npart() != 59u,
+                     "team_policy sort must compact: npart() should equal "
+                     "the alive count",
+                     HERE);
+#else
+      raise::ErrorIf(prtls.npart() != 66u,
+                     "legacy sort should leave npart() unchanged",
+                     HERE);
+#endif
+      bool     seen_dead   = false;
+      bool     have_prev   = false;
+      ncells_t prev_tile   = 0u;
+      npart_t  n_alive_obs = 0u;
+      for (auto p { 0u }; p < prtls.npart(); ++p) {
+        if (tag_h(p) != ntt::ParticleTag::alive) {
+          seen_dead = true;
+          raise::ErrorIf(weight_h(p) != -1.0,
+                         "dead particle has unexpected weight",
+                         HERE);
+          continue;
         }
+        raise::ErrorIf(seen_dead,
+                       "alive particle after a dead one (not sorted to prefix)",
+                       HERE);
+        const auto tile = tile_of(i1_h(p), i2_h(p), i3_h(p));
+        raise::ErrorIf(have_prev && (tile < prev_tile),
+                       "alive particles not sorted by tile index",
+                       HERE);
+        prev_tile = tile;
+        have_prev = true;
+        ++n_alive_obs;
       }
+      raise::ErrorIf(n_alive_obs != 59u,
+                     "wrong number of alive particles after sort",
+                     HERE);
     }
 
   } catch (const std::exception& e) {
diff --git a/tests/framework/sort_by_key.cpp b/tests/framework/sort_by_key.cpp
new file mode 100644
index 000000000..9ccecc732
--- /dev/null
+++ b/tests/framework/sort_by_key.cpp
@@ -0,0 +1,113 @@
+/**
+ * @brief X-3 (team_policy) — sort_by_key permutation test.
+ *
+ * Exercises every backend overload of `ntt::sort_helpers::sort_by_key_dispatch`
+ * that is compiled in for the current Kokkos device. For each backend:
+ *   1. Allocate keys = { 5, 2, 5, 1, 3, 5, 2 }, perm = (uninitialised).
+ *   2. Call sort_by_key_dispatch.
+ *   3. Verify that keys[perm[i]] is sorted in non-decreasing order.
+ *
+ * Stability is verified for the BinSort and StdSort backends (the others
+ * promise stability per their documentation but we don't bake that into
+ * the test).
+ *
+ * Built only when `team_policy=ON` at CMake time.
+ */
+#include "enums.h"
+#include "global.h"
+
+#include "arch/kokkos_aliases.h"
+#include "utils/error.h"
+#include "utils/sort_dispatch.h"
+#include "utils/sorting.h"
+
+#include <Kokkos_Core.hpp>
+
+#include <iostream>
+#include <vector>
+
+namespace {
+
+  using namespace ntt;
+
+  template <typename Backend>
+  void test_one_backend(const char* label, Backend tag) {
+    const std::vector<ncells_t> keys_host_init { 5u, 2u, 5u, 1u, 3u, 5u, 2u };
+    const npart_t               n     = keys_host_init.size();
+    const ncells_t              n_max = 6u; // bin range [0, n_max)
+
+    array_t<ncells_t*> keys { "keys", n };
+    auto               keys_h = Kokkos::create_mirror_view(keys);
+    for (npart_t i = 0u; i < n; ++i) {
+      keys_h(i) = keys_host_init[i];
+    }
+    Kokkos::deep_copy(keys, keys_h);
+
+    prtl_perm_t perm { "perm", n };
+
+    sort_helpers::sort_by_key_dispatch(keys, perm, n_max, tag);
+
+    auto perm_h = Kokkos::create_mirror_view(perm);
+    Kokkos::deep_copy(perm_h, perm);
+
+    // Validate: keys[perm[0]] <= keys[perm[1]] <= ...
+    for (npart_t i = 1u; i < n; ++i) {
+      const auto a = keys_host_init[perm_h(i - 1u)];
+      const auto b = keys_host_init[perm_h(i)];
+      raise::ErrorIf(
+        a > b,
+        std::string("sort_by_key_dispatch produced non-sorted permutation "
+                    "for backend ") +
+          label,
+        HERE);
+    }
+
+    // Validate: perm is a permutation of [0, n).
+    std::vector<int> seen(n, 0);
+    for (npart_t i = 0u; i < n; ++i) {
+      const auto idx = perm_h(i);
+      raise::ErrorIf(idx >= n,
+                     std::string("permutation index out of range for "
+                                 "backend ") +
+                       label,
+                     HERE);
+      seen[idx] += 1;
+    }
+    for (npart_t i = 0u; i < n; ++i) {
+      raise::ErrorIf(seen[i] != 1,
+                     std::string("permutation not a bijection for backend ") +
+                       label,
+                     HERE);
+    }
+
+    std::cout << "[OK] sort_by_key_dispatch<" << label << ">: "
+              << "keys[perm] sorted, perm is a bijection." << std::endl;
+  }
+
+} // namespace
+
+auto main(int argc, char* argv[]) -> int {
+  ntt::GlobalInitialize(argc, argv);
+  try {
+    // Always-available backends.
+    test_one_backend("BinSort", ::sort::backend::BinSort {});
+#if !defined(DEVICE_ENABLED)
+    test_one_backend("StdSort", ::sort::backend::StdSort {});
+#endif
+#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)
+    test_one_backend("OneDPL", ::sort::backend::OneDPL {});
+#endif
+#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED)
+    test_one_backend("Thrust", ::sort::backend::Thrust {});
+#endif
+#if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)
+    test_one_backend("Rocthrust", ::sort::backend::Rocthrust {});
+#endif
+  } catch (std::exception& e) {
+    std::cerr << e.what() << std::endl;
+    ntt::GlobalFinalize();
+    return 1;
+  }
+  ntt::GlobalFinalize();
+  return 0;
+}
diff --git a/tests/global/tiling.cpp b/tests/global/tiling.cpp
index d0f0a412c..2f926ce28 100644
--- a/tests/global/tiling.cpp
+++ b/tests/global/tiling.cpp
@@ -1,6 +1,5 @@
 #include "arch/kokkos_aliases.h"
 #include "utils/error.h"
-#include "utils/formatting.h"
 #include "utils/numeric.h"
 #include "utils/sorting.h"
 
@@ -122,37 +121,27 @@ void test_tiling(const array_t<int*>&         i1,
 
     const auto ntiles = nt1 * nt2 * nt3;
 
-    auto position_to_tile_kernel = sort::PositionToTileIndex<D, true> {
-      i1, i2, i3, tag, tile_indices, ncells, ts
-    };
-    Kokkos::parallel_for("Tiling", npart, position_to_tile_kernel);
-    const auto num_ppt = position_to_tile_kernel.num_ppt;
+    array_t<npart_t*> num_ppt { "num_ppt", ntiles };
+    Kokkos::parallel_for(
+      "Tiling",
+      npart,
+      sort::PositionToTileIndex<D, true> { i1, i2, i3, tag, tile_indices, ncells, ts, num_ppt });
     Kokkos::parallel_for(
       "Checking",
       npart,
       Lambda(prtlidx_t p) {
         CheckValue<D>(p, i1, i2, i3, tag, tile_indices, nt1, nt2, nt3, ntiles, ts);
       });
-    raise::ErrorIf(
-      num_ppt.extent(0) != ntiles,
-      fmt::format("num_ppt size does not match number of tiles %u != %u",
-                  num_ppt.extent(0),
-                  ntiles),
-      HERE);
 
     npart_t tot_alive = 0u;
     Kokkos::parallel_reduce(
       "CountAliveInTiles",
       ntiles,
-      Lambda(cellidx_t t, npart_t & count) { count += num_ppt(t); },
+      Lambda(prtlidx_t t, npart_t & count) { count += num_ppt(t); },
       tot_alive);
-    raise::ErrorIf(
-      tot_alive != npart - ndead,
-      fmt::format("Error in counting particles per tile: %u != %u - %u",
-                  tot_alive,
-                  npart,
-                  ndead),
-      HERE);
+    raise::ErrorIf(tot_alive != npart - ndead,
+                   "Error in counting particles per tile",
+                   HERE);
   }
 }
 
diff --git a/tests/kernels/CMakeLists.txt b/tests/kernels/CMakeLists.txt
index f1438e108..0fe23f578 100644
--- a/tests/kernels/CMakeLists.txt
+++ b/tests/kernels/CMakeLists.txt
@@ -26,6 +26,9 @@ endfunction()
 gen_test(faraday_mink)
 gen_test(ampere_mink)
 gen_test(deposit)
+if(${team_policy})
+  gen_test(deposit_tiled)
+endif()
 gen_test(digital_filter)
 gen_test(particle_moments)
 gen_test(fields_to_phys)
diff --git a/tests/kernels/deposit.cpp b/tests/kernels/deposit.cpp
index 24f544202..10095d304 100644
--- a/tests/kernels/deposit.cpp
+++ b/tests/kernels/deposit.cpp
@@ -142,15 +142,21 @@ void testDeposit(const std::vector<ncells_t>&         res,
 
   auto J_scat = Kokkos::Experimental::create_scatter_view(J);
 
+  // The deposit kernel now takes a `ParticleArrays` SoA struct instead of
+  // the individual per-component arrays. Pack the per-test arrays into one;
+  // payload (pld_*) members stay default (unused here).
+  ParticleArrays pa;
+  pa.i1 = i1, pa.i2 = i2, pa.i3 = i3;
+  pa.i1_prev = i1_prev, pa.i2_prev = i2_prev, pa.i3_prev = i3_prev;
+  pa.dx1 = dx1, pa.dx2 = dx2, pa.dx3 = dx3;
+  pa.dx1_prev = dx1_prev, pa.dx2_prev = dx2_prev, pa.dx3_prev = dx3_prev;
+  pa.ux1 = ux1, pa.ux2 = ux2, pa.ux3 = ux3;
+  pa.phi = phi, pa.weight = weight, pa.tag = tag;
+
   // clang-format off
   Kokkos::parallel_for("CurrentsDeposit", 10,
                        kernel::DepositCurrents_kernel<S, M, O>(J_scat,
-                                                            i1, i2, i3,
-                                                            i1_prev, i2_prev, i3_prev,
-                                                            dx1, dx2, dx3,
-                                                            dx1_prev, dx2_prev, dx3_prev,
-                                                            ux1, ux2, ux3,
-                                                            phi, weight, tag,
+                                                            pa,
                                                             metric, charge, inv_dt));
   // clang-format on
 
diff --git a/tests/kernels/deposit_tiled.cpp b/tests/kernels/deposit_tiled.cpp
new file mode 100644
index 000000000..f058d6f30
--- /dev/null
+++ b/tests/kernels/deposit_tiled.cpp
@@ -0,0 +1,592 @@
+/**
+ * @file tests/kernels/deposit_tiled.cpp
+ * @brief X-1 numerical-equivalence test for the tiled deposit kernel.
+ *
+ * Runs the flat (`DepositCurrents_kernel`) and tiled
+ * (`DepositCurrentsTiled_kernel`) kernels on identical particle SoA inputs
+ * for shape orders O = 1..11 and asserts that the resulting J array is
+ * identical cell-by-cell within a small floating-point tolerance.
+ *
+ * Built only when `team_policy=ON` (`-D TEAM_POLICY` defined). The test
+ * matches the per-particle setup used in `deposit.cpp` so that any
+ * regression in the shared `kernel::deposit::deposit_one_particle` body
+ * is caught by both tests.
+ */
+
+#include "enums.h"
+#include "global.h"
+
+#include "arch/kokkos_aliases.h"
+#include "utils/comparators.h"
+
+#include "metrics/minkowski.h"
+
+#include "kernels/currents_deposit.hpp"
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_ScatterView.hpp>
+
+#include <cmath>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+namespace {
+
+  using namespace ntt;
+
+  void errorIf(bool condition, const std::string& msg) {
+    if (condition) {
+      throw std::runtime_error(msg);
+    }
+  }
+
+  template <typename T>
+  void put_value(const array_t<T*>& arr, T value, int i) {
+    auto h = Kokkos::create_mirror_view(arr);
+    h(i)   = value;
+    Kokkos::deep_copy(arr, h);
+  }
+
+  // Pack the per-test SoA arrays into a ParticleArrays — the struct both
+  // deposit kernels take. Payload (pld_*) members stay default; these are
+  // 2D Cartesian Minkowski cases, so phi/i3/dx3 are present but unread.
+  ParticleArrays pack_arrays(const array_t<int*>&      i1,
+                             const array_t<int*>&      i2,
+                             const array_t<int*>&      i3,
+                             const array_t<int*>&      i1_prev,
+                             const array_t<int*>&      i2_prev,
+                             const array_t<int*>&      i3_prev,
+                             const array_t<prtldx_t*>& dx1,
+                             const array_t<prtldx_t*>& dx2,
+                             const array_t<prtldx_t*>& dx3,
+                             const array_t<prtldx_t*>& dx1_prev,
+                             const array_t<prtldx_t*>& dx2_prev,
+                             const array_t<prtldx_t*>& dx3_prev,
+                             const array_t<real_t*>&   ux1,
+                             const array_t<real_t*>&   ux2,
+                             const array_t<real_t*>&   ux3,
+                             const array_t<real_t*>&   phi,
+                             const array_t<real_t*>&   weight,
+                             const array_t<short*>&    tag) {
+    ParticleArrays pa;
+    pa.i1 = i1, pa.i2 = i2, pa.i3 = i3;
+    pa.i1_prev = i1_prev, pa.i2_prev = i2_prev, pa.i3_prev = i3_prev;
+    pa.dx1 = dx1, pa.dx2 = dx2, pa.dx3 = dx3;
+    pa.dx1_prev = dx1_prev, pa.dx2_prev = dx2_prev, pa.dx3_prev = dx3_prev;
+    pa.ux1 = ux1, pa.ux2 = ux2, pa.ux3 = ux3;
+    pa.phi = phi, pa.weight = weight, pa.tag = tag;
+    return pa;
+  }
+
+  // Builds tile_offsets for a single-particle test. Particle 0 is alive
+  // and lives in tile (tx1, tx2); slots 1..n_slots-1 carry the dead
+  // sentinel and are never referenced by tile_offsets — so the tiled
+  // kernel never iterates over them.
+  array_t<npart_t*> build_tile_offsets_single_particle(ncells_t ntx1,
+                                                       ncells_t ntx2,
+                                                       ncells_t tx1,
+                                                       ncells_t tx2) {
+    const ncells_t total_tiles = ntx1 * ntx2;
+    const ncells_t hot_tile    = tx1 * ntx2 + tx2;
+    array_t<npart_t*> offsets("tile_offsets", total_tiles + 1u);
+    auto h = Kokkos::create_mirror_view(offsets);
+    for (ncells_t t = 0; t <= total_tiles; ++t) {
+      h(t) = (t <= hot_tile) ? static_cast<npart_t>(0)
+                             : static_cast<npart_t>(1);
+    }
+    Kokkos::deep_copy(offsets, h);
+    return offsets;
+  }
+
+  // Buckets ALL `n_alive` particles (slots 0..n_alive-1) into tile 0,
+  // modelling a maximally-stale tile layout: every particle was "sorted"
+  // into tile 0 but now sits anywhere in the domain (as happens when the
+  // SoA drifts / is reordered between sorts). Every particle except the
+  // few that genuinely live near the origin must therefore take the
+  // per-particle escape valve to global J.
+  array_t<npart_t*> build_tile_offsets_all_in_tile0(ncells_t total_tiles,
+                                                    npart_t  n_alive) {
+    array_t<npart_t*> offsets("tile_offsets", total_tiles + 1u);
+    auto              h = Kokkos::create_mirror_view(offsets);
+    h(0)                = static_cast<npart_t>(0);
+    for (ncells_t t = 1; t <= total_tiles; ++t) {
+      h(t) = n_alive;
+    }
+    Kokkos::deep_copy(offsets, h);
+    return offsets;
+  }
+
+  // Cell-by-cell comparison of two J fields; throws on mismatch.
+  void compare_J_fields(const ndfield_t<Dim::_2D, 3>& J_flat,
+                        const ndfield_t<Dim::_2D, 3>& J_tiled,
+                        unsigned short                O,
+                        unsigned short                T_TILE,
+                        const char*                   label) {
+    auto h_flat  = Kokkos::create_mirror_view(J_flat);
+    auto h_tiled = Kokkos::create_mirror_view(J_tiled);
+    Kokkos::deep_copy(h_flat, J_flat);
+    Kokkos::deep_copy(h_tiled, J_tiled);
+
+    const real_t eps        = static_cast<real_t>(1.0e-5);
+    real_t       max_diff   = ZERO;
+    int          fail_count = 0;
+    for (ncells_t i = 0; i < h_flat.extent(0); ++i) {
+      for (ncells_t j = 0; j < h_flat.extent(1); ++j) {
+        for (int c = 0; c < 3; ++c) {
+          const real_t a    = h_flat(i, j, c);
+          const real_t b    = h_tiled(i, j, c);
+          const real_t diff = math::fabs(a - b);
+          const real_t mag  = math::max(math::fabs(a), math::fabs(b));
+          if (diff > max_diff) {
+            max_diff = diff;
+          }
+          if (diff > eps * math::max(mag, static_cast<real_t>(1.0))) {
+            if (fail_count < 5) {
+              std::cerr << "  [" << label << "] J(" << i << "," << j
+                        << ",c=" << c << ") flat=" << a << " tiled=" << b
+                        << " diff=" << diff << '\n';
+            }
+            ++fail_count;
+          }
+        }
+      }
+    }
+    if (fail_count > 0) {
+      std::cerr << "deposit_tiled[" << label << "] FAILED for O=" << O
+                << " T_TILE=" << T_TILE << " : " << fail_count
+                << " mismatches; max_diff=" << max_diff << '\n';
+      throw std::logic_error("DepositCurrentsTiled_kernel mismatch");
+    }
+    std::cerr << "deposit_tiled[" << label << "] OK  O=" << O
+              << " T_TILE=" << T_TILE << "  max_diff=" << max_diff << '\n';
+  }
+
+  // Intrinsic charge-conservation check on a single deposited J field.
+  // Esirkepov/zigzag deposits satisfy the discrete continuity equation, so
+  // the spatial sum of the discrete divergence div.J = dJx/dx + dJy/dy
+  // vanishes whenever the summation region encloses every particle's full
+  // stencil (J == 0 on the region's outer boundary). This is evaluated on
+  // J_tiled ALONE -- it does not compare against the flat reference -- so it
+  // certifies the per-particle escape valve deposits each drifted particle's
+  // stencil as one coherent unit: no cell dropped, duplicated, or split
+  // between SLM scratch and global J. (The run_drift_case order guard keeps
+  // every stencil inside [0, j_ext), so the extreme ghost cells stay zero and
+  // the telescoping boundary flux is genuinely zero rather than clipped.)
+  // Accumulated in double regardless of build precision to keep the
+  // tolerance tight.
+  void check_charge_conservation(const ndfield_t<Dim::_2D, 3>& J,
+                                 unsigned short                O,
+                                 unsigned short                T_TILE,
+                                 const char*                   label) {
+    auto h = Kokkos::create_mirror_view(J);
+    Kokkos::deep_copy(h, J);
+
+    double sum_div = 0.0; // Sum over the field of div.J (jx1 -> dx, jx2 -> dy).
+    double abs_tot = 0.0; // Total |J|, sets the relative tolerance scale.
+    for (ncells_t i = 1; i < h.extent(0); ++i) {
+      for (ncells_t j = 1; j < h.extent(1); ++j) {
+        sum_div += (static_cast<double>(h(i, j, 0)) -
+                    static_cast<double>(h(i - 1, j, 0))) +
+                   (static_cast<double>(h(i, j, 1)) -
+                    static_cast<double>(h(i, j - 1, 1)));
+      }
+    }
+    for (ncells_t i = 0; i < h.extent(0); ++i) {
+      for (ncells_t j = 0; j < h.extent(1); ++j) {
+        abs_tot += std::fabs(static_cast<double>(h(i, j, 0))) +
+                   std::fabs(static_cast<double>(h(i, j, 1)));
+      }
+    }
+    const double tol = 1.0e-5 * (abs_tot > 1.0 ? abs_tot : 1.0);
+    if (std::fabs(sum_div) > tol) {
+      std::cerr << "deposit_tiled[" << label
+                << "] CHARGE NON-CONSERVED for O=" << O << " T_TILE=" << T_TILE
+                << " : sum(div.J)=" << sum_div << " tol=" << tol
+                << " (abs_tot=" << abs_tot << ")\n";
+      throw std::logic_error(
+        "DepositCurrentsTiled_kernel charge non-conservation");
+    }
+    std::cerr << "deposit_tiled[" << label << "] charge-conserved O=" << O
+              << " T_TILE=" << T_TILE << "  sum(div.J)=" << sum_div << '\n';
+  }
+
+  template <unsigned short O, unsigned short T_TILE>
+  void run_one_case() {
+    using metric_t = metric::Minkowski<Dim::_2D>;
+    constexpr unsigned short nx1 = 50u, nx2 = 50u;
+    metric_t metric { { nx1, nx2 },
+                      { { 0.0, 55.0 }, { 0.0, 55.0 } },
+                      {} };
+
+    // Particle setup (mirrors deposit.cpp).
+    const int      i0 = 25, j0 = 21, i0f = 24, j0f = 20;
+    const real_t   uz = 2.5;
+    const prtldx_t dxi = static_cast<prtldx_t>(0.65);
+    const prtldx_t dxf = static_cast<prtldx_t>(0.99);
+    const prtldx_t dyi = static_cast<prtldx_t>(0.65);
+    const prtldx_t dyf = static_cast<prtldx_t>(0.80);
+
+    array_t<int*>      i1 { "i1", 10 };
+    array_t<int*>      i2 { "i2", 10 };
+    array_t<int*>      i3 { "i3", 10 };
+    array_t<int*>      i1_prev { "i1_prev", 10 };
+    array_t<int*>      i2_prev { "i2_prev", 10 };
+    array_t<int*>      i3_prev { "i3_prev", 10 };
+    array_t<prtldx_t*> dx1 { "dx1", 10 };
+    array_t<prtldx_t*> dx2 { "dx2", 10 };
+    array_t<prtldx_t*> dx3 { "dx3", 10 };
+    array_t<prtldx_t*> dx1_prev { "dx1_prev", 10 };
+    array_t<prtldx_t*> dx2_prev { "dx2_prev", 10 };
+    array_t<prtldx_t*> dx3_prev { "dx3_prev", 10 };
+    array_t<real_t*>   ux1 { "ux1", 10 };
+    array_t<real_t*>   ux2 { "ux2", 10 };
+    array_t<real_t*>   ux3 { "ux3", 10 };
+    array_t<real_t*>   phi { "phi", 10 };
+    array_t<real_t*>   weight { "weight", 10 };
+    array_t<short*>    tag { "tag", 10 };
+    const real_t       charge = 1.0, dt = 1.0;
+
+    put_value<int>(i1, i0f, 0);
+    put_value<int>(i2, j0f, 0);
+    put_value<int>(i1_prev, i0, 0);
+    put_value<int>(i2_prev, j0, 0);
+    put_value<prtldx_t>(dx1, dxf, 0);
+    put_value<prtldx_t>(dx2, dyf, 0);
+    put_value<prtldx_t>(dx1_prev, dxi, 0);
+    put_value<prtldx_t>(dx2_prev, dyi, 0);
+    put_value<real_t>(ux1, ZERO, 0);
+    put_value<real_t>(ux2, ZERO, 0);
+    put_value<real_t>(ux3, uz, 0);
+    put_value<real_t>(weight, 1.0, 0);
+    put_value<short>(tag, ParticleTag::alive, 0);
+
+    // Run the flat kernel.
+    ndfield_t<Dim::_2D, 3> J_flat { "J_flat",
+                                    nx1 + 2u * N_GHOSTS,
+                                    nx2 + 2u * N_GHOSTS };
+    {
+      auto J_scat = Kokkos::Experimental::create_scatter_view(J_flat);
+      Kokkos::parallel_for(
+        "FlatDeposit",
+        10,
+        kernel::DepositCurrents_kernel<SimEngine::SRPIC, metric_t, O>(
+          J_scat,
+          pack_arrays(i1, i2, i3,
+                      i1_prev, i2_prev, i3_prev,
+                      dx1, dx2, dx3,
+                      dx1_prev, dx2_prev, dx3_prev,
+                      ux1, ux2, ux3,
+                      phi, weight, tag),
+          metric, charge, dt));
+      Kokkos::Experimental::contribute(J_flat, J_scat);
+      Kokkos::fence("flat deposit done");
+    }
+
+    // Run the tiled kernel. Build a TileLayout with one alive particle
+    // landing in its expected tile (sort key = min(i, i_prev) / T_TILE).
+    ndfield_t<Dim::_2D, 3> J_tiled { "J_tiled",
+                                     nx1 + 2u * N_GHOSTS,
+                                     nx2 + 2u * N_GHOSTS };
+    {
+      const auto sort_i1 = static_cast<int>(
+        (i0 < i0f) ? i0 : i0f); // min(i, i_prev) before clamp
+      const auto sort_i2 = static_cast<int>((j0 < j0f) ? j0 : j0f);
+      const auto ntx1    = static_cast<ncells_t>(
+        std::ceil(static_cast<double>(nx1) / static_cast<double>(T_TILE)));
+      const auto ntx2 = static_cast<ncells_t>(
+        std::ceil(static_cast<double>(nx2) / static_cast<double>(T_TILE)));
+      const auto tx1 = static_cast<ncells_t>(sort_i1) / T_TILE;
+      const auto tx2 = static_cast<ncells_t>(sort_i2) / T_TILE;
+
+      TileLayout<Dim::_2D> layout;
+      layout.ntiles_per_axis[0] = ntx1;
+      layout.ntiles_per_axis[1] = ntx2;
+      layout.ntiles_per_axis[2] = 1u;
+      layout.ntiles_total       = ntx1 * ntx2;
+      layout.tile_size          = T_TILE;
+      layout.tile_offsets       = build_tile_offsets_single_particle(ntx1,
+                                                                      ntx2,
+                                                                      tx1,
+                                                                      tx2);
+
+      using kernel_t =
+        kernel::DepositCurrentsTiled_kernel<SimEngine::SRPIC, metric_t, O, T_TILE>;
+      // npart = full slot count (10): the lone alive particle sits in slot 0
+      // and the per-tile slice clamp keeps the (dead) tail out.
+      kernel_t kern { J_tiled,
+                      pack_arrays(i1, i2, i3,
+                                  i1_prev, i2_prev, i3_prev,
+                                  dx1, dx2, dx3,
+                                  dx1_prev, dx2_prev, dx3_prev,
+                                  ux1, ux2, ux3,
+                                  phi, weight, tag),
+                      metric, charge, dt, layout,
+                      static_cast<npart_t>(10) };
+
+      Kokkos::TeamPolicy<> policy(static_cast<int>(layout.ntiles_total),
+                                  Kokkos::AUTO);
+      policy.set_scratch_size(0,
+                              Kokkos::PerTeam(kernel_t::scratch_bytes()));
+      Kokkos::parallel_for("TiledDeposit", policy, kern);
+      Kokkos::fence("tiled deposit done");
+    }
+
+    // Compare J_flat vs J_tiled cell-by-cell.
+    auto h_flat  = Kokkos::create_mirror_view(J_flat);
+    auto h_tiled = Kokkos::create_mirror_view(J_tiled);
+    Kokkos::deep_copy(h_flat, J_flat);
+    Kokkos::deep_copy(h_tiled, J_tiled);
+
+    const real_t eps      = static_cast<real_t>(1.0e-5);
+    real_t       max_diff = ZERO;
+    int          fail_count = 0;
+    for (ncells_t i = 0; i < h_flat.extent(0); ++i) {
+      for (ncells_t j = 0; j < h_flat.extent(1); ++j) {
+        for (int c = 0; c < 3; ++c) {
+          const real_t a    = h_flat(i, j, c);
+          const real_t b    = h_tiled(i, j, c);
+          const real_t diff = math::fabs(a - b);
+          const real_t mag  = math::max(math::fabs(a), math::fabs(b));
+          if (diff > max_diff) {
+            max_diff = diff;
+          }
+          if (diff > eps * math::max(mag, static_cast<real_t>(1.0))) {
+            if (fail_count < 5) {
+              std::cerr << "  J(" << i << "," << j << ",c=" << c
+                        << ") flat=" << a << " tiled=" << b
+                        << " diff=" << diff << '\n';
+            }
+            ++fail_count;
+          }
+        }
+      }
+    }
+    if (fail_count > 0) {
+      std::cerr << "X-1 deposit_tiled equivalence FAILED for O=" << O
+                << " T_TILE=" << T_TILE
+                << " : " << fail_count << " mismatches; max_diff=" << max_diff
+                << '\n';
+      throw std::logic_error("DepositCurrentsTiled_kernel mismatch");
+    }
+    std::cerr << "X-1 deposit_tiled OK  O=" << O << " T_TILE=" << T_TILE
+              << "  max_diff=" << max_diff << '\n';
+  }
+
+  // Drift / stale-layout regression test for the per-particle escape valve.
+  //
+  // A population of alive particles is spread across the whole domain
+  // (including the near-boundary cells that deposit into the ghost stripe)
+  // but the tile layout buckets them ALL into tile 0 — i.e. the layout is
+  // maximally stale w.r.t. their real positions, exactly the situation that
+  // arises when the SoA is reordered/appended between sorts. The tiled
+  // kernel must route every out-of-tile particle to the global J view and
+  // reproduce the flat deposit cell-for-cell: no charge dropped or
+  // double-counted at any drift distance. This is the property that broke
+  // when `spatial_sorting_interval > 1` left drifted particles depositing
+  // partial stencils, producing a density/E line at the decomposition
+  // boundary.
+  template <unsigned short O, unsigned short T_TILE>
+  void run_drift_case() {
+    // This case deposits boundary-adjacent particles (cells touching the
+    // ghost stripe), so an order-O stencil must fit inside the field's
+    // N_GHOSTS ghost layers. N_GHOSTS is a compile-time constant fixed by
+    // the build's SHAPE_ORDER ((SHAPE_ORDER+1)/2 + 1); a build whose ghost
+    // width is smaller than order O requires would deposit outside the
+    // field -- silent on GPU (no Kokkos View bounds guard; the overshoot
+    // cells carry zero shape-weight so results still match) but heap
+    // corruption on a host/SERIAL build. Skip those orders here; build at
+    // the matching SHAPE_ORDER to drift-test higher orders. The equivalence
+    // ("X-1") cases above stay interior, so they exercise all orders.
+    if constexpr ((O + 1u) / 2u + 1u > N_GHOSTS) {
+      std::cerr << "deposit_tiled[drift] SKIP O=" << O << " T_TILE=" << T_TILE
+                << " (needs N_GHOSTS>=" << ((O + 1u) / 2u + 1u)
+                << ", build has " << N_GHOSTS << ")\n";
+      return;
+    }
+    using metric_t = metric::Minkowski<Dim::_2D>;
+    constexpr unsigned short nx1 = 50u, nx2 = 50u;
+    metric_t metric { { nx1, nx2 }, { { 0.0, 55.0 }, { 0.0, 55.0 } }, {} };
+
+    constexpr int n_slots = 64;
+    constexpr int n_base  = 5;
+    const int     bases[n_base] = { 1, 13, 25, 37, 48 };
+    const int     n_alive       = n_base * n_base; // 25
+
+    array_t<int*>      i1 { "i1", n_slots }, i2 { "i2", n_slots },
+      i3 { "i3", n_slots };
+    array_t<int*>      i1_prev { "i1_prev", n_slots },
+      i2_prev { "i2_prev", n_slots }, i3_prev { "i3_prev", n_slots };
+    array_t<prtldx_t*> dx1 { "dx1", n_slots }, dx2 { "dx2", n_slots },
+      dx3 { "dx3", n_slots };
+    array_t<prtldx_t*> dx1_prev { "dx1_prev", n_slots },
+      dx2_prev { "dx2_prev", n_slots }, dx3_prev { "dx3_prev", n_slots };
+    array_t<real_t*>   ux1 { "ux1", n_slots }, ux2 { "ux2", n_slots },
+      ux3 { "ux3", n_slots };
+    array_t<real_t*>   phi { "phi", n_slots }, weight { "weight", n_slots };
+    array_t<short*>    tag { "tag", n_slots };
+    const real_t       charge = 1.0, dt = 1.0;
+
+    // Fill alive particles on host (slots >= n_alive stay zero == dead).
+    auto h_i1  = Kokkos::create_mirror_view(i1);
+    auto h_i2  = Kokkos::create_mirror_view(i2);
+    auto h_i1p = Kokkos::create_mirror_view(i1_prev);
+    auto h_i2p = Kokkos::create_mirror_view(i2_prev);
+    auto h_dx1  = Kokkos::create_mirror_view(dx1);
+    auto h_dx2  = Kokkos::create_mirror_view(dx2);
+    auto h_dx1p = Kokkos::create_mirror_view(dx1_prev);
+    auto h_dx2p = Kokkos::create_mirror_view(dx2_prev);
+    auto h_ux3  = Kokkos::create_mirror_view(ux3);
+    auto h_w    = Kokkos::create_mirror_view(weight);
+    auto h_tag  = Kokkos::create_mirror_view(tag);
+    int  p = 0;
+    for (int a = 0; a < n_base; ++a) {
+      for (int b = 0; b < n_base; ++b, ++p) {
+        h_i1p(p) = bases[a];
+        h_i1(p)  = bases[a] - 1;
+        h_i2p(p) = bases[b];
+        h_i2(p)  = bases[b] - 1;
+        h_dx1p(p) = static_cast<prtldx_t>(0.65);
+        h_dx1(p)  = static_cast<prtldx_t>(0.99);
+        h_dx2p(p) = static_cast<prtldx_t>(0.65);
+        h_dx2(p)  = static_cast<prtldx_t>(0.80);
+        h_ux3(p)  = static_cast<real_t>(2.5);
+        h_w(p)    = static_cast<real_t>(1.0);
+        h_tag(p)  = ParticleTag::alive;
+      }
+    }
+    Kokkos::deep_copy(i1, h_i1);
+    Kokkos::deep_copy(i2, h_i2);
+    Kokkos::deep_copy(i1_prev, h_i1p);
+    Kokkos::deep_copy(i2_prev, h_i2p);
+    Kokkos::deep_copy(dx1, h_dx1);
+    Kokkos::deep_copy(dx2, h_dx2);
+    Kokkos::deep_copy(dx1_prev, h_dx1p);
+    Kokkos::deep_copy(dx2_prev, h_dx2p);
+    Kokkos::deep_copy(ux3, h_ux3);
+    Kokkos::deep_copy(weight, h_w);
+    Kokkos::deep_copy(tag, h_tag);
+
+    // Flat reference over all slots (dead slots are skipped internally).
+    ndfield_t<Dim::_2D, 3> J_flat { "J_flat",
+                                    nx1 + 2u * N_GHOSTS,
+                                    nx2 + 2u * N_GHOSTS };
+    {
+      auto J_scat = Kokkos::Experimental::create_scatter_view(J_flat);
+      Kokkos::parallel_for(
+        "FlatDepositDrift",
+        n_slots,
+        kernel::DepositCurrents_kernel<SimEngine::SRPIC, metric_t, O>(
+          J_scat,
+          pack_arrays(i1, i2, i3,
+                      i1_prev, i2_prev, i3_prev,
+                      dx1, dx2, dx3,
+                      dx1_prev, dx2_prev, dx3_prev,
+                      ux1, ux2, ux3,
+                      phi, weight, tag),
+          metric, charge, dt));
+      Kokkos::Experimental::contribute(J_flat, J_scat);
+      Kokkos::fence("flat drift deposit done");
+    }
+
+    // Tiled with a maximally-stale layout: all alive particles in tile 0.
+    ndfield_t<Dim::_2D, 3> J_tiled { "J_tiled",
+                                     nx1 + 2u * N_GHOSTS,
+                                     nx2 + 2u * N_GHOSTS };
+    {
+      const auto ntx1 = static_cast<ncells_t>(
+        std::ceil(static_cast<double>(nx1) / static_cast<double>(T_TILE)));
+      const auto ntx2 = static_cast<ncells_t>(
+        std::ceil(static_cast<double>(nx2) / static_cast<double>(T_TILE)));
+
+      TileLayout<Dim::_2D> layout;
+      layout.ntiles_per_axis[0] = ntx1;
+      layout.ntiles_per_axis[1] = ntx2;
+      layout.ntiles_per_axis[2] = 1u;
+      layout.ntiles_total       = ntx1 * ntx2;
+      layout.tile_size          = T_TILE;
+      layout.tile_offsets       = build_tile_offsets_all_in_tile0(
+        ntx1 * ntx2,
+        static_cast<npart_t>(n_alive));
+
+      using kernel_t =
+        kernel::DepositCurrentsTiled_kernel<SimEngine::SRPIC, metric_t, O, T_TILE>;
+      // npart = n_alive: the stale layout buckets all alive particles into
+      // tile 0, so the team must walk [0, n_alive) and route the drifted
+      // ones to the global-J escape valve.
+      kernel_t kern { J_tiled,
+                      pack_arrays(i1, i2, i3,
+                                  i1_prev, i2_prev, i3_prev,
+                                  dx1, dx2, dx3,
+                                  dx1_prev, dx2_prev, dx3_prev,
+                                  ux1, ux2, ux3,
+                                  phi, weight, tag),
+                      metric, charge, dt, layout,
+                      static_cast<npart_t>(n_alive) };
+
+      Kokkos::TeamPolicy<> policy(static_cast<int>(layout.ntiles_total),
+                                  Kokkos::AUTO);
+      policy.set_scratch_size(0,
+                              Kokkos::PerTeam(kernel_t::scratch_bytes()));
+      Kokkos::parallel_for("TiledDepositDrift", policy, kern);
+      Kokkos::fence("tiled drift deposit done");
+    }
+
+    compare_J_fields(J_flat, J_tiled, O, T_TILE, "drift");
+    // Self-contained conservation check on the escape-valve output: the
+    // drifted particles all take the per-particle global-J path, so this
+    // certifies that path is charge-conserving without leaning on J_flat.
+    check_charge_conservation(J_tiled, O, T_TILE, "drift");
+  }
+
+  template <unsigned short T_TILE>
+  void run_all_orders() {
+    run_one_case<0u, T_TILE>();
+    run_one_case<1u, T_TILE>();
+    run_one_case<2u, T_TILE>();
+    run_one_case<3u, T_TILE>();
+    run_one_case<4u, T_TILE>();
+    run_one_case<5u, T_TILE>();
+    run_one_case<6u, T_TILE>();
+    run_one_case<7u, T_TILE>();
+    run_one_case<8u, T_TILE>();
+    run_one_case<9u, T_TILE>();
+    run_one_case<10u, T_TILE>();
+    run_one_case<11u, T_TILE>();
+
+    // Stale-layout / drift regression (per-particle escape valve).
+    run_drift_case<0u, T_TILE>();
+    run_drift_case<1u, T_TILE>();
+    run_drift_case<2u, T_TILE>();
+    run_drift_case<3u, T_TILE>();
+    run_drift_case<4u, T_TILE>();
+    run_drift_case<5u, T_TILE>();
+    run_drift_case<6u, T_TILE>();
+    run_drift_case<7u, T_TILE>();
+    run_drift_case<8u, T_TILE>();
+    run_drift_case<9u, T_TILE>();
+    run_drift_case<10u, T_TILE>();
+    run_drift_case<11u, T_TILE>();
+  }
+
+} // namespace
+
+auto main(int argc, char* argv[]) -> int {
+  Kokkos::initialize(argc, argv);
+  try {
+    // Run with each tile-size choice from the validated CMake list.
+    run_all_orders<4u>();
+    run_all_orders<6u>();
+    run_all_orders<8u>();
+    run_all_orders<10u>();
+    run_all_orders<12u>();
+    run_all_orders<14u>();
+    run_all_orders<16u>();
+  } catch (std::exception& e) {
+    std::cerr << e.what() << '\n';
+    Kokkos::finalize();
+    return 1;
+  }
+  Kokkos::finalize();
+  return 0;
+}