diff --git a/CMakeLists.txt b/CMakeLists.txt index e3c580020..3b82fca9b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ set(PROJECT_NAME entity) project( ${PROJECT_NAME} - VERSION 1.4.2 + VERSION 1.5.0 LANGUAGES CXX C) add_compile_options("-D ENTITY_VERSION=\"${PROJECT_VERSION}\"") set(hash_cmd "git diff --quiet src/ && echo $(git rev-parse HEAD) ") @@ -58,6 +58,24 @@ set(gpu_aware_mpi ${default_gpu_aware_mpi} CACHE BOOL "Enable GPU-aware MPI") +set(team_policy + ${default_team_policy} + CACHE BOOL "Enable team_policy tile-blocked deposit/pusher kernels") +set(team_policy_tile_size + ${default_team_policy_tile_size} + CACHE STRING "team_policy tile edge length in cells") +set(team_policy_tile_sizes + "4;6;8;10;12;14;16" + CACHE STRING "team_policy tile-size choices") +set(team_policy_drift + ${default_team_policy_drift} + CACHE STRING + "team_policy tiled-deposit scratch halo drift in cells (max cells a particle may move between two sorts). Sizes the deposit scratch halo only; the sort cadence is set at runtime via spatial_sorting_interval. Default 1.") +set(vendor_sort + ${default_vendor_sort} + CACHE BOOL + "Use the vendor sort_by_key (oneDPL/Thrust/rocThrust) for the team_policy spatial sort when available. OFF forces the Kokkos::BinSort fallback, which sorts each SoA member in place (lower peak memory, no maxnpart gather buffer) at the cost of sort speed.") + # -------------------------- Compilation settings -------------------------- # set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -136,6 +154,87 @@ else() set(DEVICE_ENABLED OFF) endif() +# ------------------------------ team_policy wiring ------------------------ # +if(${team_policy}) + list(FIND team_policy_tile_sizes "${team_policy_tile_size}" _tps_idx) + if(_tps_idx EQUAL -1) + message(FATAL_ERROR + "${Red}team_policy_tile_size must be one of ${team_policy_tile_sizes}, " + "got '${team_policy_tile_size}'${ColorReset}") + endif() + add_compile_options("-D TEAM_POLICY") + add_compile_options("-D TEAM_POLICY_TILE_SIZE=${team_policy_tile_size}") + + # Compile-time tiled-deposit scratch halo drift. Sizes the halo so a + # particle that drifts up to DRIFT cells between two sorts still deposits + # inside its tile scratch; particles drifting further take the + # per-particle global-J escape valve (correct, only slower). This is + # independent of the sort cadence, which is set at runtime via + # `spatial_sorting_interval`. Defaults to 1 (the sorted-every-step case). + add_compile_options("-D TEAM_POLICY_DRIFT=${team_policy_drift}") + + # Vendor sort: oneDPL on SYCL, Thrust on CUDA, rocThrust/rocprim on HIP. + # When `vendor_sort` is ON (default) the available library is detected + # and used; the spatial sort then builds a single permutation that + # gathers all SoA members. When `vendor_sort` is OFF, or no library is + # found, the code falls back to Kokkos::BinSort, which sorts each member + # in place -- lower peak memory and no maxnpart gather buffer, at the + # cost of sort speed (negligible when sorting is a small fraction of the + # step). The `vendor_sort` knob lets you force the BinSort fallback even + # when a vendor library is present. + if(${vendor_sort}) + if("${Kokkos_DEVICES}" MATCHES "SYCL") + find_package(oneDPL QUIET) + if(oneDPL_FOUND) + message(STATUS "team_policy: oneDPL found, enabling SYCL sort_by_key") + add_compile_options("-D ONEDPL_ENABLED") + set(DEPENDENCIES ${DEPENDENCIES} oneDPL) + else() + message(STATUS "team_policy: oneDPL not found; using BinSort fallback " + "for SYCL sort_by_key") + endif() + endif() + + if("${Kokkos_DEVICES}" MATCHES "CUDA") + find_package(Thrust QUIET) + if(Thrust_FOUND) + message(STATUS "team_policy: Thrust enabled for CUDA sort_by_key") + add_compile_options("-D THRUST_ENABLED") + else() + message(STATUS "team_policy: Thrust not found; using BinSort fallback " + "for CUDA sort_by_key") + endif() + endif() + + if("${Kokkos_DEVICES}" MATCHES "HIP") + # rocThrust ships with ROCm. The HIP sort_by_key path uses rocprim's + # bounded-bit radix sort directly (rocprim is rocThrust's own + # dependency, so its headers come in transitively; we find it + # explicitly to keep the include path robust). This builds a single + # permutation that gathers all SoA members, instead of the legacy + # per-member Kokkos::BinSort path which allocates a fresh + # `sorted_values` buffer for every member every step (the dominant + # source of allocator churn / fragmentation on ROCm). + find_package(rocthrust QUIET) + if(rocthrust_FOUND) + message(STATUS "team_policy: rocThrust enabled for HIP sort_by_key") + add_compile_options("-D ROCTHRUST_ENABLED") + set(DEPENDENCIES ${DEPENDENCIES} roc::rocthrust) + find_package(rocprim QUIET) + if(rocprim_FOUND) + set(DEPENDENCIES ${DEPENDENCIES} roc::rocprim) + endif() + else() + message(STATUS "team_policy: rocThrust not found; using BinSort " + "fallback for HIP sort_by_key") + endif() + endif() + else() + message(STATUS "team_policy: vendor_sort=OFF; forcing Kokkos::BinSort " + "fallback for spatial sort_by_key") + endif() +endif() + # MPI if(${mpi}) find_or_fetch_dependency(MPI FALSE REQUIRED) @@ -145,6 +244,62 @@ if(${mpi}) if(${DEVICE_ENABLED}) if(${gpu_aware_mpi}) add_compile_options("-D GPU_AWARE_MPI") + + # On Cray systems (e.g. Frontier) GPU-aware Cray MPICH can only + # handle device pointers if the GPU Transport Layer (GTL) library + # is linked. The Cray compiler wrappers (cc/CC) inject this + # automatically, but we build with hipcc/nvcc directly, so + # find_package(MPI) only finds base libmpi and the GTL is left + # out -> MPI_Sendrecv on a device pointer fails with + # "OFI ... Bad address". Add it explicitly here. + # + # Cray PE exports PE_MPICH_GTL_DIR_ / PE_MPICH_GTL_LIBS_ + # (e.g. amd_gfx90a -> -lmpi_gtl_hsa). Their absence means this is + # not a Cray MPICH build, in which case nothing extra is needed. + if("${Kokkos_DEVICES}" MATCHES "HIP") + set(_gtl_accels amd_gfx942 amd_gfx940 amd_gfx90a amd_gfx908 amd_gfx906) + elseif("${Kokkos_DEVICES}" MATCHES "CUDA") + set(_gtl_accels nvidia90 nvidia80 nvidia70) + elseif("${Kokkos_DEVICES}" MATCHES "SYCL") + set(_gtl_accels ponteVecchio) + else() + set(_gtl_accels "") + endif() + + set(_gtl_dir "") + set(_gtl_libflag "") + foreach(_accel ${_gtl_accels}) + if((NOT _gtl_dir) AND (DEFINED ENV{PE_MPICH_GTL_DIR_${_accel}})) + # strip the leading "-L" from the Cray-provided value + string(REGEX REPLACE "^-L" "" + _gtl_dir "$ENV{PE_MPICH_GTL_DIR_${_accel}}") + string(REGEX REPLACE "^-l" "" + _gtl_libflag "$ENV{PE_MPICH_GTL_LIBS_${_accel}}") + endif() + endforeach() + + if(_gtl_dir AND _gtl_libflag) + find_library(MPI_GTL_LIBRARY + NAMES ${_gtl_libflag} + HINTS "${_gtl_dir}" + NO_DEFAULT_PATH) + if(MPI_GTL_LIBRARY) + message(STATUS + "GPU-aware MPI: linking Cray GTL library ${MPI_GTL_LIBRARY}") + set(DEPENDENCIES ${DEPENDENCIES} ${MPI_GTL_LIBRARY}) + else() + message(FATAL_ERROR + "${Red}gpu_aware_mpi=ON: Cray MPICH detected but the GTL " + "library 'lib${_gtl_libflag}' was not found in '${_gtl_dir}'. " + "GPU-aware MPI will crash at runtime without it. Make sure the " + "craype-accel module is loaded, or build with gpu_aware_mpi=OFF." + "${ColorReset}") + endif() + else() + message(STATUS + "GPU-aware MPI: no Cray GTL environment found; assuming the MPI " + "implementation is GPU-aware without an extra transport library.") + endif() endif() else() set(gpu_aware_mpi diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake index fb8790019..c03ebaf81 100644 --- a/cmake/defaults.cmake +++ b/cmake/defaults.cmake @@ -92,3 +92,36 @@ else() endif() set_property(CACHE default_gpu_aware_mpi PROPERTY TYPE BOOL) + +if(DEFINED ENV{Entity_ENABLE_TEAM_POLICY}) + set(default_team_policy + $ENV{Entity_ENABLE_TEAM_POLICY} + CACHE INTERNAL "Default flag for team_policy tile-blocked kernels") +else() + set(default_team_policy + OFF + CACHE INTERNAL "Default flag for team_policy tile-blocked kernels") +endif() +set_property(CACHE default_team_policy PROPERTY TYPE BOOL) + +if(DEFINED ENV{Entity_ENABLE_VENDOR_SORT}) + set(default_vendor_sort + $ENV{Entity_ENABLE_VENDOR_SORT} + CACHE INTERNAL + "Default flag for vendor sort_by_key (oneDPL/Thrust/rocThrust)") +else() + set(default_vendor_sort + ON + CACHE INTERNAL + "Default flag for vendor sort_by_key (oneDPL/Thrust/rocThrust)") +endif() +set_property(CACHE default_vendor_sort PROPERTY TYPE BOOL) + +set(default_team_policy_tile_size + 8 + CACHE INTERNAL "Default tile edge length in cells for team_policy") + +set(default_team_policy_drift + 1 + CACHE INTERNAL + "Default tiled-deposit scratch halo drift for team_policy (cells between sorts)") diff --git a/cmake/report.cmake b/cmake/report.cmake index 7b145bfc7..d972fafc4 100644 --- a/cmake/report.cmake +++ b/cmake/report.cmake @@ -122,6 +122,44 @@ if(${mpi} AND ${DEVICE_ENABLED}) GPU_AWARE_MPI_REPORT 46) endif() +printchoices( + "Team Policy" + "team_policy" + "${ON_OFF_VALUES}" + ${team_policy} + OFF + "${Green}" + TEAM_POLICY_REPORT + 46) +if(${team_policy}) + printchoices( + "Team Tile Size" + "team_policy_tile_size" + "${team_policy_tile_sizes}" + ${team_policy_tile_size} + ${default_team_policy_tile_size} + "${Blue}" + TEAM_POLICY_TILE_SIZE_REPORT + 46) + printchoices( + "Team Deposit Drift" + "team_policy_drift" + "${team_policy_drift}" + ${team_policy_drift} + 1 + "${Blue}" + TEAM_POLICY_DRIFT_REPORT + 46) + printchoices( + "Vendor sort" + "vendor_sort" + "${ON_OFF_VALUES}" + ${vendor_sort} + ON + "${Green}" + VENDOR_SORT_REPORT + 46) +endif() printchoices( "Debug mode" "DEBUG" @@ -197,6 +235,13 @@ if(${mpi} AND ${DEVICE_ENABLED}) string(APPEND REPORT_TEXT " " ${GPU_AWARE_MPI_REPORT} "\n") endif() +string(APPEND REPORT_TEXT " " ${TEAM_POLICY_REPORT} "\n") +if(${team_policy}) + string(APPEND REPORT_TEXT " " ${TEAM_POLICY_TILE_SIZE_REPORT} "\n") + string(APPEND REPORT_TEXT " " ${TEAM_POLICY_DRIFT_REPORT} "\n") + string(APPEND REPORT_TEXT " " ${VENDOR_SORT_REPORT} "\n") +endif() + string( APPEND REPORT_TEXT diff --git a/ideal_tile_size.py b/ideal_tile_size.py new file mode 100644 index 000000000..e216f5981 --- /dev/null +++ b/ideal_tile_size.py @@ -0,0 +1,955 @@ +#!/usr/bin/env python3 +"""Recommend the team tile size (T_TILE) for entity's tiled current-deposit kernel. + +Each GPU work-group (team) owns a TE^dim scratch tile in shared memory (SLM on Intel, +LDS on AMD, shared mem on NVIDIA), accumulates its particles' currents into it, then +flushes once to global memory. With + + TE = T_TILE + 2*HALO + HALO = stencil_reach + drift (stencil_reach = shape_order for Esirkepov, 2 for the + O==0 zigzag deposit; drift = the compile-time + `team_policy_drift` CMake knob, NOT the runtime + spatial_sorting_interval -- see currents_deposit.hpp) + +the tile size is squeezed by three competing pressures: + + * Shared-memory capacity (HARD): TE^dim * ncomp * sizeof(real) must fit, ideally with + several work-groups resident per compute unit so latency is hidden. Binds in 3D / + double precision / on AMD's 64 KiB LDS. + * Halo overhead (push LARGER): zero-fill + flush sweep the whole TE^dim tile, so a tiny + tile is almost all halo (1-(T/TE)^dim wasted). Big HALO (infrequent sorts) makes this + worse and forces larger tiles. + * Particles per tile (push SMALLER): ppc*T^dim particles all atomic-add into one fixed + scratch tile -> SLM-atomic contention and load imbalance grow with tile size. + +Recommendation = the largest tile that respects the particle budget and shared-memory +residency; if that tile would be mostly halo, it is grown (toward lower halo) up to the +shared-memory limit. This is a first-order model -- confirm by sweeping the entity knobs + -D team_policy_tile_size= -D team_policy_drift= and re-profiling (see roofline/). +The team (work-group) size defaults to Kokkos::AUTO; override it at runtime with the + [algorithms.deposit] team_policy_team_size = (0 = AUTO) +toml knob -- clamped to the backend maximum at launch (engines/srpic/currents.h). + +Two ways to drive it: + + * Interactive TUI (no arguments): + python ideal_tile_size.py + + * Scriptable CLI (any argument): + python ideal_tile_size.py pvc --dim 2 --ppc 16 + python ideal_tile_size.py amd --dim 3 --ppc 64 + python ideal_tile_size.py all --dim 2 --ppc 16 --drift 4 # infrequent sorting +""" +import argparse +import curses +import os +import sys +from typing import Callable, List, Optional, Tuple + +# Shared-memory budget is the load-bearing number. Per compute unit (Xe-core / SM / CU); +# smem_wg_max is the largest single-work-group allocation. +ARCH = { + "pvc": dict(label="Intel Data Center GPU Max 1550 (PVC, Xe-HPC), per tile", + smem_cu=128 * 1024, smem_wg_max=128 * 1024, + subgroup=32, max_wg=1024, n_cu=64, cu="Xe-core"), + "a100": dict(label="NVIDIA A100 (Ampere)", + smem_cu=164 * 1024, smem_wg_max=163 * 1024, + subgroup=32, max_wg=1024, n_cu=108, cu="SM", + note="shared mem >48 KiB/block needs opt-in (cudaFuncAttributeMaxDynamicSharedMemorySize)"), + "h100": dict(label="NVIDIA H100 (Hopper)", + smem_cu=228 * 1024, smem_wg_max=227 * 1024, + subgroup=32, max_wg=1024, n_cu=132, cu="SM", + note="shared mem >48 KiB/block needs opt-in (cudaFuncAttributeMaxDynamicSharedMemorySize)"), + "gh200": dict(label="NVIDIA GH200 Grace Hopper (Hopper H100/H200 GPU)", + smem_cu=228 * 1024, smem_wg_max=227 * 1024, + subgroup=32, max_wg=1024, n_cu=132, cu="SM", + note="shared mem >48 KiB/block needs opt-in (cudaFuncAttributeMaxDynamicSharedMemorySize)"), + "mi250x": dict(label="AMD Instinct MI250X (CDNA2), per GCD", + smem_cu=64 * 1024, smem_wg_max=64 * 1024, + subgroup=64, max_wg=1024, n_cu=110, cu="CU"), + "mi300x": dict(label="AMD Instinct MI300X (CDNA3)", + smem_cu=64 * 1024, smem_wg_max=64 * 1024, + subgroup=64, max_wg=1024, n_cu=304, cu="CU"), +} +ALIAS = {"nvidia": "h100", "amd": "mi300x", "intel": "pvc", "mi250": "mi250x", "mi300": "mi300x", + "gracehopper": "gh200", "grace-hopper": "gh200", "gh200x": "gh200"} +PRECISION = {"single": 4, "double": 8} + +# arch choices offered in the TUI (canonical keys plus the meta-target "all") +ARCH_CHOICES = list(ARCH.keys()) + ["all"] +# "all" expands to one representative of each vendor (matches the CLI behaviour) +ALL_ARCHS = ["pvc", "nvidia", "amd"] + + +# ============================ +# core model (shared by TUI + CLI) +# ============================ + +class Settings: + """Tunable inputs for the tile-size model. + + Attribute names match the argparse dest names so `recommend`/`report_lines` accept + either a Settings instance (TUI) or an argparse namespace (CLI) interchangeably. + """ + + def __init__(self): + self.arch = "pvc" # an ARCH key or "all" + self.dim = 2 # 1 / 2 / 3 + self.ppc = 16.0 # particles per cell (per species) + self.shape_order = 2 # entity shape_order + self.precision = "single" # single / double + self.components = 3 # current-field components (J has 3) + self.drift = 1 # team_policy_drift: cells of drift the scratch halo absorbs + # (compile-time CMake knob, independent of spatial_sorting_interval) + self.target_resident = 2 # work-groups resident per compute unit + self.npart_cap = 1600.0 # particle-per-tile budget (contention / load-balance proxy) + self.halo_max = 0.70 # halo fraction above which the tile is grown + self.grid = 0 # cells per dim (0 disables the GPU-fill check) + self.balance_factor = 4 # min tiles per compute unit + self.min_tile = 4 # entity's team_policy_tile_sizes list starts at 4 + self.max_tile = 64 + + +def resolve_arch(name): + key = ALIAS.get(name.lower(), name.lower()) + if key not in ARCH: + raise SystemExit("unknown arch '%s'; choose from %s (or aliases %s)" + % (name, ", ".join(ARCH), ", ".join(ALIAS))) + return key, ARCH[key] + + +def recommend(hw, p): + """p: Settings or argparse namespace. Returns dict with rows, chosen row, binding.""" + # Matches DepositCurrentsTiled_kernel: STENCIL_REACH = O for Esirkepov (O>=1), + # 2 for the O==0 zigzag deposit; HALO = STENCIL_REACH + TEAM_POLICY_DRIFT. + stencil_reach = 2 if p.shape_order == 0 else p.shape_order + halo = stencil_reach + p.drift + real = PRECISION[p.precision] + atoms_pp = p.components * (p.shape_order + 1) ** p.dim # ~ useful atomics / particle + + rows = [] + for T in range(p.min_tile, p.max_tile + 1, 2): # entity uses even tile sizes + TE = T + 2 * halo + scratch = TE ** p.dim * p.components * real + npart = p.ppc * T ** p.dim + halo_frac = 1.0 - (T / TE) ** p.dim + ovhd = (2.0 * p.components / p.ppc) * (TE / T) ** p.dim / atoms_pp # zero+flush vs deposit + resident = int(hw["smem_cu"] // scratch) if scratch else 0 + ntiles = (p.grid / T) ** p.dim if p.grid else None + rows.append(dict(T=T, TE=TE, scratch=scratch, npart=npart, halo_frac=halo_frac, + ovhd=ovhd, resident=resident, ntiles=ntiles, + ok_cap=scratch <= hw["smem_wg_max"])) + + capfeas = [r for r in rows if r["ok_cap"]] + if not capfeas: + return dict(halo=halo, rows=rows, chosen=None, binding=None, grown=False) + + def largest(pred, default): + ts = [r["T"] for r in capfeas if pred(r)] + return max(ts) if ts else default + + T_cap = max(r["T"] for r in capfeas) + T_res = largest(lambda r: r["resident"] >= p.target_resident, p.min_tile) + T_np = largest(lambda r: r["npart"] <= p.npart_cap, p.min_tile) + T_bal = largest(lambda r: r["ntiles"] is None or r["ntiles"] >= p.balance_factor * hw["n_cu"], p.min_tile) + bounds = {"shared-memory capacity": T_cap, "shared-memory residency": T_res, + "GPU fill (too few tiles)": T_bal, "particle budget per tile": T_np} + + chosen_T = max(min(bounds.values()), p.min_tile) + binding = min(bounds, key=lambda k: bounds[k]) + + # If the particle-budget pick is mostly halo, grow the tile to cut halo, but never + # past what shared memory / GPU-fill allow (that just trades halo for contention). + grown = False + cur = next(r for r in capfeas if r["T"] == chosen_T) + if cur["halo_frac"] > p.halo_max: + halo_ok = [r["T"] for r in capfeas if r["halo_frac"] <= p.halo_max] + ceil_T = min(T_res, T_bal, T_cap) + if halo_ok: + target = max(min(halo_ok), chosen_T) # smallest tile that clears halo_max + new_T = min(max(target, chosen_T), ceil_T) + if new_T > chosen_T: + chosen_T, grown = new_T, True + binding = ("halo overhead" if min(halo_ok) <= ceil_T + else min({k: v for k, v in bounds.items() + if k != "particle budget per tile"}, + key=lambda k: bounds[k])) + else: + new_T = min(ceil_T, T_cap) # can't clear halo_max at all -> go as big as SLM allows + if new_T > chosen_T: + chosen_T, grown = new_T, True + binding = "shared-memory residency" + + chosen = next(r for r in capfeas if r["T"] == chosen_T) + return dict(halo=halo, rows=rows, chosen=chosen, binding=binding, grown=grown, atoms_pp=atoms_pp) + + +def kib(b): + return "%.1f" % (b / 1024.0) + + +def report_lines(name, key, hw, p, res): + """Build the recommendation report as a list of text lines (no printing).""" + L = [] + L.append("=" * 80) + L.append("%s [preset: %s]" % (hw["label"], key)) + L.append(" dim=%d ppc=%g shape_order=%d precision=%s(%dB) J-components=%d" + % (p.dim, p.ppc, p.shape_order, p.precision, PRECISION[p.precision], p.components)) + reach = 2 if p.shape_order == 0 else p.shape_order + reach_kind = "zigzag" if p.shape_order == 0 else "Esirkepov O" + L.append(" HALO = stencil_reach + drift = %d + %d = %d -> TE = T_TILE + %d" + " (reach %d = %s; drift = team_policy_drift)" + % (reach, p.drift, res["halo"], 2 * res["halo"], reach, reach_kind)) + L.append(" shared mem %s KiB/%s (budget %s KiB for %d resident WGs); subgroup=%d, n_cu=%d" + % (kib(hw["smem_cu"]), hw["cu"], kib(hw["smem_cu"] / p.target_resident), + p.target_resident, hw["subgroup"], hw["n_cu"])) + if hw.get("note"): + L.append(" note: %s" % hw["note"]) + L.append("-" * 80) + L.append(" T_TILE TE scratch resWG part/tile halo% zero+flush%") + for r in res["rows"]: + if not r["ok_cap"]: + continue + mark = " <== recommended" if r is res["chosen"] else "" + L.append(" %3d %4d %6s K %4d %9.0f %4.0f %7.1f%s" + % (r["T"], r["TE"], kib(r["scratch"]), r["resident"], r["npart"], + 100 * r["halo_frac"], 100 * r["ovhd"], mark)) + L.append("-" * 80) + + c = res["chosen"] + if c is None: + L.append(" INFEASIBLE: even T_TILE=%d does not fit %s KiB shared memory." + % (p.min_tile, kib(hw["smem_wg_max"]))) + L.append(" -> sort more often (smaller drift), use precision single, lower shape_order,") + L.append(" or use a non-tiled (global-atomic / ScatterView) deposit on this arch.") + return L + L.append(" RECOMMENDED T_TILE = %d (limited by: %s%s)" + % (c["T"], res["binding"], "; tile grown to reduce halo" if res["grown"] else "")) + L.append(" %.1f KiB scratch/team, %d work-groups resident/%s, %.0f particles/team, %.0f%% halo" + % (c["scratch"] / 1024.0, c["resident"], hw["cu"], c["npart"], 100 * c["halo_frac"])) + team = min(hw["max_wg"], 256 - 256 % hw["subgroup"]) + extra = "" if c["T"] <= 16 else " (entity's team_policy_tile_sizes list stops at 16; extend it)" + L.append(" entity build: -D team_policy=ON -D team_policy_tile_size=%d -D team_policy_drift=%d%s" + % (min(c["T"], 16), p.drift, extra)) + L.append(" team (work-group) size: Kokkos::AUTO by default; to override, set in the toml") + L.append(" [algorithms.deposit] team_policy_team_size = %d (0 = AUTO; keep a multiple of" + % team) + L.append(" subgroup=%d), then sweep around it and re-profile" % hw["subgroup"]) + # contextual guidance + if c["halo_frac"] > p.halo_max: + if p.drift > 1: + L.append(" !! %.0f%% of the tile is halo, inflated by team_policy_drift=%d; lower it " + "(and sort at least that often via spatial_sorting_interval)" + % (100 * c["halo_frac"], p.drift)) + else: + L.append(" !! %.0f%% halo is intrinsic at this size (shared memory caps the tile here)" + % (100 * c["halo_frac"])) + if c["npart"] > p.npart_cap: + L.append(" !! %.0f particles/team exceeds the %.0f budget -> watch SLM-atomic contention" + % (c["npart"], p.npart_cap)) + if c["resident"] < p.target_resident: + L.append(" !! only %d work-group(s) resident/%s -> limited latency hiding" + % (c["resident"], hw["cu"])) + return L + + +def archs_for(name): + """Expand a setting/arg value into the list of arch names to report on.""" + return ALL_ARCHS if name.lower() == "all" else [name] + + +def build_report(p): + """Run the model for the selected arch(es) and return the full report as lines.""" + lines = [] + for a in archs_for(p.arch): + try: + key, hw = resolve_arch(a) + except SystemExit as e: + lines.append(str(e)) + continue + lines.extend(report_lines(a, key, hw, p, recommend(hw, p))) + lines.append("=" * 80) + return lines + + +# ============================ +# colors: edit these +# ============================ + +COLOR_TITLE_FG = curses.COLOR_BLUE +COLOR_TEXT_FG = curses.COLOR_WHITE +COLOR_SELECTED_FG = curses.COLOR_WHITE +COLOR_SELECTED_BG = curses.COLOR_BLACK +COLOR_HINT_FG = curses.COLOR_YELLOW +COLOR_OK_FG = curses.COLOR_GREEN +COLOR_ERR_FG = curses.COLOR_RED +COLOR_KEY_FG = curses.COLOR_MAGENTA +COLOR_DIM_FG = curses.COLOR_CYAN + +PAIR_TITLE = 1 +PAIR_TEXT = 2 +PAIR_SELECTED = 3 +PAIR_HINT = 4 +PAIR_OK = 5 +PAIR_ERR = 6 +PAIR_KEY = 7 +PAIR_DIM = 8 + + +class MenuItem: + def __init__(self, label, hint="", right=None, on_enter=None, + on_space=None, disabled=None): + self.label = label + self.hint = hint + self.right = right + self.on_enter = on_enter + self.on_space = on_space + self.disabled = disabled + + +# ============================ +# TUI +# ============================ + +class App: + def __init__(self, stdscr): + self.stdscr = stdscr + self.s = Settings() + + self.state = "mainmenu" + self.stack: List[Tuple[str, int]] = [] + self.selected = 0 + self.scroll = 0 + self.message = "use arrows or j/k" + + self._init_curses() + + def _init_curses(self) -> None: + curses.curs_set(0) + self.stdscr.keypad(True) + curses.noecho() + curses.cbreak() + + if curses.has_colors(): + curses.start_color() + curses.use_default_colors() + curses.init_pair(PAIR_TITLE, COLOR_TITLE_FG, -1) + curses.init_pair(PAIR_TEXT, COLOR_TEXT_FG, -1) + curses.init_pair(PAIR_SELECTED, COLOR_SELECTED_FG, COLOR_SELECTED_BG) + curses.init_pair(PAIR_HINT, COLOR_HINT_FG, -1) + curses.init_pair(PAIR_OK, COLOR_OK_FG, -1) + curses.init_pair(PAIR_ERR, COLOR_ERR_FG, -1) + curses.init_pair(PAIR_KEY, COLOR_KEY_FG, -1) + curses.init_pair(PAIR_DIM, COLOR_DIM_FG, -1) + + def cp(self, pair_id: int) -> int: + return curses.color_pair(pair_id) if curses.has_colors() else 0 + + # ----- formatting helpers ----- + + def arch_label(self) -> str: + if self.s.arch == "all": + return "all (%s)" % " + ".join(ALL_ARCHS) + try: + return ARCH[resolve_arch(self.s.arch)[0]]["label"] + except SystemExit: + return self.s.arch + + # ----- nav stack ----- + + def push(self, st: str) -> None: + self.stack.append((self.state, self.selected)) + self.state = st + self.selected = 0 + self.scroll = 0 + self.message = "" + + def pop(self) -> None: + if self.stack: + self.state, self.selected = self.stack.pop() + else: + self.state, self.selected = "mainmenu", 0 + self.scroll = 0 + self.message = "" + + # ----- drawing ----- + + def add(self, y: int, x: int, s: str, attr: int = 0) -> None: + try: + self.stdscr.addstr(y, x, s, attr) + except curses.error: + pass + + def hline(self, y: int) -> None: + _, w = self.stdscr.getmaxyx() + try: + self.stdscr.hline(y, 0, curses.ACS_HLINE, max(0, w - 1)) + except curses.error: + pass + + def draw_keybar(self, y: int, x: int, pairs: List[Tuple[str, str]]) -> None: + cur_x = x + for key, action in pairs: + self.add(y, cur_x, key, self.cp(PAIR_KEY) | curses.A_BOLD) + cur_x += len(key) + self.add(y, cur_x, " ", self.cp(PAIR_DIM)) + cur_x += 1 + self.add(y, cur_x, action, self.cp(PAIR_HINT)) + cur_x += len(action) + self.add(y, cur_x, " ", self.cp(PAIR_DIM)) + cur_x += 3 + + def breadcrumb(self) -> str: + return { + "mainmenu": "mainmenu", + "arch": "mainmenu > architecture", + "physics": "mainmenu > physics & particles", + "tuning": "mainmenu > tuning knobs", + }.get(self.state, "mainmenu") + + def draw_menu(self, title: str, prompt: str, items: List[MenuItem]) -> None: + self.stdscr.erase() + h, w = self.stdscr.getmaxyx() + + self.add(0, 2, title, self.cp(PAIR_TITLE) | curses.A_BOLD) + bc = self.breadcrumb() + self.add(0, max(2, w - 2 - len(bc)), bc, self.cp(PAIR_DIM)) + + self.draw_keybar( + 1, + 2, + [ + ("up/dn/j/k", "move"), + ("enter", "select"), + ("space", "toggle/cycle"), + ("b", "back"), + ("q", "quit"), + ], + ) + self.hline(2) + + status1 = ("arch: %s dim: %d precision: %s ppc: %g" + % (self.s.arch, self.s.dim, self.s.precision, self.s.ppc)) + status2 = ("shape_order: %d drift: %d components: %d tile range: %d-%d" + % (self.s.shape_order, self.s.drift, self.s.components, + self.s.min_tile, self.s.max_tile)) + self.add(3, 2, status1[: w - 4], self.cp(PAIR_TEXT)) + self.add(4, 2, status2[: w - 4], self.cp(PAIR_TEXT)) + self.hline(5) + + self.add(6, 2, prompt[: w - 4], self.cp(PAIR_TEXT) | curses.A_BOLD) + + list_y = 8 + footer_h = 3 + view_h = max(1, h - list_y - footer_h) + n = len(items) + + if n == 0: + self.add(list_y, 2, "(empty)", self.cp(PAIR_HINT)) + else: + self.selected = max(0, min(self.selected, n - 1)) + + if self.selected < self.scroll: + self.scroll = self.selected + if self.selected >= self.scroll + view_h: + self.scroll = self.selected - view_h + 1 + self.scroll = max(0, min(self.scroll, max(0, n - view_h))) + + shown = items[self.scroll : self.scroll + view_h] + + for i, it in enumerate(shown): + idx = self.scroll + i + sel = idx == self.selected + dis = bool(it.disabled and it.disabled()) + + row_attr = ( + self.cp(PAIR_SELECTED) | curses.A_BOLD + if sel + else (self.cp(PAIR_DIM) if dis else self.cp(PAIR_TEXT)) + ) + self.add(list_y + i, 2, (" %s" % it.label)[: w - 4], row_attr) + + if it.right: + rt = (it.right() or "").strip() + if rt: + rt = rt[: max(0, w - 6)] + x = max(2, w - 2 - len(rt)) + rt_attr = ( + row_attr + if sel + else (self.cp(PAIR_HINT) if not dis else self.cp(PAIR_DIM)) + ) + self.add(list_y + i, x, rt, rt_attr) + + if sel and it.hint: + self.add( + list_y + i, + min(w - 4, 30), + (" %s" % it.hint)[: w - 4], + self.cp(PAIR_HINT), + ) + + self.hline(h - 3) + msg = self.message or "" + if msg: + is_err = msg.startswith("error") + attr = (self.cp(PAIR_ERR) if is_err else self.cp(PAIR_OK)) | curses.A_BOLD + self.add(h - 2, 2, msg[: w - 4], attr) + self.stdscr.refresh() + + # ----- modals ----- + + def input_box(self, title: str, prompt: str, initial: str) -> Optional[str]: + h, w = self.stdscr.getmaxyx() + win_h, win_w = 9, min(86, max(46, w - 6)) + top, left = max(0, (h - win_h) // 2), max(0, (w - win_w) // 2) + + win = curses.newwin(win_h, win_w, top, left) + win.keypad(True) + win.border() + + win.addstr(1, 2, title[: win_w - 4], self.cp(PAIR_TITLE) | curses.A_BOLD) + win.addstr(2, 2, prompt[: win_w - 4], self.cp(PAIR_TEXT)) + + buf = list(initial) + curses.curs_set(1) + + while True: + win.addstr(4, 2, " " * (win_w - 4), self.cp(PAIR_TEXT)) + text = "".join(buf) + if len(text) > win_w - 4: + text = text[-(win_w - 4) :] + win.addstr(4, 2, text, self.cp(PAIR_TEXT) | curses.A_BOLD) + win.addstr(6, 2, "enter=ok esc=cancel", self.cp(PAIR_DIM)) + win.refresh() + + ch = win.getch() + if ch == 27: + curses.curs_set(0) + return None + if ch in (curses.KEY_ENTER, 10, 13): + curses.curs_set(0) + return "".join(buf).strip() + if ch in (curses.KEY_BACKSPACE, 127, 8): + if buf: + buf.pop() + elif 32 <= ch <= 126: + buf.append(chr(ch)) + + # ----- value editors ----- + + def cycle_attr(self, attr: str, options: list) -> None: + cur = getattr(self.s, attr) + if cur not in options: + setattr(self.s, attr, options[0]) + else: + setattr(self.s, attr, options[(options.index(cur) + 1) % len(options)]) + + def edit_int(self, label: str, attr: str, minv: Optional[int] = None) -> None: + val = self.input_box(label, "enter an integer:", str(getattr(self.s, attr))) + if val is None or val == "": + return + try: + n = int(val) + except ValueError: + self.message = "error: '%s' is not an integer" % val + return + if minv is not None and n < minv: + self.message = "error: %s must be >= %d" % (label, minv) + return + setattr(self.s, attr, n) + self.message = "%s = %d" % (label, n) + + def edit_float(self, label: str, attr: str, + minv: Optional[float] = None, maxv: Optional[float] = None) -> None: + val = self.input_box(label, "enter a number:", str(getattr(self.s, attr))) + if val is None or val == "": + return + try: + x = float(val) + except ValueError: + self.message = "error: '%s' is not a number" % val + return + if minv is not None and x < minv: + self.message = "error: %s must be >= %g" % (label, minv) + return + if maxv is not None and x > maxv: + self.message = "error: %s must be <= %g" % (label, maxv) + return + setattr(self.s, attr, x) + self.message = "%s = %g" % (label, x) + + # ----- report pager ----- + + def _pager_attr(self, ln: str) -> int: + s = ln.strip() + if "RECOMMENDED" in ln: + return self.cp(PAIR_OK) | curses.A_BOLD + if "<== recommended" in ln: + return self.cp(PAIR_OK) + if "INFEASIBLE" in ln or "unknown arch" in ln: + return self.cp(PAIR_ERR) | curses.A_BOLD + if s.startswith("!!"): + return self.cp(PAIR_HINT) + if s.startswith("T_TILE"): + return self.cp(PAIR_TITLE) | curses.A_BOLD + if set(s) <= {"=", "-"} and s: + return self.cp(PAIR_DIM) + return self.cp(PAIR_TEXT) + + def pager(self, title: str, lines: List[str]) -> None: + top = 0 + note = "" + while True: + self.stdscr.erase() + h, w = self.stdscr.getmaxyx() + + self.add(0, 2, title, self.cp(PAIR_TITLE) | curses.A_BOLD) + self.draw_keybar( + 1, + 2, + [ + ("up/dn/j/k", "scroll"), + ("PgUp/PgDn", "page"), + ("g/G", "top/end"), + ("w", "save"), + ("b/q", "back"), + ], + ) + self.hline(2) + + list_y = 4 + view_h = max(1, h - list_y - 2) + n = len(lines) + top = max(0, min(top, max(0, n - view_h))) + + for i, ln in enumerate(lines[top : top + view_h]): + self.add(list_y + i, 2, ln[: w - 3], self._pager_attr(ln)) + + self.hline(h - 2) + footer = "line %d-%d / %d" % (top + 1, min(top + view_h, n), n) + if note: + footer += " " + note + self.add(h - 1, 2, footer[: w - 4], self.cp(PAIR_DIM)) + self.stdscr.refresh() + + ch = self.stdscr.getch() + if ch in (ord("q"), ord("Q"), ord("b"), 8, 127): + return + if ch in (curses.KEY_UP, ord("k"), ord("K")): + top -= 1 + elif ch in (curses.KEY_DOWN, ord("j"), ord("J")): + top += 1 + elif ch in (curses.KEY_PPAGE,): + top -= view_h + elif ch in (curses.KEY_NPAGE, ord(" ")): + top += view_h + elif ch in (ord("g"),): + top = 0 + elif ch in (ord("G"),): + top = n + elif ch in (ord("w"), ord("W")): + note = self._save_report(lines) + + def _save_report(self, lines: List[str]) -> str: + path = os.path.join(os.getcwd(), "ideal_tile_size_report.txt") + try: + with open(path, "w") as f: + f.write("\n".join(lines) + "\n") + return "saved to %s" % path + except OSError as e: + return "save failed: %s" % e + + def do_compute(self) -> None: + self.pager("recommendation [%s]" % self.s.arch, build_report(self.s)) + + def reset(self) -> None: + self.s = Settings() + self.message = "reset to defaults" + + # ----- menus ----- + + def menu_main(self) -> Tuple[str, str, List[MenuItem]]: + return ( + "entity tile-size advisor", + "main menu:", + [ + MenuItem( + "architecture", + "choose the target GPU (or 'all')", + right=self.arch_label, + on_enter=lambda: self.push("arch"), + ), + MenuItem( + "physics & particles", + "dim, ppc, shape order, precision, ...", + right=lambda: "dim %d / ppc %g / so %d / %s" + % (self.s.dim, self.s.ppc, self.s.shape_order, self.s.precision), + on_enter=lambda: self.push("physics"), + ), + MenuItem( + "tuning knobs", + "residency, budgets, tile range, ...", + right=lambda: "res %d / cap %g / halo %.2f" + % (self.s.target_resident, self.s.npart_cap, self.s.halo_max), + on_enter=lambda: self.push("tuning"), + ), + MenuItem( + "compute recommendation", + "run the model and show the report", + on_enter=self.do_compute, + ), + MenuItem("reset to defaults", "", on_enter=self.reset), + MenuItem("exit", "", on_enter=lambda: setattr(self, "state", "exit")), + ], + ) + + def menu_arch(self) -> Tuple[str, str, List[MenuItem]]: + def choose(name: str): + self.s.arch = name + self.pop() + + def label(name: str) -> str: + mark = "(*)" if name == self.s.arch else "( )" + return "%s %s" % (mark, name) + + def right(name: str) -> str: + if name == "all": + return " + ".join(ALL_ARCHS) + return ARCH[name]["label"] + + items = [ + MenuItem( + label(a), + "select target", + right=(lambda a=a: right(a)), + on_enter=(lambda a=a: choose(a)), + ) + for a in ARCH_CHOICES + ] + items.append(MenuItem("back", "return", on_enter=self.pop)) + return ("architecture", "pick the target GPU:", items) + + def menu_physics(self) -> Tuple[str, str, List[MenuItem]]: + def cyc_dim(): + self.cycle_attr("dim", [1, 2, 3]) + + def cyc_prec(): + self.cycle_attr("precision", ["single", "double"]) + + return ( + "physics & particles", + "set physical inputs:", + [ + MenuItem( + "dim", + "space cycles: 1 / 2 / 3", + right=lambda: str(self.s.dim), + on_enter=cyc_dim, + on_space=cyc_dim, + ), + MenuItem( + "ppc", + "particles per cell (per species)", + right=lambda: "%g" % self.s.ppc, + on_enter=lambda: self.edit_float("ppc", "ppc", minv=0.0), + ), + MenuItem( + "shape_order", + "entity particle shape order", + right=lambda: str(self.s.shape_order), + on_enter=lambda: self.edit_int("shape_order", "shape_order", minv=0), + ), + MenuItem( + "precision", + "space cycles: single / double", + right=lambda: self.s.precision, + on_enter=cyc_prec, + on_space=cyc_prec, + ), + MenuItem( + "components", + "current-field components (J has 3)", + right=lambda: str(self.s.components), + on_enter=lambda: self.edit_int("components", "components", minv=1), + ), + MenuItem( + "drift", + "team_policy_drift CMake knob: cells the scratch halo absorbs (>= spatial_sorting_interval)", + right=lambda: str(self.s.drift), + on_enter=lambda: self.edit_int("drift", "drift", minv=0), + ), + MenuItem("back", "return", on_enter=self.pop), + ], + ) + + def menu_tuning(self) -> Tuple[str, str, List[MenuItem]]: + return ( + "tuning knobs", + "set the model's budgets and ranges:", + [ + MenuItem( + "target_resident", + "work-groups resident per compute unit", + right=lambda: str(self.s.target_resident), + on_enter=lambda: self.edit_int("target_resident", "target_resident", minv=1), + ), + MenuItem( + "npart_cap", + "particle-per-tile budget (contention proxy)", + right=lambda: "%g" % self.s.npart_cap, + on_enter=lambda: self.edit_float("npart_cap", "npart_cap", minv=1.0), + ), + MenuItem( + "halo_max", + "halo fraction above which the tile is grown (0..1)", + right=lambda: "%.2f" % self.s.halo_max, + on_enter=lambda: self.edit_float("halo_max", "halo_max", minv=0.0, maxv=1.0), + ), + MenuItem( + "grid", + "cells per dim (0 disables GPU-fill check)", + right=lambda: str(self.s.grid), + on_enter=lambda: self.edit_int("grid", "grid", minv=0), + ), + MenuItem( + "balance_factor", + "min tiles per compute unit", + right=lambda: str(self.s.balance_factor), + on_enter=lambda: self.edit_int("balance_factor", "balance_factor", minv=1), + ), + MenuItem( + "min_tile", + "smallest T_TILE to consider", + right=lambda: str(self.s.min_tile), + on_enter=lambda: self.edit_int("min_tile", "min_tile", minv=1), + ), + MenuItem( + "max_tile", + "largest T_TILE to consider", + right=lambda: str(self.s.max_tile), + on_enter=lambda: self.edit_int("max_tile", "max_tile", minv=1), + ), + MenuItem("back", "return", on_enter=self.pop), + ], + ) + + def get_menu(self) -> Tuple[str, str, List[MenuItem]]: + if self.state == "mainmenu": + return self.menu_main() + if self.state == "arch": + return self.menu_arch() + if self.state == "physics": + return self.menu_physics() + if self.state == "tuning": + return self.menu_tuning() + self.state = "mainmenu" + return self.menu_main() + + # ----- navigation ----- + + def is_disabled(self, it: MenuItem) -> bool: + return bool(it.disabled and it.disabled()) + + def move_sel(self, items: List[MenuItem], delta: int) -> None: + if not items: + return + n = len(items) + start = self.selected + for _ in range(n): + self.selected = (self.selected + delta) % n + if not self.is_disabled(items[self.selected]): + return + self.selected = start + + def activate(self, items: List[MenuItem], enter: bool) -> None: + if not items: + return + it = items[self.selected] + if self.is_disabled(it): + self.message = "error: option disabled." + return + fn = it.on_enter if enter else it.on_space + if fn: + fn() + + # ----- loop ----- + + def run(self) -> None: + while True: + if self.state == "exit": + return + + title, prompt, items = self.get_menu() + self.draw_menu(title, prompt, items) + + ch = self.stdscr.getch() + + if ch in (ord("q"), ord("Q")): + self.state = "exit" + continue + if ch in (ord("b"), 8, 127): + self.pop() + continue + if ch in (curses.KEY_UP, ord("k"), ord("K")): + self.move_sel(items, -1) + continue + if ch in (curses.KEY_DOWN, ord("j"), ord("J")): + self.move_sel(items, +1) + continue + if ch in (curses.KEY_ENTER, 10, 13): + self.activate(items, enter=True) + continue + if ch == ord(" "): + self.activate(items, enter=False) + continue + + +def run_tui() -> int: + try: + curses.wrapper(lambda stdscr: App(stdscr).run()) + except KeyboardInterrupt: + return 130 + return 0 + + +# ============================ +# CLI (preserved for scripting / sweeps) +# ============================ + +def run_cli(argv) -> int: + ap = argparse.ArgumentParser(description="Recommend entity team tile size (T_TILE).") + ap.add_argument("arch", help="pvc | nvidia | amd (or a100/h100/gh200/mi250x/mi300x, or 'all')") + ap.add_argument("--dim", type=int, default=2, choices=(1, 2, 3)) + ap.add_argument("--ppc", type=float, default=16.0, help="particles per cell (per species)") + ap.add_argument("--shape-order", type=int, default=2, help="particle shape order (entity shape_order)") + ap.add_argument("--precision", choices=("single", "double"), default="single") + ap.add_argument("--components", type=int, default=3, help="current-field components (J has 3)") + ap.add_argument("--drift", type=int, default=1, + help="team_policy_drift CMake knob (compile-time): cells of drift the scratch " + "halo absorbs; size it >= spatial_sorting_interval") + ap.add_argument("--target-resident", type=int, default=2, help="work-groups resident per compute unit") + ap.add_argument("--npart-cap", type=float, default=1600, + help="particle-per-tile budget (SLM-atomic-contention / load-balance proxy)") + ap.add_argument("--halo-max", type=float, default=0.70, help="halo fraction above which the tile is grown") + ap.add_argument("--grid", type=int, default=0, help="cells per dim (optional; enables a GPU-fill check)") + ap.add_argument("--balance-factor", type=int, default=4, help="min tiles per compute unit") + ap.add_argument("--min-tile", type=int, default=4, + help="smallest T_TILE to consider (entity's team_policy_tile_sizes starts at 4)") + ap.add_argument("--max-tile", type=int, default=64) + p = ap.parse_args(argv) + + for ln in build_report(p): + print(ln) + return 0 + + +def main() -> int: + # no arguments -> interactive TUI; any argument -> scriptable CLI + if len(sys.argv) > 1: + return run_cli(sys.argv[1:]) + return run_tui() + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/input.example.toml b/input.example.toml index 741d5c2d2..87ca48f3c 100644 --- a/input.example.toml +++ b/input.example.toml @@ -301,6 +301,14 @@ # @type: bool # @default: true enable = "" + # team_policy tiled-deposit work-group (team) size + # @type: uint [>= 0] + # @default: 0 + # @note: 0 keeps Kokkos::AUTO (backend occupancy heuristic); a positive + # value overrides it, clamped to the backend/scratch maximum at + # launch. Only used in `team_policy=ON` builds. Pick a multiple of + # the device subgroup width for best occupancy (see ideal_tile_size.py) + team_policy_team_size = "" # @inferred: # - order diff --git a/src/engines/engine.hpp b/src/engines/engine.hpp index b20e163ca..057bfcb5a 100644 --- a/src/engines/engine.hpp +++ b/src/engines/engine.hpp @@ -78,10 +78,11 @@ namespace ntt { Metadomain m_metadomain; user::PGen m_pgen; - const bool is_resuming; - const simtime_t runtime; - const real_t dt; - const timestep_t max_steps; + const bool is_resuming; + const simtime_t runtime; + const real_t dt; + const std::size_t team_policy_team_size; + const timestep_t max_steps; const timestep_t start_step; const simtime_t start_time; simtime_t time; @@ -109,6 +110,8 @@ namespace ntt { , is_resuming { m_params.get("checkpoint.is_resuming") } , runtime { m_params.get("simulation.runtime") } , dt { m_params.get("algorithms.timestep.dt") } + , team_policy_team_size { m_params.get( + "algorithms.deposit.team_policy_team_size") } , max_steps { static_cast(runtime / dt) } , start_step { m_params.get("checkpoint.start_step") } , start_time { m_params.get("checkpoint.start_time") } @@ -127,6 +130,8 @@ namespace ntt { auto parameters = prm::Parameters {}; parameters.set("dt", static_cast(dt)); parameters.set("time", static_cast(time)); + parameters.set("team_policy_team_size", + static_cast(team_policy_team_size)); return parameters; } }; diff --git a/src/engines/grpic/currents.h b/src/engines/grpic/currents.h index 11c847533..cb4032f54 100644 --- a/src/engines/grpic/currents.h +++ b/src/engines/grpic/currents.h @@ -2,6 +2,8 @@ * @file engines/grpic/currents.h * @brief Current deposition and filtering routines for the GRPIC engine * @implements + * - ntt::grpic::CallDepositKernel<> -> void (flat path) + * - ntt::grpic::CallDepositKernelTiled<> -> void (TEAM_POLICY) * - ntt::grpic::CurrentsDeposit<> -> void * - ntt::grpic::CurrentsFilter<> -> void * @namespaces: @@ -12,8 +14,11 @@ #define ENGINES_GRPIC_CURRENTS_H #include "enums.h" +#include "global.h" +#include "arch/kokkos_aliases.h" #include "traits/metric.h" +#include "utils/error.h" #include "utils/log.h" #include "utils/param_container.h" @@ -26,13 +31,198 @@ namespace ntt { namespace grpic { + template + void CallDepositKernel(const Particles& species, + const M& local_metric, + const scatter_ndfield_t& scatter_cur, + real_t dt) { + Kokkos::parallel_for("CurrentsDeposit", + species.rangeActiveParticles(), + kernel::DepositCurrents_kernel( + scatter_cur, + species, + local_metric, + (real_t)(species.charge()), + dt)); + } + +#if defined(TEAM_POLICY) + /** + * @brief Tiled deposit launcher (TeamPolicy + per-team scratch). + * + * Identical in structure to the SRPIC launcher (`engines/srpic/currents.h`): + * iterates over `tile_layout.ntiles_total` teams; each team accumulates its + * tile's particle contributions in SLM scratch and atomically flushes to the + * global J (here `cur0`, the GRPIC half-step current). Requires the species + * to have been sorted with `team_policy` enabled (`tile_layout` populated by + * `SortSpatially`). + * + * The deposit body (`kernel::DepositOneParticle`) is + * the same shared math used by the flat path — it already carries the GR + * velocity-recovery branch — so the only engine-specific differences from + * SRPIC are the `SimEngine::GRPIC` tag and the `cur0` target. + * + * Falls back to the flat kernel for the tail `[npart_partitioned, npart)` + * exactly as SRPIC does; see the per-step coverage note in + * `kernels/currents_deposit.hpp`. + */ + template + void CallDepositKernelTiled(const Particles& species, + const M& local_metric, + const ndfield_t& cur, + real_t dt, + int team_size_req) { + static_assert(O <= 11u, "Shape order must be <= 11"); + constexpr unsigned short T = static_cast( + TEAM_POLICY_TILE_SIZE); + const auto& layout = species.tile_layout(); + raise::ErrorIf(layout.ntiles_total == 0u, + "CallDepositKernelTiled: tile_layout has 0 tiles — call " + "SortSpatially before CurrentsDeposit", + HERE); + raise::ErrorIf(layout.tile_offsets.extent(0) != layout.ntiles_total + 1u, + "CallDepositKernelTiled: tile_offsets size inconsistent " + "with ntiles_total", + HERE); + + auto deposit_kernel = + kernel::DepositCurrentsTiled_kernel { + cur, species, local_metric, (real_t)(species.charge()), + dt, layout, species.npart() + }; + + const auto scratch = Kokkos::PerTeam( + decltype(deposit_kernel)::scratch_bytes()); + + // Team (work-group) size. The default (team_size_req == 0) leaves + // Kokkos::AUTO, which sizes the team from the backend occupancy + // heuristic. A positive `algorithms.deposit.team_policy_team_size` + // overrides it, clamped to the scratch/backend-feasible maximum so an + // over-large request cannot abort the launch (Kokkos errors when + // team_size > team_size_max). No portable subgroup rounding is applied; + // pick a multiple of the device subgroup width (printed per arch by + // ideal_tile_size.py) for the best occupancy. + Kokkos::TeamPolicy<> policy(static_cast(layout.ntiles_total), + Kokkos::AUTO); + policy.set_scratch_size(0, scratch); + if (team_size_req > 0) { + const int ts_max = policy.team_size_max(deposit_kernel, + Kokkos::ParallelForTag {}); + int ts = team_size_req; + if (ts > ts_max) { + raise::Warning( + fmt::format("algorithms.deposit.team_policy_team_size = %d exceeds " + "the tiled-deposit maximum %d on this backend; clamping " + "to %d", + team_size_req, + ts_max, + ts_max), + HERE); + ts = ts_max; + } + policy = Kokkos::TeamPolicy<>(static_cast(layout.ntiles_total), ts); + policy.set_scratch_size(0, scratch); + logger::Checkpoint( + fmt::format("Tiled deposit: explicit team size %d", ts), + HERE); + } + Kokkos::parallel_for("CurrentsDepositTiled", policy, deposit_kernel); + + // Particles appended since the last sort (injection / MPI receive on a + // no-sort step) live past the partition and are not visited by any team + // above. Deposit that tail [npart_partitioned, npart) with the flat + // scatter-view kernel so every active particle is deposited exactly + // once. The range is empty when the species was just sorted (the + // every-step-sorted common case), so this is a no-op there. + if (species.npart() > layout.npart_partitioned) { + // `cur` is a const ref; take a non-const View handle (shallow copy, + // shares storage) so the scatter view can contribute back into it. + auto cur_nc = cur; + auto scatter_cur = Kokkos::Experimental::create_scatter_view(cur_nc); + Kokkos::parallel_for( + "CurrentsDepositTiledTail", + CreateParticleRangePolicy({ layout.npart_partitioned }, + { species.npart() }), + kernel::DepositCurrents_kernel( + scatter_cur, + species, + local_metric, + (real_t)(species.charge()), + dt)); + Kokkos::Experimental::contribute(cur_nc, scatter_cur); + } + } +#endif // TEAM_POLICY + template void CurrentsDeposit(Domain& domain, const prm::Parameters& engine_params) { + const auto dt = engine_params.get("dt"); + // GRPIC deposits the half-step current into `cur0` (the engine no longer + // pre-zeros it — this is the single source of truth, matching SRPIC). + Kokkos::deep_copy(domain.fields.cur0, ZERO); + +#if defined(TEAM_POLICY) + // Optional runtime override for the tiled-deposit team (work-group) size; + // 0 (default) keeps Kokkos::AUTO. Clamped to the backend max in the + // launcher (see CallDepositKernelTiled). + const auto team_size_req = static_cast( + engine_params.get("team_policy_team_size", + std::optional { 0u })); + + // Tiled deposit. Correctness no longer depends on the SoA being in a + // "sorted" state at deposit time — the tiled kernel handles a stale + // partition per-particle (escape valve for drifted particles, dead-tag + // clamp, and the launcher's flat tail pass for appended particles). The + // only case the tiled kernel cannot serve is the very first step, before + // any SortSpatially has populated a layout; that species takes the flat + // scatter-view path for that step alone. See engines/srpic/currents.h and + // kernels/currents_deposit.hpp for the full coverage argument. + for (auto& species : domain.species) { + if ((species.pusher() == ParticlePusher::NONE) or + (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) { + continue; + } + const auto& layout = species.tile_layout(); + if (layout.ntiles_total == 0u or layout.tile_offsets.extent(0) == 0u) { + logger::Checkpoint( + fmt::format("Launching currents deposit (flat, no sort yet) for " + "%d [%s] : %lu %f", + species.index(), + species.label().c_str(), + species.npart(), + (double)species.charge()), + HERE); + auto scatter_cur0 = Kokkos::Experimental::create_scatter_view( + domain.fields.cur0); + CallDepositKernel(species, + domain.mesh.metric, + scatter_cur0, + dt); + Kokkos::Experimental::contribute(domain.fields.cur0, scatter_cur0); + } else { + logger::Checkpoint( + fmt::format("Launching tiled currents deposit for %d [%s] : %lu %f", + species.index(), + species.label().c_str(), + species.npart(), + (double)species.charge()), + HERE); + CallDepositKernelTiled(species, + domain.mesh.metric, + domain.fields.cur0, + dt, + team_size_req); + } + } +#else auto scatter_cur0 = Kokkos::Experimental::create_scatter_view( domain.fields.cur0); - const auto dt = engine_params.get("dt"); for (auto& species : domain.species) { + if ((species.pusher() == ParticlePusher::NONE) or + (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) { + continue; + } logger::Checkpoint( fmt::format("Launching currents deposit kernel for %d [%s] : %lu %f", species.index(), @@ -40,36 +230,14 @@ namespace ntt { species.npart(), (double)species.charge()), HERE); - if (species.npart() == 0 || cmp::AlmostZero(species.charge())) { - continue; - } - Kokkos::parallel_for("CurrentsDeposit", - species.rangeActiveParticles(), - kernel::DepositCurrents_kernel( - scatter_cur0, - species.i1, - species.i2, - species.i3, - species.i1_prev, - species.i2_prev, - species.i3_prev, - species.dx1, - species.dx2, - species.dx3, - species.dx1_prev, - species.dx2_prev, - species.dx3_prev, - species.ux1, - species.ux2, - species.ux3, - species.phi, - species.weight, - species.tag, - domain.mesh.metric, - (real_t)(species.charge()), - dt)); + + CallDepositKernel(species, + domain.mesh.metric, + scatter_cur0, + dt); } Kokkos::Experimental::contribute(domain.fields.cur0, scatter_cur0); +#endif } template @@ -103,4 +271,4 @@ namespace ntt { } // namespace grpic } // namespace ntt -#endif // ENGINES_GRPIC_CURRENTS_H \ No newline at end of file +#endif // ENGINES_GRPIC_CURRENTS_H diff --git a/src/engines/grpic/grpic.hpp b/src/engines/grpic/grpic.hpp index fc2daa4de..621592955 100644 --- a/src/engines/grpic/grpic.hpp +++ b/src/engines/grpic/grpic.hpp @@ -416,7 +416,8 @@ namespace ntt { */ if (deposit_enabled) { timers.start("CurrentDeposit"); - Kokkos::deep_copy(dom.fields.cur0, ZERO); + // `cur0` is zeroed inside grpic::CurrentsDeposit (matching SRPIC), + // so no pre-zero is needed here. grpic::CurrentsDeposit(dom, this->engineParams()); timers.stop("CurrentDeposit"); diff --git a/src/engines/reporter.cpp b/src/engines/reporter.cpp index f56874d23..fcd46fb81 100644 --- a/src/engines/reporter.cpp +++ b/src/engines/reporter.cpp @@ -32,6 +32,24 @@ namespace ntt { "%s", params.template get("simulation.name").c_str()); reporter::AddParam(report, 4, "Engine", "%s", SimEngine(S).to_string()); +#if defined(TEAM_POLICY) + reporter::AddParam(report, 4, "Tile size", "%d", TEAM_POLICY_TILE_SIZE); + #if defined(TEAM_POLICY_DRIFT) + reporter::AddParam(report, 4, "Halo drift", "%d", TEAM_POLICY_DRIFT); + #endif + if (params.template get( + "algorithms.deposit.team_policy_team_size") == 0u) { + reporter::AddParam(report, 4, "Team size", "%s", "AUTO (Kokkos)"); + } else { + reporter::AddParam( + report, + 4, + "Team size", + "%d (requested; clamped to backend max at launch)", + static_cast(params.template get( + "algorithms.deposit.team_policy_team_size"))); + } +#endif reporter::AddParam(report, 4, "Metric", "%s", M.to_string()); #if SHAPE_ORDER == 0 reporter::AddParam(report, 4, "Deposit", "%s", "zigzag"); diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h index 3afabea8a..74946ed68 100644 --- a/src/engines/srpic/currents.h +++ b/src/engines/srpic/currents.h @@ -2,7 +2,8 @@ * @file engines/srpic/currents.h * @brief Current deposition and filtering routines for the SRPIC engine * @implements - * - ntt::srpic::CallDepositKernel<> -> void + * - ntt::srpic::CallDepositKernel<> -> void (flat path) + * - ntt::srpic::CallDepositKernelTiled<> -> void (TEAM_POLICY) * - ntt::srpic::CurrentsDeposit<> -> void * - ntt::srpic::CurrentsFilter<> -> void * @namespaces: @@ -26,6 +27,8 @@ #include "kernels/currents_deposit.hpp" #include "kernels/digital_filter.hpp" +#include + namespace ntt { namespace srpic { @@ -38,34 +41,183 @@ namespace ntt { species.rangeActiveParticles(), kernel::DepositCurrents_kernel( scatter_cur, - species.i1, - species.i2, - species.i3, - species.i1_prev, - species.i2_prev, - species.i3_prev, - species.dx1, - species.dx2, - species.dx3, - species.dx1_prev, - species.dx2_prev, - species.dx3_prev, - species.ux1, - species.ux2, - species.ux3, - species.phi, - species.weight, - species.tag, + species, local_metric, (real_t)(species.charge()), dt)); } +#if defined(TEAM_POLICY) + /** + * @brief Tiled deposit launcher (TeamPolicy + per-team scratch). + * + * Iterates over `tile_layout.ntiles_total` teams; each team accumulates + * its tile's particle contributions in SLM scratch and atomically + * flushes to the global J. Requires the species to have been sorted + * with `team_policy` enabled (`tile_layout` populated by + * `SortSpatially`). + * + * Falls back to the flat kernel if `tile_offsets` is empty — this + * happens on the first step before the first sort, or for very small + * species that exited early in `SortSpatially`. The fallback uses the + * passed-in `scatter_cur` so the caller still composes correctly. + */ + template + void CallDepositKernelTiled(const Particles& species, + const M& local_metric, + const ndfield_t& cur, + real_t dt, + int team_size_req) { + static_assert(O <= 11u, "Shape order must be <= 11"); + constexpr unsigned short T = static_cast( + TEAM_POLICY_TILE_SIZE); + const auto& layout = species.tile_layout(); + raise::ErrorIf(layout.ntiles_total == 0u, + "CallDepositKernelTiled: tile_layout has 0 tiles — call " + "SortSpatially before CurrentsDeposit", + HERE); + raise::ErrorIf(layout.tile_offsets.extent(0) != layout.ntiles_total + 1u, + "CallDepositKernelTiled: tile_offsets size inconsistent " + "with ntiles_total", + HERE); + + auto deposit_kernel = + kernel::DepositCurrentsTiled_kernel { + cur, species, local_metric, (real_t)(species.charge()), + dt, layout, species.npart() + }; + + const auto scratch = Kokkos::PerTeam( + decltype(deposit_kernel)::scratch_bytes()); + + // Team (work-group) size. The default (team_size_req == 0) leaves + // Kokkos::AUTO, which sizes the team from the backend occupancy + // heuristic. A positive `algorithms.deposit.team_policy_team_size` + // overrides it, clamped to the scratch/backend-feasible maximum so an + // over-large request cannot abort the launch (Kokkos errors when + // team_size > team_size_max). No portable subgroup rounding is applied; + // pick a multiple of the device subgroup width (printed per arch by + // ideal_tile_size.py) for the best occupancy. + Kokkos::TeamPolicy<> policy(static_cast(layout.ntiles_total), + Kokkos::AUTO); + policy.set_scratch_size(0, scratch); + if (team_size_req > 0) { + const int ts_max = policy.team_size_max(deposit_kernel, + Kokkos::ParallelForTag {}); + int ts = team_size_req; + if (ts > ts_max) { + raise::Warning( + fmt::format("algorithms.deposit.team_policy_team_size = %d exceeds " + "the tiled-deposit maximum %d on this backend; clamping " + "to %d", + team_size_req, + ts_max, + ts_max), + HERE); + ts = ts_max; + } + policy = Kokkos::TeamPolicy<>(static_cast(layout.ntiles_total), ts); + policy.set_scratch_size(0, scratch); + logger::Checkpoint( + fmt::format("Tiled deposit: explicit team size %d", ts), + HERE); + } + Kokkos::parallel_for("CurrentsDepositTiled", policy, deposit_kernel); + + // Particles appended since the last sort (injection / MPI receive on a + // no-sort step) live past the partition and are not visited by any team + // above. Deposit that tail [npart_partitioned, npart) with the flat + // scatter-view kernel so every active particle is deposited exactly + // once. The range is empty when the species was just sorted (the + // every-step-sorted common case), so this is a no-op there. + if (species.npart() > layout.npart_partitioned) { + // `cur` is a const ref; take a non-const View handle (shallow copy, + // shares storage) so the scatter view can contribute back into it. + auto cur_nc = cur; + auto scatter_cur = Kokkos::Experimental::create_scatter_view(cur_nc); + Kokkos::parallel_for( + "CurrentsDepositTiledTail", + CreateParticleRangePolicy({ layout.npart_partitioned }, + { species.npart() }), + kernel::DepositCurrents_kernel( + scatter_cur, + species, + local_metric, + (real_t)(species.charge()), + dt)); + Kokkos::Experimental::contribute(cur_nc, scatter_cur); + } + } +#endif // TEAM_POLICY + template void CurrentsDeposit(Domain& domain, const prm::Parameters& engine_params) { const auto dt = engine_params.get("dt"); Kokkos::deep_copy(domain.fields.cur, ZERO); + +#if defined(TEAM_POLICY) + // Optional runtime override for the tiled-deposit team (work-group) size; + // 0 (default) keeps Kokkos::AUTO. Clamped to the backend max in the + // launcher (see CallDepositKernelTiled). + const auto team_size_req = static_cast( + engine_params.get("team_policy_team_size", + std::optional { 0u })); + + // Tiled deposit. Correctness no longer depends on the SoA being in a + // "sorted" state at deposit time — the tiled kernel handles a stale + // partition per-particle: + // - a particle whose full stencil has drifted out of its tile is + // deposited straight to the global J view (the per-particle escape + // valve); `team_policy_drift` sizes the scratch halo so the + // common in-tile case stays in fast SLM (see currents_deposit.hpp); + // - particles dead-tagged in place since the sort are clamped out by + // the kernel and skipped by the dead-tag test; + // - particles appended past the partition since the sort (injection / + // MPI receive on a no-sort step) are deposited by the launcher's + // flat tail pass over [npart_partitioned, npart). + // Together these cover every active particle exactly once for any sort + // interval. The only case the tiled kernel cannot serve is the very + // first step, before any SortSpatially has populated a layout; that + // species takes the flat scatter-view path for that step alone. + for (auto& species : domain.species) { + if ((species.pusher() == ParticlePusher::NONE) or + (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) { + continue; + } + const auto& layout = species.tile_layout(); + if (layout.ntiles_total == 0u or layout.tile_offsets.extent(0) == 0u) { + logger::Checkpoint( + fmt::format("Launching currents deposit (flat, no sort yet) for " + "%d [%s] : %lu %f", + species.index(), + species.label().c_str(), + species.npart(), + (double)species.charge()), + HERE); + auto scatter_cur = Kokkos::Experimental::create_scatter_view( + domain.fields.cur); + CallDepositKernel(species, + domain.mesh.metric, + scatter_cur, + dt); + Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur); + } else { + logger::Checkpoint( + fmt::format("Launching tiled currents deposit for %d [%s] : %lu %f", + species.index(), + species.label().c_str(), + species.npart(), + (double)species.charge()), + HERE); + CallDepositKernelTiled(species, + domain.mesh.metric, + domain.fields.cur, + dt, + team_size_req); + } + } +#else auto scatter_cur = Kokkos::Experimental::create_scatter_view( domain.fields.cur); for (auto& species : domain.species) { @@ -84,6 +236,7 @@ namespace ntt { CallDepositKernel(species, domain.mesh.metric, scatter_cur, dt); } Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur); +#endif } template @@ -91,9 +244,11 @@ namespace ntt { Domain& domain, const SimulationParams& params) { logger::Checkpoint("Launching currents filtering kernels", HERE); - auto range = srpic::RangeWithAxisBCs(domain); const auto nfilter = params.template get( "algorithms.current_filters"); + if (nfilter == 0u) { + return; + } tuple_t size; if constexpr (M::Dim == Dim::_1D || M::Dim == Dim::_2D || M::Dim == Dim::_3D) { size[0] = domain.mesh.n_active(in::x1); @@ -104,17 +259,114 @@ namespace ntt { if constexpr (M::Dim == Dim::_3D) { size[2] = domain.mesh.n_active(in::x3); } - // !TODO: this needs to be done more efficiently + + // The filter ping-pongs `cur` <-> scratch `buff`: one up-front copy + // seeds `buff` with valid ghost cells (the kernel writes only the + // cells in its launch range), then each pass filters the input into + // the other buffer and swaps the View handles so `cur` again names the + // result. `buff` is pure scratch (also reused by CommunicateFields), so + // the permanent handle swap is transparent and `cur` always names the + // result. The single seeding copy preserves physical-boundary ghosts + // (conductor/match/atmosphere/...), which neither the kernel nor the + // MPI exchange refresh. + Kokkos::deep_copy(domain.fields.buff, domain.fields.cur); + + const auto flds_bc = domain.mesh.flds_bc(); + + // Reduced-exchange ghost-margin scheme (all coordinate types). One halo + // exchange refreshes N_GHOSTS ghost layers — enough for N_GHOSTS passes + // of the 3-point binomial if each pass also recomputes the inner ghost + // layers it will need next. We therefore extend the launch range by a + // shrinking margin `m` into the ghost zone, but only on comm-refreshed + // sides (PERIODIC self-wrap or SYNC inter-domain), where the ghost cell + // is interior physics. Physical-boundary ghosts are never written or + // refreshed, exactly as in the per-pass loop, so the result is identical + // for every BC — while doing one exchange per N_GHOSTS passes instead of + // one per pass. (Entering the loop the ghosts are valid to distance + // N_GHOSTS: srpic.hpp runs CommunicateFields(J) immediately before + // CurrentsFilter.) + // + // Non-Cartesian axis: the theta (x2) direction is self-contained in the + // filter kernel — the axis branches only ever read/write within + // [i2_min, i2_max] and never cross the axis — so the axis needs no halo + // exchange at all (the axis current fold is done once by + // SynchronizeFields(J) before this function). The single coordinate + // dependency is that the axis cell sits at i_max(x2), one past the active + // range, and must be filtered on every pass. We therefore add a fixed +1 + // to the x2 upper bound when that side is AXIS — a physical boundary, so + // never the shrinking comm margin. This folds the old RangeWithAxisBCs + // fixup into make_range, letting the same loop serve every CoordType. + const int G = static_cast(N_GHOSTS); + const auto comm_side = [](FldsBC b) { + return (b == FldsBC::PERIODIC) or (b == FldsBC::SYNC); + }; + bool ext_lo[3] = { false, false, false }; + bool ext_hi[3] = { false, false, false }; + for (auto d { 0 }; d < static_cast(M::Dim); ++d) { + ext_lo[d] = comm_side(flds_bc[d].first); + ext_hi[d] = comm_side(flds_bc[d].second); + } + // AXIS at the upper x2 boundary needs the axis cell (i_max(x2)) included + // every pass; matches srpic::RangeWithAxisBCs. (The lower-x2 axis cell is + // already i_min(x2), so no fixup is needed there.) + bool axis_hi_x2 = false; + if constexpr (M::CoordType != Coord::Cartesian and + (M::Dim == Dim::_2D or M::Dim == Dim::_3D)) { + axis_hi_x2 = (flds_bc[1].second == FldsBC::AXIS); + } + const auto make_range = [&](int m) -> range_t { + const auto ml = [&](int d) -> ncells_t { + return ext_lo[d] ? static_cast(m) : 0u; + }; + const auto mh = [&](int d) -> ncells_t { + if (ext_hi[d]) { + return static_cast(m); + } + // axis cell fixup (x2 == dimension index 1); mutually exclusive with + // the comm margin since AXIS is not a comm side + if (d == 1 and axis_hi_x2) { + return 1u; + } + return 0u; + }; + if constexpr (M::Dim == Dim::_1D) { + return CreateRangePolicy( + { domain.mesh.i_min(in::x1) - ml(0) }, + { domain.mesh.i_max(in::x1) + mh(0) }); + } else if constexpr (M::Dim == Dim::_2D) { + return CreateRangePolicy( + { domain.mesh.i_min(in::x1) - ml(0), + domain.mesh.i_min(in::x2) - ml(1) }, + { domain.mesh.i_max(in::x1) + mh(0), + domain.mesh.i_max(in::x2) + mh(1) }); + } else { + return CreateRangePolicy( + { domain.mesh.i_min(in::x1) - ml(0), + domain.mesh.i_min(in::x2) - ml(1), + domain.mesh.i_min(in::x3) - ml(2) }, + { domain.mesh.i_max(in::x1) + mh(0), + domain.mesh.i_max(in::x2) + mh(1), + domain.mesh.i_max(in::x3) + mh(2) }); + } + }; + int m = G - 1; for (auto i { 0u }; i < nfilter; ++i) { - Kokkos::deep_copy(domain.fields.buff, domain.fields.cur); - Kokkos::parallel_for("CurrentsFilter", - range, - kernel::DigitalFilter_kernel( - domain.fields.cur, - domain.fields.buff, - size, - domain.mesh.flds_bc())); - metadomain.CommunicateFields(domain, Comm::J); + Kokkos::parallel_for( + "CurrentsFilter", + make_range(m), + kernel::DigitalFilter_kernel( + domain.fields.buff, + domain.fields.cur, + size, + flds_bc)); + std::swap(domain.fields.cur, domain.fields.buff); + --m; + if (m < 0 or i == nfilter - 1u) { + // refresh ghosts to distance G (and leave them valid for the + // downstream field solver after the final pass) + metadomain.CommunicateFields(domain, Comm::J); + m = G - 1; + } } } diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h index 895026552..396b5aec8 100644 --- a/src/framework/containers/particles.h +++ b/src/framework/containers/particles.h @@ -25,6 +25,7 @@ #include "traits/metric.h" #include "utils/error.h" #include "utils/formatting.h" +#include "utils/sorting.h" #include "framework/containers/species.h" #include "framework/domain/grid.h" @@ -90,6 +91,14 @@ namespace ntt { const uint8_t m_ntags { (uint8_t)(2 + math::pow(3, (int)D) - 1) }; #endif + // team_policy: tile metadata produced by SortSpatially + // and consumed by the tiled deposit / pusher kernels. Lazily + // allocated on first sort. The sort backend itself (oneDPL on SYCL, + // Thrust on CUDA, std::sort on Host, Kokkos::BinSort otherwise) is + // selected at compile time based on the Kokkos device and the + // vendor libraries detected by CMake. + TileLayout m_tile_layout {}; + public: // for empty allocation Particles() {} @@ -198,6 +207,20 @@ namespace ntt { return m_ntags; } +#if defined(TEAM_POLICY) + // Build m_tile_layout.tile_offsets / npart_partitioned from the + // already-sorted tile-index keys. A separate member function (not a + // lambda local to SortSpatially) so the inner device kernel is not an + // extended __device__ lambda nested inside another lambda — which + // nvcc forbids. Lets the vendor path run the offsets pass and then + // release the keys before the SoA gather allocates its buffers. + // NOTE: must be public — nvcc forbids an extended __host__ __device__ + // lambda inside a member function with private/protected access. + void compute_tile_offsets(const array_t& tile_indices, + ncells_t total_tiles, + npart_t npart_local); +#endif + [[nodiscard]] auto memory_footprint() const -> std::size_t { std::size_t footprint = 0; @@ -276,9 +299,54 @@ namespace ntt { /** * @brief Sort particles spatially by their cell indices * @param grid The grid object to get the cell information for sorting + * @note In team_policy mode (compile-time `team_policy=ON`), also + * populates `m_tile_layout` with tile-offset and per-tile + * permutation metadata that the tiled deposit/pusher kernels + * consume. */ void SortSpatially(const Grid&); +#if defined(TEAM_POLICY) && \ + ((defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) || \ + (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) || \ + (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED))) + private: + /** + * @brief Apply a particle-index permutation (built by oneDPL/Thrust + * sort_by_key) to the SoA member arrays. Members are gathered + * through `perm` into a reusable `n`-sized scratch buffer + * (one per member type, shared across members of that type) + * and copied back in place, so the large persistent member + * arrays keep their storage/address and the gather makes a + * handful of transient allocations instead of one maxnpart + * buffer per member. The *_prev arrays are intentionally not + * permuted (overwritten by the next push before any read). + * Only compiled when a vendor sort backend is enabled; the + * BinSort path applies the permutation in place via + * `sorter.sort(view)` instead. + * @param perm Permutation: sorted position -> pre-sort slot index. + * @param n Number of leading (alive) particles to gather. Pass + * `npart_partitioned` so only the alive set is moved into + * `[0, n)` (the dead were binned to the sentinel tile and sort + * to the tail); the caller then drops the dead tail via + * `set_npart(n)`. + */ + void apply_permutation_to_soa(const prtl_perm_t& perm, npart_t n); + + public: +#endif + + /** + * @brief Read-only access to the tile layout produced by the most + * recent SortSpatially call. Returns a default-constructed + * layout (`ntiles_total == 0`) when the species has not yet + * been sorted. + */ + [[nodiscard]] + auto tile_layout() const -> const TileLayout& { + return m_tile_layout; + } + /** * @brief Copy particle data from device to host. */ diff --git a/src/framework/containers/particles_comm.cpp b/src/framework/containers/particles_comm.cpp index 0fa140f93..6ede3abf0 100644 --- a/src/framework/containers/particles_comm.cpp +++ b/src/framework/containers/particles_comm.cpp @@ -266,6 +266,11 @@ namespace ntt { auto iteration = 0; auto current_received = 0; + // `tag_offsets` is the same for every direction; mirror it to the host + // once here instead of re-copying it inside the direction loop. + auto tag_offsets_h = Kokkos::create_mirror_view(tag_offsets); + Kokkos::deep_copy(tag_offsets_h, tag_offsets); + for (const auto& direction : dirs_to_comm) { const auto send_rank = send_ranks.at(direction); const auto recv_rank = recv_ranks.at(direction); @@ -291,9 +296,6 @@ namespace ntt { npart_send_in * NPLDS_I }; } - auto tag_offsets_h = Kokkos::create_mirror_view(tag_offsets); - Kokkos::deep_copy(tag_offsets_h, tag_offsets); - npart_t idx_offset = npart_dead; if (tag_send > 2) { idx_offset += tag_offsets_h(tag_send - 3); diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp index 904fb3fc7..5e9ae4147 100644 --- a/src/framework/containers/particles_sort.cpp +++ b/src/framework/containers/particles_sort.cpp @@ -8,10 +8,22 @@ #include "framework/containers/particles.h" #include "framework/domain/grid.h" +#if defined(TEAM_POLICY) + #if (defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) || \ + (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) || \ + (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)) + #define TEAM_POLICY_USE_VENDOR_SORT + #include "utils/sort_dispatch.h" + #endif +#endif + #include #include +#include #include +#include +#include #include #include #include @@ -193,8 +205,231 @@ namespace ntt { m_is_sorted = true; } +#if defined(TEAM_POLICY) + template + void Particles::compute_tile_offsets( + const array_t& tile_indices, + ncells_t total_tiles, + npart_t npart_local) { + // Compute the per-tile prefix-sum `tile_offsets` for the tiled + // pusher from the (already sorted) `tile_indices` — monotonically + // non-decreasing for alive particles, with the dead sentinel + // `total_tiles + 1` clustered at the end. Transition-detect directly + // on it: the start of each non-empty tile is the only place a write + // happens — atomic-free in the dense branch. Empty tiles (no + // particles) are filled by a reverse pass on a small host mirror + // (`total_tiles ≈ 176K` at production scale → ~700 KB). + array_t tile_offsets { "tile_offsets", total_tiles + 1u }; + Kokkos::deep_copy(tile_offsets, static_cast(npart_local)); + + const auto total_tiles_v = total_tiles; + auto ti_v = tile_indices; + Kokkos::parallel_for( + "DetectTileBoundaries", + CreateParticleRangePolicy({ 0u }, { npart_local }), + Lambda(prtlidx_t p) { + const auto t_curr = ti_v(p); + const bool boundary = (p == 0u) || (ti_v(p - 1u) != t_curr); + if (!boundary) { + return; + } + if (t_curr < total_tiles_v) { + tile_offsets(t_curr) = p; + } else { + // First dead particle — also marks the alive_count boundary + // stored at index total_tiles. + Kokkos::atomic_min(&tile_offsets(total_tiles_v), p); + } + }); + + auto h_offsets = Kokkos::create_mirror_view(tile_offsets); + Kokkos::deep_copy(h_offsets, tile_offsets); + for (auto t = static_cast(total_tiles); t-- > 0u;) { + if (h_offsets(t) > h_offsets(t + 1u)) { + h_offsets(t) = h_offsets(t + 1u); + } + } + Kokkos::deep_copy(tile_offsets, h_offsets); + + m_tile_layout.tile_offsets = tile_offsets; + // tile_offsets(total_tiles) is the alive-particle count at sort time: + // the tiles partition exactly [0, npart_partitioned). The deposit + // launcher compares this against the live npart() to detect (and + // separately deposit) particles appended since this sort. + m_tile_layout.npart_partitioned = h_offsets(total_tiles); + } +#endif // TEAM_POLICY + template void Particles::SortSpatially(const Grid& grid) { +#if defined(TEAM_POLICY) + // ---------------------- team_policy: tile-based sort ------------------ // + const auto npart_local = npart(); + if (npart_local == 0u) { + m_tile_layout = TileLayout {}; + m_is_sorted = true; + return; + } + + constexpr unsigned short T = static_cast( + TEAM_POLICY_TILE_SIZE); + static_assert(T > 0u, "TEAM_POLICY_TILE_SIZE must be > 0"); + + // 1. Compute per-axis tile counts and total_tiles. + const auto ncells_active = grid.n_active(); + ncells_t ntx[3] { 1u, 1u, 1u }; + ncells_t total_tiles { 1u }; + if constexpr ((D == Dim::_1D) or (D == Dim::_2D) or (D == Dim::_3D)) { + ntx[0] = static_cast(math::ceil( + static_cast(ncells_active[0]) / static_cast(T))); + total_tiles *= ntx[0]; + } + if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) { + ntx[1] = static_cast(math::ceil( + static_cast(ncells_active[1]) / static_cast(T))); + total_tiles *= ntx[1]; + } + if constexpr (D == Dim::_3D) { + ntx[2] = static_cast(math::ceil( + static_cast(ncells_active[2]) / static_cast(T))); + total_tiles *= ntx[2]; + } + + // 2. Compute per-particle tile key (with min(i, i_prev)). + array_t tile_indices { "tile_indices", npart_local }; + Kokkos::parallel_for( + "FillTileIndices", + rangeActiveParticles(), + sort::PositionToTileIndex { i1, + i2, + i3, + tag, + tile_indices, + ncells_active, + static_cast(T), + array_t {}, + i1_prev, + i2_prev, + i3_prev }); + + // 3. Sort. Vendor library (oneDPL/Thrust) when compiled in; + // Kokkos::BinSort otherwise. n_bins = total_tiles + 2 covers + // the dead-particle sentinel bin (total_tiles + 1u). + const ncells_t n_bins = total_tiles + 2u; + const auto slice = prtl_slice_t(0, npart_local); + + #if defined(TEAM_POLICY_USE_VENDOR_SORT) + // Vendor path: produce an explicit permutation via sort_by_key, then + // apply it to each SoA member by gathering the alive prefix through a + // reusable scratch buffer (one per member type, sized to the alive + // count, copied back in place). The *_prev arrays are skipped — see + // apply_permutation_to_soa. Peak transient = one + // `npart_partitioned × sizeof(member)` scratch at a time. + prtl_perm_t perm { "tile_perm", npart_local }; + #if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED) + sort_helpers::sort_by_key_dispatch(tile_indices, + perm, + n_bins, + sort::backend::OneDPL {}); + #elif defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED) + sort_helpers::sort_by_key_dispatch(tile_indices, + perm, + n_bins, + sort::backend::Rocthrust {}); + #else + sort_helpers::sort_by_key_dispatch(tile_indices, + perm, + n_bins, + sort::backend::Thrust {}); + #endif + // `tile_indices` is sorted in place by sort_by_key. Build the tile + // offsets from it now, then drop it before the gather allocates its + // `maxnpart`-sized buffers — so the keys are not co-resident with + // them at the gather's peak (#2). + compute_tile_offsets(tile_indices, total_tiles, npart_local); + tile_indices = array_t {}; + Kokkos::fence("SortSpatially: pre-gather drain"); + // Gather only the alive particles. The sort binned dead particles to + // the sentinel tile, so they occupy [npart_partitioned, npart_local) + // and `perm[0, npart_partitioned)` lists the alive slots in tile + // order. Restricting the gather to the alive count drops the dead in + // the same pass (no separate compaction), skips work on dead slots, + // and sizes the gather scratch to the alive count. The dead tail is + // released below via `set_npart`. + apply_permutation_to_soa(perm, m_tile_layout.npart_partitioned); + #else + // BinSort path: same mechanism as legacy SortSpatially (BinSort + // allocates one temp View per `sorter.sort(view)` call and frees + // it before the next), so peak transient memory is bounded. + using sorter_op_t = Kokkos::BinOp1D>; + using sorter_t = Kokkos::BinSort, sorter_op_t>; + auto bin_op = sorter_op_t { static_cast(n_bins), 0u, n_bins }; + auto sorter = sorter_t { tile_indices, bin_op, false }; + sorter.create_permute_vector(); + if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) { + sorter.sort(Kokkos::subview(i1, slice)); + sorter.sort(Kokkos::subview(i1_prev, slice)); + sorter.sort(Kokkos::subview(dx1, slice)); + sorter.sort(Kokkos::subview(dx1_prev, slice)); + } + if constexpr (D == Dim::_2D or D == Dim::_3D) { + sorter.sort(Kokkos::subview(i2, slice)); + sorter.sort(Kokkos::subview(i2_prev, slice)); + sorter.sort(Kokkos::subview(dx2, slice)); + sorter.sort(Kokkos::subview(dx2_prev, slice)); + } + if constexpr (D == Dim::_3D) { + sorter.sort(Kokkos::subview(i3, slice)); + sorter.sort(Kokkos::subview(i3_prev, slice)); + sorter.sort(Kokkos::subview(dx3, slice)); + sorter.sort(Kokkos::subview(dx3_prev, slice)); + } + sorter.sort(Kokkos::subview(ux1, slice)); + sorter.sort(Kokkos::subview(ux2, slice)); + sorter.sort(Kokkos::subview(ux3, slice)); + sorter.sort(Kokkos::subview(weight, slice)); + sorter.sort(Kokkos::subview(tag, slice)); + if constexpr (D == Dim::_2D and C != Coord::Cartesian) { + sorter.sort(Kokkos::subview(phi, slice)); + } + for (auto pldr { 0u }; pldr < npld_r(); ++pldr) { + sorter.sort(Kokkos::subview(pld_r, slice, pldr)); + } + for (auto pldi { 0u }; pldi < npld_i(); ++pldi) { + sorter.sort(Kokkos::subview(pld_i, slice, pldi)); + } + // Apply the same permutation to `tile_indices` itself so it ends + // monotonically non-decreasing for the offsets pass, then build the + // tile offsets from it (the in-place BinSort path has no separate + // gather to hoist this ahead of). + sorter.sort(tile_indices); + compute_tile_offsets(tile_indices, total_tiles, npart_local); + #endif // TEAM_POLICY_USE_VENDOR_SORT + + // Populate `m_tile_layout` size/shape. `tile_perm` is not used in the + // current design — the SoA arrays are physically permuted into tile + // order, so consumers iterate `[tile_offsets(t), tile_offsets(t+1))` + // directly without a separate permutation indirection. + m_tile_layout.ntiles_per_axis[0] = ntx[0]; + m_tile_layout.ntiles_per_axis[1] = ntx[1]; + m_tile_layout.ntiles_per_axis[2] = ntx[2]; + m_tile_layout.ntiles_total = total_tiles; + m_tile_layout.tile_size = T; + m_tile_layout.tile_perm = prtl_perm_t {}; + m_is_sorted = true; + + // Compact-on-sort: drop the dead tail now instead of waiting for the + // periodic RemoveDead. The sort parked dead particles in the sentinel + // bin at [npart_partitioned, npart()), and the alive set is exactly + // [0, npart_partitioned) — gathered into tile order above (vendor) or + // sorted in place (BinSort). Shrinking npart() here keeps it, and + // every subsequent sort's transient buffers, tracking the alive count + // instead of ratcheting up with dead slots between clearing intervals. + // (RemoveDead remains the compactor when spatial sorting is disabled.) + set_npart(m_tile_layout.npart_partitioned); + + Kokkos::fence("SortSpatially: end of team_policy path"); +#else // !TEAM_POLICY — legacy in-place BinSort by global cell index const auto nx2 = grid.n_active(in::x2); const auto nx3 = grid.n_active(in::x3); const auto total_cells = grid.num_active(); @@ -250,13 +485,181 @@ namespace ntt { for (auto pldi { 0u }; pldi < npld_i(); ++pldi) { sorter.sort(Kokkos::subview(pld_i, slice, pldi)); } +#endif // TEAM_POLICY } +#if defined(TEAM_POLICY_USE_VENDOR_SORT) + namespace permute_helpers { + + // Permute a 1D SoA member `arr` by `perm` in place, using a + // caller-owned reusable `scratch` buffer. Gathers the live prefix + // `arr[perm[0..n)]` into `scratch[0..n)`, then copies it back into + // `arr[0..n)`. Unlike the old swap-the-handle approach, `arr` keeps + // its original storage and full maxnpart capacity, so the large + // persistent member arrays never change address between sorts (the + // dominant source of allocator churn / fragmentation on ROCm), and + // `scratch` is shared across every member of the same type within one + // sort -- one transient allocation per type group instead of a fresh + // maxnpart buffer per member. `scratch` is sized to the live count + // `n` (not maxnpart), which also lowers the gather's peak transient. + // The tail `arr[n, capacity)` is left untouched (stale, never read: + // consumers iterate `[0, npart())`). Cost vs the swap: one extra + // copy-back pass (~2x HBM traffic), negligible when sorting is a + // small fraction of the step. The fences keep the shared scratch from + // being overwritten by the next member's gather before this member's + // copy-back has drained it. + template + inline void permute_1d_into(V& arr, + const V& scratch, + const prtl_perm_t& perm, + npart_t n) { + if (n == 0u) { + return; + } + auto perm_v = perm; + auto arr_v = arr; + auto buf_v = scratch; + Kokkos::parallel_for( + "Permute1D", + n, + KOKKOS_LAMBDA(const npart_t p) { buf_v(p) = arr_v(perm_v(p)); }); + Kokkos::fence("permute_1d_into: gather"); + Kokkos::deep_copy( + Kokkos::subview(arr, std::make_pair(static_cast(0), n)), + Kokkos::subview(scratch, std::make_pair(static_cast(0), n))); + Kokkos::fence("permute_1d_into: copy-back"); + } + + // 2D analogue for `pld_r` / `pld_i` (`scratch` is `(>= n, ncols)`). + template + inline void permute_2d_into(V& arr, + const V& scratch, + const prtl_perm_t& perm, + npart_t n, + npart_t ncols) { + if (n == 0u or ncols == 0u) { + return; + } + auto perm_v = perm; + auto arr_v = arr; + auto buf_v = scratch; + Kokkos::parallel_for( + "Permute2D", + CreateParticleRangePolicy({ 0u, 0u }, { n, ncols }), + KOKKOS_LAMBDA(const npart_t p, const npart_t l) { + buf_v(p, l) = arr_v(perm_v(p), l); + }); + Kokkos::fence("permute_2d_into: gather"); + Kokkos::deep_copy( + Kokkos::subview(arr, + std::make_pair(static_cast(0), n), + Kokkos::ALL), + Kokkos::subview(scratch, + std::make_pair(static_cast(0), n), + Kokkos::ALL)); + Kokkos::fence("permute_2d_into: copy-back"); + } + + } // namespace permute_helpers + + template + void Particles::apply_permutation_to_soa(const prtl_perm_t& perm, + npart_t n) { + if (n == 0u) { + return; + } + + using permute_helpers::permute_1d_into; + using permute_helpers::permute_2d_into; + + // The *_prev arrays (i{1,2,3}_prev, dx{1,2,3}_prev) are intentionally + // NOT permuted. SortSpatially runs at the very end of the step loop + // (engine step_forward), and the first thing the next step's pusher + // does is overwrite prev := current (positionPush, sr.hpp / gr.hpp) + // for every active particle, before any consumer reads prev: + // - current deposit: runs after the push, which has already + // overwritten prev; species with pusher==NONE (whose prev would + // stay un-permuted) are skipped by CurrentsDeposit entirely. + // - pusher getParticlePrevPosition / piston: read prev only after + // positionPush has rewritten it within the same call. + // - checkpoint (prev is checkpoint-only, never in diagnostic + // output): on restart the first push overwrites prev before it + // is read, so restart results are unaffected; only the redundant + // prev field saved to the checkpoint differs from the old code. + // Permuting prev would therefore reorder data that is overwritten + // before it is ever observed. + // + // Each block below allocates a single `n`-sized scratch buffer that + // is reused for every member of that type, then freed before the next + // block allocates its own. The whole gather thus makes one transient + // allocation per type group (int / prtldx_t / real_t / short / each + // payload) instead of one fresh maxnpart buffer per member, and peak + // transient is a single n-sized scratch at a time. + { + array_t scratch { "perm_scratch_int", n }; + if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) { + permute_1d_into(i1, scratch, perm, n); + } + if constexpr (D == Dim::_2D or D == Dim::_3D) { + permute_1d_into(i2, scratch, perm, n); + } + if constexpr (D == Dim::_3D) { + permute_1d_into(i3, scratch, perm, n); + } + } + { + array_t scratch { "perm_scratch_prtldx", n }; + if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) { + permute_1d_into(dx1, scratch, perm, n); + } + if constexpr (D == Dim::_2D or D == Dim::_3D) { + permute_1d_into(dx2, scratch, perm, n); + } + if constexpr (D == Dim::_3D) { + permute_1d_into(dx3, scratch, perm, n); + } + } + { + array_t scratch { "perm_scratch_real", n }; + permute_1d_into(ux1, scratch, perm, n); + permute_1d_into(ux2, scratch, perm, n); + permute_1d_into(ux3, scratch, perm, n); + permute_1d_into(weight, scratch, perm, n); + if constexpr (D == Dim::_2D and C != Coord::Cartesian) { + permute_1d_into(phi, scratch, perm, n); + } + } + { + array_t scratch { "perm_scratch_tag", n }; + permute_1d_into(tag, scratch, perm, n); + } + if (npld_r() > 0) { + const auto ncols = static_cast(npld_r()); + array_t scratch { "perm_scratch_pld_r", n, ncols }; + permute_2d_into(pld_r, scratch, perm, n, ncols); + } + if (npld_i() > 0) { + const auto ncols = static_cast(npld_i()); + array_t scratch { "perm_scratch_pld_i", n, ncols }; + permute_2d_into(pld_i, scratch, perm, n, ncols); + } + } +#endif // TEAM_POLICY_USE_VENDOR_SORT + +#if defined(TEAM_POLICY_USE_VENDOR_SORT) + #define APPLY_PERM_INSTANTIATE(D, C) \ + template void Particles::apply_permutation_to_soa( \ + const prtl_perm_t&, npart_t); +#else + #define APPLY_PERM_INSTANTIATE(D, C) +#endif + #define PARTICLES_SORT(D, C) \ template auto Particles::NpartsPerTagAndOffsets() const \ -> std::pair, array_t>; \ template void Particles::RemoveDead(); \ - template void Particles::SortSpatially(const Grid&); + template void Particles::SortSpatially(const Grid&); \ + APPLY_PERM_INSTANTIATE(D, C) PARTICLES_SORT(Dim::_1D, Coord::Cartesian) PARTICLES_SORT(Dim::_2D, Coord::Cartesian) @@ -266,5 +669,6 @@ namespace ntt { PARTICLES_SORT(Dim::_3D, Coord::Spherical) PARTICLES_SORT(Dim::_3D, Coord::Qspherical) #undef PARTICLES_SORT +#undef APPLY_PERM_INSTANTIATE } // namespace ntt diff --git a/src/framework/domain/metadomain_sort.cpp b/src/framework/domain/metadomain_sort.cpp index 791bf31a8..a2b08f8be 100644 --- a/src/framework/domain/metadomain_sort.cpp +++ b/src/framework/domain/metadomain_sort.cpp @@ -21,9 +21,7 @@ namespace ntt { const auto clearing_interval = species.clearing_interval(); if ((clearing_interval > 0u) and (step % clearing_interval == 0u) and (step > 0u)) { - for (auto& species : domain.species) { - species.RemoveDead(); - } + species.RemoveDead(); } const auto spatial_sorting_interval = species.spatial_sorting_interval(); if ((spatial_sorting_interval > 0u) and diff --git a/src/framework/parameters/algorithms.cpp b/src/framework/parameters/algorithms.cpp index 4766db965..87c77fbd9 100644 --- a/src/framework/parameters/algorithms.cpp +++ b/src/framework/parameters/algorithms.cpp @@ -33,6 +33,11 @@ namespace ntt { deposit_enable = toml::find_or(toml_data, "algorithms", "deposit", "enable", true); deposit_order = static_cast(SHAPE_ORDER); + deposit_team_policy_team_size = toml::find_or(toml_data, + "algorithms", + "deposit", + "team_policy_team_size", + defaults::team_policy_team_size); fieldsolver_enable = toml::find_or(toml_data, "algorithms", @@ -140,6 +145,8 @@ namespace ntt { params->set("algorithms.deposit.enable", deposit_enable.value()); params->set("algorithms.deposit.order", deposit_order.value()); + params->set("algorithms.deposit.team_policy_team_size", + deposit_team_policy_team_size.value()); params->set("algorithms.fieldsolver.enable", fieldsolver_enable.value()); for (const auto& [key, value] : fieldsolver_stencil_coeffs.value()) { diff --git a/src/framework/parameters/algorithms.h b/src/framework/parameters/algorithms.h index 97edb244a..c46f480fe 100644 --- a/src/framework/parameters/algorithms.h +++ b/src/framework/parameters/algorithms.h @@ -34,6 +34,7 @@ namespace ntt { std::optional deposit_enable; std::optional deposit_order; + std::optional deposit_team_policy_team_size; std::optional fieldsolver_enable; std::optional> fieldsolver_stencil_coeffs; diff --git a/src/global/arch/kokkos_aliases.h b/src/global/arch/kokkos_aliases.h index 314e78c0e..7212b554e 100644 --- a/src/global/arch/kokkos_aliases.h +++ b/src/global/arch/kokkos_aliases.h @@ -10,6 +10,7 @@ * - CreateRangePolicy, CreateRangePolicyOnHost * - random_number_pool_t, random_generator_t * - Random function + * - prtl_perm_t, TileLayout<> * @cpp: * - arch/kokkos_aliases.cpp * @namespaces: @@ -225,6 +226,79 @@ namespace kokkos_aliases_hidden { template using range_h_t = typename kokkos_aliases_hidden::range_h_impl::type; +// Array aliases of arbitrary type and dimensions (up to 4) +namespace kokkos_aliases_hidden { + // c++ magic + template + struct scratch_nddata_impl { + using type = void; + }; + + template + struct scratch_nddata_impl<1, T> { + using type = Kokkos::View>; + }; + + template + struct scratch_nddata_impl<2, T> { + using type = Kokkos::View>; + }; + + template + struct scratch_nddata_impl<3, T> { + using type = Kokkos::View>; + }; + + template + struct scratch_nddata_impl<4, T> { + using type = Kokkos::View>; + }; +} // namespace kokkos_aliases_hidden + +template +using scratch_nddata_t = typename kokkos_aliases_hidden::scratch_nddata_impl::type; + +// Defining aliases for Scratch memory ndfield +namespace kokkos_aliases_hidden { + template + struct scratch_ndfield_impl { + using type = void; + }; + + template + struct scratch_ndfield_impl { + using type = Kokkos::View>; + }; + + template + struct scratch_ndfield_impl { + using type = Kokkos::View>; + }; + + template + struct scratch_ndfield_impl { + using type = Kokkos::View>; + }; +} // namespace kokkos_aliases_hidden + +template +using scratch_ndfield_t = + typename kokkos_aliases_hidden::scratch_ndfield_impl::type; + /** * @brief Function template for generating 1D Kokkos range policy for particles. * @tparam D Dimension @@ -258,6 +332,39 @@ template auto CreateRangePolicyOnHost(const tuple_t&, const tuple_t&) -> range_h_t; +// --------------------------- team_policy types ---------------------------- // +// Particle permutation index: maps a sorted-position p in [0, npart) to a +// pre-sort particle index. Produced by SortSpatially, consumed by tiled +// pusher and deposit kernels to walk particles tile-by-tile without +// physically re-permuting the SoA arrays in lock step every step. +using prtl_perm_t = array_t; + +// Tile layout metadata: the contract between Stream 1 (sort) and Streams +// 2/3 (tiled deposit / pusher). Scalars are host-resident; the views are +// device-resident. +// ntiles_per_axis : number of tiles along each axis (1 for unused axes). +// ntiles_total : product of ntiles_per_axis = league size for TeamPolicy. +// tile_size : tile edge length in cells (compile-time CMake knob, +// replicated here for runtime checks). +// npart_partitioned: number of (alive) particles partitioned at the last +// sort, i.e. tile_offsets(ntiles_total). The tiles cover +// exactly [0, npart_partitioned); particles appended past +// it (injection / MPI receive on a no-sort step) are not +// partitioned and must be deposited separately. +// tile_offsets : prefix-sum of per-tile particle counts; size +// ntiles_total + 1; tile t owns particles +// [tile_offsets(t), tile_offsets(t+1)). +// tile_perm : size npart, particle index sorted by tile. +template +struct TileLayout { + ncells_t ntiles_per_axis[3] { 1u, 1u, 1u }; + ncells_t ntiles_total { 0u }; + unsigned short tile_size { 0u }; + npart_t npart_partitioned { 0u }; + array_t tile_offsets; + prtl_perm_t tile_perm; +}; + // Random number pool/generator type alias // (using math:: instead of Kokkos:: to suppress compiler warning on unused namespace alias) using random_number_pool_t = math::Random_XorShift64_Pool; diff --git a/src/global/defaults.h b/src/global/defaults.h index e1387677e..c4f9b67f4 100644 --- a/src/global/defaults.h +++ b/src/global/defaults.h @@ -22,6 +22,8 @@ namespace ntt::defaults { const unsigned short current_filters = 0; + const std::size_t team_policy_team_size = 0; + const std::string em_pusher = "Boris"; const std::string ph_pusher = "Photon"; const timestep_t clear_interval = 100; diff --git a/src/global/global.cpp b/src/global/global.cpp index ec22fd2f3..c3aa1bcdc 100644 --- a/src/global/global.cpp +++ b/src/global/global.cpp @@ -6,8 +6,51 @@ #include #endif // MPI_ENABLED +#if defined(HIP_ENABLED) + #include + + #include +#endif // HIP_ENABLED + +namespace { +#if defined(HIP_ENABLED) + // Turn the ROCm stream-ordered allocator into a caching arena. + // + // This Kokkos build uses hipMallocAsync/hipFreeAsync (Kokkos option + // IMPL_HIP_MALLOC_ASYNC). The default memory pool has a release + // threshold of 0, so every freed block is handed back to the driver + // at the next stream sync. With ~50 GB of particle SoA permanently + // pinned and only ~14 GB free, the per-step churn of dozens of + // large, differently-sized sort/comm scratch buffers fragments that + // free space: allocation cost grows monotonically (ParticleSort + // slowdown) until no contiguous mid-size block remains and BinSort's + // `sorted_values` allocation fails (OOM). Raising the release + // threshold to "unlimited" makes the pool retain and recycle freed + // blocks instead, which stabilizes the working set and removes both + // the slowdown and the OOM. + void ConfigureHipMemPool() { + int device = 0; + if (hipGetDevice(&device) != hipSuccess) { + return; + } + hipMemPool_t pool = nullptr; + if (hipDeviceGetDefaultMemPool(&pool, device) != hipSuccess or + pool == nullptr) { + return; + } + uint64_t threshold = UINT64_MAX; + (void)hipMemPoolSetAttribute(pool, + hipMemPoolAttrReleaseThreshold, + &threshold); + } +#endif // HIP_ENABLED +} // namespace + void ntt::GlobalInitialize(int argc, char* argv[]) { Kokkos::initialize(argc, argv); +#if defined(HIP_ENABLED) + ConfigureHipMemPool(); +#endif // HIP_ENABLED #if defined(MPI_ENABLED) MPI_Init(&argc, &argv); #endif // MPI_ENABLED diff --git a/src/global/utils/diag.cpp b/src/global/utils/diag.cpp index 4cf763faf..604a872ae 100644 --- a/src/global/utils/diag.cpp +++ b/src/global/utils/diag.cpp @@ -17,6 +17,7 @@ #include #endif // MPI_ENABLED +#include #include #include #include @@ -26,8 +27,8 @@ namespace diag { auto npart_stats(npart_t npart, npart_t maxnpart) - -> std::vector> { - auto stats = std::vector>(); + -> std::vector> { + auto stats = std::vector>(); const auto percentage = [](npart_t part, npart_t maxpart) -> unsigned short { return static_cast( 100.0f * static_cast(part) / static_cast(maxpart)); @@ -59,9 +60,9 @@ namespace diag { if (rank != MPI_ROOT_RANK) { return stats; } - const npart_t tot_npart = std::accumulate(mpi_npart.begin(), - mpi_npart.end(), - static_cast(0)); + const std::size_t tot_npart = std::accumulate(mpi_npart.begin(), + mpi_npart.end(), + static_cast(0)); const npart_t max_idx = std::distance( mpi_npart.begin(), std::max_element(mpi_npart.begin(), mpi_npart.end())); diff --git a/src/global/utils/reporter.cpp b/src/global/utils/reporter.cpp index 77117c4b9..6113a89c7 100644 --- a/src/global/utils/reporter.cpp +++ b/src/global/utils/reporter.cpp @@ -250,6 +250,19 @@ namespace reporter { #else AddParam(report, 4, "GPU_AWARE_MPI", "%s", "OFF"); #endif + +#if defined(TEAM_POLICY) + AddParam(report, 4, "TEAM_POLICY", "%s", "ON"); + #if (defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) || \ + (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) || \ + (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)) + AddParam(report, 4, "VENDOR_SORT", "%s", "ON"); + #else + AddParam(report, 4, "VENDOR_SORT", "%s", "OFF (BinSort)"); + #endif +#else + AddParam(report, 4, "TEAM_POLICY", "%s", "OFF"); +#endif report += "\n"; return report; } diff --git a/src/global/utils/sort_dispatch.h b/src/global/utils/sort_dispatch.h new file mode 100644 index 000000000..432b1c1b9 --- /dev/null +++ b/src/global/utils/sort_dispatch.h @@ -0,0 +1,317 @@ +/** + * @file utils/sort_dispatch.h + * @brief Backend-dispatched sort_by_key for team_policy SortSpatially. + * @implements + * - sort_helpers::sort_by_key_dispatch -> void (BinSort, OneDPL, Thrust, StdSort) + * @namespaces: + * - ntt::sort_helpers:: + * @macros: + * - TEAM_POLICY + * - SYCL_ENABLED, ONEDPL_ENABLED (oneDPL overload) + * - CUDA_ENABLED, THRUST_ENABLED (Thrust overload) + * + * @note Each overload produces a permutation `perm` of size N such that + * keys[perm[0]] <= keys[perm[1]] <= ... in stable order. + * Always-available overloads: BinSort (uses Kokkos::BinSort) and + * StdSort (host-side std::stable_sort fallback). The vendor-library + * overloads (OneDPL on SYCL, cub radix sort on CUDA, rocprim radix + * sort on HIP) are conditional on the respective build flags. + * @note The CUDA/HIP overloads bound the radix sort to the significant + * key bits (`significant_bits(n_bins)`) instead of the full 32, so + * only ceil(log2(n_bins)) bits are sorted — fewer radix passes than + * a full-width `thrust::sort_by_key`. Scratch is transient (freed at + * scope exit); no persistent buffer is retained (cf. 787aa045). + */ + +#ifndef GLOBAL_UTILS_SORT_DISPATCH_H +#define GLOBAL_UTILS_SORT_DISPATCH_H + +#if !defined(TEAM_POLICY) + #error "sort_dispatch.h is only meaningful when TEAM_POLICY is defined" +#endif + +#include "global.h" + +#include "arch/kokkos_aliases.h" +#include "utils/sorting.h" + +#include +#include + +// Entity's Kokkos alias macros (arch/kokkos_aliases.h) define bare words such +// as `Function`, `Inline`, `Lambda` and `ClassLambda`. These collide with +// template-parameter and member names used inside the vendor sort headers +// (rocPRIM, cub, oneDPL) and corrupt their parsing (e.g. rocPRIM's +// `template`). Suspend the aliases across the +// vendor includes only, then restore them for the rest of the translation unit. +#pragma push_macro("Function") +#pragma push_macro("Inline") +#pragma push_macro("Lambda") +#pragma push_macro("ClassLambda") +#undef Function +#undef Inline +#undef Lambda +#undef ClassLambda + +#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED) + #include + #include +#endif +#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED) + #include +#endif +#if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED) + #include +#endif + +#pragma pop_macro("ClassLambda") +#pragma pop_macro("Lambda") +#pragma pop_macro("Inline") +#pragma pop_macro("Function") + +#include +#include +#include + +namespace ntt::sort_helpers { + + // Number of low-order bits needed to represent keys in [0, n_bins). + // The radix sort only needs to scan these bits — bounding `end_bit` to + // ceil(log2(n_bins)) instead of 32 cuts the number of passes (e.g. 18 + // bits when total_tiles ~ 176K). Returns at least 1. + inline unsigned int significant_bits(ncells_t n_bins) { + unsigned int bits = 0u; + while (bits < 32u && + (static_cast(1u) << bits) < + static_cast(n_bins)) { + ++bits; + } + return (bits == 0u) ? 1u : bits; + } + + // Always-available legacy fallback: Kokkos::BinSort. n_bins must be an + // upper bound on distinct key values. + inline void sort_by_key_dispatch(const array_t& keys, + prtl_perm_t& perm, + ncells_t n_bins, + ::sort::backend::BinSort) { + const auto n = static_cast(keys.extent(0)); + if (n == 0u) { + return; + } + using sorter_op_t = Kokkos::BinOp1D>; + using sorter_t = Kokkos::BinSort, sorter_op_t>; + auto bin_op = sorter_op_t { static_cast(n_bins), 0u, n_bins }; + auto sorter = sorter_t { keys, bin_op, false }; + sorter.create_permute_vector(); + auto perm_v = perm; + Kokkos::parallel_for( + "PermInitIota", + n, + KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; }); + Kokkos::fence("sort_by_key_dispatch BinSort: pre-sort"); + sorter.sort(perm); + Kokkos::fence("sort_by_key_dispatch BinSort: post-sort"); + } + +#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED) + inline void sort_by_key_dispatch(const array_t& keys, + prtl_perm_t& perm, + ncells_t /*n_bins*/, + ::sort::backend::OneDPL) { + const auto n = static_cast(keys.extent(0)); + if (n == 0u) { + return; + } + auto* keys_ptr = keys.data(); + auto* perm_ptr = perm.data(); + auto exec = Kokkos::DefaultExecutionSpace(); + auto perm_v = perm; + Kokkos::parallel_for( + "PermInitIota", + n, + KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; }); + // Drain Kokkos's queue so oneDPL's policy sees the iota'd perm even + // if oneDPL submits to a different SYCL queue internally. + exec.fence("sort_by_key_dispatch OneDPL: pre-sort"); + auto queue = exec.sycl_queue(); + auto policy = oneapi::dpl::execution::make_device_policy(queue); + oneapi::dpl::sort_by_key(policy, keys_ptr, keys_ptr + n, perm_ptr); + exec.fence("sort_by_key_dispatch OneDPL: post-sort"); + } +#endif + +#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED) + inline void sort_by_key_dispatch(const array_t& keys, + prtl_perm_t& perm, + ncells_t n_bins, + ::sort::backend::Thrust) { + const auto n = static_cast(keys.extent(0)); + if (n == 0u) { + return; + } + auto exec = Kokkos::DefaultExecutionSpace(); + auto perm_v = perm; + Kokkos::parallel_for( + "PermInitIota", + n, + KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; }); + + // Radix sort bounded to the significant key bits. The _out buffers and + // temp storage are transient (freed at scope exit). + array_t keys_out("tile_keys_sorted", n); + prtl_perm_t perm_out("tile_perm_sorted", n); + const int end_bit = static_cast(significant_bits(n_bins)); + + exec.fence("sort_by_key_dispatch Thrust: pre-sort"); + auto stream = exec.cuda_stream(); + + // DoubleBuffer radix sort: cub ping-pongs between the supplied + // (current, alternate) buffer pairs, so `temp_bytes` holds only the + // histograms (~MB) instead of an internal N-sized alternate (~8*N bytes). + // Nearly halves the sort's transient memory vs the plain out-of-place + // form, which matters at high npart where the N-sized temp can fail to + // allocate (device OOM at scale). + cub::DoubleBuffer d_keys(keys.data(), keys_out.data()); + cub::DoubleBuffer d_perm(perm.data(), perm_out.data()); + + std::size_t temp_bytes = 0; + auto err = cub::DeviceRadixSort::SortPairs(nullptr, + temp_bytes, + d_keys, + d_perm, + n, + 0, + end_bit, + stream); + raise::ErrorIf(err != cudaSuccess, + "cub::DeviceRadixSort::SortPairs (size query) failed", + HERE); + array_t temp("cub_radix_temp", + (temp_bytes == 0u) ? std::size_t { 1 } : temp_bytes); + err = cub::DeviceRadixSort::SortPairs(temp.data(), + temp_bytes, + d_keys, + d_perm, + n, + 0, + end_bit, + stream); + raise::ErrorIf(err != cudaSuccess, + "cub::DeviceRadixSort::SortPairs failed", + HERE); + exec.fence("sort_by_key_dispatch Thrust: post-sort"); + + // Publish results from whichever buffer cub left as Current() (depends on + // the pass count): copy sorted keys back into `keys`' storage if they + // ended up in the alternate, and point `perm` at its current buffer. + if (d_keys.Current() != keys.data()) { + auto keys_nc = keys; // non-const handle aliasing the same storage + Kokkos::deep_copy(keys_nc, keys_out); + } + if (d_perm.Current() == perm_out.data()) { + perm = perm_out; + } + } +#endif + +#if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED) + // HIP analogue of the CUDA cub overload, using rocprim's radix sort + // (which ships with rocThrust). Same bounded-bit, out-of-place, + // transient-scratch scheme. + inline void sort_by_key_dispatch(const array_t& keys, + prtl_perm_t& perm, + ncells_t n_bins, + ::sort::backend::Rocthrust) { + const auto n = static_cast(keys.extent(0)); + if (n == 0u) { + return; + } + auto exec = Kokkos::DefaultExecutionSpace(); + auto perm_v = perm; + Kokkos::parallel_for( + "PermInitIota", + n, + KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; }); + + array_t keys_out("tile_keys_sorted", n); + prtl_perm_t perm_out("tile_perm_sorted", n); + const unsigned int end_bit = significant_bits(n_bins); + + exec.fence("sort_by_key_dispatch Rocthrust: pre-sort"); + auto stream = exec.hip_stream(); + + // double_buffer radix sort: rocprim ping-pongs between the supplied + // (current, alternate) buffer pairs, so `temp_storage` holds only the + // histograms (~MB) instead of an internal N-sized alternate (~8*N bytes, + // the dominant `rocprim_radix_temp`). This nearly halves the sort's + // transient memory vs the plain out-of-place form, which matters at high + // npart where that N-sized temp can fail to allocate (device OOM at scale). + rocprim::double_buffer d_keys(keys.data(), keys_out.data()); + rocprim::double_buffer d_perm(perm.data(), perm_out.data()); + + std::size_t temp_bytes = 0; + auto err = rocprim::radix_sort_pairs(nullptr, + temp_bytes, + d_keys, + d_perm, + static_cast(n), + 0u, + end_bit, + stream); + raise::ErrorIf(err != hipSuccess, + "rocprim::radix_sort_pairs (size query) failed", + HERE); + array_t temp("rocprim_radix_temp", + (temp_bytes == 0u) ? std::size_t { 1 } : temp_bytes); + err = rocprim::radix_sort_pairs(temp.data(), + temp_bytes, + d_keys, + d_perm, + static_cast(n), + 0u, + end_bit, + stream); + raise::ErrorIf(err != hipSuccess, + "rocprim::radix_sort_pairs failed", + HERE); + exec.fence("sort_by_key_dispatch Rocthrust: post-sort"); + + // Publish results from whichever buffer rocprim left as `current()` + // (depends on the pass count). If the sorted keys ended up in the + // alternate, copy them back into `keys`' storage so downstream + // (compute_tile_offsets) sees them; point `perm` at its current buffer. + if (d_keys.current() != keys.data()) { + auto keys_nc = keys; // non-const handle aliasing the same storage + Kokkos::deep_copy(keys_nc, keys_out); + } + if (d_perm.current() == perm_out.data()) { + perm = perm_out; + } + } +#endif + + // Host fallback: indirect-sort via std::stable_sort. + inline void sort_by_key_dispatch(const array_t& keys, + prtl_perm_t& perm, + ncells_t /*n_bins*/, + ::sort::backend::StdSort) { + const auto n = static_cast(keys.extent(0)); + if (n == 0u) { + return; + } + auto keys_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + keys); + auto perm_h = Kokkos::create_mirror_view(perm); + std::iota(perm_h.data(), perm_h.data() + n, npart_t { 0u }); + std::stable_sort(perm_h.data(), + perm_h.data() + n, + [&](npart_t a, npart_t b) { + return keys_h(a) < keys_h(b); + }); + Kokkos::deep_copy(perm, perm_h); + } + +} // namespace ntt::sort_helpers + +#endif // GLOBAL_UTILS_SORT_DISPATCH_H diff --git a/src/global/utils/sorting.h b/src/global/utils/sorting.h index dbe774402..8442f5ddb 100644 --- a/src/global/utils/sorting.h +++ b/src/global/utils/sorting.h @@ -5,6 +5,7 @@ * - sort::BinBool<> * - sort::BinTag<> * - sort::PositionToTileIndex<> + * - sort::backend tag types (compile-time tag dispatch for sort_by_key) * @namespaces: * - sort:: * @note BinBool sorts by boolean values "true" then "false" @@ -62,9 +63,27 @@ namespace sort { const int m_max_bins; }; - template + /** + * @brief Bin a particle into a tile of edge length `tile_size` cells. + * @tparam D Dimension. + * @tparam Count If true, atomic-increment `num_ppt[tile]` for each live + * particle (used to populate per-tile counts in one pass). + * @tparam UsePrev If true, the bin key uses `min(i_curr, i_prev)` instead + * of `i_curr` alone. This guarantees that a particle whose + * Esirkepov stencil straddles a tile boundary (because it + * crossed the boundary during the pusher) lands in the + * lower-indexed of the two tiles. Combined with a halo of + * `O+1` cells in the deposit's per-tile scratch, this + * keeps every particle's stencil inside its assigned + * tile's interior+halo region. See plan §S2.4. + * + * Dead particles get the sentinel `total_tiles + 1u` so they sort to the + * end (or get skipped, depending on the consumer). + */ + template struct PositionToTileIndex { const array_t i1, i2, i3; + const array_t i1_prev, i2_prev, i3_prev; const array_t tag; array_t tile_indices; ncells_t tile_size; @@ -72,20 +91,37 @@ namespace sort { ncells_t ntx2 { 0u }, ntx3 { 0u }; ncells_t total_tiles { 0u }; - - PositionToTileIndex(const array_t& i1, - const array_t& i2, - const array_t& i3, - const array_t& tag, - array_t& tile_indices, + // Active-cell extents per axis. Used to clamp the bin key when + // UsePrev=true, since `i_prev` can be transiently negative after the + // pusher's periodic wrap (`i_prev -= ni`) or out-of-range after an + // MPI receive that hasn't translated frames. Without clamping, the + // signed-to-unsigned promotion in `int(-1) / uint32_t(T)` produces + // ~1.07e9, the linearised `tile_indices(p)` overflows past `n_bins`, + // and BinSort's internal `atomic_add(&bin_count[wild_idx], 1)` + // faults on an unmapped page. + int ncells1 { 1 }, ncells2 { 1 }, ncells3 { 1 }; + + PositionToTileIndex(const array_t& i1_, + const array_t& i2_, + const array_t& i3_, + const array_t& tag_, + array_t& tile_indices_, const std::vector& ncells, - ncells_t tile_size = 1u) - : i1 { i1 } - , i2 { i2 } - , i3 { i3 } - , tag { tag } - , tile_indices { tile_indices } - , tile_size { tile_size } + ncells_t tile_size_ = 1u, + const array_t& num_ppt_ = { "num_ppt", 0u }, + const array_t& i1_prev_ = {}, + const array_t& i2_prev_ = {}, + const array_t& i3_prev_ = {}) + : i1 { i1_ } + , i2 { i2_ } + , i3 { i3_ } + , i1_prev { i1_prev_ } + , i2_prev { i2_prev_ } + , i3_prev { i3_prev_ } + , tag { tag_ } + , tile_indices { tile_indices_ } + , tile_size { tile_size_ } + , num_ppt { num_ppt_ } , ntx2 { 1u } , ntx3 { 1u } , total_tiles { 1u } { @@ -93,49 +129,122 @@ namespace sort { "ncells size must match D", HERE); if constexpr ((D == Dim::_1D) or (D == Dim::_2D) or (D == Dim::_3D)) { + ncells1 = static_cast(ncells[0]); npart_t ntx1 = static_cast(math::ceil( static_cast(ncells[0]) / static_cast(tile_size))); total_tiles *= ntx1; } if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) { + ncells2 = static_cast(ncells[1]); ntx2 = static_cast(math::ceil( static_cast(ncells[1]) / static_cast(tile_size))); total_tiles *= ntx2; } if constexpr (D == Dim::_3D) { + ncells3 = static_cast(ncells[2]); ntx3 = static_cast(math::ceil( static_cast(ncells[2]) / static_cast(tile_size))); total_tiles *= ntx3; } if constexpr (Count) { - num_ppt = array_t { "num_ppt", total_tiles }; + raise::ErrorIf(num_ppt.extent(0) != total_tiles, + "num_ppt must have extent equal to total tiles", + HERE); + } + if constexpr (UsePrev) { + raise::ErrorIf( + i1_prev.extent(0) == 0u, + "PositionToTileIndex requires i1_prev to be set", + HERE); + if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) { + raise::ErrorIf( + i2_prev.extent(0) == 0u, + "PositionToTileIndex requires i2_prev to be set", + HERE); + } + if constexpr (D == Dim::_3D) { + raise::ErrorIf( + i3_prev.extent(0) == 0u, + "PositionToTileIndex requires i3_prev to be set", + HERE); + } } } - Inline auto operator()(prtldx_t p) const { + Inline auto operator()(prtlidx_t p) const { if (tag(p) != ntt::ParticleTag::alive) { tile_indices(p) = total_tiles + 1u; } else { + // bin key per-axis: use min(i, i_prev) when UsePrev so that a + // particle straddling a boundary lands in the lower tile. + // Then clamp to [0, ncells_axis - 1] — `i_prev` can be negative + // (after the pusher's periodic-wrap path: `i_prev -= ni`) or + // out-of-range (after MPI receive without frame translation). + // Without the clamp, signed-to-unsigned promotion in + // `int(-1) / uint32_t(T)` makes `tile_indices(p)` overflow far + // past `n_bins`, and BinSort's `atomic_add(&bin_count[bin],1)` + // faults on an unmapped page. + const auto clamp_axis = [](int v, int ncells) -> int { + return (v < 0) ? 0 : ((v >= ncells) ? (ncells - 1) : v); + }; + const auto key1 = [&]() -> int { + if constexpr (UsePrev) { + const int raw = (i1(p) < i1_prev(p)) ? i1(p) : i1_prev(p); + return clamp_axis(raw, ncells1); + } else { + return i1(p); + } + }(); + const auto key2 = [&]() -> int { + if constexpr (UsePrev) { + const int raw = (i2(p) < i2_prev(p)) ? i2(p) : i2_prev(p); + return clamp_axis(raw, ncells2); + } else { + return i2(p); + } + }(); + const auto key3 = [&]() -> int { + if constexpr (UsePrev) { + const int raw = (i3(p) < i3_prev(p)) ? i3(p) : i3_prev(p); + return clamp_axis(raw, ncells3); + } else { + return i3(p); + } + }(); if constexpr (D == Dim::_1D) { - tile_indices(p) = static_cast(i1(p) / tile_size); + tile_indices(p) = static_cast(key1 / tile_size); } else if constexpr (D == Dim::_2D) { - tile_indices(p) = static_cast(i1(p) / tile_size) * ntx2 + - static_cast(i2(p) / tile_size); + tile_indices(p) = static_cast(key1 / tile_size) * ntx2 + + static_cast(key2 / tile_size); } else if constexpr (D == Dim::_3D) { - tile_indices(p) = (static_cast(i1(p) / tile_size) * ntx2 + - static_cast(i2(p) / tile_size)) * + tile_indices(p) = (static_cast(key1 / tile_size) * ntx2 + + static_cast(key2 / tile_size)) * ntx3 + - static_cast(i3(p) / tile_size); + static_cast(key3 / tile_size); } else { raise::KernelError(HERE, "Wrong D in SortSpatially"); } if constexpr (Count) { - Kokkos::atomic_add(&num_ppt(tile_indices(p)), 1u); + Kokkos::atomic_add(&num_ppt(tile_indices(p)), 1); } } } }; + // -------------------- Backend dispatch for sort_by_key ------------------- // + // Compile-time tags for tag-dispatch into backend-specific + // sort_by_key implementations. Selection is fully compile-time: the + // backend that resolves depends on the active Kokkos device and the + // availability of the corresponding vendor library. + namespace backend { + struct OneDPL {}; + struct Thrust {}; + struct Rocthrust {}; + struct StdSort {}; + // Always-available legacy fallback using Kokkos::BinSort. + struct BinSort {}; + } // namespace backend + } // namespace sort #endif // GLOBAL_UTILS_SORTING_H diff --git a/src/kernels/currents_deposit.hpp b/src/kernels/currents_deposit.hpp index 268187af2..98fa0b661 100644 --- a/src/kernels/currents_deposit.hpp +++ b/src/kernels/currents_deposit.hpp @@ -1,10 +1,24 @@ /** * @file kernels/currents_deposit.hpp - * @brief Covariant algorithms for the current deposition + * @brief Covariant algorithms for the current deposition. + * + * Two kernels share the same per-particle body + * (`kernel::DepositOneParticle`): + * - `kernel::DepositCurrents_kernel` flat (RangePolicy over particles, + * writes into a `Kokkos::Experimental::ScatterView`). Always available. + * - `kernel::DepositCurrents_kernel_tiled` team-policy + * (one team per spatial tile, accumulates into team SLM scratch with + * atomic adds, then flushes to global J). Available when `team_policy=ON` + * (`#if defined(TEAM_POLICY)`). Stream 2 of the Pattern A plan. + * * @implements + * - kernel::deposit::PrtlPack<> + * - kernel::DepositOneParticle<> * - kernel::DepositCurrents_kernel<> + * - kernel::DepositCurrents_kernel_tiled<> (TEAM_POLICY only) * @namespaces: * - kernel:: + * - kernel::deposit:: */ #ifndef KERNELS_CURRENTS_DEPOSIT_HPP @@ -18,9 +32,11 @@ #include "utils/error.h" #include "utils/numeric.h" -#include "particle_shapes.hpp" +#include "framework/containers/particles.h" +#include "kernels/particle_shapes.hpp" #include +#include #define i_di_to_Xi(I, DI) (static_cast((I)) + static_cast((DI))) @@ -28,738 +44,1201 @@ namespace kernel { using namespace ntt; /** - * @brief Algorithm for the current deposition + * @brief Per-particle deposit body, shared between the flat and tiled + * kernels. + * + * The caller supplies a `deposit_at(idx..., comp, val)` callback that + * applies the contribution `val` to the J component `comp` at the + * **global** J cell index `idx...` (already includes the `N_GHOSTS` + * offset). The flat kernel's callback simply does + * `J_acc(idx..., comp) += val` on its scatter-view accessor; the tiled + * kernel's callback translates `idx...` into per-tile scratch + * coordinates and uses `Kokkos::atomic_add` on SLM. Either way, this + * function is identical numerically and contains the only deposit math + * in the codebase. + * + * Dead particles return early. The callback is invoked once per cell + * write, with the dimension-appropriate signature: + * - 1D: `deposit_at(int g_i1, int comp, real_t val)` + * - 2D: `deposit_at(int g_i1, int g_i2, int comp, real_t val)` + * - 3D: `deposit_at(int g_i1, int g_i2, int g_i3, int comp, real_t val)` */ - template - class DepositCurrents_kernel { + template + Inline void DepositOneParticle(prtlidx_t p, + const ParticleArrays& prtls, + const M& metric, + real_t charge, + real_t inv_dt, + DepositFn deposit_at) { static_assert(O <= 11u, "Shape function order O must be <= 11"); - static constexpr auto D = M::Dim; + constexpr auto D = M::Dim; - scatter_ndfield_t J; - const array_t i1, i2, i3; - const array_t i1_prev, i2_prev, i3_prev; - const array_t dx1, dx2, dx3; - const array_t dx1_prev, dx2_prev, dx3_prev; - const array_t ux1, ux2, ux3; - const array_t phi; - const array_t weight; - const array_t tag; - const M metric; - const real_t charge, inv_dt; - - public: - /** - * @brief explicit constructor. - */ - DepositCurrents_kernel(const scatter_ndfield_t& scatter_cur, - const array_t& i1, - const array_t& i2, - const array_t& i3, - const array_t& i1_prev, - const array_t& i2_prev, - const array_t& i3_prev, - const array_t& dx1, - const array_t& dx2, - const array_t& dx3, - const array_t& dx1_prev, - const array_t& dx2_prev, - const array_t& dx3_prev, - const array_t& ux1, - const array_t& ux2, - const array_t& ux3, - const array_t& phi, - const array_t& weight, - const array_t& tag, - const M& metric, - real_t charge, - const real_t dt) - : J { scatter_cur } - , i1 { i1 } - , i2 { i2 } - , i3 { i3 } - , i1_prev { i1_prev } - , i2_prev { i2_prev } - , i3_prev { i3_prev } - , dx1 { dx1 } - , dx2 { dx2 } - , dx3 { dx3 } - , dx1_prev { dx1_prev } - , dx2_prev { dx2_prev } - , dx3_prev { dx3_prev } - , ux1 { ux1 } - , ux2 { ux2 } - , ux3 { ux3 } - , phi { phi } - , weight { weight } - , tag { tag } - , metric { metric } - , charge { charge } - , inv_dt { ONE / dt } { - raise::ErrorIf( - (O == 2u and N_GHOSTS < 2), - "Order of interpolation is 2, but number of ghost cells is < 2", - HERE); + if (prtls.tag(p) == ParticleTag::dead) { + return; } - /** - * @brief Iteration of the loop over particles. - * @param p index. - */ - Inline auto operator()(prtlidx_t p) const -> void { - if (tag(p) == ParticleTag::dead) { - return; - } - // recover particle velocity to deposit in unsimulated direction - vec_t vp { ZERO }; - { - coord_t xp { ZERO }; - if constexpr (D == Dim::_1D) { - xp[0] = i_di_to_Xi(i1(p), dx1(p)); - } else if constexpr (D == Dim::_2D) { - if constexpr (M::PrtlDim == Dim::_3D) { - xp[0] = i_di_to_Xi(i1(p), dx1(p)); - xp[1] = i_di_to_Xi(i2(p), dx2(p)); - xp[2] = phi(p); - } else { - xp[0] = i_di_to_Xi(i1(p), dx1(p)); - xp[1] = i_di_to_Xi(i2(p), dx2(p)); - } - } else { - xp[0] = i_di_to_Xi(i1(p), dx1(p)); - xp[1] = i_di_to_Xi(i2(p), dx2(p)); - xp[2] = i_di_to_Xi(i3(p), dx3(p)); - } - auto inv_energy { ZERO }; - if constexpr (S == SimEngine::SRPIC) { - metric.template transform_xyz(xp, - { ux1(p), ux2(p), ux3(p) }, - vp); - inv_energy = ONE / math::sqrt(ONE + NORM_SQR(ux1(p), ux2(p), ux3(p))); + // recover particle velocity to deposit in unsimulated direction + [[maybe_unused]] + vec_t vp { ZERO }; + // `vp` only feeds the unsimulated-direction current in the 1D + // (jx2, jx3) and 2D (jx3) branches. In 3D every J component comes + // from the Esirkepov/zigzag charge motion and `vp` is never read, + // so the metric transform + 1/sqrt + NaN/Inf guard below is pure + // dead work there — skip it (also frees xp/inv_energy registers). + if constexpr (D != Dim::_3D) { + coord_t xp { ZERO }; + if constexpr (D == Dim::_1D) { + xp[0] = i_di_to_Xi(prtls.i1(p), prtls.dx1(p)); + } else if constexpr (D == Dim::_2D) { + if constexpr (M::PrtlDim == Dim::_3D) { + xp[0] = i_di_to_Xi(prtls.i1(p), prtls.dx1(p)); + xp[1] = i_di_to_Xi(prtls.i2(p), prtls.dx2(p)); + xp[2] = prtls.phi(p); } else { - coord_t xp_ { ZERO }; - xp_[0] = xp[0]; - real_t theta_Cd { xp[1] }; - const auto theta_Ph { metric.template convert<2, Crd::Cd, Crd::Ph>( - theta_Cd) }; - const auto small_angle { static_cast(constant::SMALL_ANGLE_GR) }; - const auto large_angle { static_cast( - constant::PI - constant::SMALL_ANGLE_GR) }; - if (theta_Ph < small_angle) { - theta_Cd = metric.template convert<2, Crd::Ph, Crd::Cd>(small_angle); - } else if (theta_Ph >= large_angle) { - theta_Cd = metric.template convert<2, Crd::Ph, Crd::Cd>(large_angle); - } - xp_[1] = theta_Cd; - metric.template transform(xp_, - { ux1(p), ux2(p), ux3(p) }, - vp); - inv_energy = metric.alpha(xp_) / - math::sqrt(ONE + ux1(p) * vp[0] + ux2(p) * vp[1] + - ux3(p) * vp[2]); + xp[0] = i_di_to_Xi(prtls.i1(p), prtls.dx1(p)); + xp[1] = i_di_to_Xi(prtls.i2(p), prtls.dx2(p)); } - if (Kokkos::isnan(vp[2]) || Kokkos::isinf(vp[2])) { - vp[2] = ZERO; + } else { + xp[0] = i_di_to_Xi(prtls.i1(p), prtls.dx1(p)); + xp[1] = i_di_to_Xi(prtls.i2(p), prtls.dx2(p)); + xp[2] = i_di_to_Xi(prtls.i3(p), prtls.dx3(p)); + } + auto inv_energy { ZERO }; + if constexpr (S == SimEngine::SRPIC) { + metric.template transform_xyz( + xp, + { prtls.ux1(p), prtls.ux2(p), prtls.ux3(p) }, + vp); + inv_energy = ONE / U2GAMMA(prtls.ux1(p), prtls.ux2(p), prtls.ux3(p)); + } else { + coord_t xp_ { ZERO }; + xp_[0] = xp[0]; + real_t theta_Cd { xp[1] }; + const auto theta_Ph { metric.template convert<2, Crd::Cd, Crd::Ph>( + theta_Cd) }; + const auto small_angle { static_cast(constant::SMALL_ANGLE_GR) }; + const auto large_angle { static_cast( + constant::PI - constant::SMALL_ANGLE_GR) }; + if (theta_Ph < small_angle) { + theta_Cd = metric.template convert<2, Crd::Ph, Crd::Cd>(small_angle); + } else if (theta_Ph >= large_angle) { + theta_Cd = metric.template convert<2, Crd::Ph, Crd::Cd>(large_angle); } - vp[0] *= inv_energy; - vp[1] *= inv_energy; - vp[2] *= inv_energy; + xp_[1] = theta_Cd; + metric.template transform( + xp_, + { prtls.ux1(p), prtls.ux2(p), prtls.ux3(p) }, + vp); + inv_energy = metric.alpha(xp_) / + math::sqrt(ONE + prtls.ux1(p) * vp[0] + + prtls.ux2(p) * vp[1] + prtls.ux3(p) * vp[2]); } + if (Kokkos::isnan(vp[2]) || Kokkos::isinf(vp[2])) { + vp[2] = ZERO; + } + vp[0] *= inv_energy; + vp[1] *= inv_energy; + vp[2] *= inv_energy; + } - const real_t coeff { weight(p) * charge }; - - // ToDo: interpolation_order as parameter - if constexpr (O == 0u) { - /* - Zig-zag deposit - */ - const auto dxp_r_1 { static_cast(i1(p) == i1_prev(p)) * - (dx1(p) + dx1_prev(p)) * + const real_t coeff { prtls.weight(p) * charge }; + + if constexpr (O == 0u) { + /* + Zig-zag deposit + */ + const auto dxp_r_1 { static_cast(prtls.i1(p) == prtls.i1_prev(p)) * + (prtls.dx1(p) + prtls.dx1_prev(p)) * + static_cast(INV_2) }; + + const real_t Wx1_1 { INV_2 * + (dxp_r_1 + prtls.dx1_prev(p) + + static_cast(prtls.i1(p) > prtls.i1_prev(p))) }; + const real_t Wx1_2 { INV_2 * + (prtls.dx1(p) + dxp_r_1 + + static_cast( + static_cast(prtls.i1(p) > prtls.i1_prev(p)) + + prtls.i1_prev(p) - prtls.i1(p))) }; + const real_t Fx1_1 { (static_cast(prtls.i1(p) > prtls.i1_prev(p)) + + dxp_r_1 - prtls.dx1_prev(p)) * + coeff * inv_dt }; + const real_t Fx1_2 { (static_cast( + prtls.i1(p) - prtls.i1_prev(p) - + static_cast(prtls.i1(p) > prtls.i1_prev(p))) + + prtls.dx1(p) - dxp_r_1) * + coeff * inv_dt }; + + if constexpr (D == Dim::_1D) { + const real_t Fx2_1 { HALF * vp[1] * coeff }; + const real_t Fx2_2 { HALF * vp[1] * coeff }; + + const real_t Fx3_1 { HALF * vp[2] * coeff }; + const real_t Fx3_2 { HALF * vp[2] * coeff }; + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, cur::jx1, Fx1_1); + deposit_at(prtls.i1(p) + N_GHOSTS, cur::jx1, Fx1_2); + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, cur::jx2, Fx2_1 * (ONE - Wx1_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, cur::jx2, Fx2_1 * Wx1_1); + deposit_at(prtls.i1(p) + N_GHOSTS, cur::jx2, Fx2_2 * (ONE - Wx1_2)); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, cur::jx2, Fx2_2 * Wx1_2); + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, cur::jx3, Fx3_1 * (ONE - Wx1_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, cur::jx3, Fx3_1 * Wx1_1); + deposit_at(prtls.i1(p) + N_GHOSTS, cur::jx3, Fx3_2 * (ONE - Wx1_2)); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, cur::jx3, Fx3_2 * Wx1_2); + } else if constexpr (D == Dim::_2D || D == Dim::_3D) { + const auto dxp_r_2 { static_cast(prtls.i2(p) == prtls.i2_prev(p)) * + (prtls.dx2(p) + prtls.dx2_prev(p)) * static_cast(INV_2) }; - const real_t Wx1_1 { INV_2 * (dxp_r_1 + dx1_prev(p) + - static_cast(i1(p) > i1_prev(p))) }; - const real_t Wx1_2 { INV_2 * (dx1(p) + dxp_r_1 + - static_cast( - static_cast(i1(p) > i1_prev(p)) + - i1_prev(p) - i1(p))) }; - const real_t Fx1_1 { (static_cast(i1(p) > i1_prev(p)) + - dxp_r_1 - dx1_prev(p)) * + const real_t Wx2_1 { INV_2 * (dxp_r_2 + prtls.dx2_prev(p) + + static_cast(prtls.i2(p) > + prtls.i2_prev(p))) }; + const real_t Wx2_2 { INV_2 * + (prtls.dx2(p) + dxp_r_2 + + static_cast( + static_cast(prtls.i2(p) > prtls.i2_prev(p)) + + prtls.i2_prev(p) - prtls.i2(p))) }; + const real_t Fx2_1 { (static_cast(prtls.i2(p) > prtls.i2_prev(p)) + + dxp_r_2 - prtls.dx2_prev(p)) * coeff * inv_dt }; - const real_t Fx1_2 { (static_cast( - i1(p) - i1_prev(p) - - static_cast(i1(p) > i1_prev(p))) + - dx1(p) - dxp_r_1) * - coeff * inv_dt }; - - auto J_acc = J.access(); - - if constexpr (D == Dim::_1D) { - const real_t Fx2_1 { HALF * vp[1] * coeff }; - const real_t Fx2_2 { HALF * vp[1] * coeff }; - + const real_t Fx2_2 { + (static_cast(prtls.i2(p) - prtls.i2_prev(p) - + static_cast(prtls.i2(p) > prtls.i2_prev(p))) + + prtls.dx2(p) - dxp_r_2) * + coeff * inv_dt + }; + + if constexpr (D == Dim::_2D) { const real_t Fx3_1 { HALF * vp[2] * coeff }; const real_t Fx3_2 { HALF * vp[2] * coeff }; - J_acc(i1_prev(p) + N_GHOSTS, cur::jx1) += Fx1_1; - J_acc(i1(p) + N_GHOSTS, cur::jx1) += Fx1_2; - - J_acc(i1_prev(p) + N_GHOSTS, cur::jx2) += Fx2_1 * (ONE - Wx1_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, cur::jx2) += Fx2_1 * Wx1_1; - J_acc(i1(p) + N_GHOSTS, cur::jx2) += Fx2_2 * (ONE - Wx1_2); - J_acc(i1(p) + N_GHOSTS + 1, cur::jx2) += Fx2_2 * Wx1_2; - - J_acc(i1_prev(p) + N_GHOSTS, cur::jx3) += Fx3_1 * (ONE - Wx1_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, cur::jx3) += Fx3_1 * Wx1_1; - J_acc(i1(p) + N_GHOSTS, cur::jx3) += Fx3_2 * (ONE - Wx1_2); - J_acc(i1(p) + N_GHOSTS + 1, cur::jx3) += Fx3_2 * Wx1_2; - } else if constexpr (D == Dim::_2D || D == Dim::_3D) { - const auto dxp_r_2 { static_cast(i2(p) == i2_prev(p)) * - (dx2(p) + dx2_prev(p)) * - static_cast(INV_2) }; - - const real_t Wx2_1 { INV_2 * (dxp_r_2 + dx2_prev(p) + - static_cast(i2(p) > i2_prev(p))) }; - const real_t Wx2_2 { INV_2 * (dx2(p) + dxp_r_2 + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS, + cur::jx1, + Fx1_1 * (ONE - Wx2_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_1 * Wx2_1); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS, + cur::jx1, + Fx1_2 * (ONE - Wx2_2)); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_2 * Wx2_2); + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS, + cur::jx2, + Fx2_1 * (ONE - Wx1_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, + prtls.i2_prev(p) + N_GHOSTS, + cur::jx2, + Fx2_1 * Wx1_1); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS, + cur::jx2, + Fx2_2 * (ONE - Wx1_2)); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, + prtls.i2(p) + N_GHOSTS, + cur::jx2, + Fx2_2 * Wx1_2); + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, + prtls.i2_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * Wx1_1 * (ONE - Wx2_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS + 1, + cur::jx3, + Fx3_1 * (ONE - Wx1_1) * Wx2_1); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, + prtls.i2_prev(p) + N_GHOSTS + 1, + cur::jx3, + Fx3_1 * Wx1_1 * Wx2_1); + + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2)); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, + prtls.i2(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * Wx1_2 * (ONE - Wx2_2)); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS + 1, + cur::jx3, + Fx3_2 * (ONE - Wx1_2) * Wx2_2); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, + prtls.i2(p) + N_GHOSTS + 1, + cur::jx3, + Fx3_2 * Wx1_2 * Wx2_2); + } else { + const auto dxp_r_3 { + static_cast(prtls.i3(p) == prtls.i3_prev(p)) * + (prtls.dx3(p) + prtls.dx3_prev(p)) * static_cast(INV_2) + }; + const real_t Wx3_1 { INV_2 * (dxp_r_3 + prtls.dx3_prev(p) + static_cast( - static_cast(i2(p) > i2_prev(p)) + - i2_prev(p) - i2(p))) }; - const real_t Fx2_1 { (static_cast(i2(p) > i2_prev(p)) + - dxp_r_2 - dx2_prev(p)) * + prtls.i3(p) > prtls.i3_prev(p))) }; + const real_t Wx3_2 { + INV_2 * + (prtls.dx3(p) + dxp_r_3 + + static_cast(static_cast(prtls.i3(p) > prtls.i3_prev(p)) + + prtls.i3_prev(p) - prtls.i3(p))) + }; + const real_t Fx3_1 { (static_cast(prtls.i3(p) > prtls.i3_prev(p)) + + dxp_r_3 - prtls.dx3_prev(p)) * coeff * inv_dt }; - const real_t Fx2_2 { (static_cast( - i2(p) - i2_prev(p) - - static_cast(i2(p) > i2_prev(p))) + - dx2(p) - dxp_r_2) * - coeff * inv_dt }; - - if constexpr (D == Dim::_2D) { - const real_t Fx3_1 { HALF * vp[2] * coeff }; - const real_t Fx3_2 { HALF * vp[2] * coeff }; - - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - cur::jx1) += Fx1_1 * (ONE - Wx2_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - cur::jx1) += Fx1_1 * Wx2_1; - J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS, cur::jx1) += Fx1_2 * - (ONE - Wx2_2); - J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS + 1, cur::jx1) += Fx1_2 * Wx2_2; - - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - cur::jx2) += Fx2_1 * (ONE - Wx1_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - cur::jx2) += Fx2_1 * Wx1_1; - J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS, cur::jx2) += Fx2_2 * - (ONE - Wx1_2); - J_acc(i1(p) + N_GHOSTS + 1, i2(p) + N_GHOSTS, cur::jx2) += Fx2_2 * Wx1_2; - - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * Wx1_1 * (ONE - Wx2_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - cur::jx3) += Fx3_1 * (ONE - Wx1_1) * Wx2_1; - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS + 1, - cur::jx3) += Fx3_1 * Wx1_1 * Wx2_1; - - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2); - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * Wx1_2 * (ONE - Wx2_2); - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS + 1, - cur::jx3) += Fx3_2 * (ONE - Wx1_2) * Wx2_2; - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS + 1, - cur::jx3) += Fx3_2 * Wx1_2 * Wx2_2; - } else { - const auto dxp_r_3 { static_cast(i3(p) == i3_prev(p)) * - (dx3(p) + dx3_prev(p)) * - static_cast(INV_2) }; - const real_t Wx3_1 { INV_2 * (dxp_r_3 + dx3_prev(p) + - static_cast(i3(p) > i3_prev(p))) }; - const real_t Wx3_2 { INV_2 * (dx3(p) + dxp_r_3 + - static_cast( - static_cast(i3(p) > i3_prev(p)) + - i3_prev(p) - i3(p))) }; - const real_t Fx3_1 { (static_cast(i3(p) > i3_prev(p)) + - dxp_r_3 - dx3_prev(p)) * - coeff * inv_dt }; - const real_t Fx3_2 { (static_cast( - i3(p) - i3_prev(p) - - static_cast(i3(p) > i3_prev(p))) + - dx3(p) - dxp_r_3) * - coeff * inv_dt }; - - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx1) += Fx1_1 * (ONE - Wx2_1) * (ONE - Wx3_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - i3_prev(p) + N_GHOSTS, - cur::jx1) += Fx1_1 * Wx2_1 * (ONE - Wx3_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS + 1, - cur::jx1) += Fx1_1 * (ONE - Wx2_1) * Wx3_1; - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - i3_prev(p) + N_GHOSTS + 1, - cur::jx1) += Fx1_1 * Wx2_1 * Wx3_1; - - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx1) += Fx1_2 * (ONE - Wx2_2) * (ONE - Wx3_2); - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS + 1, - i3(p) + N_GHOSTS, - cur::jx1) += Fx1_2 * Wx2_2 * (ONE - Wx3_2); - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS + 1, - cur::jx1) += Fx1_2 * (ONE - Wx2_2) * Wx3_2; - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS + 1, - i3(p) + N_GHOSTS + 1, - cur::jx1) += Fx1_2 * Wx2_2 * Wx3_2; - - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx2) += Fx2_1 * (ONE - Wx1_1) * (ONE - Wx3_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx2) += Fx2_1 * Wx1_1 * (ONE - Wx3_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS + 1, - cur::jx2) += Fx2_1 * (ONE - Wx1_1) * Wx3_1; - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS + 1, - cur::jx2) += Fx2_1 * Wx1_1 * Wx3_1; - - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx2) += Fx2_2 * (ONE - Wx1_2) * (ONE - Wx3_2); - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx2) += Fx2_2 * Wx1_2 * (ONE - Wx3_2); - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS + 1, - cur::jx2) += Fx2_2 * (ONE - Wx1_2) * Wx3_2; - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS + 1, - cur::jx2) += Fx2_2 * Wx1_2 * Wx3_2; - - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * Wx1_1 * (ONE - Wx2_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - i3_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * (ONE - Wx1_1) * Wx2_1; - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS + 1, - i3_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * Wx1_1 * Wx2_1; - - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2); - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * Wx1_2 * (ONE - Wx2_2); - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS + 1, - i3(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * (ONE - Wx1_2) * Wx2_2; - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS + 1, - i3(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * Wx1_2 * Wx2_2; + const real_t Fx3_2 { + (static_cast(prtls.i3(p) - prtls.i3_prev(p) - + static_cast(prtls.i3(p) > prtls.i3_prev(p))) + + prtls.dx3(p) - dxp_r_3) * + coeff * inv_dt + }; + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS, + prtls.i3_prev(p) + N_GHOSTS, + cur::jx1, + Fx1_1 * (ONE - Wx2_1) * (ONE - Wx3_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS + 1, + prtls.i3_prev(p) + N_GHOSTS, + cur::jx1, + Fx1_1 * Wx2_1 * (ONE - Wx3_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS, + prtls.i3_prev(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_1 * (ONE - Wx2_1) * Wx3_1); + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS + 1, + prtls.i3_prev(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_1 * Wx2_1 * Wx3_1); + + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS, + prtls.i3(p) + N_GHOSTS, + cur::jx1, + Fx1_2 * (ONE - Wx2_2) * (ONE - Wx3_2)); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS + 1, + prtls.i3(p) + N_GHOSTS, + cur::jx1, + Fx1_2 * Wx2_2 * (ONE - Wx3_2)); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS, + prtls.i3(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_2 * (ONE - Wx2_2) * Wx3_2); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS + 1, + prtls.i3(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_2 * Wx2_2 * Wx3_2); + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS, + prtls.i3_prev(p) + N_GHOSTS, + cur::jx2, + Fx2_1 * (ONE - Wx1_1) * (ONE - Wx3_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, + prtls.i2_prev(p) + N_GHOSTS, + prtls.i3_prev(p) + N_GHOSTS, + cur::jx2, + Fx2_1 * Wx1_1 * (ONE - Wx3_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS, + prtls.i3_prev(p) + N_GHOSTS + 1, + cur::jx2, + Fx2_1 * (ONE - Wx1_1) * Wx3_1); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, + prtls.i2_prev(p) + N_GHOSTS, + prtls.i3_prev(p) + N_GHOSTS + 1, + cur::jx2, + Fx2_1 * Wx1_1 * Wx3_1); + + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS, + prtls.i3(p) + N_GHOSTS, + cur::jx2, + Fx2_2 * (ONE - Wx1_2) * (ONE - Wx3_2)); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, + prtls.i2(p) + N_GHOSTS, + prtls.i3(p) + N_GHOSTS, + cur::jx2, + Fx2_2 * Wx1_2 * (ONE - Wx3_2)); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS, + prtls.i3(p) + N_GHOSTS + 1, + cur::jx2, + Fx2_2 * (ONE - Wx1_2) * Wx3_2); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, + prtls.i2(p) + N_GHOSTS, + prtls.i3(p) + N_GHOSTS + 1, + cur::jx2, + Fx2_2 * Wx1_2 * Wx3_2); + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS, + prtls.i3_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, + prtls.i2_prev(p) + N_GHOSTS, + prtls.i3_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * Wx1_1 * (ONE - Wx2_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS + 1, + prtls.i3_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * (ONE - Wx1_1) * Wx2_1); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, + prtls.i2_prev(p) + N_GHOSTS + 1, + prtls.i3_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * Wx1_1 * Wx2_1); + + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS, + prtls.i3(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2)); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, + prtls.i2(p) + N_GHOSTS, + prtls.i3(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * Wx1_2 * (ONE - Wx2_2)); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS + 1, + prtls.i3(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * (ONE - Wx1_2) * Wx2_2); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, + prtls.i2(p) + N_GHOSTS + 1, + prtls.i3(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * Wx1_2 * Wx2_2); + } + } + } else if constexpr ((O >= 1u) and (O <= 11u)) { + + // shape function in dim1 -> always required + real_t iS_x1[O + 2], fS_x1[O + 2]; + // indices of the shape function + int i1_min, i1_max; + + // call shape function + prtl_shape::for_deposit(prtls.i1_prev(p), + static_cast(prtls.dx1_prev(p)), + prtls.i1(p), + static_cast(prtls.dx1(p)), + i1_min, + i1_max, + iS_x1, + fS_x1); + + if constexpr (D == Dim::_1D) { + // (1D): fused Esirkepov, no [O+2] temporaries. + // jx1[i] = -Qdx1dt * sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) + // = -Qdx1dt * P1[i] (Eq. 38, 1D) + // Wx23[i] = HALF * (fS_x1[i] + iS_x1[i]) (computed inline) + const real_t Qdx1dt = coeff * inv_dt; + const real_t QVx2 = coeff * vp[1]; + const real_t QVx3 = coeff * vp[2]; + + // account for ghost cells + i1_min += N_GHOSTS; + i1_max += N_GHOSTS; + + // get number of update indices for asymmetric movement + const int di_x1 = i1_max - i1_min; + + // Current update — fused over the union line so the J cell + // stays L1-resident across the 3 component atomic_adds. + real_t P1 = ZERO; + for (int i = 0; i <= di_x1; ++i) { + P1 += fS_x1[i] - iS_x1[i]; + const int gi = i1_min + i; + const real_t Wx23 = HALF * (fS_x1[i] + iS_x1[i]); + if (i < di_x1) { + deposit_at(gi, cur::jx1, -Qdx1dt * P1); } + deposit_at(gi, cur::jx2, QVx2 * Wx23); + deposit_at(gi, cur::jx3, QVx3 * Wx23); } - } else if constexpr ((O >= 1u) and (O <= 11u)) { + + } else if constexpr (D == Dim::_2D) { // shape function in dim1 -> always required - real_t iS_x1[O + 2], fS_x1[O + 2]; + real_t iS_x2[O + 2], fS_x2[O + 2]; // indices of the shape function - int i1_min, i1_max; + int i2_min, i2_max; // call shape function - prtl_shape::for_deposit(i1_prev(p), - static_cast(dx1_prev(p)), - i1(p), - static_cast(dx1(p)), - i1_min, - i1_max, - iS_x1, - fS_x1); - - if constexpr (D == Dim::_1D) { - // define weight vectors - real_t Wx1[O + 2]; - real_t Wx23[O + 2]; - - // Calculate weight function -#pragma unroll - for (int i = 0; i < O + 2; ++i) { - // Esirkepov 2001, Eq. 38 for 1D case - Wx1[i] = fS_x1[i] - iS_x1[i]; - Wx23[i] = HALF * (fS_x1[i] + iS_x1[i]); - } - - // contribution within the shape function stencil - real_t jx1[O + 2]; - - // prefactors for j update - const real_t Qdx1dt = coeff * inv_dt; - const real_t QVx2 = coeff * vp[1]; - const real_t QVx3 = coeff * vp[2]; - - // Calculate current contribution - jx1[0] = -Qdx1dt * Wx1[0]; -#pragma unroll - for (int i = 1; i < O + 2; ++i) { - jx1[i] = jx1[i - 1] - Qdx1dt * Wx1[i]; - } - - // account for ghost cells - i1_min += N_GHOSTS; - i1_max += N_GHOSTS; - - // get number of update indices for asymmetric movement - const int di_x1 = i1_max - i1_min; - - /* - Current update - */ - auto J_acc = J.access(); - - for (int i = 0; i < di_x1; ++i) { - J_acc(i1_min + i, cur::jx1) += jx1[i]; - } - - for (int i = 0; i <= di_x1; ++i) { - J_acc(i1_min + i, cur::jx2) += QVx2 * Wx23[i]; - } - - for (int i = 0; i <= di_x1; ++i) { - J_acc(i1_min + i, cur::jx3) += QVx3 * Wx23[i]; - } - - } else if constexpr (D == Dim::_2D) { - - // shape function in dim1 -> always required - real_t iS_x2[O + 2], fS_x2[O + 2]; - // indices of the shape function - int i2_min, i2_max; - - // call shape function - prtl_shape::for_deposit(i2_prev(p), - static_cast(dx2_prev(p)), - i2(p), - static_cast(dx2(p)), - i2_min, - i2_max, - iS_x2, - fS_x2); - - // define weight tensors - real_t Wx1[O + 2][O + 2]; - real_t Wx2[O + 2][O + 2]; - real_t Wx3[O + 2][O + 2]; - -// Calculate weight function -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { - // Esirkepov 2001, Eq. 38 (simplified) - Wx1[i][j] = HALF * (fS_x1[i] - iS_x1[i]) * (fS_x2[j] + iS_x2[j]); - - Wx2[i][j] = HALF * (fS_x1[i] + iS_x1[i]) * (fS_x2[j] - iS_x2[j]); - - Wx3[i][j] = THIRD * (fS_x2[j] * (HALF * iS_x1[i] + fS_x1[i]) + - iS_x2[j] * (HALF * fS_x1[i] + iS_x1[i])); - } - } - - // contribution within the shape function stencil - real_t jx1[O + 2][O + 2], jx2[O + 2][O + 2]; - - // prefactors for j update - const real_t Qdx1dt = coeff * inv_dt; - const real_t Qdx2dt = coeff * inv_dt; - const real_t QVx3 = coeff * vp[2]; - - // Calculate current contribution - - // jx1 -#pragma unroll - for (int j = 0; j < O + 2; ++j) { - jx1[0][j] = -Qdx1dt * Wx1[0][j]; - } - -#pragma unroll - for (int i = 1; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { - jx1[i][j] = jx1[i - 1][j] - Qdx1dt * Wx1[i][j]; - } - } - - // jx2 -#pragma unroll - for (int i = 0; i < O + 2; ++i) { - jx2[i][0] = -Qdx2dt * Wx2[i][0]; - } - -#pragma unroll - for (int j = 1; j < O + 2; ++j) { -#pragma unroll - for (int i = 0; i < O + 2; ++i) { - jx2[i][j] = jx2[i][j - 1] - Qdx2dt * Wx2[i][j]; - } - } - - // account for ghost cells - i1_min += N_GHOSTS; - i2_min += N_GHOSTS; - i1_max += N_GHOSTS; - i2_max += N_GHOSTS; - - // get number of update indices for asymmetric movement - const int di_x1 = i1_max - i1_min; - const int di_x2 = i2_max - i2_min; - - /* - Current update - */ - auto J_acc = J.access(); - - for (int i = 0; i < di_x1; ++i) { - for (int j = 0; j <= di_x2; ++j) { - J_acc(i1_min + i, i2_min + j, cur::jx1) += jx1[i][j]; + prtl_shape::for_deposit(prtls.i2_prev(p), + static_cast(prtls.dx2_prev(p)), + prtls.i2(p), + static_cast(prtls.dx2(p)), + i2_min, + i2_max, + iS_x2, + fS_x2); + + /** + * (2D): fused Esirkepov, no [O+2]^2 temporaries. + * + * Esirkepov 2001 Eq. 38 (simplified) is separable: with + * P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) and + * P2[j] = sum_{j'=0}^{j} (fS_x2[j'] - iS_x2[j']), + * jx1[i][j] = -Q*HALF * P1[i] * (fS_x2[j] + iS_x2[j]) + * jx2[i][j] = -Q*HALF * P2[j] * (fS_x1[i] + iS_x1[i]) + * Wx3[i][j] = THIRD*( fS_x2[j]*(HALF*iS_x1[i]+fS_x1[i]) + * + iS_x2[j]*(HALF*fS_x1[i]+iS_x1[i]) ) + * with Q = coeff*inv_dt (Qdx1dt == Qdx2dt). Same value as the + * old explicit Wx/jx tensors up to FP reassociation; + * charge-conserving by construction. Prefix sums carried as + * running scalars, so the only per-thread state is the + * existing 1D shape arrays. + */ + const real_t QVx3 = coeff * vp[2]; + // -Q*HALF prefactor (Qdx1dt == Qdx2dt == coeff*inv_dt) + const real_t cf = -(coeff * inv_dt) * HALF; + + // account for ghost cells + i1_min += N_GHOSTS; + i2_min += N_GHOSTS; + i1_max += N_GHOSTS; + i2_max += N_GHOSTS; + + // get number of update indices for asymmetric movement + const int di_x1 = i1_max - i1_min; + const int di_x2 = i2_max - i2_min; + + // Current update — fused over the union plane so the J cell + // line stays L1-resident across the 3 component atomic_adds. + real_t P1 = ZERO; + for (int i = 0; i <= di_x1; ++i) { + P1 += fS_x1[i] - iS_x1[i]; + const int gi = i1_min + i; + const real_t iSx1 = iS_x1[i]; + const real_t fSx1 = fS_x1[i]; + const real_t A1 = fSx1 + iSx1; // jx2 cross-factor + real_t P2 = ZERO; + for (int j = 0; j <= di_x2; ++j) { + P2 += fS_x2[j] - iS_x2[j]; + const int gj = i2_min + j; + const real_t iSx2 = iS_x2[j]; + const real_t fSx2 = fS_x2[j]; + if (i < di_x1) { + deposit_at(gi, gj, cur::jx1, cf * P1 * (fSx2 + iSx2)); } - } - - for (int i = 0; i <= di_x1; ++i) { - for (int j = 0; j < di_x2; ++j) { - J_acc(i1_min + i, i2_min + j, cur::jx2) += jx2[i][j]; + if (j < di_x2) { + deposit_at(gi, gj, cur::jx2, cf * P2 * A1); } + const real_t Wx3 = THIRD * (fSx2 * (HALF * iSx1 + fSx1) + + iSx2 * (HALF * fSx1 + iSx1)); + deposit_at(gi, gj, cur::jx3, QVx3 * Wx3); } + } - for (int i = 0; i <= di_x1; ++i) { - for (int j = 0; j <= di_x2; ++j) { - J_acc(i1_min + i, i2_min + j, cur::jx3) += QVx3 * Wx3[i][j]; - } - } + } else if constexpr (D == Dim::_3D) { + // shape function in dim2 + real_t iS_x2[O + 2], fS_x2[O + 2]; + // indices of the shape function + int i2_min, i2_max; + // call shape function + prtl_shape::for_deposit(prtls.i2_prev(p), + static_cast(prtls.dx2_prev(p)), + prtls.i2(p), + static_cast(prtls.dx2(p)), + i2_min, + i2_max, + iS_x2, + fS_x2); + + // shape function in dim3 + real_t iS_x3[O + 2], fS_x3[O + 2]; + // indices of the shape function + int i3_min, i3_max; - } else if constexpr (D == Dim::_3D) { - // shape function in dim2 - real_t iS_x2[O + 2], fS_x2[O + 2]; - // indices of the shape function - int i2_min, i2_max; - // call shape function - prtl_shape::for_deposit(i2_prev(p), - static_cast(dx2_prev(p)), - i2(p), - static_cast(dx2(p)), - i2_min, - i2_max, - iS_x2, - fS_x2); - - // shape function in dim3 - real_t iS_x3[O + 2], fS_x3[O + 2]; - // indices of the shape function - int i3_min, i3_max; - - // call shape function - prtl_shape::for_deposit(i3_prev(p), - static_cast(dx3_prev(p)), - i3(p), - static_cast(dx3(p)), - i3_min, - i3_max, - iS_x3, - fS_x3); - - // define weight tensors - real_t Wx1[O + 2][O + 2][O + 2]; - real_t Wx2[O + 2][O + 2][O + 2]; - real_t Wx3[O + 2][O + 2][O + 2]; - -// Calculate weight function -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { -#pragma unroll - for (int k = 0; k < O + 2; ++k) { - // Esirkepov 2001, Eq. 31 - Wx1[i][j][k] = THIRD * (fS_x1[i] - iS_x1[i]) * - ((iS_x2[j] * iS_x3[k] + fS_x2[j] * fS_x3[k]) + - HALF * (iS_x3[k] * fS_x2[j] + iS_x2[j] * fS_x3[k])); - - Wx2[i][j][k] = THIRD * (fS_x2[j] - iS_x2[j]) * - (iS_x1[i] * iS_x3[k] + fS_x1[i] * fS_x3[k] + - HALF * (iS_x3[k] * fS_x1[i] + iS_x1[i] * fS_x3[k])); - - Wx3[i][j][k] = THIRD * (fS_x3[k] - iS_x3[k]) * - (iS_x1[i] * iS_x2[j] + fS_x1[i] * fS_x2[j] + - HALF * (iS_x1[i] * fS_x2[j] + iS_x2[j] * fS_x1[i])); + // call shape function + prtl_shape::for_deposit(prtls.i3_prev(p), + static_cast(prtls.dx3_prev(p)), + prtls.i3(p), + static_cast(prtls.dx3(p)), + i3_min, + i3_max, + iS_x3, + fS_x3); + + /** + * fused Esirkepov, no (O+2)^3 temporaries. + * + * The Esirkepov 3D current (2001, Eq. 31) is separable: with + * P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) (and likewise + * P2[j], P3[k]) the cumulative-sum currents collapse to + * + * jx1[i][j][k] = -Q*THIRD * P1[i] * G23(j,k) + * jx2[i][j][k] = -Q*THIRD * P2[j] * H13(i,k) + * jx3[i][j][k] = -Q*THIRD * P3[k] * F12(i,j) + * + * with the 1D-shape cross-factors + * + * G23(j,k) = iS_x2[j]*iS_x3[k] + fS_x2[j]*fS_x3[k] + * + HALF*(iS_x3[k]*fS_x2[j] + iS_x2[j]*fS_x3[k]) + * H13(i,k) = iS_x1[i]*iS_x3[k] + fS_x1[i]*fS_x3[k] + * + HALF*(iS_x3[k]*fS_x1[i] + iS_x1[i]*fS_x3[k]) + * F12(i,j) = iS_x1[i]*iS_x2[j] + fS_x1[i]*fS_x2[j] + * + HALF*(iS_x1[i]*fS_x2[j] + iS_x2[j]*fS_x1[i]) + * + * and Q = coeff*inv_dt (Qdxdt == Qdydt == Qdzdt). This is the + * same value as the old explicit Wx/jx tensors up to + * floating-point reassociation: charge-conserving by + * construction (the Esirkepov decomposition is exact). The + * prefix sums are carried as running scalars in the deposit + * loop, so the only per-thread state is the existing 1D shape + * arrays (no (O+2)^3 / (O+2)^2 locals, hence far fewer VGPRs + * and no private-memory tensor traffic). + */ + + // account for ghost cells + i1_min += N_GHOSTS; + i2_min += N_GHOSTS; + i3_min += N_GHOSTS; + i1_max += N_GHOSTS; + i2_max += N_GHOSTS; + i3_max += N_GHOSTS; + + // get number of update indices for asymmetric movement + const int di_x1 = i1_max - i1_min; + const int di_x2 = i2_max - i2_min; + const int di_x3 = i3_max - i3_min; + + // -Q*THIRD prefactor (Qdxdt == Qdydt == Qdzdt == coeff*inv_dt) + const real_t cf = -(coeff * inv_dt) * THIRD; + + /** + * Current update — fused over the union cube so the J cell + * line stays L1-resident across the 3 component atomic_adds. + * Per-cell branches on (i 11 not supported. Seriously. " + "What are you even doing here? Entity already goes to 11!"); + } + } - // Calculate current contribution + /** + * @brief Flat current-deposition kernel. + * + * One thread per particle (RangePolicy). Writes are coalesced through a + * `Kokkos::Experimental::ScatterView` to avoid per-thread atomics on + * global J. Constructor signature is unchanged from prior versions — + * `engines/srpic/currents.h` continues to call it identically. + */ + template + class DepositCurrents_kernel { + static_assert(O <= 11u, "Shape function order O must be <= 11"); + static constexpr auto D = M::Dim; - // jx1 -#pragma unroll - for (int j = 0; j < O + 2; ++j) { -#pragma unroll - for (int k = 0; k < O + 2; ++k) { - jx1[0][j][k] = -Qdxdt * Wx1[0][j][k]; - } - } + scatter_ndfield_t J; + const ParticleArrays prtls; + const M metric; + const real_t charge, inv_dt; -#pragma unroll - for (int i = 1; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { -#pragma unroll - for (int k = 0; k < O + 2; ++k) { - jx1[i][j][k] = jx1[i - 1][j][k] - Qdxdt * Wx1[i][j][k]; - } - } - } + public: + DepositCurrents_kernel(const scatter_ndfield_t& scatter_cur, + const ParticleArrays& prtls, + const M& metric, + real_t charge, + const real_t dt) + : J { scatter_cur } + , prtls { prtls } + , metric { metric } + , charge { charge } + , inv_dt { ONE / dt } { + raise::ErrorIf( + (O == 2u and N_GHOSTS < 2), + "Order of interpolation is 2, but number of ghost cells is < 2", + HERE); + } - // jx2 -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int k = 0; k < O + 2; ++k) { - jx2[i][0][k] = -Qdydt * Wx2[i][0][k]; - } - } + Inline auto operator()(prtlidx_t p) const -> void { + auto J_acc = J.access(); + if constexpr (D == Dim::_1D) { + DepositOneParticle(p, + prtls, + metric, + charge, + inv_dt, + [&](int g_i1, int comp, real_t v) { + J_acc(g_i1, comp) += v; + }); + } else if constexpr (D == Dim::_2D) { + DepositOneParticle(p, + prtls, + metric, + charge, + inv_dt, + [&](int g_i1, int g_i2, int comp, real_t v) { + J_acc(g_i1, g_i2, comp) += v; + }); + } else if constexpr (D == Dim::_3D) { + DepositOneParticle( + p, + prtls, + metric, + charge, + inv_dt, + [&](int g_i1, int g_i2, int g_i3, int comp, real_t v) { + J_acc(g_i1, g_i2, g_i3, comp) += v; + }); + } + } + }; -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int j = 1; j < O + 2; ++j) { -#pragma unroll - for (int k = 0; k < O + 2; ++k) { - jx2[i][j][k] = jx2[i][j - 1][k] - Qdydt * Wx2[i][j][k]; - } - } - } + /** + * @brief Tiled current-deposition kernel. + * + * One team per spatial tile (`league_size = ntiles_total`). Each team + * accumulates particle contributions into a per-team scratch buffer of + * shape `(T_TILE + 2*HALO)^D × 3` real_t, where `HALO = O + 1` cells per + * side. Scratch atomics live in SLM (PVC: ~5–10 cycles per + * `atomic_add`); the global J is touched only once per scratch cell at + * flush time. Compared with the flat scatter-view kernel: + * - global atomic pressure ~ (T_TILE + 2*HALO)^D × 3 per tile + * instead of (stencil writes per particle × particles) + * - per-particle stencil writes are tile-local (SLM) instead of + * scattering through global HBM + * + * Supports `O ∈ {0, ..., 11}`. `O == 0` (zigzag) is wired for + * A/B benchmarking against the flat scatter-view kernel — its narrow + * stencil typically makes scratch alloc/zero/flush overhead a + * regression there, but it's good to be able to measure the + * crossover. To revert and use flat for zigzag-only builds, change + * the dispatch in `engines/srpic/currents.h` from + * `#if defined(TEAM_POLICY)` to + * `#if defined(TEAM_POLICY) && (SHAPE_ORDER > 0)`. + * + * Particle iteration order is governed by `tile_offsets`: tile `t` + * owns particles `[tile_offsets(t), tile_offsets(t+1))`, post-sort. + * `SortSpatially` (`particles_sort.cpp`) is responsible for keeping + * the SoA arrays consistent with that. + * + * **Halo sizing and escape valve.** Sort runs at the end of a step + * (see `srpic.hpp`); a particle is pushed once per step thereafter, so + * its `min(i, i_prev)` may differ from the bin key by one cell of drift + * per step elapsed since the last sort. The scratch HALO is + * `STENCIL_REACH(O) + DRIFT`, where `STENCIL_REACH = 2` for zigzag + * (writes `{i_prev, i_prev+1, i, i+1}` ⇒ +2 above `min(i, i_prev)` with + * `|Δi|=1`) and `O` for Esirkepov. `DRIFT` is the `team_policy_drift` + * CMake knob (macro TEAM_POLICY_DRIFT) — the number of cells a particle + * may drift between two sorts that the halo is sized to absorb — and `1` + * by default (the every-step-sorted common case). It is independent of + * the sort cadence, which is set at runtime via `spatial_sorting_interval`; + * particles that drift past the halo take the escape valve below. + * + * Correctness does **not** depend on the halo size. Any particle whose + * full stencil escapes the scratch tile — because it drifted further + * than `DRIFT`, was reordered far from its tile by a no-sort-step + * `CommunicateParticles`, or because the halo is otherwise undersized — + * is deposited *as a whole* via a direct, bounds-clipped + * `Kokkos::atomic_add` on the global J view (the per-particle escape + * valve). Each particle's stencil is therefore deposited exactly once + * (entirely to SLM scratch when it fits, entirely to global J when it + * does not), so the path is charge-conserving; it is merely slower per + * write. Sizing `DRIFT` to the typical between-sort drift keeps the + * common case in fast SLM; sorting less often (or drifting past the + * halo) only costs escape-valve traffic, never accuracy. + * + * **Partition coverage.** The team iteration covers only the particles + * partitioned at the last sort, `[0, layout.npart_partitioned)`, clamped + * to the live `npart`. Particles appended past the partition since the + * sort are not seen here; the launcher (`engines/srpic/currents.h`) + * deposits that tail with the flat kernel so every active particle is + * covered exactly once regardless of sort cadence. + */ + template + class DepositCurrentsTiled_kernel { + static_assert(O <= 11u, "Shape order O must be <= 11"); + static_assert(T_TILE > 0u, "T_TILE must be positive"); + static constexpr auto D = M::Dim; - // jx3 -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { - jx3[i][j][0] = -Qdydt * Wx3[i][j][0]; - } - } + /** + * Per-side scratch halo, derived from first principles. + * + * total halo = stencil_reach(O) + drift_between_sort_and_deposit + * + * stencil_reach(O) — maximum cells the deposit writes ABOVE + * min(i, i_prev) under CFL |v * dt/dx| <= 1/2: + * - O == 0 (zigzag): writes { i_prev, i_prev+1, i, i+1 } => +2 + * - O >= 1 Esirkepov: `for_deposit` returns an (O+2)-wide + * array but only O+1 entries are non-zero, and the union + * window satisfies `i_max - i_min <= O+1` (see + * particle_shapes.hpp::for_deposit). The genuine one-sided + * reach above min(i, i_prev) is therefore O, not O+1 — the + * old `O+1` carried one extra cell of conservative padding + * on top of the already-conservative drift term below. + * + * drift — sort runs at end-of-step (see srpic.hpp), so a particle is + * pushed once per step between its last sort and a given deposit. With + * a runtime sort interval of `K` (spatial_sorting_interval), a particle + * drifts at most `K` cells (CFL |v dt/dx| <= 1/2 ⇒ |Δi| <= 1 per step) + * before the next sort. The `team_policy_drift` CMake knob (macro + * TEAM_POLICY_DRIFT) sets DRIFT independently of `K`, sizing the halo so + * a particle that drifts up to DRIFT cells still deposits inside its + * tile scratch. DRIFT defaults to 1 (the sorted-every-step common case); + * any particle that drifts past the halo (e.g. a larger sort interval, + * or a CFL excursion) takes the per-particle global-J escape valve + * below — correct, only slower (see + * the class doc-comment for why this is charge-conserving). + */ + static constexpr int STENCIL_REACH = (O == 0u) ? 2 : static_cast(O); + // One-sided footprint reach for the per-particle escape valve: the + // deposit writes at most this many cells above max(i,i_prev) (and fewer + // below min), so [min - FOOTPRINT_REACH, max + FOOTPRINT_REACH] in cell + // coords conservatively bounds every deposited cell for any order + // (Esirkepov reaches max+O; O=0 zigzag reaches max+1). + static constexpr int FOOTPRINT_REACH = (O == 0u) ? 1 : static_cast(O); +#if defined(TEAM_POLICY_DRIFT) + static constexpr int DRIFT = static_cast(TEAM_POLICY_DRIFT); +#else + static constexpr int DRIFT = 1; +#endif + static constexpr int HALO = STENCIL_REACH + DRIFT; + static constexpr int TE = static_cast(T_TILE) + 2 * HALO; + + using exec_space = Kokkos::DefaultExecutionSpace; + using team_policy = Kokkos::TeamPolicy; + using member_t = typename team_policy::member_type; + + ndfield_t J; + ParticleArrays prtls; + const M metric; + const real_t charge, inv_dt; + + // Tile metadata produced by SortSpatially. + array_t tile_offsets; + ncells_t ntx1 { 1u }, ntx2 { 1u }, ntx3 { 1u }; + ncells_t total_tiles { 0u }; -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { -#pragma unroll - for (int k = 1; k < O + 2; ++k) { - jx3[i][j][k] = jx3[i][j][k - 1] - Qdzdt * Wx3[i][j][k]; - } - } - } + /** + * Current active-particle count. `tile_offsets` partitions only the + * particles that existed at the last sort ([0, layout.npart_partitioned)); + * `npart` may differ if the pusher dead-tagged particles in place since. + * Each team clamps its `[tile_offsets(t), tile_offsets(t+1))` slice to + * `npart` so stale slots past the live array are never read. Particles + * appended *beyond* the partition (npart > npart_partitioned) are not seen + * by any team here — the launcher deposits that tail separately. + */ + npart_t npart { 0u }; - // account for ghost cells - i1_min += N_GHOSTS; - i2_min += N_GHOSTS; - i3_min += N_GHOSTS; - i1_max += N_GHOSTS; - i2_max += N_GHOSTS; - i3_max += N_GHOSTS; - - // get number of update indices for asymmetric movement - const int di_x1 = i1_max - i1_min; - const int di_x2 = i2_max - i2_min; - const int di_x3 = i3_max - i3_min; - - /* - Current update - */ - auto J_acc = J.access(); - - for (int i = 0; i < di_x1; ++i) { - for (int j = 0; j <= di_x2; ++j) { - for (int k = 0; k <= di_x3; ++k) { - J_acc(i1_min + i, i2_min + j, i3_min + k, cur::jx1) += jx1[i][j][k]; - } - } - } + /** + * J's full storage extent including all ghost cells. Used to clip + * the cooperative flush so that a partial tile at the high end of + * the domain does not over-write past the J view. + */ + int j_ext1 { 0 }, j_ext2 { 0 }, j_ext3 { 0 }; - for (int i = 0; i <= di_x1; ++i) { - for (int j = 0; j < di_x2; ++j) { - for (int k = 0; k <= di_x3; ++k) { - J_acc(i1_min + i, i2_min + j, i3_min + k, cur::jx2) += jx2[i][j][k]; - } - } - } + public: + DepositCurrentsTiled_kernel(const ndfield_t& cur, + const ParticleArrays& prtls, + const M& metric, + real_t charge, + real_t dt, + const TileLayout& layout, + npart_t npart) + : J { cur } + , prtls { prtls } + , metric { metric } + , charge { charge } + , inv_dt { ONE / dt } + , tile_offsets { layout.tile_offsets } + , ntx1 { layout.ntiles_per_axis[0] } + , ntx2 { layout.ntiles_per_axis[1] } + , ntx3 { layout.ntiles_per_axis[2] } + , total_tiles { layout.ntiles_total } + , npart { npart } { + raise::ErrorIf( + layout.tile_size != T_TILE, + "Tiled deposit launched with mismatched T_TILE and runtime tile_size", + HERE); + /** + * @note: HALO is allowed to exceed N_GHOSTS. The cooperative + * scratch→J flush and the per-particle escape valve both bounds-clip + * their writes against `j_ext*` so writes that would land past J's + * ghost stripe are silently dropped (they only ever come from a + * particle whose stencil reaches into the domain ghost region, where + * CommunicateFields will re-supply the contribution). + */ + if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) { + j_ext1 = static_cast(cur.extent(0)); + } + if constexpr (D == Dim::_2D or D == Dim::_3D) { + j_ext2 = static_cast(cur.extent(1)); + } + if constexpr (D == Dim::_3D) { + j_ext3 = static_cast(cur.extent(2)); + } + } - for (int i = 0; i <= di_x1; ++i) { - for (int j = 0; j <= di_x2; ++j) { - for (int k = 0; k < di_x3; ++k) { - J_acc(i1_min + i, i2_min + j, i3_min + k, cur::jx3) += jx3[i][j][k]; - } - } - } + /** + * @brief Per-team scratch size in bytes. Used by the launcher to set + * `team_policy.set_scratch_size(0, Kokkos::PerTeam(bytes))`. + */ + static constexpr size_t scratch_bytes() { + // The component count (3) is a *static* extent of scratch_ndfield_t + // (View / **[3] / ***[3]), so shmem_size() takes only the + // dynamic spatial extents — passing 3 as well trips Kokkos' + // `rank_dynamic != number of arguments` abort. This matches the + // scratch View construction below, which also omits the 3. + if constexpr (D == Dim::_1D) { + return scratch_ndfield_t::shmem_size(TE); + } else if constexpr (D == Dim::_2D) { + return scratch_ndfield_t::shmem_size(TE, TE); + } else { + return scratch_ndfield_t::shmem_size(TE, TE, TE); + } + } - } // dim - } else { // order - raise::KernelError( - HERE, - "Unsupported interpolation order. O > 11 not supported. Seriously. " - "What are you even doing here? Entity already goes to 11!"); + Inline void operator()(const member_t& team) const { + const auto tile_id = static_cast(team.league_rank()); + /** + * Tile coordinates (tile-grid indices) → tile origin in **active** + * cell coords (no ghost offset). Using ncells_t to match the linearised + * tile index produced by SortSpatially. + */ + ncells_t tx1 = 0, tx2 = 0, tx3 = 0; + if constexpr (D == Dim::_1D) { + tx1 = tile_id; + } else if constexpr (D == Dim::_2D) { + tx1 = tile_id / ntx2; + tx2 = tile_id - tx1 * ntx2; + } else { + const auto plane = ntx2 * ntx3; + tx1 = tile_id / plane; + const auto rem = tile_id - tx1 * plane; + tx2 = rem / ntx3; + tx3 = rem - tx2 * ntx3; + } + /** + * origin_active = lowest active-cell index in the tile (no ghost). + * origin_J = same value translated into J's storage coordinate + * (i.e. plus N_GHOSTS). + * origin_J_low = J coordinate of scratch index 0 (i.e. origin_J - HALO). + * local index `li` in scratch ↔ global J index `gi = li + origin_J_low`. + */ + const int origin_J1_low = static_cast(tx1 * T_TILE) + + static_cast(N_GHOSTS) - HALO; + const int origin_J2_low = static_cast(tx2 * T_TILE) + + static_cast(N_GHOSTS) - HALO; + const int origin_J3_low = static_cast(tx3 * T_TILE) + + static_cast(N_GHOSTS) - HALO; + + // Allocate scratch and cooperatively zero-fill it. + if constexpr (D == Dim::_1D) { + scratch_ndfield_t scr { team.team_scratch(0), TE }; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, TE * 3), + [&](ncells_t idx) { + const auto li = idx / 3; + const auto c = idx - li * 3; + scr(li, c) = ZERO; + }); + team.team_barrier(); + + // Clamp the tile's particle slice to the live array: slots past + // `npart` may hold stale (possibly alive-tagged) data from a prior + // step's compaction and must not be re-deposited. + const auto t_lo = tile_offsets(tile_id); + const auto t_hi = tile_offsets(tile_id + 1u); + const auto p_begin = (t_lo < npart) ? t_lo : npart; + const auto p_end = (t_hi < npart) ? t_hi : npart; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, p_begin, p_end), + [&](prtlidx_t p) { + /** + * Per-particle escape valve: route the WHOLE particle to the + * global J view when its Esirkepov footprint does not fit + * inside this tile's scratch window [0,TE); only particles + * fully inside the tile touch SLM scratch. A particle drifts + * out of its tile when sorted less often than every step. + * + * The conservative footprint bound in cell coords, + * [min(i,i_prev) - O, max(i,i_prev) + O], covers + * prtl_shape::for_deposit for any order (i_min >= + * min-floor(O/2), i_max <= max+O), so when `to_scratch` is true + * every deposited cell is provably in [0,TE) and the scratch write + * needs no per-cell bounds test. The global path bounds-clips + * against J's storage extent (writes past the ghost stripe are + * re-supplied by SynchronizeFields(J)). + */ + const int i1c = prtls.i1(p), i1p = prtls.i1_prev(p); + const int lo1 = (i1c < i1p ? i1c : i1p) + static_cast(N_GHOSTS) - + FOOTPRINT_REACH - origin_J1_low; + const int hi1 = (i1c > i1p ? i1c : i1p) + static_cast(N_GHOSTS) + + FOOTPRINT_REACH - origin_J1_low; + const bool to_scratch = (lo1 >= 0 and hi1 < TE); + DepositOneParticle( + p, + prtls, + metric, + charge, + inv_dt, + [&](int g_i1, int comp, real_t v) { + if (to_scratch) { + Kokkos::atomic_add(&scr(g_i1 - origin_J1_low, comp), v); + } else if (g_i1 >= 0 and g_i1 < j_ext1) { + // Bounds-clip the escape-valve write against J's storage, + // exactly as the cooperative flush does. Cells past the + // ghost stripe are re-supplied by SynchronizeFields(J); an + // unclipped write here faults the GPU (an escaped boundary + // particle's stencil can reach past j_ext1). + Kokkos::atomic_add(&J(g_i1, comp), v); + } + }); + }); + team.team_barrier(); + + /** + * Cooperative flush of scratch to global J. Bounds-clip against + * the J view extent in case a partial high-end tile (or non-zero + * halo at domain edges) would otherwise write past J. + */ + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, TE * 3), + [&](const int idx) { + const auto li = idx / 3; + const auto c = idx - li * 3; + const auto gi = li + origin_J1_low; + if (gi < 0 or gi >= j_ext1) { + return; + } + const real_t v = scr(li, c); + if (v != ZERO) { + Kokkos::atomic_add(&J(gi, c), v); + } + }); + } else if constexpr (D == Dim::_2D) { + scratch_ndfield_t scr { team.team_scratch(0), TE, TE }; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, SQR(TE) * 3), + [&](const int idx) { + const auto lij = idx / 3; + const auto c = idx - lij * 3; + const auto li = lij / TE; + const auto lj = lij - li * TE; + scr(li, lj, c) = ZERO; + }); + team.team_barrier(); + + // Clamp the tile's particle slice to the live array: slots past + // `npart` may hold stale (possibly alive-tagged) data from a prior + // step's compaction and must not be re-deposited. + const auto t_lo = tile_offsets(tile_id); + const auto t_hi = tile_offsets(tile_id + 1u); + const auto p_begin = (t_lo < npart) ? t_lo : npart; + const auto p_end = (t_hi < npart) ? t_hi : npart; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, p_begin, p_end), + [&](prtlidx_t p) { + // See 1D branch for rationale: route the whole particle to the + // global escape valve unless its full footprint fits in scratch. + const int i1c = prtls.i1(p), i1p = prtls.i1_prev(p); + const int i2c = prtls.i2(p), i2p = prtls.i2_prev(p); + const int lo1 = (i1c < i1p ? i1c : i1p) + static_cast(N_GHOSTS) - + FOOTPRINT_REACH - origin_J1_low; + const int hi1 = (i1c > i1p ? i1c : i1p) + static_cast(N_GHOSTS) + + FOOTPRINT_REACH - origin_J1_low; + const int lo2 = (i2c < i2p ? i2c : i2p) + static_cast(N_GHOSTS) - + FOOTPRINT_REACH - origin_J2_low; + const int hi2 = (i2c > i2p ? i2c : i2p) + static_cast(N_GHOSTS) + + FOOTPRINT_REACH - origin_J2_low; + const bool to_scratch = (lo1 >= 0 and hi1 < TE and lo2 >= 0 and + hi2 < TE); + DepositOneParticle( + p, + prtls, + metric, + charge, + inv_dt, + [&](const int g_i1, const int g_i2, int comp, real_t v) { + if (to_scratch) { + Kokkos::atomic_add( + &scr(g_i1 - origin_J1_low, g_i2 - origin_J2_low, comp), + v); + } else if (g_i1 >= 0 and g_i1 < j_ext1 and g_i2 >= 0 and + g_i2 < j_ext2) { + // Bounds-clip as the cooperative flush does; an unclipped + // escape-valve write faults the GPU when an escaped boundary + // particle's stencil reaches past j_ext. + Kokkos::atomic_add(&J(g_i1, g_i2, comp), v); + } + }); + }); + team.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, SQR(TE) * 3), + [&](const int idx) { + const auto lij = idx / 3; + const auto c = idx - lij * 3; + const auto li = lij / TE; + const auto lj = lij - li * TE; + const auto gi = li + origin_J1_low; + const auto gj = lj + origin_J2_low; + if ((gi < 0 or gi >= j_ext1) or + (gj < 0 or gj >= j_ext2)) { + return; + } + const real_t v = scr(li, lj, c); + if (v != ZERO) { + Kokkos::atomic_add(&J(gi, gj, c), v); + } + }); + } else if constexpr (D == Dim::_3D) { + scratch_ndfield_t scr { team.team_scratch(0), TE, TE, TE }; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, CUBE(TE) * 3), + [&](const int idx) { + const auto lijk = idx / 3; + const auto c = idx - lijk * 3; + const auto li = lijk / (TE * TE); + const auto rem = lijk - li * TE * TE; + const auto lj = rem / TE; + const auto lk = rem - lj * TE; + scr(li, lj, lk, c) = ZERO; + }); + team.team_barrier(); + + // Clamp the tile's particle slice to the live array: slots past + // `npart` may hold stale (possibly alive-tagged) data from a prior + // step's compaction and must not be re-deposited. + const auto t_lo = tile_offsets(tile_id); + const auto t_hi = tile_offsets(tile_id + 1u); + const auto p_begin = (t_lo < npart) ? t_lo : npart; + const auto p_end = (t_hi < npart) ? t_hi : npart; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, p_begin, p_end), + [&](prtlidx_t p) { + // See 1D branch for rationale: route the whole particle to the + // global escape valve unless its full footprint fits in scratch. + const int i1c = prtls.i1(p), i1p = prtls.i1_prev(p); + const int i2c = prtls.i2(p), i2p = prtls.i2_prev(p); + const int i3c = prtls.i3(p), i3p = prtls.i3_prev(p); + const int lo1 = (i1c < i1p ? i1c : i1p) + static_cast(N_GHOSTS) - + FOOTPRINT_REACH - origin_J1_low; + const int hi1 = (i1c > i1p ? i1c : i1p) + static_cast(N_GHOSTS) + + FOOTPRINT_REACH - origin_J1_low; + const int lo2 = (i2c < i2p ? i2c : i2p) + static_cast(N_GHOSTS) - + FOOTPRINT_REACH - origin_J2_low; + const int hi2 = (i2c > i2p ? i2c : i2p) + static_cast(N_GHOSTS) + + FOOTPRINT_REACH - origin_J2_low; + const int lo3 = (i3c < i3p ? i3c : i3p) + static_cast(N_GHOSTS) - + FOOTPRINT_REACH - origin_J3_low; + const int hi3 = (i3c > i3p ? i3c : i3p) + static_cast(N_GHOSTS) + + FOOTPRINT_REACH - origin_J3_low; + const bool to_scratch = (lo1 >= 0 and hi1 < TE and lo2 >= 0 and + hi2 < TE and lo3 >= 0 and hi3 < TE); + DepositOneParticle( + p, + prtls, + metric, + charge, + inv_dt, + [&](const int g_i1, const int g_i2, const int g_i3, int comp, real_t v) { + if (to_scratch) { + Kokkos::atomic_add(&scr(g_i1 - origin_J1_low, + g_i2 - origin_J2_low, + g_i3 - origin_J3_low, + comp), + v); + } else if (g_i1 >= 0 and g_i1 < j_ext1 and g_i2 >= 0 and + g_i2 < j_ext2 and g_i3 >= 0 and g_i3 < j_ext3) { + // Bounds-clip as the cooperative flush does; an unclipped + // escape-valve write faults the GPU when an escaped boundary + // particle's stencil reaches past j_ext. + Kokkos::atomic_add(&J(g_i1, g_i2, g_i3, comp), v); + } + }); + }); + team.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, CUBE(TE) * 3), + [&](const int idx) { + const int lijk = idx / 3; + const int c = idx - lijk * 3; + const int li = lijk / (TE * TE); + const int rem = lijk - li * TE * TE; + const int lj = rem / TE; + const int lk = rem - lj * TE; + const int gi = li + origin_J1_low; + const int gj = lj + origin_J2_low; + const int gk = lk + origin_J3_low; + if ((gi < 0 or gi >= j_ext1) or + (gj < 0 or gj >= j_ext2) or + (gk < 0 or gk >= j_ext3)) { + return; + } + const real_t v = scr(li, lj, lk, c); + if (v != ZERO) { + Kokkos::atomic_add(&J(gi, gj, gk, c), v); + } + }); } } }; + } // namespace kernel #undef i_di_to_Xi diff --git a/tests/framework/CMakeLists.txt b/tests/framework/CMakeLists.txt index d6dc295af..9a2e2865e 100644 --- a/tests/framework/CMakeLists.txt +++ b/tests/framework/CMakeLists.txt @@ -44,6 +44,12 @@ else() gen_test(particles_sort false) endif() +# team_policy X-3: per-backend sort_by_key permutation test (only built +# when the compile-time team_policy toggle is on). +if(${team_policy}) + gen_test(sort_by_key false) +endif() + gen_test(fields false) gen_test(grid_mesh false) if(${DEBUG}) diff --git a/tests/framework/particles_sort.cpp b/tests/framework/particles_sort.cpp index 1ba10d828..a2b82456e 100644 --- a/tests/framework/particles_sort.cpp +++ b/tests/framework/particles_sort.cpp @@ -31,12 +31,14 @@ auto main(int argc, char* argv[]) -> int { ntt::EmissionType::NONE, 2u, 1u); - auto& i1_p = prtls.i1; - auto& i2_p = prtls.i2; - auto& tag_p = prtls.tag; - auto& weight_p = prtls.weight; - auto& pld_r = prtls.pld_r; - auto& pld_i = prtls.pld_i; + auto& i1_p = prtls.i1; + auto& i2_p = prtls.i2; + auto& i1_prev_p = prtls.i1_prev; + auto& i2_prev_p = prtls.i2_prev; + auto& tag_p = prtls.tag; + auto& weight_p = prtls.weight; + auto& pld_r = prtls.pld_r; + auto& pld_i = prtls.pld_i; Kokkos::parallel_for( "InitParticles", prtls.maxnpart(), @@ -61,9 +63,14 @@ auto main(int argc, char* argv[]) -> int { i2_p(p) = 23u; weight_p(p) = 3.0; } - pld_r(p, 0) = weight_p(p) + static_cast(0.5); - pld_r(p, 1) = weight_p(p) + static_cast(10.5); - pld_i(p, 0) = static_cast(weight_p(p) + 10.0); + // team_policy keys on min(i, i_prev); without a meaningful + // i_prev every key would collapse to 0. Set i_prev = i so the + // tile key reduces to the particle's current cell. + i1_prev_p(p) = i1_p(p); + i2_prev_p(p) = i2_p(p); + pld_r(p, 0) = weight_p(p) + static_cast(0.5); + pld_r(p, 1) = weight_p(p) + static_cast(10.5); + pld_i(p, 0) = static_cast(weight_p(p) + 10.0); } else { tag_p(p) = ntt::ParticleTag::dead; } @@ -75,39 +82,92 @@ auto main(int argc, char* argv[]) -> int { prtls.SortSpatially(grid); + auto i1_h = Kokkos::create_mirror_view(prtls.i1); + auto i2_h = Kokkos::create_mirror_view(prtls.i2); + auto tag_h = Kokkos::create_mirror_view(prtls.tag); auto weight_h = Kokkos::create_mirror_view(prtls.weight); auto pld_r_h = Kokkos::create_mirror_view(prtls.pld_r); auto pld_i_h = Kokkos::create_mirror_view(prtls.pld_i); + Kokkos::deep_copy(i1_h, prtls.i1); + Kokkos::deep_copy(i2_h, prtls.i2); + Kokkos::deep_copy(tag_h, prtls.tag); Kokkos::deep_copy(weight_h, prtls.weight); Kokkos::deep_copy(pld_r_h, prtls.pld_r); Kokkos::deep_copy(pld_i_h, prtls.pld_i); - for (auto p { 0u }; p < 75u; ++p) { - if (p < 16u) { - raise::ErrorIf(weight_h(p) != 3.0, "error in sorting particles", HERE); - } else if (p < 33u) { - raise::ErrorIf(weight_h(p) != 1.0, "error in sorting particles", HERE); - } else if (p < 46u) { - raise::ErrorIf(weight_h(p) != 2.0, "error in sorting particles", HERE); - } else if (p < 59u) { - raise::ErrorIf(weight_h(p) != 0.0, "error in sorting particles", HERE); - } else { - raise::ErrorIf(weight_h(p) != -1.0, "error in sorting particles", HERE); - } - if (p < 59u) { - raise::ErrorIf(pld_r_h(p, 0) != weight_h(p) + static_cast(0.5), - "error in sorting particle real payload 0", - HERE); - raise::ErrorIf(pld_r_h(p, 1) != weight_h(p) + static_cast(10.5), - "error in sorting particle real payload 1", + // Tile geometry, mirroring sort::PositionToTileIndex. T = 1 (no + // team_policy) reproduces the legacy per-cell ordering. +#if defined(TEAM_POLICY) + const ncells_t T = static_cast(TEAM_POLICY_TILE_SIZE); +#else + const ncells_t T = 1u; +#endif + const auto na = grid.n_active(); + const ncells_t ntx2 = (na[1] + T - 1u) / T; + const auto tile_of = [&](int a, int b) -> ncells_t { + return (static_cast(a) / T) * ntx2 + + (static_cast(b) / T); + }; + + // SortSpatially is order-by-tile, not order-by-cell: assert the + // invariants that hold for any tile size rather than a hardwired + // permutation. (1) alive particles form a prefix sorted by + // non-decreasing tile index; (2) every SoA member is permuted by the + // *same* permutation, so each alive slot still satisfies + // pld == f(weight); (3) no alive particle is lost. Only [0, npart()) + // is defined after a sort. The team_policy path compacts — it drops + // the dead, so npart() equals the alive count and [0, npart()) is + // entirely alive; the legacy (non-team) path keeps the dead as a + // weight == -1 suffix, leaving npart() unchanged. Iterating + // [0, npart()) exercises both: the prefix-sorted / no-alive-after-dead + // checks below hold either way. +#if defined(TEAM_POLICY) + raise::ErrorIf(prtls.npart() != 59u, + "team_policy sort must compact: npart() should equal " + "the alive count", + HERE); +#else + raise::ErrorIf(prtls.npart() != 66u, + "legacy sort should leave npart() unchanged", + HERE); +#endif + bool seen_dead = false; + bool have_prev = false; + ncells_t prev_tile = 0u; + npart_t n_alive_obs = 0u; + for (auto p { 0u }; p < prtls.npart(); ++p) { + if (tag_h(p) != ntt::ParticleTag::alive) { + seen_dead = true; + raise::ErrorIf(weight_h(p) != -1.0, + "dead particle has unexpected weight", HERE); - raise::ErrorIf( - pld_i_h(p, 0) != - static_cast(weight_h(p) + static_cast(10.0)), - "error in sorting particle integer payload 0", - HERE); + continue; } + raise::ErrorIf(seen_dead, + "alive particle after a dead one (not sorted to prefix)", + HERE); + const auto tile = tile_of(i1_h(p), i2_h(p)); + raise::ErrorIf(have_prev && (tile < prev_tile), + "alive particles not sorted by tile index", + HERE); + prev_tile = tile; + have_prev = true; + ++n_alive_obs; + raise::ErrorIf(pld_r_h(p, 0) != weight_h(p) + static_cast(0.5), + "error in sorting particle real payload 0", + HERE); + raise::ErrorIf(pld_r_h(p, 1) != weight_h(p) + static_cast(10.5), + "error in sorting particle real payload 1", + HERE); + raise::ErrorIf( + pld_i_h(p, 0) != + static_cast(weight_h(p) + static_cast(10.0)), + "error in sorting particle integer payload 0", + HERE); } + raise::ErrorIf(n_alive_obs != 59u, + "wrong number of alive particles after sort", + HERE); } { // 3D @@ -129,11 +189,14 @@ auto main(int argc, char* argv[]) -> int { ntt::EmissionType::NONE, 0u, 0u); - auto& i1_p = prtls.i1; - auto& i2_p = prtls.i2; - auto& i3_p = prtls.i3; - auto& tag_p = prtls.tag; - auto& weight_p = prtls.weight; + auto& i1_p = prtls.i1; + auto& i2_p = prtls.i2; + auto& i3_p = prtls.i3; + auto& i1_prev_p = prtls.i1_prev; + auto& i2_prev_p = prtls.i2_prev; + auto& i3_prev_p = prtls.i3_prev; + auto& tag_p = prtls.tag; + auto& weight_p = prtls.weight; Kokkos::parallel_for( "InitParticles", prtls.maxnpart(), @@ -167,6 +230,11 @@ auto main(int argc, char* argv[]) -> int { i3_p(p) = 7u; weight_p(p) = 4.0; } + // see 2D block: i_prev = i so the team_policy tile key reduces + // to the particle's current cell. + i1_prev_p(p) = i1_p(p); + i2_prev_p(p) = i2_p(p); + i3_prev_p(p) = i3_p(p); } else { tag_p(p) = ntt::ParticleTag::dead; } @@ -178,23 +246,73 @@ auto main(int argc, char* argv[]) -> int { prtls.SortSpatially(grid); + auto i1_h = Kokkos::create_mirror_view(prtls.i1); + auto i2_h = Kokkos::create_mirror_view(prtls.i2); + auto i3_h = Kokkos::create_mirror_view(prtls.i3); + auto tag_h = Kokkos::create_mirror_view(prtls.tag); auto weight_h = Kokkos::create_mirror_view(prtls.weight); + Kokkos::deep_copy(i1_h, prtls.i1); + Kokkos::deep_copy(i2_h, prtls.i2); + Kokkos::deep_copy(i3_h, prtls.i3); + Kokkos::deep_copy(tag_h, prtls.tag); Kokkos::deep_copy(weight_h, prtls.weight); - for (auto p { 0u }; p < 75u; ++p) { - if (p < 13u) { - raise::ErrorIf(weight_h(p) != 4.0, "error in sorting particles", HERE); - } else if (p < 26u) { - raise::ErrorIf(weight_h(p) != 1.0, "error in sorting particles", HERE); - } else if (p < 39u) { - raise::ErrorIf(weight_h(p) != 2.0, "error in sorting particles", HERE); - } else if (p < 46u) { - raise::ErrorIf(weight_h(p) != 0.0, "error in sorting particles", HERE); - } else if (p < 59u) { - raise::ErrorIf(weight_h(p) != 3.0, "error in sorting particles", HERE); - } else { - raise::ErrorIf(weight_h(p) != -1.0, "error in sorting particles", HERE); + + // Same invariants as the 2D block (no payloads here): alive prefix + // sorted by non-decreasing tile index, alive count preserved. The + // team_policy path compacts the dead away (npart() == alive count); + // the legacy path keeps them as a weight == -1 suffix. T = 1 + // reproduces the legacy per-cell order. +#if defined(TEAM_POLICY) + const ncells_t T = static_cast(TEAM_POLICY_TILE_SIZE); +#else + const ncells_t T = 1u; +#endif + const auto na = grid.n_active(); + const ncells_t ntx2 = (na[1] + T - 1u) / T; + const ncells_t ntx3 = (na[2] + T - 1u) / T; + const auto tile_of = [&](int a, int b, int c) -> ncells_t { + return ((static_cast(a) / T) * ntx2 + + (static_cast(b) / T)) * + ntx3 + + (static_cast(c) / T); + }; + +#if defined(TEAM_POLICY) + raise::ErrorIf(prtls.npart() != 59u, + "team_policy sort must compact: npart() should equal " + "the alive count", + HERE); +#else + raise::ErrorIf(prtls.npart() != 66u, + "legacy sort should leave npart() unchanged", + HERE); +#endif + bool seen_dead = false; + bool have_prev = false; + ncells_t prev_tile = 0u; + npart_t n_alive_obs = 0u; + for (auto p { 0u }; p < prtls.npart(); ++p) { + if (tag_h(p) != ntt::ParticleTag::alive) { + seen_dead = true; + raise::ErrorIf(weight_h(p) != -1.0, + "dead particle has unexpected weight", + HERE); + continue; } + raise::ErrorIf(seen_dead, + "alive particle after a dead one (not sorted to prefix)", + HERE); + const auto tile = tile_of(i1_h(p), i2_h(p), i3_h(p)); + raise::ErrorIf(have_prev && (tile < prev_tile), + "alive particles not sorted by tile index", + HERE); + prev_tile = tile; + have_prev = true; + ++n_alive_obs; } + raise::ErrorIf(n_alive_obs != 59u, + "wrong number of alive particles after sort", + HERE); } } catch (const std::exception& e) { diff --git a/tests/framework/sort_by_key.cpp b/tests/framework/sort_by_key.cpp new file mode 100644 index 000000000..9ccecc732 --- /dev/null +++ b/tests/framework/sort_by_key.cpp @@ -0,0 +1,113 @@ +/** + * @brief X-3 (team_policy) — sort_by_key permutation test. + * + * Exercises every backend overload of `ntt::sort_helpers::sort_by_key_dispatch` + * that is compiled in for the current Kokkos device. For each backend: + * 1. Allocate keys = { 5, 2, 5, 1, 3, 5, 2 }, perm = (uninitialised). + * 2. Call sort_by_key_dispatch. + * 3. Verify that keys[perm[i]] is sorted in non-decreasing order. + * + * Stability is verified for the BinSort and StdSort backends (the others + * promise stability per their documentation but we don't bake that into + * the test). + * + * Built only when `team_policy=ON` at CMake time. + */ +#include "enums.h" +#include "global.h" + +#include "arch/kokkos_aliases.h" +#include "utils/error.h" +#include "utils/sort_dispatch.h" +#include "utils/sorting.h" + +#include + +#include +#include + +namespace { + + using namespace ntt; + + template + void test_one_backend(const char* label, Backend tag) { + const std::vector keys_host_init { 5u, 2u, 5u, 1u, 3u, 5u, 2u }; + const npart_t n = keys_host_init.size(); + const ncells_t n_max = 6u; // bin range [0, n_max) + + array_t keys { "keys", n }; + auto keys_h = Kokkos::create_mirror_view(keys); + for (npart_t i = 0u; i < n; ++i) { + keys_h(i) = keys_host_init[i]; + } + Kokkos::deep_copy(keys, keys_h); + + prtl_perm_t perm { "perm", n }; + + sort_helpers::sort_by_key_dispatch(keys, perm, n_max, tag); + + auto perm_h = Kokkos::create_mirror_view(perm); + Kokkos::deep_copy(perm_h, perm); + + // Validate: keys[perm[0]] <= keys[perm[1]] <= ... + for (npart_t i = 1u; i < n; ++i) { + const auto a = keys_host_init[perm_h(i - 1u)]; + const auto b = keys_host_init[perm_h(i)]; + raise::ErrorIf( + a > b, + std::string("sort_by_key_dispatch produced non-sorted permutation " + "for backend ") + + label, + HERE); + } + + // Validate: perm is a permutation of [0, n). + std::vector seen(n, 0); + for (npart_t i = 0u; i < n; ++i) { + const auto idx = perm_h(i); + raise::ErrorIf(idx >= n, + std::string("permutation index out of range for " + "backend ") + + label, + HERE); + seen[idx] += 1; + } + for (npart_t i = 0u; i < n; ++i) { + raise::ErrorIf(seen[i] != 1, + std::string("permutation not a bijection for backend ") + + label, + HERE); + } + + std::cout << "[OK] sort_by_key_dispatch<" << label << ">: " + << "keys[perm] sorted, perm is a bijection." << std::endl; + } + +} // namespace + +auto main(int argc, char* argv[]) -> int { + ntt::GlobalInitialize(argc, argv); + try { + // Always-available backends. + test_one_backend("BinSort", ::sort::backend::BinSort {}); +#if !defined(DEVICE_ENABLED) + test_one_backend("StdSort", ::sort::backend::StdSort {}); +#endif +#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED) + test_one_backend("OneDPL", ::sort::backend::OneDPL {}); +#endif +#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED) + test_one_backend("Thrust", ::sort::backend::Thrust {}); +#endif +#if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED) + test_one_backend("Rocthrust", ::sort::backend::Rocthrust {}); +#endif + } catch (std::exception& e) { + std::cerr << e.what() << std::endl; + ntt::GlobalFinalize(); + return 1; + } + ntt::GlobalFinalize(); + return 0; +} diff --git a/tests/global/tiling.cpp b/tests/global/tiling.cpp index d0f0a412c..2f926ce28 100644 --- a/tests/global/tiling.cpp +++ b/tests/global/tiling.cpp @@ -1,6 +1,5 @@ #include "arch/kokkos_aliases.h" #include "utils/error.h" -#include "utils/formatting.h" #include "utils/numeric.h" #include "utils/sorting.h" @@ -122,37 +121,27 @@ void test_tiling(const array_t& i1, const auto ntiles = nt1 * nt2 * nt3; - auto position_to_tile_kernel = sort::PositionToTileIndex { - i1, i2, i3, tag, tile_indices, ncells, ts - }; - Kokkos::parallel_for("Tiling", npart, position_to_tile_kernel); - const auto num_ppt = position_to_tile_kernel.num_ppt; + array_t num_ppt { "num_ppt", ntiles }; + Kokkos::parallel_for( + "Tiling", + npart, + sort::PositionToTileIndex { i1, i2, i3, tag, tile_indices, ncells, ts, num_ppt }); Kokkos::parallel_for( "Checking", npart, Lambda(prtlidx_t p) { CheckValue(p, i1, i2, i3, tag, tile_indices, nt1, nt2, nt3, ntiles, ts); }); - raise::ErrorIf( - num_ppt.extent(0) != ntiles, - fmt::format("num_ppt size does not match number of tiles %u != %u", - num_ppt.extent(0), - ntiles), - HERE); npart_t tot_alive = 0u; Kokkos::parallel_reduce( "CountAliveInTiles", ntiles, - Lambda(cellidx_t t, npart_t & count) { count += num_ppt(t); }, + Lambda(prtlidx_t t, npart_t & count) { count += num_ppt(t); }, tot_alive); - raise::ErrorIf( - tot_alive != npart - ndead, - fmt::format("Error in counting particles per tile: %u != %u - %u", - tot_alive, - npart, - ndead), - HERE); + raise::ErrorIf(tot_alive != npart - ndead, + "Error in counting particles per tile", + HERE); } } diff --git a/tests/kernels/CMakeLists.txt b/tests/kernels/CMakeLists.txt index f1438e108..0fe23f578 100644 --- a/tests/kernels/CMakeLists.txt +++ b/tests/kernels/CMakeLists.txt @@ -26,6 +26,9 @@ endfunction() gen_test(faraday_mink) gen_test(ampere_mink) gen_test(deposit) +if(${team_policy}) + gen_test(deposit_tiled) +endif() gen_test(digital_filter) gen_test(particle_moments) gen_test(fields_to_phys) diff --git a/tests/kernels/deposit.cpp b/tests/kernels/deposit.cpp index 24f544202..10095d304 100644 --- a/tests/kernels/deposit.cpp +++ b/tests/kernels/deposit.cpp @@ -142,15 +142,21 @@ void testDeposit(const std::vector& res, auto J_scat = Kokkos::Experimental::create_scatter_view(J); + // The deposit kernel now takes a `ParticleArrays` SoA struct instead of + // the individual per-component arrays. Pack the per-test arrays into one; + // payload (pld_*) members stay default (unused here). + ParticleArrays pa; + pa.i1 = i1, pa.i2 = i2, pa.i3 = i3; + pa.i1_prev = i1_prev, pa.i2_prev = i2_prev, pa.i3_prev = i3_prev; + pa.dx1 = dx1, pa.dx2 = dx2, pa.dx3 = dx3; + pa.dx1_prev = dx1_prev, pa.dx2_prev = dx2_prev, pa.dx3_prev = dx3_prev; + pa.ux1 = ux1, pa.ux2 = ux2, pa.ux3 = ux3; + pa.phi = phi, pa.weight = weight, pa.tag = tag; + // clang-format off Kokkos::parallel_for("CurrentsDeposit", 10, kernel::DepositCurrents_kernel(J_scat, - i1, i2, i3, - i1_prev, i2_prev, i3_prev, - dx1, dx2, dx3, - dx1_prev, dx2_prev, dx3_prev, - ux1, ux2, ux3, - phi, weight, tag, + pa, metric, charge, inv_dt)); // clang-format on diff --git a/tests/kernels/deposit_tiled.cpp b/tests/kernels/deposit_tiled.cpp new file mode 100644 index 000000000..f058d6f30 --- /dev/null +++ b/tests/kernels/deposit_tiled.cpp @@ -0,0 +1,592 @@ +/** + * @file tests/kernels/deposit_tiled.cpp + * @brief X-1 numerical-equivalence test for the tiled deposit kernel. + * + * Runs the flat (`DepositCurrents_kernel`) and tiled + * (`DepositCurrentsTiled_kernel`) kernels on identical particle SoA inputs + * for shape orders O = 1..11 and asserts that the resulting J array is + * identical cell-by-cell within a small floating-point tolerance. + * + * Built only when `team_policy=ON` (`-D TEAM_POLICY` defined). The test + * matches the per-particle setup used in `deposit.cpp` so that any + * regression in the shared `kernel::deposit::deposit_one_particle` body + * is caught by both tests. + */ + +#include "enums.h" +#include "global.h" + +#include "arch/kokkos_aliases.h" +#include "utils/comparators.h" + +#include "metrics/minkowski.h" + +#include "kernels/currents_deposit.hpp" + +#include +#include + +#include +#include +#include +#include + +namespace { + + using namespace ntt; + + void errorIf(bool condition, const std::string& msg) { + if (condition) { + throw std::runtime_error(msg); + } + } + + template + void put_value(const array_t& arr, T value, int i) { + auto h = Kokkos::create_mirror_view(arr); + h(i) = value; + Kokkos::deep_copy(arr, h); + } + + // Pack the per-test SoA arrays into a ParticleArrays — the struct both + // deposit kernels take. Payload (pld_*) members stay default; these are + // 2D Cartesian Minkowski cases, so phi/i3/dx3 are present but unread. + ParticleArrays pack_arrays(const array_t& i1, + const array_t& i2, + const array_t& i3, + const array_t& i1_prev, + const array_t& i2_prev, + const array_t& i3_prev, + const array_t& dx1, + const array_t& dx2, + const array_t& dx3, + const array_t& dx1_prev, + const array_t& dx2_prev, + const array_t& dx3_prev, + const array_t& ux1, + const array_t& ux2, + const array_t& ux3, + const array_t& phi, + const array_t& weight, + const array_t& tag) { + ParticleArrays pa; + pa.i1 = i1, pa.i2 = i2, pa.i3 = i3; + pa.i1_prev = i1_prev, pa.i2_prev = i2_prev, pa.i3_prev = i3_prev; + pa.dx1 = dx1, pa.dx2 = dx2, pa.dx3 = dx3; + pa.dx1_prev = dx1_prev, pa.dx2_prev = dx2_prev, pa.dx3_prev = dx3_prev; + pa.ux1 = ux1, pa.ux2 = ux2, pa.ux3 = ux3; + pa.phi = phi, pa.weight = weight, pa.tag = tag; + return pa; + } + + // Builds tile_offsets for a single-particle test. Particle 0 is alive + // and lives in tile (tx1, tx2); slots 1..n_slots-1 carry the dead + // sentinel and are never referenced by tile_offsets — so the tiled + // kernel never iterates over them. + array_t build_tile_offsets_single_particle(ncells_t ntx1, + ncells_t ntx2, + ncells_t tx1, + ncells_t tx2) { + const ncells_t total_tiles = ntx1 * ntx2; + const ncells_t hot_tile = tx1 * ntx2 + tx2; + array_t offsets("tile_offsets", total_tiles + 1u); + auto h = Kokkos::create_mirror_view(offsets); + for (ncells_t t = 0; t <= total_tiles; ++t) { + h(t) = (t <= hot_tile) ? static_cast(0) + : static_cast(1); + } + Kokkos::deep_copy(offsets, h); + return offsets; + } + + // Buckets ALL `n_alive` particles (slots 0..n_alive-1) into tile 0, + // modelling a maximally-stale tile layout: every particle was "sorted" + // into tile 0 but now sits anywhere in the domain (as happens when the + // SoA drifts / is reordered between sorts). Every particle except the + // few that genuinely live near the origin must therefore take the + // per-particle escape valve to global J. + array_t build_tile_offsets_all_in_tile0(ncells_t total_tiles, + npart_t n_alive) { + array_t offsets("tile_offsets", total_tiles + 1u); + auto h = Kokkos::create_mirror_view(offsets); + h(0) = static_cast(0); + for (ncells_t t = 1; t <= total_tiles; ++t) { + h(t) = n_alive; + } + Kokkos::deep_copy(offsets, h); + return offsets; + } + + // Cell-by-cell comparison of two J fields; throws on mismatch. + void compare_J_fields(const ndfield_t& J_flat, + const ndfield_t& J_tiled, + unsigned short O, + unsigned short T_TILE, + const char* label) { + auto h_flat = Kokkos::create_mirror_view(J_flat); + auto h_tiled = Kokkos::create_mirror_view(J_tiled); + Kokkos::deep_copy(h_flat, J_flat); + Kokkos::deep_copy(h_tiled, J_tiled); + + const real_t eps = static_cast(1.0e-5); + real_t max_diff = ZERO; + int fail_count = 0; + for (ncells_t i = 0; i < h_flat.extent(0); ++i) { + for (ncells_t j = 0; j < h_flat.extent(1); ++j) { + for (int c = 0; c < 3; ++c) { + const real_t a = h_flat(i, j, c); + const real_t b = h_tiled(i, j, c); + const real_t diff = math::fabs(a - b); + const real_t mag = math::max(math::fabs(a), math::fabs(b)); + if (diff > max_diff) { + max_diff = diff; + } + if (diff > eps * math::max(mag, static_cast(1.0))) { + if (fail_count < 5) { + std::cerr << " [" << label << "] J(" << i << "," << j + << ",c=" << c << ") flat=" << a << " tiled=" << b + << " diff=" << diff << '\n'; + } + ++fail_count; + } + } + } + } + if (fail_count > 0) { + std::cerr << "deposit_tiled[" << label << "] FAILED for O=" << O + << " T_TILE=" << T_TILE << " : " << fail_count + << " mismatches; max_diff=" << max_diff << '\n'; + throw std::logic_error("DepositCurrentsTiled_kernel mismatch"); + } + std::cerr << "deposit_tiled[" << label << "] OK O=" << O + << " T_TILE=" << T_TILE << " max_diff=" << max_diff << '\n'; + } + + // Intrinsic charge-conservation check on a single deposited J field. + // Esirkepov/zigzag deposits satisfy the discrete continuity equation, so + // the spatial sum of the discrete divergence div.J = dJx/dx + dJy/dy + // vanishes whenever the summation region encloses every particle's full + // stencil (J == 0 on the region's outer boundary). This is evaluated on + // J_tiled ALONE -- it does not compare against the flat reference -- so it + // certifies the per-particle escape valve deposits each drifted particle's + // stencil as one coherent unit: no cell dropped, duplicated, or split + // between SLM scratch and global J. (The run_drift_case order guard keeps + // every stencil inside [0, j_ext), so the extreme ghost cells stay zero and + // the telescoping boundary flux is genuinely zero rather than clipped.) + // Accumulated in double regardless of build precision to keep the + // tolerance tight. + void check_charge_conservation(const ndfield_t& J, + unsigned short O, + unsigned short T_TILE, + const char* label) { + auto h = Kokkos::create_mirror_view(J); + Kokkos::deep_copy(h, J); + + double sum_div = 0.0; // Sum over the field of div.J (jx1 -> dx, jx2 -> dy). + double abs_tot = 0.0; // Total |J|, sets the relative tolerance scale. + for (ncells_t i = 1; i < h.extent(0); ++i) { + for (ncells_t j = 1; j < h.extent(1); ++j) { + sum_div += (static_cast(h(i, j, 0)) - + static_cast(h(i - 1, j, 0))) + + (static_cast(h(i, j, 1)) - + static_cast(h(i, j - 1, 1))); + } + } + for (ncells_t i = 0; i < h.extent(0); ++i) { + for (ncells_t j = 0; j < h.extent(1); ++j) { + abs_tot += std::fabs(static_cast(h(i, j, 0))) + + std::fabs(static_cast(h(i, j, 1))); + } + } + const double tol = 1.0e-5 * (abs_tot > 1.0 ? abs_tot : 1.0); + if (std::fabs(sum_div) > tol) { + std::cerr << "deposit_tiled[" << label + << "] CHARGE NON-CONSERVED for O=" << O << " T_TILE=" << T_TILE + << " : sum(div.J)=" << sum_div << " tol=" << tol + << " (abs_tot=" << abs_tot << ")\n"; + throw std::logic_error( + "DepositCurrentsTiled_kernel charge non-conservation"); + } + std::cerr << "deposit_tiled[" << label << "] charge-conserved O=" << O + << " T_TILE=" << T_TILE << " sum(div.J)=" << sum_div << '\n'; + } + + template + void run_one_case() { + using metric_t = metric::Minkowski; + constexpr unsigned short nx1 = 50u, nx2 = 50u; + metric_t metric { { nx1, nx2 }, + { { 0.0, 55.0 }, { 0.0, 55.0 } }, + {} }; + + // Particle setup (mirrors deposit.cpp). + const int i0 = 25, j0 = 21, i0f = 24, j0f = 20; + const real_t uz = 2.5; + const prtldx_t dxi = static_cast(0.65); + const prtldx_t dxf = static_cast(0.99); + const prtldx_t dyi = static_cast(0.65); + const prtldx_t dyf = static_cast(0.80); + + array_t i1 { "i1", 10 }; + array_t i2 { "i2", 10 }; + array_t i3 { "i3", 10 }; + array_t i1_prev { "i1_prev", 10 }; + array_t i2_prev { "i2_prev", 10 }; + array_t i3_prev { "i3_prev", 10 }; + array_t dx1 { "dx1", 10 }; + array_t dx2 { "dx2", 10 }; + array_t dx3 { "dx3", 10 }; + array_t dx1_prev { "dx1_prev", 10 }; + array_t dx2_prev { "dx2_prev", 10 }; + array_t dx3_prev { "dx3_prev", 10 }; + array_t ux1 { "ux1", 10 }; + array_t ux2 { "ux2", 10 }; + array_t ux3 { "ux3", 10 }; + array_t phi { "phi", 10 }; + array_t weight { "weight", 10 }; + array_t tag { "tag", 10 }; + const real_t charge = 1.0, dt = 1.0; + + put_value(i1, i0f, 0); + put_value(i2, j0f, 0); + put_value(i1_prev, i0, 0); + put_value(i2_prev, j0, 0); + put_value(dx1, dxf, 0); + put_value(dx2, dyf, 0); + put_value(dx1_prev, dxi, 0); + put_value(dx2_prev, dyi, 0); + put_value(ux1, ZERO, 0); + put_value(ux2, ZERO, 0); + put_value(ux3, uz, 0); + put_value(weight, 1.0, 0); + put_value(tag, ParticleTag::alive, 0); + + // Run the flat kernel. + ndfield_t J_flat { "J_flat", + nx1 + 2u * N_GHOSTS, + nx2 + 2u * N_GHOSTS }; + { + auto J_scat = Kokkos::Experimental::create_scatter_view(J_flat); + Kokkos::parallel_for( + "FlatDeposit", + 10, + kernel::DepositCurrents_kernel( + J_scat, + pack_arrays(i1, i2, i3, + i1_prev, i2_prev, i3_prev, + dx1, dx2, dx3, + dx1_prev, dx2_prev, dx3_prev, + ux1, ux2, ux3, + phi, weight, tag), + metric, charge, dt)); + Kokkos::Experimental::contribute(J_flat, J_scat); + Kokkos::fence("flat deposit done"); + } + + // Run the tiled kernel. Build a TileLayout with one alive particle + // landing in its expected tile (sort key = min(i, i_prev) / T_TILE). + ndfield_t J_tiled { "J_tiled", + nx1 + 2u * N_GHOSTS, + nx2 + 2u * N_GHOSTS }; + { + const auto sort_i1 = static_cast( + (i0 < i0f) ? i0 : i0f); // min(i, i_prev) before clamp + const auto sort_i2 = static_cast((j0 < j0f) ? j0 : j0f); + const auto ntx1 = static_cast( + std::ceil(static_cast(nx1) / static_cast(T_TILE))); + const auto ntx2 = static_cast( + std::ceil(static_cast(nx2) / static_cast(T_TILE))); + const auto tx1 = static_cast(sort_i1) / T_TILE; + const auto tx2 = static_cast(sort_i2) / T_TILE; + + TileLayout layout; + layout.ntiles_per_axis[0] = ntx1; + layout.ntiles_per_axis[1] = ntx2; + layout.ntiles_per_axis[2] = 1u; + layout.ntiles_total = ntx1 * ntx2; + layout.tile_size = T_TILE; + layout.tile_offsets = build_tile_offsets_single_particle(ntx1, + ntx2, + tx1, + tx2); + + using kernel_t = + kernel::DepositCurrentsTiled_kernel; + // npart = full slot count (10): the lone alive particle sits in slot 0 + // and the per-tile slice clamp keeps the (dead) tail out. + kernel_t kern { J_tiled, + pack_arrays(i1, i2, i3, + i1_prev, i2_prev, i3_prev, + dx1, dx2, dx3, + dx1_prev, dx2_prev, dx3_prev, + ux1, ux2, ux3, + phi, weight, tag), + metric, charge, dt, layout, + static_cast(10) }; + + Kokkos::TeamPolicy<> policy(static_cast(layout.ntiles_total), + Kokkos::AUTO); + policy.set_scratch_size(0, + Kokkos::PerTeam(kernel_t::scratch_bytes())); + Kokkos::parallel_for("TiledDeposit", policy, kern); + Kokkos::fence("tiled deposit done"); + } + + // Compare J_flat vs J_tiled cell-by-cell. + auto h_flat = Kokkos::create_mirror_view(J_flat); + auto h_tiled = Kokkos::create_mirror_view(J_tiled); + Kokkos::deep_copy(h_flat, J_flat); + Kokkos::deep_copy(h_tiled, J_tiled); + + const real_t eps = static_cast(1.0e-5); + real_t max_diff = ZERO; + int fail_count = 0; + for (ncells_t i = 0; i < h_flat.extent(0); ++i) { + for (ncells_t j = 0; j < h_flat.extent(1); ++j) { + for (int c = 0; c < 3; ++c) { + const real_t a = h_flat(i, j, c); + const real_t b = h_tiled(i, j, c); + const real_t diff = math::fabs(a - b); + const real_t mag = math::max(math::fabs(a), math::fabs(b)); + if (diff > max_diff) { + max_diff = diff; + } + if (diff > eps * math::max(mag, static_cast(1.0))) { + if (fail_count < 5) { + std::cerr << " J(" << i << "," << j << ",c=" << c + << ") flat=" << a << " tiled=" << b + << " diff=" << diff << '\n'; + } + ++fail_count; + } + } + } + } + if (fail_count > 0) { + std::cerr << "X-1 deposit_tiled equivalence FAILED for O=" << O + << " T_TILE=" << T_TILE + << " : " << fail_count << " mismatches; max_diff=" << max_diff + << '\n'; + throw std::logic_error("DepositCurrentsTiled_kernel mismatch"); + } + std::cerr << "X-1 deposit_tiled OK O=" << O << " T_TILE=" << T_TILE + << " max_diff=" << max_diff << '\n'; + } + + // Drift / stale-layout regression test for the per-particle escape valve. + // + // A population of alive particles is spread across the whole domain + // (including the near-boundary cells that deposit into the ghost stripe) + // but the tile layout buckets them ALL into tile 0 — i.e. the layout is + // maximally stale w.r.t. their real positions, exactly the situation that + // arises when the SoA is reordered/appended between sorts. The tiled + // kernel must route every out-of-tile particle to the global J view and + // reproduce the flat deposit cell-for-cell: no charge dropped or + // double-counted at any drift distance. This is the property that broke + // when `spatial_sorting_interval > 1` left drifted particles depositing + // partial stencils, producing a density/E line at the decomposition + // boundary. + template + void run_drift_case() { + // This case deposits boundary-adjacent particles (cells touching the + // ghost stripe), so an order-O stencil must fit inside the field's + // N_GHOSTS ghost layers. N_GHOSTS is a compile-time constant fixed by + // the build's SHAPE_ORDER ((SHAPE_ORDER+1)/2 + 1); a build whose ghost + // width is smaller than order O requires would deposit outside the + // field -- silent on GPU (no Kokkos View bounds guard; the overshoot + // cells carry zero shape-weight so results still match) but heap + // corruption on a host/SERIAL build. Skip those orders here; build at + // the matching SHAPE_ORDER to drift-test higher orders. The equivalence + // ("X-1") cases above stay interior, so they exercise all orders. + if constexpr ((O + 1u) / 2u + 1u > N_GHOSTS) { + std::cerr << "deposit_tiled[drift] SKIP O=" << O << " T_TILE=" << T_TILE + << " (needs N_GHOSTS>=" << ((O + 1u) / 2u + 1u) + << ", build has " << N_GHOSTS << ")\n"; + return; + } + using metric_t = metric::Minkowski; + constexpr unsigned short nx1 = 50u, nx2 = 50u; + metric_t metric { { nx1, nx2 }, { { 0.0, 55.0 }, { 0.0, 55.0 } }, {} }; + + constexpr int n_slots = 64; + constexpr int n_base = 5; + const int bases[n_base] = { 1, 13, 25, 37, 48 }; + const int n_alive = n_base * n_base; // 25 + + array_t i1 { "i1", n_slots }, i2 { "i2", n_slots }, + i3 { "i3", n_slots }; + array_t i1_prev { "i1_prev", n_slots }, + i2_prev { "i2_prev", n_slots }, i3_prev { "i3_prev", n_slots }; + array_t dx1 { "dx1", n_slots }, dx2 { "dx2", n_slots }, + dx3 { "dx3", n_slots }; + array_t dx1_prev { "dx1_prev", n_slots }, + dx2_prev { "dx2_prev", n_slots }, dx3_prev { "dx3_prev", n_slots }; + array_t ux1 { "ux1", n_slots }, ux2 { "ux2", n_slots }, + ux3 { "ux3", n_slots }; + array_t phi { "phi", n_slots }, weight { "weight", n_slots }; + array_t tag { "tag", n_slots }; + const real_t charge = 1.0, dt = 1.0; + + // Fill alive particles on host (slots >= n_alive stay zero == dead). + auto h_i1 = Kokkos::create_mirror_view(i1); + auto h_i2 = Kokkos::create_mirror_view(i2); + auto h_i1p = Kokkos::create_mirror_view(i1_prev); + auto h_i2p = Kokkos::create_mirror_view(i2_prev); + auto h_dx1 = Kokkos::create_mirror_view(dx1); + auto h_dx2 = Kokkos::create_mirror_view(dx2); + auto h_dx1p = Kokkos::create_mirror_view(dx1_prev); + auto h_dx2p = Kokkos::create_mirror_view(dx2_prev); + auto h_ux3 = Kokkos::create_mirror_view(ux3); + auto h_w = Kokkos::create_mirror_view(weight); + auto h_tag = Kokkos::create_mirror_view(tag); + int p = 0; + for (int a = 0; a < n_base; ++a) { + for (int b = 0; b < n_base; ++b, ++p) { + h_i1p(p) = bases[a]; + h_i1(p) = bases[a] - 1; + h_i2p(p) = bases[b]; + h_i2(p) = bases[b] - 1; + h_dx1p(p) = static_cast(0.65); + h_dx1(p) = static_cast(0.99); + h_dx2p(p) = static_cast(0.65); + h_dx2(p) = static_cast(0.80); + h_ux3(p) = static_cast(2.5); + h_w(p) = static_cast(1.0); + h_tag(p) = ParticleTag::alive; + } + } + Kokkos::deep_copy(i1, h_i1); + Kokkos::deep_copy(i2, h_i2); + Kokkos::deep_copy(i1_prev, h_i1p); + Kokkos::deep_copy(i2_prev, h_i2p); + Kokkos::deep_copy(dx1, h_dx1); + Kokkos::deep_copy(dx2, h_dx2); + Kokkos::deep_copy(dx1_prev, h_dx1p); + Kokkos::deep_copy(dx2_prev, h_dx2p); + Kokkos::deep_copy(ux3, h_ux3); + Kokkos::deep_copy(weight, h_w); + Kokkos::deep_copy(tag, h_tag); + + // Flat reference over all slots (dead slots are skipped internally). + ndfield_t J_flat { "J_flat", + nx1 + 2u * N_GHOSTS, + nx2 + 2u * N_GHOSTS }; + { + auto J_scat = Kokkos::Experimental::create_scatter_view(J_flat); + Kokkos::parallel_for( + "FlatDepositDrift", + n_slots, + kernel::DepositCurrents_kernel( + J_scat, + pack_arrays(i1, i2, i3, + i1_prev, i2_prev, i3_prev, + dx1, dx2, dx3, + dx1_prev, dx2_prev, dx3_prev, + ux1, ux2, ux3, + phi, weight, tag), + metric, charge, dt)); + Kokkos::Experimental::contribute(J_flat, J_scat); + Kokkos::fence("flat drift deposit done"); + } + + // Tiled with a maximally-stale layout: all alive particles in tile 0. + ndfield_t J_tiled { "J_tiled", + nx1 + 2u * N_GHOSTS, + nx2 + 2u * N_GHOSTS }; + { + const auto ntx1 = static_cast( + std::ceil(static_cast(nx1) / static_cast(T_TILE))); + const auto ntx2 = static_cast( + std::ceil(static_cast(nx2) / static_cast(T_TILE))); + + TileLayout layout; + layout.ntiles_per_axis[0] = ntx1; + layout.ntiles_per_axis[1] = ntx2; + layout.ntiles_per_axis[2] = 1u; + layout.ntiles_total = ntx1 * ntx2; + layout.tile_size = T_TILE; + layout.tile_offsets = build_tile_offsets_all_in_tile0( + ntx1 * ntx2, + static_cast(n_alive)); + + using kernel_t = + kernel::DepositCurrentsTiled_kernel; + // npart = n_alive: the stale layout buckets all alive particles into + // tile 0, so the team must walk [0, n_alive) and route the drifted + // ones to the global-J escape valve. + kernel_t kern { J_tiled, + pack_arrays(i1, i2, i3, + i1_prev, i2_prev, i3_prev, + dx1, dx2, dx3, + dx1_prev, dx2_prev, dx3_prev, + ux1, ux2, ux3, + phi, weight, tag), + metric, charge, dt, layout, + static_cast(n_alive) }; + + Kokkos::TeamPolicy<> policy(static_cast(layout.ntiles_total), + Kokkos::AUTO); + policy.set_scratch_size(0, + Kokkos::PerTeam(kernel_t::scratch_bytes())); + Kokkos::parallel_for("TiledDepositDrift", policy, kern); + Kokkos::fence("tiled drift deposit done"); + } + + compare_J_fields(J_flat, J_tiled, O, T_TILE, "drift"); + // Self-contained conservation check on the escape-valve output: the + // drifted particles all take the per-particle global-J path, so this + // certifies that path is charge-conserving without leaning on J_flat. + check_charge_conservation(J_tiled, O, T_TILE, "drift"); + } + + template + void run_all_orders() { + run_one_case<0u, T_TILE>(); + run_one_case<1u, T_TILE>(); + run_one_case<2u, T_TILE>(); + run_one_case<3u, T_TILE>(); + run_one_case<4u, T_TILE>(); + run_one_case<5u, T_TILE>(); + run_one_case<6u, T_TILE>(); + run_one_case<7u, T_TILE>(); + run_one_case<8u, T_TILE>(); + run_one_case<9u, T_TILE>(); + run_one_case<10u, T_TILE>(); + run_one_case<11u, T_TILE>(); + + // Stale-layout / drift regression (per-particle escape valve). + run_drift_case<0u, T_TILE>(); + run_drift_case<1u, T_TILE>(); + run_drift_case<2u, T_TILE>(); + run_drift_case<3u, T_TILE>(); + run_drift_case<4u, T_TILE>(); + run_drift_case<5u, T_TILE>(); + run_drift_case<6u, T_TILE>(); + run_drift_case<7u, T_TILE>(); + run_drift_case<8u, T_TILE>(); + run_drift_case<9u, T_TILE>(); + run_drift_case<10u, T_TILE>(); + run_drift_case<11u, T_TILE>(); + } + +} // namespace + +auto main(int argc, char* argv[]) -> int { + Kokkos::initialize(argc, argv); + try { + // Run with each tile-size choice from the validated CMake list. + run_all_orders<4u>(); + run_all_orders<6u>(); + run_all_orders<8u>(); + run_all_orders<10u>(); + run_all_orders<12u>(); + run_all_orders<14u>(); + run_all_orders<16u>(); + } catch (std::exception& e) { + std::cerr << e.what() << '\n'; + Kokkos::finalize(); + return 1; + } + Kokkos::finalize(); + return 0; +}