From a016b6ba50bd44eff82fa8c34acef85b0999805a Mon Sep 17 00:00:00 2001 From: haykh Date: Mon, 18 May 2026 08:51:01 -0400 Subject: [PATCH 01/33] version bump --- CMakeLists.txt | 75 +++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cd5f0c258..5482348f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ set(PROJECT_NAME entity) project( ${PROJECT_NAME} - VERSION 1.4.0 + VERSION 1.5.0 LANGUAGES CXX C) add_compile_options("-D ENTITY_VERSION=\"${PROJECT_VERSION}\"") set(hash_cmd "git diff --quiet src/ && echo $(git rev-parse HEAD) ") @@ -28,35 +28,35 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/defaults.cmake) # defaults set(DEBUG - ${default_debug} - CACHE BOOL "Debug mode") + ${default_debug} + CACHE BOOL "Debug mode") set(precision - ${default_precision} - CACHE STRING "Precision") + ${default_precision} + CACHE STRING "Precision") set(deposit - ${default_deposit} - CACHE STRING "Deposit") + ${default_deposit} + CACHE STRING "Deposit") set(shape_order - ${default_shape_order} - CACHE STRING "Shape function") + ${default_shape_order} + CACHE STRING "Shape function") set(pgen - ${default_pgen} - CACHE STRING "Problem generator") + ${default_pgen} + CACHE STRING "Problem generator") set(output - ${default_output} - CACHE BOOL "Enable output") + ${default_output} + CACHE BOOL "Enable output") set(mpi - ${default_mpi} - CACHE BOOL "Use MPI") + ${default_mpi} + CACHE BOOL "Use MPI") set(gpu_aware_mpi - ${default_gpu_aware_mpi} - CACHE BOOL "Enable GPU-aware MPI") + ${default_gpu_aware_mpi} + CACHE BOOL "Enable GPU-aware MPI") # -------------------------- Compilation settings -------------------------- # set(CMAKE_CXX_STANDARD 20) @@ -65,42 +65,42 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) if(${DEBUG} STREQUAL "OFF") set(CMAKE_BUILD_TYPE - Release - CACHE STRING "CMake build type") + Release + CACHE STRING "CMake build type") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG -O3") else() set(CMAKE_BUILD_TYPE - Debug - CACHE STRING "CMake build type") + Debug + CACHE STRING "CMake build type") set(CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -DDEBUG -Wall -Wextra -Wno-unknown-pragmas") + "${CMAKE_CXX_FLAGS} -DDEBUG -Wall -Wextra -Wno-unknown-pragmas") endif() # options set(precisions - "single" "double" - CACHE STRING "Precisions") + "single" "double" + CACHE STRING "Precisions") set(deposits - "zigzag" "esirkepov" - CACHE STRING "Deposits") + "zigzag" "esirkepov" + CACHE STRING "Deposits") if(${deposit} STREQUAL "zigzag") set(shape_order - ${default_shape_order} - CACHE STRING "Shape functions") + ${default_shape_order} + CACHE STRING "Shape functions") endif() set(shape_orders - "1;2;3;4;5;6;7;8;9;10;11" - CACHE STRING "Shape orders") + "1;2;3;4;5;6;7;8;9;10;11" + CACHE STRING "Shape orders") include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/config.cmake) # ------------------------- Third-Party Tests ------------------------------ # set(BUILD_TESTING - OFF - CACHE BOOL "Build tests") + OFF + CACHE BOOL "Build tests") # ------------------------ Third-party dependencies ------------------------ # include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/dependencies.cmake) @@ -129,8 +129,8 @@ else() endif() if(("${Kokkos_DEVICES}" MATCHES "CUDA") - OR ("${Kokkos_DEVICES}" MATCHES "HIP") - OR ("${Kokkos_DEVICES}" MATCHES "SYCL")) + OR ("${Kokkos_DEVICES}" MATCHES "HIP") + OR ("${Kokkos_DEVICES}" MATCHES "SYCL")) set(DEVICE_ENABLED ON) else() set(DEVICE_ENABLED OFF) @@ -148,8 +148,8 @@ if(${mpi}) endif() else() set(gpu_aware_mpi - OFF - CACHE BOOL "Use explicit copy when using MPI + GPU") + OFF + CACHE BOOL "Use explicit copy when using MPI + GPU") endif() endif() @@ -195,7 +195,8 @@ else() string(REPLACE "/" "_" pg_nodir ${pg_nodir}) set(pgen_suffix "_${pg_nodir}") set_problem_generator(${pg}) - add_subdirectory(${SRC_DIR}/engines ${CMAKE_CURRENT_BINARY_DIR}/${pg_nodir}/engines) + add_subdirectory(${SRC_DIR}/engines + ${CMAKE_CURRENT_BINARY_DIR}/${pg_nodir}/engines) add_subdirectory(${SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/${pg_nodir}/src) list(APPEND pgens_short ${PGEN}) endforeach() From 00fa0bf65a35c111c94dc8924ee4e6eda8285629 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ludwig=20B=C3=B6ss?= Date: Tue, 19 May 2026 19:25:22 +0200 Subject: [PATCH 02/33] port: team policy and vendor specific sort --- CMakeLists.txt | 66 + cmake/defaults.cmake | 15 + cmake/report.cmake | 25 + src/engines/srpic/currents.h | 135 +- src/framework/containers/particles.h | 58 + src/framework/containers/particles_sort.cpp | 341 ++++- src/global/arch/kokkos_aliases.h | 27 + src/global/utils/sort_dispatch.h | 171 +++ src/global/utils/sorting.h | 155 ++- src/kernels/currents_deposit.hpp | 1395 ++++++++++++------- tests/framework/CMakeLists.txt | 6 + tests/framework/sort_by_key.cpp | 110 ++ tests/global/tiling.cpp | 29 +- tests/kernels/CMakeLists.txt | 3 + tests/kernels/deposit_tiled.cpp | 262 ++++ 15 files changed, 2286 insertions(+), 512 deletions(-) create mode 100644 src/global/utils/sort_dispatch.h create mode 100644 tests/framework/sort_by_key.cpp create mode 100644 tests/kernels/deposit_tiled.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index cd5f0c258..acac1d7d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,6 +58,16 @@ set(gpu_aware_mpi ${default_gpu_aware_mpi} CACHE BOOL "Enable GPU-aware MPI") +set(team_policy + ${default_team_policy} + CACHE BOOL "Enable team_policy tile-blocked deposit/pusher kernels") +set(team_policy_tile_size + ${default_team_policy_tile_size} + CACHE STRING "team_policy tile edge length in cells") +set(team_policy_tile_sizes + "4;6;8;10;12" + CACHE STRING "team_policy tile-size choices") + # -------------------------- Compilation settings -------------------------- # set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -136,6 +146,62 @@ else() set(DEVICE_ENABLED OFF) endif() +# ------------------------------ team_policy wiring ------------------------ # +if(${team_policy}) + list(FIND team_policy_tile_sizes "${team_policy_tile_size}" _tps_idx) + if(_tps_idx EQUAL -1) + message(FATAL_ERROR + "${Red}team_policy_tile_size must be one of ${team_policy_tile_sizes}, " + "got '${team_policy_tile_size}'${ColorReset}") + endif() + add_compile_options("-D TEAM_POLICY") + add_compile_options("-D TEAM_POLICY_TILE_SIZE=${team_policy_tile_size}") + + # Vendor sort: oneDPL on SYCL, Thrust on CUDA. Used automatically + # when found; falls back to Kokkos::BinSort otherwise. + if("${Kokkos_DEVICES}" MATCHES "SYCL") + find_package(oneDPL QUIET) + if(oneDPL_FOUND) + message(STATUS "team_policy: oneDPL found, enabling SYCL sort_by_key") + add_compile_options("-D ONEDPL_ENABLED") + set(DEPENDENCIES ${DEPENDENCIES} oneDPL) + else() + message(STATUS "team_policy: oneDPL not found; using BinSort fallback " + "for SYCL sort_by_key") + endif() + endif() + + if("${Kokkos_DEVICES}" MATCHES "CUDA") + find_package(Thrust QUIET) + if(Thrust_FOUND) + message(STATUS "team_policy: Thrust enabled for CUDA sort_by_key") + add_compile_options("-D THRUST_ENABLED") + else() + message(STATUS "team_policy: Thrust not found; using BinSort fallback " + "for CUDA sort_by_key") + endif() + endif() + + if("${Kokkos_DEVICES}" MATCHES "HIP") + # rocThrust ships with ROCm and exposes the same thrust:: API. Using + # it lets the HIP backend build a single permutation via + # sort_by_key and gather all SoA members through one reused buffer, + # instead of the legacy per-member Kokkos::BinSort path which + # allocates a fresh `sorted_values` buffer for every member every + # step (the dominant source of allocator churn / fragmentation on + # ROCm). + find_package(rocthrust QUIET) + if(rocthrust_FOUND) + message(STATUS "team_policy: rocThrust enabled for HIP sort_by_key") + add_compile_options("-D ROCTHRUST_ENABLED") + set(DEPENDENCIES ${DEPENDENCIES} roc::rocthrust) + else() + message(STATUS "team_policy: rocThrust not found; using BinSort " + "fallback for HIP sort_by_key") + endif() + endif() +endif() + # MPI if(${mpi}) find_or_fetch_dependency(MPI FALSE REQUIRED) diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake index fb8790019..a85accf84 100644 --- a/cmake/defaults.cmake +++ b/cmake/defaults.cmake @@ -92,3 +92,18 @@ else() endif() set_property(CACHE default_gpu_aware_mpi PROPERTY TYPE BOOL) + +if(DEFINED ENV{Entity_ENABLE_TEAM_POLICY}) + set(default_team_policy + $ENV{Entity_ENABLE_TEAM_POLICY} + CACHE INTERNAL "Default flag for team_policy tile-blocked kernels") +else() + set(default_team_policy + OFF + CACHE INTERNAL "Default flag for team_policy tile-blocked kernels") +endif() +set_property(CACHE default_team_policy PROPERTY TYPE BOOL) + +set(default_team_policy_tile_size + 8 + CACHE INTERNAL "Default tile edge length in cells for team_policy") diff --git a/cmake/report.cmake b/cmake/report.cmake index 7b145bfc7..65a22a7a6 100644 --- a/cmake/report.cmake +++ b/cmake/report.cmake @@ -122,6 +122,26 @@ if(${mpi} AND ${DEVICE_ENABLED}) GPU_AWARE_MPI_REPORT 46) endif() +printchoices( + "Team Policy" + "team_policy" + "${ON_OFF_VALUES}" + ${team_policy} + OFF + "${Green}" + TEAM_POLICY_REPORT + 46) +if(${team_policy}) + printchoices( + "Team Tile Size" + "team_policy_tile_size" + "${team_policy_tile_sizes}" + ${team_policy_tile_size} + ${default_team_policy_tile_size} + "${Blue}" + TEAM_POLICY_TILE_SIZE_REPORT + 46) +endif() printchoices( "Debug mode" "DEBUG" @@ -197,6 +217,11 @@ if(${mpi} AND ${DEVICE_ENABLED}) string(APPEND REPORT_TEXT " " ${GPU_AWARE_MPI_REPORT} "\n") endif() +string(APPEND REPORT_TEXT " " ${TEAM_POLICY_REPORT} "\n") +if(${team_policy}) + string(APPEND REPORT_TEXT " " ${TEAM_POLICY_TILE_SIZE_REPORT} "\n") +endif() + string( APPEND REPORT_TEXT diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h index 3afabea8a..faf0bb3ad 100644 --- a/src/engines/srpic/currents.h +++ b/src/engines/srpic/currents.h @@ -2,7 +2,8 @@ * @file engines/srpic/currents.h * @brief Current deposition and filtering routines for the SRPIC engine * @implements - * - ntt::srpic::CallDepositKernel<> -> void + * - ntt::srpic::CallDepositKernel<> -> void (flat path) + * - ntt::srpic::CallDepositKernelTiled<> -> void (TEAM_POLICY) * - ntt::srpic::CurrentsDeposit<> -> void * - ntt::srpic::CurrentsFilter<> -> void * @namespaces: @@ -61,11 +62,142 @@ namespace ntt { dt)); } +#if defined(TEAM_POLICY) + /** + * @brief Tiled deposit launcher (TeamPolicy + per-team scratch). + * + * Iterates over `tile_layout.ntiles_total` teams; each team accumulates + * its tile's particle contributions in SLM scratch and atomically + * flushes to the global J. Requires the species to have been sorted + * with `team_policy` enabled (`tile_layout` populated by + * `SortSpatially`). + * + * Falls back to the flat kernel if `tile_offsets` is empty — this + * happens on the first step before the first sort, or for very small + * species that exited early in `SortSpatially`. The fallback uses the + * passed-in `scatter_cur` so the caller still composes correctly. + */ + template + void CallDepositKernelTiled( + const Particles& species, + const M& local_metric, + const ndfield_t& cur, + real_t dt) { + static_assert(O <= 11u, "Shape order must be <= 11"); + constexpr unsigned short T = static_cast( + TEAM_POLICY_TILE_SIZE); + const auto& layout = species.tile_layout(); + raise::ErrorIf(layout.ntiles_total == 0u, + "CallDepositKernelTiled: tile_layout has 0 tiles — call " + "SortSpatially before CurrentsDeposit", + HERE); + raise::ErrorIf(layout.tile_offsets.extent(0) != layout.ntiles_total + 1u, + "CallDepositKernelTiled: tile_offsets size inconsistent " + "with ntiles_total", + HERE); + + using kernel_t = kernel::DepositCurrents_kernel_tiled; + kernel_t kern { cur, + species.i1, + species.i2, + species.i3, + species.i1_prev, + species.i2_prev, + species.i3_prev, + species.dx1, + species.dx2, + species.dx3, + species.dx1_prev, + species.dx2_prev, + species.dx3_prev, + species.ux1, + species.ux2, + species.ux3, + species.phi, + species.weight, + species.tag, + local_metric, + (real_t)(species.charge()), + dt, + layout }; + + Kokkos::TeamPolicy<> policy(static_cast(layout.ntiles_total), + Kokkos::AUTO); + policy.set_scratch_size(0, Kokkos::PerTeam(kernel_t::scratch_bytes())); + Kokkos::parallel_for("CurrentsDepositTiled", policy, kern); + } +#endif // TEAM_POLICY + template void CurrentsDeposit(Domain& domain, const prm::Parameters& engine_params) { const auto dt = engine_params.get("dt"); Kokkos::deep_copy(domain.fields.cur, ZERO); + +#if defined(TEAM_POLICY) + + // First-step fallback: if any contributing species has not been + // sorted yet (tile_layout still empty), fall back to the flat + // scatter-view path for that step. Subsequent steps see populated + // layouts and use the tiled kernel. + bool any_unsorted = false; + for (auto& species : domain.species) { + if ((species.pusher() == ParticlePusher::NONE) or + (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) { + continue; + } + if (species.tile_layout().ntiles_total == 0u or + species.tile_layout().tile_offsets.extent(0) == 0u) { + any_unsorted = true; + break; + } + } + if (any_unsorted) { + auto scatter_cur = Kokkos::Experimental::create_scatter_view( + domain.fields.cur); + for (auto& species : domain.species) { + if ((species.pusher() == ParticlePusher::NONE) or + (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) { + continue; + } + logger::Checkpoint( + fmt::format("Launching currents deposit (flat fallback, no sort yet) " + "for %d [%s] : %lu %f", + species.index(), + species.label().c_str(), + species.npart(), + (double)species.charge()), + HERE); + CallDepositKernel(species, + domain.mesh.metric, + scatter_cur, + dt); + } + Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur); + } else { + for (auto& species : domain.species) { + if ((species.pusher() == ParticlePusher::NONE) or + (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) { + continue; + } + logger::Checkpoint( + fmt::format("Launching tiled currents deposit for %d [%s] : %lu %f", + species.index(), + species.label().c_str(), + species.npart(), + (double)species.charge()), + HERE); + + CallDepositKernelTiled(species, + domain.mesh.metric, + domain.fields.cur, + dt); + } + } +#else auto scatter_cur = Kokkos::Experimental::create_scatter_view( domain.fields.cur); for (auto& species : domain.species) { @@ -84,6 +216,7 @@ namespace ntt { CallDepositKernel(species, domain.mesh.metric, scatter_cur, dt); } Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur); +#endif } template diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h index 895026552..0a15cb5d9 100644 --- a/src/framework/containers/particles.h +++ b/src/framework/containers/particles.h @@ -25,6 +25,7 @@ #include "traits/metric.h" #include "utils/error.h" #include "utils/formatting.h" +#include "utils/sorting.h" #include "framework/containers/species.h" #include "framework/domain/grid.h" @@ -90,6 +91,30 @@ namespace ntt { const uint8_t m_ntags { (uint8_t)(2 + math::pow(3, (int)D) - 1) }; #endif + // team_policy: tile metadata produced by SortSpatially + // and consumed by the tiled deposit / pusher kernels. Lazily + // allocated on first sort. The sort backend itself (oneDPL on SYCL, + // Thrust on CUDA, std::sort on Host, Kokkos::BinSort otherwise) is + // selected at compile time based on the Kokkos device and the + // vendor libraries detected by CMake. + TileLayout m_tile_layout {}; + +#if defined(TEAM_POLICY) && \ + ((defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) || \ + (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) || \ + (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED))) + // Persistent byte scratch reused by every SoA-member gather in + // `apply_permutation_to_soa`, across all members and all timesteps. + // Without this each member would allocate (and free) its own + // transient buffer every sort; recycling one persistent buffer + // removes that allocation churn entirely — the structural fix for + // the ROCm sort slowdown / fragmentation. Grown monotonically to + // the largest required size, never shrunk. Kokkos device + // allocations are over-aligned (>= 8 B), so reinterpreting the + // bytes as any SoA element type (<= 8 B PODs) is well-defined. + array_t m_perm_scratch {}; +#endif + public: // for empty allocation Particles() {} @@ -276,9 +301,42 @@ namespace ntt { /** * @brief Sort particles spatially by their cell indices * @param grid The grid object to get the cell information for sorting + * @note In team_policy mode (compile-time `team_policy=ON`), also + * populates `m_tile_layout` with tile-offset and per-tile + * permutation metadata that the tiled deposit/pusher kernels + * consume. */ void SortSpatially(const Grid&); +#if defined(TEAM_POLICY) && \ + ((defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) || \ + (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) || \ + (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED))) + private: + /** + * @brief Apply a particle-index permutation (built by oneDPL/Thrust + * sort_by_key) to every SoA member array. Sequential — one + * transient buffer at a time, fenced before scope exit. + * Only compiled when a vendor sort backend is enabled; the + * BinSort path applies the permutation in place via + * `sorter.sort(view)` instead. + */ + void apply_permutation_to_soa(const prtl_perm_t& perm); + + public: +#endif + + /** + * @brief Read-only access to the tile layout produced by the most + * recent SortSpatially call. Returns a default-constructed + * layout (`ntiles_total == 0`) when the species has not yet + * been sorted. + */ + [[nodiscard]] + auto tile_layout() const -> const TileLayout& { + return m_tile_layout; + } + /** * @brief Copy particle data from device to host. */ diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp index 904fb3fc7..c04813e12 100644 --- a/src/framework/containers/particles_sort.cpp +++ b/src/framework/containers/particles_sort.cpp @@ -8,10 +8,22 @@ #include "framework/containers/particles.h" #include "framework/domain/grid.h" +#if defined(TEAM_POLICY) + #if (defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) || \ + (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) || \ + (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)) + #define TEAM_POLICY_USE_VENDOR_SORT + #include "utils/sort_dispatch.h" + #endif +#endif + #include #include +#include #include +#include +#include #include #include #include @@ -195,6 +207,192 @@ namespace ntt { template void Particles::SortSpatially(const Grid& grid) { +#if defined(TEAM_POLICY) + // ---------------------- team_policy: tile-based sort ------------------ // + const auto npart_local = npart(); + if (npart_local == 0u) { + m_tile_layout = TileLayout {}; + m_is_sorted = true; + return; + } + + constexpr unsigned short T = static_cast( + TEAM_POLICY_TILE_SIZE); + static_assert(T > 0u, "TEAM_POLICY_TILE_SIZE must be > 0"); + + // 1. Compute per-axis tile counts and total_tiles. + const auto ncells_active = grid.n_active(); + ncells_t ntx[3] { 1u, 1u, 1u }; + ncells_t total_tiles { 1u }; + if constexpr ((D == Dim::_1D) or (D == Dim::_2D) or (D == Dim::_3D)) { + ntx[0] = static_cast(math::ceil( + static_cast(ncells_active[0]) / static_cast(T))); + total_tiles *= ntx[0]; + } + if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) { + ntx[1] = static_cast(math::ceil( + static_cast(ncells_active[1]) / static_cast(T))); + total_tiles *= ntx[1]; + } + if constexpr (D == Dim::_3D) { + ntx[2] = static_cast(math::ceil( + static_cast(ncells_active[2]) / static_cast(T))); + total_tiles *= ntx[2]; + } + + // 2. Compute per-particle tile key (with min(i, i_prev)). + array_t tile_indices { "tile_indices", npart_local }; + Kokkos::parallel_for( + "FillTileIndices", + rangeActiveParticles(), + sort::PositionToTileIndex { i1, + i2, + i3, + tag, + tile_indices, + ncells_active, + static_cast(T), + array_t {}, + i1_prev, + i2_prev, + i3_prev }); + + // 3. Sort. Vendor library (oneDPL/Thrust) when compiled in; + // Kokkos::BinSort otherwise. n_bins = total_tiles + 2 covers + // the dead-particle sentinel bin (total_tiles + 1u). + const ncells_t n_bins = total_tiles + 2u; + const auto slice = prtl_slice_t(0, npart_local); + #if defined(TEAM_POLICY_USE_VENDOR_SORT) + // Vendor path: produce an explicit permutation via sort_by_key, + // then apply it to each SoA member with a sequential one-buffer + // gather (peak transient = one `npart × sizeof(member)` buffer. + prtl_perm_t perm { "tile_perm", npart_local }; + #if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED) + sort_helpers::sort_by_key_dispatch(tile_indices, + perm, + n_bins, + sort::backend::OneDPL {}); + #elif defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED) + sort_helpers::sort_by_key_dispatch(tile_indices, + perm, + n_bins, + sort::backend::Rocthrust {}); + #else + sort_helpers::sort_by_key_dispatch(tile_indices, + perm, + n_bins, + sort::backend::Thrust {}); + #endif + Kokkos::fence("SortSpatially: pre-gather drain"); + apply_permutation_to_soa(perm); + #else + // BinSort path: same mechanism as legacy SortSpatially (BinSort + // allocates one temp View per `sorter.sort(view)` call and frees + // it before the next), so peak transient memory is bounded. + using sorter_op_t = Kokkos::BinOp1D>; + using sorter_t = Kokkos::BinSort, sorter_op_t>; + auto bin_op = sorter_op_t { static_cast(n_bins), 0u, n_bins }; + auto sorter = sorter_t { tile_indices, bin_op, false }; + sorter.create_permute_vector(); + if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) { + sorter.sort(Kokkos::subview(i1, slice)); + sorter.sort(Kokkos::subview(i1_prev, slice)); + sorter.sort(Kokkos::subview(dx1, slice)); + sorter.sort(Kokkos::subview(dx1_prev, slice)); + } + if constexpr (D == Dim::_2D or D == Dim::_3D) { + sorter.sort(Kokkos::subview(i2, slice)); + sorter.sort(Kokkos::subview(i2_prev, slice)); + sorter.sort(Kokkos::subview(dx2, slice)); + sorter.sort(Kokkos::subview(dx2_prev, slice)); + } + if constexpr (D == Dim::_3D) { + sorter.sort(Kokkos::subview(i3, slice)); + sorter.sort(Kokkos::subview(i3_prev, slice)); + sorter.sort(Kokkos::subview(dx3, slice)); + sorter.sort(Kokkos::subview(dx3_prev, slice)); + } + sorter.sort(Kokkos::subview(ux1, slice)); + sorter.sort(Kokkos::subview(ux2, slice)); + sorter.sort(Kokkos::subview(ux3, slice)); + sorter.sort(Kokkos::subview(weight, slice)); + sorter.sort(Kokkos::subview(tag, slice)); + if constexpr (D == Dim::_2D and C != Coord::Cartesian) { + sorter.sort(Kokkos::subview(phi, slice)); + } + for (auto pldr { 0u }; pldr < npld_r(); ++pldr) { + sorter.sort(Kokkos::subview(pld_r, slice, pldr)); + } + for (auto pldi { 0u }; pldi < npld_i(); ++pldi) { + sorter.sort(Kokkos::subview(pld_i, slice, pldi)); + } + // Apply the same permutation to `tile_indices` itself so it ends + // monotonically non-decreasing for the offsets pass below. + sorter.sort(tile_indices); + #endif // TEAM_POLICY_USE_VENDOR_SORT + + // 5. Compute per-tile prefix-sum `tile_offsets` for the tiled + // pusher. `tile_indices` is now sorted (monotonically + // non-decreasing for alive particles, dead sentinel + // `total_tiles + 1` clustered at the end) — vendor sort_by_key + // sorts keys in place; the BinSort path explicitly applies the + // same permutation to `tile_indices` above. Transition-detect + // directly on it: the start of each non-empty tile is the only + // place a write happens — atomic-free in the dense branch. + // Empty tiles (no particles) are filled by a reverse pass on a + // small host mirror (`total_tiles ≈ 176K` at production scale → + // ~700 KB). + { + array_t tile_offsets { "tile_offsets", total_tiles + 1u }; + Kokkos::deep_copy(tile_offsets, static_cast(npart_local)); + + const auto total_tiles_v = total_tiles; + auto ti_v = tile_indices; + Kokkos::parallel_for( + "DetectTileBoundaries", + rangeActiveParticles(), + Lambda(prtlidx_t p) { + const auto t_curr = ti_v(p); + const bool boundary = (p == 0u) || (ti_v(p - 1u) != t_curr); + if (!boundary) { + return; + } + if (t_curr < total_tiles_v) { + tile_offsets(t_curr) = p; + } else { + // First dead particle — also marks the alive_count boundary + // stored at index total_tiles. + Kokkos::atomic_min(&tile_offsets(total_tiles_v), p); + } + }); + + auto h_offsets = Kokkos::create_mirror_view(tile_offsets); + Kokkos::deep_copy(h_offsets, tile_offsets); + for (auto t = static_cast(total_tiles); t-- > 0u;) { + if (h_offsets(t) > h_offsets(t + 1u)) { + h_offsets(t) = h_offsets(t + 1u); + } + } + Kokkos::deep_copy(tile_offsets, h_offsets); + + m_tile_layout.tile_offsets = tile_offsets; + } + + // 6. Populate `m_tile_layout` size/shape. `tile_perm` is not used + // in the current design — the SoA arrays are physically permuted + // into tile order, so consumers iterate + // `[tile_offsets(t), tile_offsets(t+1))` directly without a + // separate permutation indirection. + m_tile_layout.ntiles_per_axis[0] = ntx[0]; + m_tile_layout.ntiles_per_axis[1] = ntx[1]; + m_tile_layout.ntiles_per_axis[2] = ntx[2]; + m_tile_layout.ntiles_total = total_tiles; + m_tile_layout.tile_size = T; + m_tile_layout.tile_perm = prtl_perm_t {}; + m_is_sorted = true; + + Kokkos::fence("SortSpatially: end of team_policy path"); +#else // !TEAM_POLICY — legacy in-place BinSort by global cell index const auto nx2 = grid.n_active(in::x2); const auto nx3 = grid.n_active(in::x3); const auto total_cells = grid.num_active(); @@ -250,13 +448,153 @@ namespace ntt { for (auto pldi { 0u }; pldi < npld_i(); ++pldi) { sorter.sort(Kokkos::subview(pld_i, slice, pldi)); } +#endif // TEAM_POLICY + } + +#if defined(TEAM_POLICY_USE_VENDOR_SORT) + namespace permute_helpers { + + // Permute a 1D SoA member array `arr` in place by `perm`, gathering + // through `scratch` — a persistent byte buffer reused by every + // member and every timestep (no per-call allocation). An unmanaged + // typed view aliases the scratch bytes; the caller guarantees + // `scratch` is large enough and that Kokkos' device over-alignment + // covers the element type. + template + inline void permute_1d_inplace(V& arr, + const prtl_perm_t& perm, + npart_t n, + const array_t& scratch) { + if (n == 0u) { + return; + } + using value_t = typename V::non_const_value_type; + using buf_t = Kokkos::View>; + buf_t buf(reinterpret_cast(scratch.data()), n); + auto perm_v = perm; + auto arr_v = arr; + Kokkos::parallel_for( + "Permute1D", + n, + KOKKOS_LAMBDA(const npart_t p) { buf(p) = arr_v(perm_v(p)); }); + Kokkos::deep_copy(Kokkos::subview(arr, prtl_slice_t(0u, n)), buf); + Kokkos::fence("permute_1d_inplace: end"); + } + + // 2D analogue for `pld_r` / `pld_i`. + template + inline void permute_2d_inplace(V& arr, + const prtl_perm_t& perm, + npart_t n, + npart_t ncols, + const array_t& scratch) { + if (n == 0u or ncols == 0u) { + return; + } + using value_t = typename V::non_const_value_type; + using buf_t = Kokkos::View>; + buf_t buf(reinterpret_cast(scratch.data()), n, ncols); + auto perm_v = perm; + auto arr_v = arr; + Kokkos::parallel_for( + "Permute2D", + CreateParticleRangePolicy({ 0u, 0u }, { n, ncols }), + KOKKOS_LAMBDA(const npart_t p, const npart_t l) { + buf(p, l) = arr_v(perm_v(p), l); + }); + Kokkos::deep_copy(Kokkos::subview(arr, prtl_slice_t(0u, n), Kokkos::ALL), + buf); + Kokkos::fence("permute_2d_inplace: end"); + } + + } // namespace permute_helpers + + template + void Particles::apply_permutation_to_soa(const prtl_perm_t& perm) { + const auto n = npart(); + if (n == 0u) { + return; + } + + // Size the persistent scratch once to the largest gather any member + // needs this call: 1D members need n * sizeof(real_t) bytes (the + // widest element); the 2D payloads need n * ncols * elem bytes. + // Grown monotonically, never shrunk — so after warmup this incurs + // no allocation at all. + std::size_t need = static_cast(n) * sizeof(real_t); + if (npld_r() > 0) { + need = std::max(need, + static_cast(n) * + static_cast(npld_r()) * sizeof(real_t)); + } + if (npld_i() > 0) { + need = std::max(need, + static_cast(n) * + static_cast(npld_i()) * sizeof(npart_t)); + } + if (m_perm_scratch.extent(0) < need) { + m_perm_scratch = array_t { "perm_scratch", need }; + } + const auto& scratch = m_perm_scratch; + + using permute_helpers::permute_1d_inplace; + using permute_helpers::permute_2d_inplace; + + if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) { + permute_1d_inplace(i1, perm, n, scratch); + permute_1d_inplace(dx1, perm, n, scratch); + permute_1d_inplace(i1_prev, perm, n, scratch); + permute_1d_inplace(dx1_prev, perm, n, scratch); + } + if constexpr (D == Dim::_2D or D == Dim::_3D) { + permute_1d_inplace(i2, perm, n, scratch); + permute_1d_inplace(dx2, perm, n, scratch); + permute_1d_inplace(i2_prev, perm, n, scratch); + permute_1d_inplace(dx2_prev, perm, n, scratch); + } + if constexpr (D == Dim::_3D) { + permute_1d_inplace(i3, perm, n, scratch); + permute_1d_inplace(dx3, perm, n, scratch); + permute_1d_inplace(i3_prev, perm, n, scratch); + permute_1d_inplace(dx3_prev, perm, n, scratch); + } + permute_1d_inplace(ux1, perm, n, scratch); + permute_1d_inplace(ux2, perm, n, scratch); + permute_1d_inplace(ux3, perm, n, scratch); + permute_1d_inplace(weight, perm, n, scratch); + permute_1d_inplace(tag, perm, n, scratch); + if constexpr (D == Dim::_2D and C != Coord::Cartesian) { + permute_1d_inplace(phi, perm, n, scratch); + } + if (npld_r() > 0) { + permute_2d_inplace(pld_r, perm, n, static_cast(npld_r()), scratch); + } + if (npld_i() > 0) { + permute_2d_inplace(pld_i, perm, n, static_cast(npld_i()), scratch); + } } +#endif // TEAM_POLICY_USE_VENDOR_SORT + +#if defined(TEAM_POLICY_USE_VENDOR_SORT) + #define APPLY_PERM_INSTANTIATE(D, C) \ + template void Particles::apply_permutation_to_soa( \ + const prtl_perm_t&); +#else + #define APPLY_PERM_INSTANTIATE(D, C) +#endif #define PARTICLES_SORT(D, C) \ template auto Particles::NpartsPerTagAndOffsets() const \ -> std::pair, array_t>; \ template void Particles::RemoveDead(); \ - template void Particles::SortSpatially(const Grid&); + template void Particles::SortSpatially(const Grid&); \ + APPLY_PERM_INSTANTIATE(D, C) PARTICLES_SORT(Dim::_1D, Coord::Cartesian) PARTICLES_SORT(Dim::_2D, Coord::Cartesian) @@ -266,5 +604,6 @@ namespace ntt { PARTICLES_SORT(Dim::_3D, Coord::Spherical) PARTICLES_SORT(Dim::_3D, Coord::Qspherical) #undef PARTICLES_SORT +#undef APPLY_PERM_INSTANTIATE } // namespace ntt diff --git a/src/global/arch/kokkos_aliases.h b/src/global/arch/kokkos_aliases.h index 314e78c0e..43f21ce47 100644 --- a/src/global/arch/kokkos_aliases.h +++ b/src/global/arch/kokkos_aliases.h @@ -10,6 +10,7 @@ * - CreateRangePolicy, CreateRangePolicyOnHost * - random_number_pool_t, random_generator_t * - Random function + * - prtl_perm_t, TileLayout<> * @cpp: * - arch/kokkos_aliases.cpp * @namespaces: @@ -258,6 +259,32 @@ template auto CreateRangePolicyOnHost(const tuple_t&, const tuple_t&) -> range_h_t; +// --------------------------- team_policy types ---------------------------- // +// Particle permutation index: maps a sorted-position p in [0, npart) to a +// pre-sort particle index. Produced by SortSpatially, consumed by tiled +// pusher and deposit kernels to walk particles tile-by-tile without +// physically re-permuting the SoA arrays in lock step every step. +using prtl_perm_t = array_t; + +// Tile layout metadata: the contract between Stream 1 (sort) and Streams +// 2/3 (tiled deposit / pusher). All members are device-resident. +// ntiles_per_axis : number of tiles along each axis (1 for unused axes). +// ntiles_total : product of ntiles_per_axis = league size for TeamPolicy. +// tile_size : tile edge length in cells (compile-time CMake knob, +// replicated here for runtime checks). +// tile_offsets : prefix-sum of per-tile particle counts; size +// ntiles_total + 1; tile t owns particles +// [tile_offsets(t), tile_offsets(t+1)). +// tile_perm : size npart, particle index sorted by tile. +template +struct TileLayout { + ncells_t ntiles_per_axis[3] { 1u, 1u, 1u }; + ncells_t ntiles_total { 0u }; + unsigned short tile_size { 0u }; + array_t tile_offsets; + prtl_perm_t tile_perm; +}; + // Random number pool/generator type alias // (using math:: instead of Kokkos:: to suppress compiler warning on unused namespace alias) using random_number_pool_t = math::Random_XorShift64_Pool; diff --git a/src/global/utils/sort_dispatch.h b/src/global/utils/sort_dispatch.h new file mode 100644 index 000000000..506a90742 --- /dev/null +++ b/src/global/utils/sort_dispatch.h @@ -0,0 +1,171 @@ +/** + * @file utils/sort_dispatch.h + * @brief Backend-dispatched sort_by_key for team_policy SortSpatially. + * @implements + * - sort_helpers::sort_by_key_dispatch -> void (BinSort, OneDPL, Thrust, StdSort) + * @namespaces: + * - ntt::sort_helpers:: + * @macros: + * - TEAM_POLICY + * - SYCL_ENABLED, ONEDPL_ENABLED (oneDPL overload) + * - CUDA_ENABLED, THRUST_ENABLED (Thrust overload) + * + * @note Each overload produces a permutation `perm` of size N such that + * keys[perm[0]] <= keys[perm[1]] <= ... in stable order. + * Always-available overloads: BinSort (uses Kokkos::BinSort) and + * StdSort (host-side std::stable_sort fallback). The vendor-library + * overloads (OneDPL on SYCL, Thrust on CUDA) are conditional on the + * respective build flags. + */ + +#ifndef GLOBAL_UTILS_SORT_DISPATCH_H +#define GLOBAL_UTILS_SORT_DISPATCH_H + +#if !defined(TEAM_POLICY) + #error "sort_dispatch.h is only meaningful when TEAM_POLICY is defined" +#endif + +#include "global.h" + +#include "arch/kokkos_aliases.h" +#include "utils/sorting.h" + +#include +#include + +#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED) + #include + #include +#endif +#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED) + #include + #include + #include +#endif +#if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED) + #include + #include + #include + #include +#endif + +#include +#include + +namespace ntt::sort_helpers { + + // Always-available legacy fallback: Kokkos::BinSort. n_bins must be an + // upper bound on distinct key values. + inline void sort_by_key_dispatch(const array_t& keys, + prtl_perm_t& perm, + ncells_t n_bins, + ::sort::backend::BinSort) { + const auto n = static_cast(keys.extent(0)); + if (n == 0u) { + return; + } + using sorter_op_t = Kokkos::BinOp1D>; + using sorter_t = Kokkos::BinSort, sorter_op_t>; + auto bin_op = sorter_op_t { static_cast(n_bins), 0u, n_bins }; + auto sorter = sorter_t { keys, bin_op, false }; + sorter.create_permute_vector(); + auto perm_v = perm; + Kokkos::parallel_for( + "PermInitIota", + n, + KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; }); + Kokkos::fence("sort_by_key_dispatch BinSort: pre-sort"); + sorter.sort(perm); + Kokkos::fence("sort_by_key_dispatch BinSort: post-sort"); + } + +#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED) + inline void sort_by_key_dispatch(const array_t& keys, + prtl_perm_t& perm, + ncells_t /*n_bins*/, + ::sort::backend::OneDPL) { + const auto n = static_cast(keys.extent(0)); + if (n == 0u) { + return; + } + auto* keys_ptr = keys.data(); + auto* perm_ptr = perm.data(); + auto exec = Kokkos::DefaultExecutionSpace(); + auto perm_v = perm; + Kokkos::parallel_for( + "PermInitIota", + n, + KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; }); + // Drain Kokkos's queue so oneDPL's policy sees the iota'd perm even + // if oneDPL submits to a different SYCL queue internally. + exec.fence("sort_by_key_dispatch OneDPL: pre-sort"); + auto queue = exec.sycl_queue(); + auto policy = oneapi::dpl::execution::make_device_policy(queue); + oneapi::dpl::sort_by_key(policy, keys_ptr, keys_ptr + n, perm_ptr); + exec.fence("sort_by_key_dispatch OneDPL: post-sort"); + } +#endif + +#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED) + inline void sort_by_key_dispatch(const array_t& keys, + prtl_perm_t& perm, + ncells_t /*n_bins*/, + ::sort::backend::Thrust) { + const auto n = static_cast(keys.extent(0)); + if (n == 0u) { + return; + } + Kokkos::fence("sort_by_key_dispatch Thrust: pre-sort"); + thrust::device_ptr kp(keys.data()); + thrust::device_ptr pp(perm.data()); + thrust::sequence(pp, pp + n); + thrust::sort_by_key(kp, kp + n, pp); + Kokkos::fence("sort_by_key_dispatch Thrust: post-sort"); + } +#endif + +#if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED) + // rocThrust exposes the same thrust:: API as CUDA Thrust; with hipcc + // device_ptr-based algorithms dispatch to the HIP backend. Mirrors + // the CUDA Thrust overload. + inline void sort_by_key_dispatch(const array_t& keys, + prtl_perm_t& perm, + ncells_t /*n_bins*/, + ::sort::backend::Rocthrust) { + const auto n = static_cast(keys.extent(0)); + if (n == 0u) { + return; + } + Kokkos::fence("sort_by_key_dispatch Rocthrust: pre-sort"); + thrust::device_ptr kp(keys.data()); + thrust::device_ptr pp(perm.data()); + thrust::sequence(pp, pp + n); + thrust::sort_by_key(kp, kp + n, pp); + Kokkos::fence("sort_by_key_dispatch Rocthrust: post-sort"); + } +#endif + + // Host fallback: indirect-sort via std::stable_sort. + inline void sort_by_key_dispatch(const array_t& keys, + prtl_perm_t& perm, + ncells_t /*n_bins*/, + ::sort::backend::StdSort) { + const auto n = static_cast(keys.extent(0)); + if (n == 0u) { + return; + } + auto keys_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + keys); + auto perm_h = Kokkos::create_mirror_view(perm); + std::iota(perm_h.data(), perm_h.data() + n, npart_t { 0u }); + std::stable_sort(perm_h.data(), + perm_h.data() + n, + [&](npart_t a, npart_t b) { + return keys_h(a) < keys_h(b); + }); + Kokkos::deep_copy(perm, perm_h); + } + +} // namespace ntt::sort_helpers + +#endif // GLOBAL_UTILS_SORT_DISPATCH_H diff --git a/src/global/utils/sorting.h b/src/global/utils/sorting.h index dbe774402..8442f5ddb 100644 --- a/src/global/utils/sorting.h +++ b/src/global/utils/sorting.h @@ -5,6 +5,7 @@ * - sort::BinBool<> * - sort::BinTag<> * - sort::PositionToTileIndex<> + * - sort::backend tag types (compile-time tag dispatch for sort_by_key) * @namespaces: * - sort:: * @note BinBool sorts by boolean values "true" then "false" @@ -62,9 +63,27 @@ namespace sort { const int m_max_bins; }; - template + /** + * @brief Bin a particle into a tile of edge length `tile_size` cells. + * @tparam D Dimension. + * @tparam Count If true, atomic-increment `num_ppt[tile]` for each live + * particle (used to populate per-tile counts in one pass). + * @tparam UsePrev If true, the bin key uses `min(i_curr, i_prev)` instead + * of `i_curr` alone. This guarantees that a particle whose + * Esirkepov stencil straddles a tile boundary (because it + * crossed the boundary during the pusher) lands in the + * lower-indexed of the two tiles. Combined with a halo of + * `O+1` cells in the deposit's per-tile scratch, this + * keeps every particle's stencil inside its assigned + * tile's interior+halo region. See plan §S2.4. + * + * Dead particles get the sentinel `total_tiles + 1u` so they sort to the + * end (or get skipped, depending on the consumer). + */ + template struct PositionToTileIndex { const array_t i1, i2, i3; + const array_t i1_prev, i2_prev, i3_prev; const array_t tag; array_t tile_indices; ncells_t tile_size; @@ -72,20 +91,37 @@ namespace sort { ncells_t ntx2 { 0u }, ntx3 { 0u }; ncells_t total_tiles { 0u }; - - PositionToTileIndex(const array_t& i1, - const array_t& i2, - const array_t& i3, - const array_t& tag, - array_t& tile_indices, + // Active-cell extents per axis. Used to clamp the bin key when + // UsePrev=true, since `i_prev` can be transiently negative after the + // pusher's periodic wrap (`i_prev -= ni`) or out-of-range after an + // MPI receive that hasn't translated frames. Without clamping, the + // signed-to-unsigned promotion in `int(-1) / uint32_t(T)` produces + // ~1.07e9, the linearised `tile_indices(p)` overflows past `n_bins`, + // and BinSort's internal `atomic_add(&bin_count[wild_idx], 1)` + // faults on an unmapped page. + int ncells1 { 1 }, ncells2 { 1 }, ncells3 { 1 }; + + PositionToTileIndex(const array_t& i1_, + const array_t& i2_, + const array_t& i3_, + const array_t& tag_, + array_t& tile_indices_, const std::vector& ncells, - ncells_t tile_size = 1u) - : i1 { i1 } - , i2 { i2 } - , i3 { i3 } - , tag { tag } - , tile_indices { tile_indices } - , tile_size { tile_size } + ncells_t tile_size_ = 1u, + const array_t& num_ppt_ = { "num_ppt", 0u }, + const array_t& i1_prev_ = {}, + const array_t& i2_prev_ = {}, + const array_t& i3_prev_ = {}) + : i1 { i1_ } + , i2 { i2_ } + , i3 { i3_ } + , i1_prev { i1_prev_ } + , i2_prev { i2_prev_ } + , i3_prev { i3_prev_ } + , tag { tag_ } + , tile_indices { tile_indices_ } + , tile_size { tile_size_ } + , num_ppt { num_ppt_ } , ntx2 { 1u } , ntx3 { 1u } , total_tiles { 1u } { @@ -93,49 +129,122 @@ namespace sort { "ncells size must match D", HERE); if constexpr ((D == Dim::_1D) or (D == Dim::_2D) or (D == Dim::_3D)) { + ncells1 = static_cast(ncells[0]); npart_t ntx1 = static_cast(math::ceil( static_cast(ncells[0]) / static_cast(tile_size))); total_tiles *= ntx1; } if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) { + ncells2 = static_cast(ncells[1]); ntx2 = static_cast(math::ceil( static_cast(ncells[1]) / static_cast(tile_size))); total_tiles *= ntx2; } if constexpr (D == Dim::_3D) { + ncells3 = static_cast(ncells[2]); ntx3 = static_cast(math::ceil( static_cast(ncells[2]) / static_cast(tile_size))); total_tiles *= ntx3; } if constexpr (Count) { - num_ppt = array_t { "num_ppt", total_tiles }; + raise::ErrorIf(num_ppt.extent(0) != total_tiles, + "num_ppt must have extent equal to total tiles", + HERE); + } + if constexpr (UsePrev) { + raise::ErrorIf( + i1_prev.extent(0) == 0u, + "PositionToTileIndex requires i1_prev to be set", + HERE); + if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) { + raise::ErrorIf( + i2_prev.extent(0) == 0u, + "PositionToTileIndex requires i2_prev to be set", + HERE); + } + if constexpr (D == Dim::_3D) { + raise::ErrorIf( + i3_prev.extent(0) == 0u, + "PositionToTileIndex requires i3_prev to be set", + HERE); + } } } - Inline auto operator()(prtldx_t p) const { + Inline auto operator()(prtlidx_t p) const { if (tag(p) != ntt::ParticleTag::alive) { tile_indices(p) = total_tiles + 1u; } else { + // bin key per-axis: use min(i, i_prev) when UsePrev so that a + // particle straddling a boundary lands in the lower tile. + // Then clamp to [0, ncells_axis - 1] — `i_prev` can be negative + // (after the pusher's periodic-wrap path: `i_prev -= ni`) or + // out-of-range (after MPI receive without frame translation). + // Without the clamp, signed-to-unsigned promotion in + // `int(-1) / uint32_t(T)` makes `tile_indices(p)` overflow far + // past `n_bins`, and BinSort's `atomic_add(&bin_count[bin],1)` + // faults on an unmapped page. + const auto clamp_axis = [](int v, int ncells) -> int { + return (v < 0) ? 0 : ((v >= ncells) ? (ncells - 1) : v); + }; + const auto key1 = [&]() -> int { + if constexpr (UsePrev) { + const int raw = (i1(p) < i1_prev(p)) ? i1(p) : i1_prev(p); + return clamp_axis(raw, ncells1); + } else { + return i1(p); + } + }(); + const auto key2 = [&]() -> int { + if constexpr (UsePrev) { + const int raw = (i2(p) < i2_prev(p)) ? i2(p) : i2_prev(p); + return clamp_axis(raw, ncells2); + } else { + return i2(p); + } + }(); + const auto key3 = [&]() -> int { + if constexpr (UsePrev) { + const int raw = (i3(p) < i3_prev(p)) ? i3(p) : i3_prev(p); + return clamp_axis(raw, ncells3); + } else { + return i3(p); + } + }(); if constexpr (D == Dim::_1D) { - tile_indices(p) = static_cast(i1(p) / tile_size); + tile_indices(p) = static_cast(key1 / tile_size); } else if constexpr (D == Dim::_2D) { - tile_indices(p) = static_cast(i1(p) / tile_size) * ntx2 + - static_cast(i2(p) / tile_size); + tile_indices(p) = static_cast(key1 / tile_size) * ntx2 + + static_cast(key2 / tile_size); } else if constexpr (D == Dim::_3D) { - tile_indices(p) = (static_cast(i1(p) / tile_size) * ntx2 + - static_cast(i2(p) / tile_size)) * + tile_indices(p) = (static_cast(key1 / tile_size) * ntx2 + + static_cast(key2 / tile_size)) * ntx3 + - static_cast(i3(p) / tile_size); + static_cast(key3 / tile_size); } else { raise::KernelError(HERE, "Wrong D in SortSpatially"); } if constexpr (Count) { - Kokkos::atomic_add(&num_ppt(tile_indices(p)), 1u); + Kokkos::atomic_add(&num_ppt(tile_indices(p)), 1); } } } }; + // -------------------- Backend dispatch for sort_by_key ------------------- // + // Compile-time tags for tag-dispatch into backend-specific + // sort_by_key implementations. Selection is fully compile-time: the + // backend that resolves depends on the active Kokkos device and the + // availability of the corresponding vendor library. + namespace backend { + struct OneDPL {}; + struct Thrust {}; + struct Rocthrust {}; + struct StdSort {}; + // Always-available legacy fallback using Kokkos::BinSort. + struct BinSort {}; + } // namespace backend + } // namespace sort #endif // GLOBAL_UTILS_SORTING_H diff --git a/src/kernels/currents_deposit.hpp b/src/kernels/currents_deposit.hpp index 268187af2..5eb7ff2b6 100644 --- a/src/kernels/currents_deposit.hpp +++ b/src/kernels/currents_deposit.hpp @@ -1,10 +1,24 @@ /** * @file kernels/currents_deposit.hpp - * @brief Covariant algorithms for the current deposition + * @brief Covariant algorithms for the current deposition. + * + * Two kernels share the same per-particle body + * (`kernel::deposit::deposit_one_particle`): + * - `kernel::DepositCurrents_kernel` flat (RangePolicy over particles, + * writes into a `Kokkos::Experimental::ScatterView`). Always available. + * - `kernel::DepositCurrents_kernel_tiled` team-policy + * (one team per spatial tile, accumulates into team SLM scratch with + * atomic adds, then flushes to global J). Available when `team_policy=ON` + * (`#if defined(TEAM_POLICY)`). Stream 2 of the Pattern A plan. + * * @implements + * - kernel::deposit::PrtlPack<> + * - kernel::deposit::deposit_one_particle<> * - kernel::DepositCurrents_kernel<> + * - kernel::DepositCurrents_kernel_tiled<> (TEAM_POLICY only) * @namespaces: * - kernel:: + * - kernel::deposit:: */ #ifndef KERNELS_CURRENTS_DEPOSIT_HPP @@ -18,100 +32,99 @@ #include "utils/error.h" #include "utils/numeric.h" -#include "particle_shapes.hpp" +#include "kernels/particle_shapes.hpp" #include +#include #define i_di_to_Xi(I, DI) (static_cast((I)) + static_cast((DI))) namespace kernel { using namespace ntt; - /** - * @brief Algorithm for the current deposition - */ - template - class DepositCurrents_kernel { - static_assert(O <= 11u, "Shape function order O must be <= 11"); - static constexpr auto D = M::Dim; - - scatter_ndfield_t J; - const array_t i1, i2, i3; - const array_t i1_prev, i2_prev, i3_prev; - const array_t dx1, dx2, dx3; - const array_t dx1_prev, dx2_prev, dx3_prev; - const array_t ux1, ux2, ux3; - const array_t phi; - const array_t weight; - const array_t tag; - const M metric; - const real_t charge, inv_dt; + namespace deposit { - public: /** - * @brief explicit constructor. + * @brief Per-particle reference pack consumed by both the flat and tiled + * deposit kernels. The same set of SoA references is captured by + * each kernel; bundling them here keeps the helper's argument + * list manageable and ensures every consumer reads the same + * view aliases. */ - DepositCurrents_kernel(const scatter_ndfield_t& scatter_cur, - const array_t& i1, - const array_t& i2, - const array_t& i3, - const array_t& i1_prev, - const array_t& i2_prev, - const array_t& i3_prev, - const array_t& dx1, - const array_t& dx2, - const array_t& dx3, - const array_t& dx1_prev, - const array_t& dx2_prev, - const array_t& dx3_prev, - const array_t& ux1, - const array_t& ux2, - const array_t& ux3, - const array_t& phi, - const array_t& weight, - const array_t& tag, - const M& metric, - real_t charge, - const real_t dt) - : J { scatter_cur } - , i1 { i1 } - , i2 { i2 } - , i3 { i3 } - , i1_prev { i1_prev } - , i2_prev { i2_prev } - , i3_prev { i3_prev } - , dx1 { dx1 } - , dx2 { dx2 } - , dx3 { dx3 } - , dx1_prev { dx1_prev } - , dx2_prev { dx2_prev } - , dx3_prev { dx3_prev } - , ux1 { ux1 } - , ux2 { ux2 } - , ux3 { ux3 } - , phi { phi } - , weight { weight } - , tag { tag } - , metric { metric } - , charge { charge } - , inv_dt { ONE / dt } { - raise::ErrorIf( - (O == 2u and N_GHOSTS < 2), - "Order of interpolation is 2, but number of ghost cells is < 2", - HERE); - } + template + struct PrtlPack { + array_t i1, i2, i3; + array_t i1_prev, i2_prev, i3_prev; + array_t dx1, dx2, dx3; + array_t dx1_prev, dx2_prev, dx3_prev; + array_t ux1, ux2, ux3; + array_t phi; + array_t weight; + array_t tag; + }; /** - * @brief Iteration of the loop over particles. - * @param p index. + * @brief Per-particle deposit body, shared between the flat and tiled + * kernels. + * + * The caller supplies a `deposit_at(idx..., comp, val)` callback that + * applies the contribution `val` to the J component `comp` at the + * **global** J cell index `idx...` (already includes the `N_GHOSTS` + * offset). The flat kernel's callback simply does + * `J_acc(idx..., comp) += val` on its scatter-view accessor; the tiled + * kernel's callback translates `idx...` into per-tile scratch + * coordinates and uses `Kokkos::atomic_add` on SLM. Either way, this + * function is identical numerically and contains the only deposit math + * in the codebase. + * + * Dead particles return early. The callback is invoked once per cell + * write, with the dimension-appropriate signature: + * - 1D: `deposit_at(int g_i1, int comp, real_t val)` + * - 2D: `deposit_at(int g_i1, int g_i2, int comp, real_t val)` + * - 3D: `deposit_at(int g_i1, int g_i2, int g_i3, int comp, real_t val)` */ - Inline auto operator()(prtlidx_t p) const -> void { + template + Inline void deposit_one_particle(prtlidx_t p, + const PrtlPack& prtls, + const M& metric, + real_t charge, + real_t inv_dt, + DepositFn deposit_at) { + static_assert(O <= 11u, "Shape function order O must be <= 11"); + constexpr auto D = M::Dim; + + const auto& i1 = prtls.i1; + const auto& i2 = prtls.i2; + const auto& i3 = prtls.i3; + const auto& i1_prev = prtls.i1_prev; + const auto& i2_prev = prtls.i2_prev; + const auto& i3_prev = prtls.i3_prev; + const auto& dx1 = prtls.dx1; + const auto& dx2 = prtls.dx2; + const auto& dx3 = prtls.dx3; + const auto& dx1_prev = prtls.dx1_prev; + const auto& dx2_prev = prtls.dx2_prev; + const auto& dx3_prev = prtls.dx3_prev; + const auto& ux1 = prtls.ux1; + const auto& ux2 = prtls.ux2; + const auto& ux3 = prtls.ux3; + const auto& phi = prtls.phi; + const auto& weight = prtls.weight; + const auto& tag = prtls.tag; + if (tag(p) == ParticleTag::dead) { return; } + // recover particle velocity to deposit in unsimulated direction - vec_t vp { ZERO }; - { + [[maybe_unused]] vec_t vp { ZERO }; + // `vp` only feeds the unsimulated-direction current in the 1D + // (jx2, jx3) and 2D (jx3) branches. In 3D every J component comes + // from the Esirkepov/zigzag charge motion and `vp` is never read, + // so the metric transform + 1/sqrt + NaN/Inf guard below is pure + // dead work there — skip it (also frees xp/inv_energy registers). + if constexpr (D != Dim::_3D) { coord_t xp { ZERO }; if constexpr (D == Dim::_1D) { xp[0] = i_di_to_Xi(i1(p), dx1(p)); @@ -167,7 +180,6 @@ namespace kernel { const real_t coeff { weight(p) * charge }; - // ToDo: interpolation_order as parameter if constexpr (O == 0u) { /* Zig-zag deposit @@ -191,8 +203,6 @@ namespace kernel { dx1(p) - dxp_r_1) * coeff * inv_dt }; - auto J_acc = J.access(); - if constexpr (D == Dim::_1D) { const real_t Fx2_1 { HALF * vp[1] * coeff }; const real_t Fx2_2 { HALF * vp[1] * coeff }; @@ -200,18 +210,18 @@ namespace kernel { const real_t Fx3_1 { HALF * vp[2] * coeff }; const real_t Fx3_2 { HALF * vp[2] * coeff }; - J_acc(i1_prev(p) + N_GHOSTS, cur::jx1) += Fx1_1; - J_acc(i1(p) + N_GHOSTS, cur::jx1) += Fx1_2; + deposit_at(i1_prev(p) + N_GHOSTS, cur::jx1, Fx1_1); + deposit_at(i1(p) + N_GHOSTS, cur::jx1, Fx1_2); - J_acc(i1_prev(p) + N_GHOSTS, cur::jx2) += Fx2_1 * (ONE - Wx1_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, cur::jx2) += Fx2_1 * Wx1_1; - J_acc(i1(p) + N_GHOSTS, cur::jx2) += Fx2_2 * (ONE - Wx1_2); - J_acc(i1(p) + N_GHOSTS + 1, cur::jx2) += Fx2_2 * Wx1_2; + deposit_at(i1_prev(p) + N_GHOSTS, cur::jx2, Fx2_1 * (ONE - Wx1_1)); + deposit_at(i1_prev(p) + N_GHOSTS + 1, cur::jx2, Fx2_1 * Wx1_1); + deposit_at(i1(p) + N_GHOSTS, cur::jx2, Fx2_2 * (ONE - Wx1_2)); + deposit_at(i1(p) + N_GHOSTS + 1, cur::jx2, Fx2_2 * Wx1_2); - J_acc(i1_prev(p) + N_GHOSTS, cur::jx3) += Fx3_1 * (ONE - Wx1_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, cur::jx3) += Fx3_1 * Wx1_1; - J_acc(i1(p) + N_GHOSTS, cur::jx3) += Fx3_2 * (ONE - Wx1_2); - J_acc(i1(p) + N_GHOSTS + 1, cur::jx3) += Fx3_2 * Wx1_2; + deposit_at(i1_prev(p) + N_GHOSTS, cur::jx3, Fx3_1 * (ONE - Wx1_1)); + deposit_at(i1_prev(p) + N_GHOSTS + 1, cur::jx3, Fx3_1 * Wx1_1); + deposit_at(i1(p) + N_GHOSTS, cur::jx3, Fx3_2 * (ONE - Wx1_2)); + deposit_at(i1(p) + N_GHOSTS + 1, cur::jx3, Fx3_2 * Wx1_2); } else if constexpr (D == Dim::_2D || D == Dim::_3D) { const auto dxp_r_2 { static_cast(i2(p) == i2_prev(p)) * (dx2(p) + dx2_prev(p)) * @@ -236,51 +246,73 @@ namespace kernel { const real_t Fx3_1 { HALF * vp[2] * coeff }; const real_t Fx3_2 { HALF * vp[2] * coeff }; - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - cur::jx1) += Fx1_1 * (ONE - Wx2_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - cur::jx1) += Fx1_1 * Wx2_1; - J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS, cur::jx1) += Fx1_2 * - (ONE - Wx2_2); - J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS + 1, cur::jx1) += Fx1_2 * Wx2_2; - - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - cur::jx2) += Fx2_1 * (ONE - Wx1_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - cur::jx2) += Fx2_1 * Wx1_1; - J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS, cur::jx2) += Fx2_2 * - (ONE - Wx1_2); - J_acc(i1(p) + N_GHOSTS + 1, i2(p) + N_GHOSTS, cur::jx2) += Fx2_2 * Wx1_2; - - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * Wx1_1 * (ONE - Wx2_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - cur::jx3) += Fx3_1 * (ONE - Wx1_1) * Wx2_1; - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS + 1, - cur::jx3) += Fx3_1 * Wx1_1 * Wx2_1; - - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2); - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * Wx1_2 * (ONE - Wx2_2); - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS + 1, - cur::jx3) += Fx3_2 * (ONE - Wx1_2) * Wx2_2; - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS + 1, - cur::jx3) += Fx3_2 * Wx1_2 * Wx2_2; + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS, + cur::jx1, + Fx1_1 * (ONE - Wx2_1)); + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_1 * Wx2_1); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS, + cur::jx1, + Fx1_2 * (ONE - Wx2_2)); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_2 * Wx2_2); + + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS, + cur::jx2, + Fx2_1 * (ONE - Wx1_1)); + deposit_at(i1_prev(p) + N_GHOSTS + 1, + i2_prev(p) + N_GHOSTS, + cur::jx2, + Fx2_1 * Wx1_1); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS, + cur::jx2, + Fx2_2 * (ONE - Wx1_2)); + deposit_at(i1(p) + N_GHOSTS + 1, + i2(p) + N_GHOSTS, + cur::jx2, + Fx2_2 * Wx1_2); + + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1)); + deposit_at(i1_prev(p) + N_GHOSTS + 1, + i2_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * Wx1_1 * (ONE - Wx2_1)); + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS + 1, + cur::jx3, + Fx3_1 * (ONE - Wx1_1) * Wx2_1); + deposit_at(i1_prev(p) + N_GHOSTS + 1, + i2_prev(p) + N_GHOSTS + 1, + cur::jx3, + Fx3_1 * Wx1_1 * Wx2_1); + + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2)); + deposit_at(i1(p) + N_GHOSTS + 1, + i2(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * Wx1_2 * (ONE - Wx2_2)); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS + 1, + cur::jx3, + Fx3_2 * (ONE - Wx1_2) * Wx2_2); + deposit_at(i1(p) + N_GHOSTS + 1, + i2(p) + N_GHOSTS + 1, + cur::jx3, + Fx3_2 * Wx1_2 * Wx2_2); } else { const auto dxp_r_3 { static_cast(i3(p) == i3_prev(p)) * (dx3(p) + dx3_prev(p)) * @@ -300,107 +332,131 @@ namespace kernel { dx3(p) - dxp_r_3) * coeff * inv_dt }; - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx1) += Fx1_1 * (ONE - Wx2_1) * (ONE - Wx3_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - i3_prev(p) + N_GHOSTS, - cur::jx1) += Fx1_1 * Wx2_1 * (ONE - Wx3_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS + 1, - cur::jx1) += Fx1_1 * (ONE - Wx2_1) * Wx3_1; - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - i3_prev(p) + N_GHOSTS + 1, - cur::jx1) += Fx1_1 * Wx2_1 * Wx3_1; - - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx1) += Fx1_2 * (ONE - Wx2_2) * (ONE - Wx3_2); - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS + 1, - i3(p) + N_GHOSTS, - cur::jx1) += Fx1_2 * Wx2_2 * (ONE - Wx3_2); - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS + 1, - cur::jx1) += Fx1_2 * (ONE - Wx2_2) * Wx3_2; - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS + 1, - i3(p) + N_GHOSTS + 1, - cur::jx1) += Fx1_2 * Wx2_2 * Wx3_2; - - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx2) += Fx2_1 * (ONE - Wx1_1) * (ONE - Wx3_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx2) += Fx2_1 * Wx1_1 * (ONE - Wx3_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS + 1, - cur::jx2) += Fx2_1 * (ONE - Wx1_1) * Wx3_1; - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS + 1, - cur::jx2) += Fx2_1 * Wx1_1 * Wx3_1; - - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx2) += Fx2_2 * (ONE - Wx1_2) * (ONE - Wx3_2); - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx2) += Fx2_2 * Wx1_2 * (ONE - Wx3_2); - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS + 1, - cur::jx2) += Fx2_2 * (ONE - Wx1_2) * Wx3_2; - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS + 1, - cur::jx2) += Fx2_2 * Wx1_2 * Wx3_2; - - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * Wx1_1 * (ONE - Wx2_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - i3_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * (ONE - Wx1_1) * Wx2_1; - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS + 1, - i3_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * Wx1_1 * Wx2_1; - - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2); - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * Wx1_2 * (ONE - Wx2_2); - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS + 1, - i3(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * (ONE - Wx1_2) * Wx2_2; - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS + 1, - i3(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * Wx1_2 * Wx2_2; + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS, + i3_prev(p) + N_GHOSTS, + cur::jx1, + Fx1_1 * (ONE - Wx2_1) * (ONE - Wx3_1)); + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS + 1, + i3_prev(p) + N_GHOSTS, + cur::jx1, + Fx1_1 * Wx2_1 * (ONE - Wx3_1)); + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS, + i3_prev(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_1 * (ONE - Wx2_1) * Wx3_1); + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS + 1, + i3_prev(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_1 * Wx2_1 * Wx3_1); + + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS, + i3(p) + N_GHOSTS, + cur::jx1, + Fx1_2 * (ONE - Wx2_2) * (ONE - Wx3_2)); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS + 1, + i3(p) + N_GHOSTS, + cur::jx1, + Fx1_2 * Wx2_2 * (ONE - Wx3_2)); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS, + i3(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_2 * (ONE - Wx2_2) * Wx3_2); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS + 1, + i3(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_2 * Wx2_2 * Wx3_2); + + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS, + i3_prev(p) + N_GHOSTS, + cur::jx2, + Fx2_1 * (ONE - Wx1_1) * (ONE - Wx3_1)); + deposit_at(i1_prev(p) + N_GHOSTS + 1, + i2_prev(p) + N_GHOSTS, + i3_prev(p) + N_GHOSTS, + cur::jx2, + Fx2_1 * Wx1_1 * (ONE - Wx3_1)); + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS, + i3_prev(p) + N_GHOSTS + 1, + cur::jx2, + Fx2_1 * (ONE - Wx1_1) * Wx3_1); + deposit_at(i1_prev(p) + N_GHOSTS + 1, + i2_prev(p) + N_GHOSTS, + i3_prev(p) + N_GHOSTS + 1, + cur::jx2, + Fx2_1 * Wx1_1 * Wx3_1); + + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS, + i3(p) + N_GHOSTS, + cur::jx2, + Fx2_2 * (ONE - Wx1_2) * (ONE - Wx3_2)); + deposit_at(i1(p) + N_GHOSTS + 1, + i2(p) + N_GHOSTS, + i3(p) + N_GHOSTS, + cur::jx2, + Fx2_2 * Wx1_2 * (ONE - Wx3_2)); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS, + i3(p) + N_GHOSTS + 1, + cur::jx2, + Fx2_2 * (ONE - Wx1_2) * Wx3_2); + deposit_at(i1(p) + N_GHOSTS + 1, + i2(p) + N_GHOSTS, + i3(p) + N_GHOSTS + 1, + cur::jx2, + Fx2_2 * Wx1_2 * Wx3_2); + + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS, + i3_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1)); + deposit_at(i1_prev(p) + N_GHOSTS + 1, + i2_prev(p) + N_GHOSTS, + i3_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * Wx1_1 * (ONE - Wx2_1)); + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS + 1, + i3_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * (ONE - Wx1_1) * Wx2_1); + deposit_at(i1_prev(p) + N_GHOSTS + 1, + i2_prev(p) + N_GHOSTS + 1, + i3_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * Wx1_1 * Wx2_1); + + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS, + i3(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2)); + deposit_at(i1(p) + N_GHOSTS + 1, + i2(p) + N_GHOSTS, + i3(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * Wx1_2 * (ONE - Wx2_2)); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS + 1, + i3(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * (ONE - Wx1_2) * Wx2_2); + deposit_at(i1(p) + N_GHOSTS + 1, + i2(p) + N_GHOSTS + 1, + i3(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * Wx1_2 * Wx2_2); } } } else if constexpr ((O >= 1u) and (O <= 11u)) { @@ -421,33 +477,14 @@ namespace kernel { fS_x1); if constexpr (D == Dim::_1D) { - // define weight vectors - real_t Wx1[O + 2]; - real_t Wx23[O + 2]; - - // Calculate weight function -#pragma unroll - for (int i = 0; i < O + 2; ++i) { - // Esirkepov 2001, Eq. 38 for 1D case - Wx1[i] = fS_x1[i] - iS_x1[i]; - Wx23[i] = HALF * (fS_x1[i] + iS_x1[i]); - } - - // contribution within the shape function stencil - real_t jx1[O + 2]; - - // prefactors for j update + // (1D): fused Esirkepov, no [O+2] temporaries. + // jx1[i] = -Qdx1dt * sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) + // = -Qdx1dt * P1[i] (Eq. 38, 1D) + // Wx23[i] = HALF * (fS_x1[i] + iS_x1[i]) (computed inline) const real_t Qdx1dt = coeff * inv_dt; const real_t QVx2 = coeff * vp[1]; const real_t QVx3 = coeff * vp[2]; - // Calculate current contribution - jx1[0] = -Qdx1dt * Wx1[0]; -#pragma unroll - for (int i = 1; i < O + 2; ++i) { - jx1[i] = jx1[i - 1] - Qdx1dt * Wx1[i]; - } - // account for ghost cells i1_min += N_GHOSTS; i1_max += N_GHOSTS; @@ -455,21 +492,18 @@ namespace kernel { // get number of update indices for asymmetric movement const int di_x1 = i1_max - i1_min; - /* - Current update - */ - auto J_acc = J.access(); - - for (int i = 0; i < di_x1; ++i) { - J_acc(i1_min + i, cur::jx1) += jx1[i]; - } - + // Current update — fused over the union line so the J cell + // stays L1-resident across the 3 component atomic_adds. + real_t P1 = ZERO; for (int i = 0; i <= di_x1; ++i) { - J_acc(i1_min + i, cur::jx2) += QVx2 * Wx23[i]; - } - - for (int i = 0; i <= di_x1; ++i) { - J_acc(i1_min + i, cur::jx3) += QVx3 * Wx23[i]; + P1 += fS_x1[i] - iS_x1[i]; + const int gi = i1_min + i; + const real_t Wx23 = HALF * (fS_x1[i] + iS_x1[i]); + if (i < di_x1) { + deposit_at(gi, cur::jx1, -Qdx1dt * P1); + } + deposit_at(gi, cur::jx2, QVx2 * Wx23); + deposit_at(gi, cur::jx3, QVx3 * Wx23); } } else if constexpr (D == Dim::_2D) { @@ -489,63 +523,23 @@ namespace kernel { iS_x2, fS_x2); - // define weight tensors - real_t Wx1[O + 2][O + 2]; - real_t Wx2[O + 2][O + 2]; - real_t Wx3[O + 2][O + 2]; - -// Calculate weight function -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { - // Esirkepov 2001, Eq. 38 (simplified) - Wx1[i][j] = HALF * (fS_x1[i] - iS_x1[i]) * (fS_x2[j] + iS_x2[j]); - - Wx2[i][j] = HALF * (fS_x1[i] + iS_x1[i]) * (fS_x2[j] - iS_x2[j]); - - Wx3[i][j] = THIRD * (fS_x2[j] * (HALF * iS_x1[i] + fS_x1[i]) + - iS_x2[j] * (HALF * fS_x1[i] + iS_x1[i])); - } - } - - // contribution within the shape function stencil - real_t jx1[O + 2][O + 2], jx2[O + 2][O + 2]; - - // prefactors for j update - const real_t Qdx1dt = coeff * inv_dt; - const real_t Qdx2dt = coeff * inv_dt; - const real_t QVx3 = coeff * vp[2]; - - // Calculate current contribution - - // jx1 -#pragma unroll - for (int j = 0; j < O + 2; ++j) { - jx1[0][j] = -Qdx1dt * Wx1[0][j]; - } - -#pragma unroll - for (int i = 1; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { - jx1[i][j] = jx1[i - 1][j] - Qdx1dt * Wx1[i][j]; - } - } - - // jx2 -#pragma unroll - for (int i = 0; i < O + 2; ++i) { - jx2[i][0] = -Qdx2dt * Wx2[i][0]; - } - -#pragma unroll - for (int j = 1; j < O + 2; ++j) { -#pragma unroll - for (int i = 0; i < O + 2; ++i) { - jx2[i][j] = jx2[i][j - 1] - Qdx2dt * Wx2[i][j]; - } - } + // (2D): fused Esirkepov, no [O+2]^2 temporaries. + // + // Esirkepov 2001 Eq. 38 (simplified) is separable: with + // P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) and + // P2[j] = sum_{j'=0}^{j} (fS_x2[j'] - iS_x2[j']), + // jx1[i][j] = -Q*HALF * P1[i] * (fS_x2[j] + iS_x2[j]) + // jx2[i][j] = -Q*HALF * P2[j] * (fS_x1[i] + iS_x1[i]) + // Wx3[i][j] = THIRD*( fS_x2[j]*(HALF*iS_x1[i]+fS_x1[i]) + // + iS_x2[j]*(HALF*fS_x1[i]+iS_x1[i]) ) + // with Q = coeff*inv_dt (Qdx1dt == Qdx2dt). Same value as the + // old explicit Wx/jx tensors up to FP reassociation; + // charge-conserving by construction. Prefix sums carried as + // running scalars, so the only per-thread state is the + // existing 1D shape arrays. + const real_t QVx3 = coeff * vp[2]; + // -Q*HALF prefactor (Qdx1dt == Qdx2dt == coeff*inv_dt) + const real_t cf = -(coeff * inv_dt) * HALF; // account for ghost cells i1_min += N_GHOSTS; @@ -557,26 +551,30 @@ namespace kernel { const int di_x1 = i1_max - i1_min; const int di_x2 = i2_max - i2_min; - /* - Current update - */ - auto J_acc = J.access(); - - for (int i = 0; i < di_x1; ++i) { - for (int j = 0; j <= di_x2; ++j) { - J_acc(i1_min + i, i2_min + j, cur::jx1) += jx1[i][j]; - } - } - - for (int i = 0; i <= di_x1; ++i) { - for (int j = 0; j < di_x2; ++j) { - J_acc(i1_min + i, i2_min + j, cur::jx2) += jx2[i][j]; - } - } - + // Current update — fused over the union plane so the J cell + // line stays L1-resident across the 3 component atomic_adds. + real_t P1 = ZERO; for (int i = 0; i <= di_x1; ++i) { + P1 += fS_x1[i] - iS_x1[i]; + const int gi = i1_min + i; + const real_t iSx1 = iS_x1[i]; + const real_t fSx1 = fS_x1[i]; + const real_t A1 = fSx1 + iSx1; // jx2 cross-factor + real_t P2 = ZERO; for (int j = 0; j <= di_x2; ++j) { - J_acc(i1_min + i, i2_min + j, cur::jx3) += QVx3 * Wx3[i][j]; + P2 += fS_x2[j] - iS_x2[j]; + const int gj = i2_min + j; + const real_t iSx2 = iS_x2[j]; + const real_t fSx2 = fS_x2[j]; + if (i < di_x1) { + deposit_at(gi, gj, cur::jx1, cf * P1 * (fSx2 + iSx2)); + } + if (j < di_x2) { + deposit_at(gi, gj, cur::jx2, cf * P2 * A1); + } + const real_t Wx3 = THIRD * (fSx2 * (HALF * iSx1 + fSx1) + + iSx2 * (HALF * fSx1 + iSx1)); + deposit_at(gi, gj, cur::jx3, QVx3 * Wx3); } } @@ -610,104 +608,33 @@ namespace kernel { iS_x3, fS_x3); - // define weight tensors - real_t Wx1[O + 2][O + 2][O + 2]; - real_t Wx2[O + 2][O + 2][O + 2]; - real_t Wx3[O + 2][O + 2][O + 2]; - -// Calculate weight function -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { -#pragma unroll - for (int k = 0; k < O + 2; ++k) { - // Esirkepov 2001, Eq. 31 - Wx1[i][j][k] = THIRD * (fS_x1[i] - iS_x1[i]) * - ((iS_x2[j] * iS_x3[k] + fS_x2[j] * fS_x3[k]) + - HALF * (iS_x3[k] * fS_x2[j] + iS_x2[j] * fS_x3[k])); - - Wx2[i][j][k] = THIRD * (fS_x2[j] - iS_x2[j]) * - (iS_x1[i] * iS_x3[k] + fS_x1[i] * fS_x3[k] + - HALF * (iS_x3[k] * fS_x1[i] + iS_x1[i] * fS_x3[k])); - - Wx3[i][j][k] = THIRD * (fS_x3[k] - iS_x3[k]) * - (iS_x1[i] * iS_x2[j] + fS_x1[i] * fS_x2[j] + - HALF * (iS_x1[i] * fS_x2[j] + iS_x2[j] * fS_x1[i])); - } - } - } - - // contribution within the shape function stencil - real_t jx1[O + 2][O + 2][O + 2], jx2[O + 2][O + 2][O + 2], - jx3[O + 2][O + 2][O + 2]; - - // prefactors to j update - const real_t Qdxdt = coeff * inv_dt; - const real_t Qdydt = coeff * inv_dt; - const real_t Qdzdt = coeff * inv_dt; - - // Calculate current contribution - - // jx1 -#pragma unroll - for (int j = 0; j < O + 2; ++j) { -#pragma unroll - for (int k = 0; k < O + 2; ++k) { - jx1[0][j][k] = -Qdxdt * Wx1[0][j][k]; - } - } - -#pragma unroll - for (int i = 1; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { -#pragma unroll - for (int k = 0; k < O + 2; ++k) { - jx1[i][j][k] = jx1[i - 1][j][k] - Qdxdt * Wx1[i][j][k]; - } - } - } - - // jx2 -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int k = 0; k < O + 2; ++k) { - jx2[i][0][k] = -Qdydt * Wx2[i][0][k]; - } - } - -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int j = 1; j < O + 2; ++j) { -#pragma unroll - for (int k = 0; k < O + 2; ++k) { - jx2[i][j][k] = jx2[i][j - 1][k] - Qdydt * Wx2[i][j][k]; - } - } - } - - // jx3 -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { - jx3[i][j][0] = -Qdydt * Wx3[i][j][0]; - } - } - -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { -#pragma unroll - for (int k = 1; k < O + 2; ++k) { - jx3[i][j][k] = jx3[i][j][k - 1] - Qdzdt * Wx3[i][j][k]; - } - } - } + // fused Esirkepov, no (O+2)^3 temporaries. + // + // The Esirkepov 3D current (2001, Eq. 31) is separable: with + // P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) (and likewise + // P2[j], P3[k]) the cumulative-sum currents collapse to + // + // jx1[i][j][k] = -Q*THIRD * P1[i] * G23(j,k) + // jx2[i][j][k] = -Q*THIRD * P2[j] * H13(i,k) + // jx3[i][j][k] = -Q*THIRD * P3[k] * F12(i,j) + // + // with the 1D-shape cross-factors + // + // G23(j,k) = iS_x2[j]*iS_x3[k] + fS_x2[j]*fS_x3[k] + // + HALF*(iS_x3[k]*fS_x2[j] + iS_x2[j]*fS_x3[k]) + // H13(i,k) = iS_x1[i]*iS_x3[k] + fS_x1[i]*fS_x3[k] + // + HALF*(iS_x3[k]*fS_x1[i] + iS_x1[i]*fS_x3[k]) + // F12(i,j) = iS_x1[i]*iS_x2[j] + fS_x1[i]*fS_x2[j] + // + HALF*(iS_x1[i]*fS_x2[j] + iS_x2[j]*fS_x1[i]) + // + // and Q = coeff*inv_dt (Qdxdt == Qdydt == Qdzdt). This is the + // same value as the old explicit Wx/jx tensors up to + // floating-point reassociation: charge-conserving by + // construction (the Esirkepov decomposition is exact). The + // prefix sums are carried as running scalars in the deposit + // loop, so the only per-thread state is the existing 1D shape + // arrays (no (O+2)^3 / (O+2)^2 locals, hence far fewer VGPRs + // and no private-memory tensor traffic). // account for ghost cells i1_min += N_GHOSTS; @@ -722,31 +649,50 @@ namespace kernel { const int di_x2 = i2_max - i2_min; const int di_x3 = i3_max - i3_min; + // -Q*THIRD prefactor (Qdxdt == Qdydt == Qdzdt == coeff*inv_dt) + const real_t cf = -(coeff * inv_dt) * THIRD; + /* - Current update + Current update — fused over the union cube so the J cell + line stays L1-resident across the 3 component atomic_adds. + Per-cell branches on (i + class DepositCurrents_kernel { + static_assert(O <= 11u, "Shape function order O must be <= 11"); + static constexpr auto D = M::Dim; + + scatter_ndfield_t J; + deposit::PrtlPack prtls; + const M metric; + const real_t charge, inv_dt; + + public: + DepositCurrents_kernel(const scatter_ndfield_t& scatter_cur, + const array_t& i1, + const array_t& i2, + const array_t& i3, + const array_t& i1_prev, + const array_t& i2_prev, + const array_t& i3_prev, + const array_t& dx1, + const array_t& dx2, + const array_t& dx3, + const array_t& dx1_prev, + const array_t& dx2_prev, + const array_t& dx3_prev, + const array_t& ux1, + const array_t& ux2, + const array_t& ux3, + const array_t& phi, + const array_t& weight, + const array_t& tag, + const M& metric, + real_t charge, + const real_t dt) + : J { scatter_cur } + , prtls { i1, i2, i3, i1_prev, i2_prev, i3_prev, + dx1, dx2, dx3, dx1_prev, dx2_prev, dx3_prev, + ux1, ux2, ux3, phi, weight, tag } + , metric { metric } + , charge { charge } + , inv_dt { ONE / dt } { + raise::ErrorIf( + (O == 2u and N_GHOSTS < 2), + "Order of interpolation is 2, but number of ghost cells is < 2", + HERE); + } + + Inline auto operator()(prtlidx_t p) const -> void { + auto J_acc = J.access(); + if constexpr (D == Dim::_1D) { + deposit::deposit_one_particle( + p, + prtls, + metric, + charge, + inv_dt, + [&](int g_i1, int comp, real_t v) { + J_acc(g_i1, comp) += v; + }); + } else if constexpr (D == Dim::_2D) { + deposit::deposit_one_particle( + p, + prtls, + metric, + charge, + inv_dt, + [&](int g_i1, int g_i2, int comp, real_t v) { + J_acc(g_i1, g_i2, comp) += v; + }); + } else if constexpr (D == Dim::_3D) { + deposit::deposit_one_particle( + p, + prtls, + metric, + charge, + inv_dt, + [&](int g_i1, int g_i2, int g_i3, int comp, real_t v) { + J_acc(g_i1, g_i2, g_i3, comp) += v; + }); + } + } }; + +#if defined(TEAM_POLICY) + + /** + * @brief Tiled current-deposition kernel. + * + * One team per spatial tile (`league_size = ntiles_total`). Each team + * accumulates particle contributions into a per-team scratch buffer of + * shape `(T_TILE + 2*HALO)^D × 3` real_t, where `HALO = O + 1` cells per + * side. Scratch atomics live in SLM (PVC: ~5–10 cycles per + * `atomic_add`); the global J is touched only once per scratch cell at + * flush time. Compared with the flat scatter-view kernel: + * - global atomic pressure ~ (T_TILE + 2*HALO)^D × 3 per tile + * instead of (stencil writes per particle × particles) + * - per-particle stencil writes are tile-local (SLM) instead of + * scattering through global HBM + * + * Supports `O ∈ {0, ..., 11}`. `O == 0` (zigzag) is wired for + * A/B benchmarking against the flat scatter-view kernel — its narrow + * stencil typically makes scratch alloc/zero/flush overhead a + * regression there, but it's good to be able to measure the + * crossover. To revert and use flat for zigzag-only builds, change + * the dispatch in `engines/srpic/currents.h` from + * `#if defined(TEAM_POLICY)` to + * `#if defined(TEAM_POLICY) && (SHAPE_ORDER > 0)`. + * + * Particle iteration order is governed by `tile_offsets`: tile `t` + * owns particles `[tile_offsets(t), tile_offsets(t+1))`, post-sort. + * `SortSpatially` (`particles_sort.cpp`) is responsible for keeping + * the SoA arrays consistent with that. + * + * **Halo sizing and escape valve.** Sort runs at the end of the + * previous step (see `srpic.hpp`), so at deposit time the particle + * has already been pushed once — its `min(i, i_prev)` may differ + * from the bin key by one cell of drift per step elapsed since the + * last sort. The scratch HALO is `STENCIL_REACH(O) + DRIFT`, where + * `STENCIL_REACH = 2` for zigzag (writes `{i_prev, i_prev+1, i, + * i+1}` ⇒ +2 above `min(i, i_prev)` with `|Δi|=1`) and `O` for + * Esirkepov, and `DRIFT` is a fixed constant (1) covering the one + * guaranteed post-sort pusher step. + * + * HALO is sized for the *common* (every-step-sorted) case, not for + * a worst-case sort cadence: correctness does **not** depend on it. + * Any particle whose stencil escapes the scratch tile — because it + * drifted further than `DRIFT` (e.g. a large runtime + * `spatial_sorting_interval`), or because the halo is otherwise + * undersized — silently falls back to a direct, bounds-clipped + * `Kokkos::atomic_add` on the global J view. That path is + * charge-conserving (each particle's stencil is deposited exactly + * once, partly to private SLM scratch and partly to global J, and + * scratch is flushed once via `atomic_add`); it is merely slower + * per write. Sorting less often than every step therefore costs + * escape-valve traffic, never accuracy. + */ + template + class DepositCurrents_kernel_tiled { + static_assert(O <= 11u, "Shape order O must be <= 11"); + static_assert(T_TILE > 0u, "T_TILE must be positive"); + static constexpr auto D = M::Dim; + + // Per-side scratch halo, derived from first principles. + // + // total halo = stencil_reach(O) + drift_between_sort_and_deposit + // + // stencil_reach(O) — maximum cells the deposit writes ABOVE + // min(i, i_prev) under CFL |v·dt/dx| ≤ 1/2: + // - O == 0 (zigzag): writes {i_prev, i_prev+1, i, i+1} ⇒ +2 + // - O >= 1 Esirkepov: `for_deposit` returns an (O+2)-wide + // array but only O+1 entries are non-zero, and the union + // window satisfies `i_max - i_min <= O+1` (see + // particle_shapes.hpp::for_deposit). The genuine one-sided + // reach above min(i, i_prev) is therefore O, not O+1 — the + // old `O+1` carried one extra cell of conservative padding + // on top of the already-conservative drift term below. + // + // drift — sort runs at end-of-step (see srpic.hpp), so a particle + // sees exactly one pusher step before the *next* step's deposit + // when sorted every step (the common case). DRIFT is therefore a + // fixed constant of 1, NOT a compile-time function of the runtime + // sort cadence. Sizing the halo for the common case (rather than a + // worst-case sort interval) is what keeps the scratch small enough + // for good occupancy; a species sorted less often than + // every step just drifts past the halo and takes the global-J + // escape valve more often — correct, only slower (see the class + // doc-comment for why this is charge-conserving). + // + static constexpr int STENCIL_REACH = (O == 0u) + ? 2 + : static_cast(O); + static constexpr int DRIFT = 1; + static constexpr int HALO = STENCIL_REACH + DRIFT; + static constexpr int TE = static_cast(T_TILE) + 2 * HALO; + + using exec_space = Kokkos::DefaultExecutionSpace; + using team_policy = Kokkos::TeamPolicy; + using member_t = typename team_policy::member_type; + using scratch_mem = typename exec_space::scratch_memory_space; + + // Scratch view types: trailing extent of 3 (jx1, jx2, jx3 components) + // is fixed by a runtime extent so we don't need a separate dimension + // template per component count. + using scratch_1d_t = Kokkos::View>; + using scratch_2d_t = Kokkos::View>; + using scratch_3d_t = Kokkos::View>; + + ndfield_t J; + deposit::PrtlPack prtls; + const M metric; + const real_t charge, inv_dt; + + // Tile metadata produced by SortSpatially. + array_t tile_offsets; + ncells_t ntx1 { 1u }, ntx2 { 1u }, ntx3 { 1u }; + ncells_t total_tiles { 0u }; + + // J's full storage extent including all ghost cells. Used to clip + // the cooperative flush so that a partial tile at the high end of + // the domain does not over-write past the J view. + int j_ext1 { 0 }, j_ext2 { 0 }, j_ext3 { 0 }; + + public: + DepositCurrents_kernel_tiled(const ndfield_t& cur, + const array_t& i1, + const array_t& i2, + const array_t& i3, + const array_t& i1_prev, + const array_t& i2_prev, + const array_t& i3_prev, + const array_t& dx1, + const array_t& dx2, + const array_t& dx3, + const array_t& dx1_prev, + const array_t& dx2_prev, + const array_t& dx3_prev, + const array_t& ux1, + const array_t& ux2, + const array_t& ux3, + const array_t& phi, + const array_t& weight, + const array_t& tag, + const M& metric, + real_t charge, + const real_t dt, + const TileLayout& layout) + : J { cur } + , prtls { i1, i2, i3, i1_prev, i2_prev, i3_prev, + dx1, dx2, dx3, dx1_prev, dx2_prev, dx3_prev, + ux1, ux2, ux3, phi, weight, tag } + , metric { metric } + , charge { charge } + , inv_dt { ONE / dt } + , tile_offsets { layout.tile_offsets } + , ntx1 { layout.ntiles_per_axis[0] } + , ntx2 { layout.ntiles_per_axis[1] } + , ntx3 { layout.ntiles_per_axis[2] } + , total_tiles { layout.ntiles_total } { + raise::ErrorIf( + layout.tile_size != T_TILE, + "Tiled deposit launched with mismatched T_TILE and runtime tile_size", + HERE); + // Note: HALO is allowed to exceed N_GHOSTS. The cooperative + // scratch→J flush and the per-particle escape valve both bounds-clip + // their writes against `j_ext*` so writes that would land past J's + // ghost stripe are silently dropped (they only ever come from a + // particle whose stencil reaches into the domain ghost region, where + // CommunicateFields will re-supply the contribution). + if constexpr (D == Dim::_1D || D == Dim::_2D || D == Dim::_3D) { + j_ext1 = static_cast(cur.extent(0)); + } + if constexpr (D == Dim::_2D || D == Dim::_3D) { + j_ext2 = static_cast(cur.extent(1)); + } + if constexpr (D == Dim::_3D) { + j_ext3 = static_cast(cur.extent(2)); + } + } + + /** + * @brief Per-team scratch size in bytes. Used by the launcher to set + * `team_policy.set_scratch_size(0, Kokkos::PerTeam(bytes))`. + */ + static constexpr std::size_t scratch_bytes() { + if constexpr (D == Dim::_1D) { + return scratch_1d_t::shmem_size(TE, 3); + } else if constexpr (D == Dim::_2D) { + return scratch_2d_t::shmem_size(TE, TE, 3); + } else { + return scratch_3d_t::shmem_size(TE, TE, TE, 3); + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const member_t& team) const { + const auto tile_id = static_cast(team.league_rank()); + // Tile coordinates (tile-grid indices) → tile origin in **active** + // cell coords (no ghost offset). Using ncells_t to match the linearised + // tile index produced by SortSpatially. + ncells_t tx1 = 0, tx2 = 0, tx3 = 0; + if constexpr (D == Dim::_1D) { + tx1 = tile_id; + } else if constexpr (D == Dim::_2D) { + tx1 = tile_id / ntx2; + tx2 = tile_id - tx1 * ntx2; + } else { + const auto plane = ntx2 * ntx3; + tx1 = tile_id / plane; + const auto rem = tile_id - tx1 * plane; + tx2 = rem / ntx3; + tx3 = rem - tx2 * ntx3; + } + // origin_active = lowest active-cell index in the tile (no ghost). + // origin_J = same value translated into J's storage coordinate + // (i.e. plus N_GHOSTS). + // origin_J_low = J coordinate of scratch index 0 (i.e. origin_J - HALO). + // local index `li` in scratch ↔ global J index `gi = li + origin_J_low`. + const int origin_J1_low = static_cast(tx1 * T_TILE) + + static_cast(N_GHOSTS) - HALO; + const int origin_J2_low = static_cast(tx2 * T_TILE) + + static_cast(N_GHOSTS) - HALO; + const int origin_J3_low = static_cast(tx3 * T_TILE) + + static_cast(N_GHOSTS) - HALO; + + // Allocate scratch and cooperatively zero-fill it. + if constexpr (D == Dim::_1D) { + scratch_1d_t scr(team.team_scratch(0), TE, 3); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, TE * 3), + [&](const int idx) { + const int li = idx / 3; + const int c = idx - li * 3; + scr(li, c) = ZERO; + }); + team.team_barrier(); + + const auto p_begin = tile_offsets(tile_id); + const auto p_end = tile_offsets(tile_id + 1u); + const int e1_d = j_ext1; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, p_begin, p_end), + [&](const npart_t p) { + deposit::deposit_one_particle( + p, + prtls, + metric, + charge, + inv_dt, + // Escape valve: a particle whose stencil reaches past the + // tile's scratch (e.g. exceeded the compile-time + // STENCIL_REACH + DRIFT budget) falls back to a direct + // atomic_add on the global J view. Bounds-clipped against + // J's storage extent so writes past the domain ghost stripe + // are dropped (matches the cooperative flush below; those + // contributions are re-supplied by SynchronizeFields(J)). + [&](int g_i1, int comp, real_t v) { + const int li = g_i1 - origin_J1_low; + if (li >= 0 && li < TE) { + Kokkos::atomic_add(&scr(li, comp), v); + } else if (g_i1 >= 0 && g_i1 < e1_d) { + Kokkos::atomic_add(&J(g_i1, comp), v); + } + }); + }); + team.team_barrier(); + + // Cooperative flush of scratch to global J. Bounds-clip against + // the J view extent in case a partial high-end tile (or non-zero + // halo at domain edges) would otherwise write past J. + const int e1 = j_ext1; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, TE * 3), + [&](const int idx) { + const int li = idx / 3; + const int c = idx - li * 3; + const int gi = li + origin_J1_low; + if (gi < 0 || gi >= e1) { + return; + } + const real_t v = scr(li, c); + if (v != ZERO) { + Kokkos::atomic_add(&J(gi, c), v); + } + }); + } else if constexpr (D == Dim::_2D) { + scratch_2d_t scr(team.team_scratch(0), TE, TE, 3); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, TE * TE * 3), + [&](const int idx) { + const int lij = idx / 3; + const int c = idx - lij * 3; + const int li = lij / TE; + const int lj = lij - li * TE; + scr(li, lj, c) = ZERO; + }); + team.team_barrier(); + + const auto p_begin = tile_offsets(tile_id); + const auto p_end = tile_offsets(tile_id + 1u); + const int e1_d = j_ext1; + const int e2_d = j_ext2; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, p_begin, p_end), + [&](const npart_t p) { + deposit::deposit_one_particle( + p, + prtls, + metric, + charge, + inv_dt, + // See 1D branch for rationale. + [&](int g_i1, int g_i2, int comp, real_t v) { + const int li = g_i1 - origin_J1_low; + const int lj = g_i2 - origin_J2_low; + if (li >= 0 && li < TE && lj >= 0 && lj < TE) { + Kokkos::atomic_add(&scr(li, lj, comp), v); + } else if (g_i1 >= 0 && g_i1 < e1_d && g_i2 >= 0 && + g_i2 < e2_d) { + Kokkos::atomic_add(&J(g_i1, g_i2, comp), v); + } + }); + }); + team.team_barrier(); + + const int e1 = j_ext1; + const int e2 = j_ext2; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, TE * TE * 3), + [&](const int idx) { + const int lij = idx / 3; + const int c = idx - lij * 3; + const int li = lij / TE; + const int lj = lij - li * TE; + const int gi = li + origin_J1_low; + const int gj = lj + origin_J2_low; + if (gi < 0 || gi >= e1 || gj < 0 || gj >= e2) { + return; + } + const real_t v = scr(li, lj, c); + if (v != ZERO) { + Kokkos::atomic_add(&J(gi, gj, c), v); + } + }); + } else if constexpr (D == Dim::_3D) { + scratch_3d_t scr(team.team_scratch(0), TE, TE, TE, 3); + const int cells = TE * TE * TE; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, cells * 3), + [&](const int idx) { + const int lijk = idx / 3; + const int c = idx - lijk * 3; + const int li = lijk / (TE * TE); + const int rem = lijk - li * TE * TE; + const int lj = rem / TE; + const int lk = rem - lj * TE; + scr(li, lj, lk, c) = ZERO; + }); + team.team_barrier(); + + const auto p_begin = tile_offsets(tile_id); + const auto p_end = tile_offsets(tile_id + 1u); + const int e1_d = j_ext1; + const int e2_d = j_ext2; + const int e3_d = j_ext3; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, p_begin, p_end), + [&](const npart_t p) { + deposit::deposit_one_particle( + p, + prtls, + metric, + charge, + inv_dt, + // See 1D branch for rationale. + [&](int g_i1, int g_i2, int g_i3, int comp, real_t v) { + const int li = g_i1 - origin_J1_low; + const int lj = g_i2 - origin_J2_low; + const int lk = g_i3 - origin_J3_low; + if (li >= 0 && li < TE && lj >= 0 && lj < TE && lk >= 0 && + lk < TE) { + Kokkos::atomic_add(&scr(li, lj, lk, comp), v); + } else if (g_i1 >= 0 && g_i1 < e1_d && g_i2 >= 0 && + g_i2 < e2_d && g_i3 >= 0 && g_i3 < e3_d) { + Kokkos::atomic_add(&J(g_i1, g_i2, g_i3, comp), v); + } + }); + }); + team.team_barrier(); + + const int e1 = j_ext1; + const int e2 = j_ext2; + const int e3 = j_ext3; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, cells * 3), + [&](const int idx) { + const int lijk = idx / 3; + const int c = idx - lijk * 3; + const int li = lijk / (TE * TE); + const int rem = lijk - li * TE * TE; + const int lj = rem / TE; + const int lk = rem - lj * TE; + const int gi = li + origin_J1_low; + const int gj = lj + origin_J2_low; + const int gk = lk + origin_J3_low; + if (gi < 0 || gi >= e1 || gj < 0 || gj >= e2 || gk < 0 || + gk >= e3) { + return; + } + const real_t v = scr(li, lj, lk, c); + if (v != ZERO) { + Kokkos::atomic_add(&J(gi, gj, gk, c), v); + } + }); + } + } + }; + +#endif // TEAM_POLICY + } // namespace kernel #undef i_di_to_Xi diff --git a/tests/framework/CMakeLists.txt b/tests/framework/CMakeLists.txt index d6dc295af..9a2e2865e 100644 --- a/tests/framework/CMakeLists.txt +++ b/tests/framework/CMakeLists.txt @@ -44,6 +44,12 @@ else() gen_test(particles_sort false) endif() +# team_policy X-3: per-backend sort_by_key permutation test (only built +# when the compile-time team_policy toggle is on). +if(${team_policy}) + gen_test(sort_by_key false) +endif() + gen_test(fields false) gen_test(grid_mesh false) if(${DEBUG}) diff --git a/tests/framework/sort_by_key.cpp b/tests/framework/sort_by_key.cpp new file mode 100644 index 000000000..94dc51d0e --- /dev/null +++ b/tests/framework/sort_by_key.cpp @@ -0,0 +1,110 @@ +/** + * @brief X-3 (team_policy) — sort_by_key permutation test. + * + * Exercises every backend overload of `ntt::sort_helpers::sort_by_key_dispatch` + * that is compiled in for the current Kokkos device. For each backend: + * 1. Allocate keys = { 5, 2, 5, 1, 3, 5, 2 }, perm = (uninitialised). + * 2. Call sort_by_key_dispatch. + * 3. Verify that keys[perm[i]] is sorted in non-decreasing order. + * + * Stability is verified for the BinSort and StdSort backends (the others + * promise stability per their documentation but we don't bake that into + * the test). + * + * Built only when `team_policy=ON` at CMake time. + */ +#include "enums.h" +#include "global.h" + +#include "arch/kokkos_aliases.h" +#include "utils/error.h" +#include "utils/sort_dispatch.h" +#include "utils/sorting.h" + +#include + +#include +#include + +namespace { + + using namespace ntt; + + template + void test_one_backend(const char* label, Backend tag) { + const std::vector keys_host_init { 5u, 2u, 5u, 1u, 3u, 5u, 2u }; + const npart_t n = keys_host_init.size(); + const ncells_t n_max = 6u; // bin range [0, n_max) + + array_t keys { "keys", n }; + auto keys_h = Kokkos::create_mirror_view(keys); + for (npart_t i = 0u; i < n; ++i) { + keys_h(i) = keys_host_init[i]; + } + Kokkos::deep_copy(keys, keys_h); + + prtl_perm_t perm { "perm", n }; + + sort_helpers::sort_by_key_dispatch(keys, perm, n_max, tag); + + auto perm_h = Kokkos::create_mirror_view(perm); + Kokkos::deep_copy(perm_h, perm); + + // Validate: keys[perm[0]] <= keys[perm[1]] <= ... + for (npart_t i = 1u; i < n; ++i) { + const auto a = keys_host_init[perm_h(i - 1u)]; + const auto b = keys_host_init[perm_h(i)]; + raise::ErrorIf( + a > b, + std::string("sort_by_key_dispatch produced non-sorted permutation " + "for backend ") + + label, + HERE); + } + + // Validate: perm is a permutation of [0, n). + std::vector seen(n, 0); + for (npart_t i = 0u; i < n; ++i) { + const auto idx = perm_h(i); + raise::ErrorIf(idx >= n, + std::string("permutation index out of range for " + "backend ") + + label, + HERE); + seen[idx] += 1; + } + for (npart_t i = 0u; i < n; ++i) { + raise::ErrorIf(seen[i] != 1, + std::string("permutation not a bijection for backend ") + + label, + HERE); + } + + std::cout << "[OK] sort_by_key_dispatch<" << label << ">: " + << "keys[perm] sorted, perm is a bijection." << std::endl; + } + +} // namespace + +auto main(int argc, char* argv[]) -> int { + ntt::GlobalInitialize(argc, argv); + try { + // Always-available backends. + test_one_backend("BinSort", ::sort::backend::BinSort {}); +#if !defined(DEVICE_ENABLED) + test_one_backend("StdSort", ::sort::backend::StdSort {}); +#endif +#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED) + test_one_backend("OneDPL", ::sort::backend::OneDPL {}); +#endif +#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED) + test_one_backend("Thrust", ::sort::backend::Thrust {}); +#endif + } catch (std::exception& e) { + std::cerr << e.what() << std::endl; + ntt::GlobalFinalize(); + return 1; + } + ntt::GlobalFinalize(); + return 0; +} diff --git a/tests/global/tiling.cpp b/tests/global/tiling.cpp index d0f0a412c..2f926ce28 100644 --- a/tests/global/tiling.cpp +++ b/tests/global/tiling.cpp @@ -1,6 +1,5 @@ #include "arch/kokkos_aliases.h" #include "utils/error.h" -#include "utils/formatting.h" #include "utils/numeric.h" #include "utils/sorting.h" @@ -122,37 +121,27 @@ void test_tiling(const array_t& i1, const auto ntiles = nt1 * nt2 * nt3; - auto position_to_tile_kernel = sort::PositionToTileIndex { - i1, i2, i3, tag, tile_indices, ncells, ts - }; - Kokkos::parallel_for("Tiling", npart, position_to_tile_kernel); - const auto num_ppt = position_to_tile_kernel.num_ppt; + array_t num_ppt { "num_ppt", ntiles }; + Kokkos::parallel_for( + "Tiling", + npart, + sort::PositionToTileIndex { i1, i2, i3, tag, tile_indices, ncells, ts, num_ppt }); Kokkos::parallel_for( "Checking", npart, Lambda(prtlidx_t p) { CheckValue(p, i1, i2, i3, tag, tile_indices, nt1, nt2, nt3, ntiles, ts); }); - raise::ErrorIf( - num_ppt.extent(0) != ntiles, - fmt::format("num_ppt size does not match number of tiles %u != %u", - num_ppt.extent(0), - ntiles), - HERE); npart_t tot_alive = 0u; Kokkos::parallel_reduce( "CountAliveInTiles", ntiles, - Lambda(cellidx_t t, npart_t & count) { count += num_ppt(t); }, + Lambda(prtlidx_t t, npart_t & count) { count += num_ppt(t); }, tot_alive); - raise::ErrorIf( - tot_alive != npart - ndead, - fmt::format("Error in counting particles per tile: %u != %u - %u", - tot_alive, - npart, - ndead), - HERE); + raise::ErrorIf(tot_alive != npart - ndead, + "Error in counting particles per tile", + HERE); } } diff --git a/tests/kernels/CMakeLists.txt b/tests/kernels/CMakeLists.txt index f1438e108..0fe23f578 100644 --- a/tests/kernels/CMakeLists.txt +++ b/tests/kernels/CMakeLists.txt @@ -26,6 +26,9 @@ endfunction() gen_test(faraday_mink) gen_test(ampere_mink) gen_test(deposit) +if(${team_policy}) + gen_test(deposit_tiled) +endif() gen_test(digital_filter) gen_test(particle_moments) gen_test(fields_to_phys) diff --git a/tests/kernels/deposit_tiled.cpp b/tests/kernels/deposit_tiled.cpp new file mode 100644 index 000000000..87f4f3f4d --- /dev/null +++ b/tests/kernels/deposit_tiled.cpp @@ -0,0 +1,262 @@ +/** + * @file tests/kernels/deposit_tiled.cpp + * @brief X-1 numerical-equivalence test for the tiled deposit kernel. + * + * Runs the flat (`DepositCurrents_kernel`) and tiled + * (`DepositCurrents_kernel_tiled`) kernels on identical particle SoA inputs + * for shape orders O = 1..11 and asserts that the resulting J array is + * identical cell-by-cell within a small floating-point tolerance. + * + * Built only when `team_policy=ON` (`-D TEAM_POLICY` defined). The test + * matches the per-particle setup used in `deposit.cpp` so that any + * regression in the shared `kernel::deposit::deposit_one_particle` body + * is caught by both tests. + */ + +#include "enums.h" +#include "global.h" + +#include "arch/kokkos_aliases.h" +#include "utils/comparators.h" + +#include "metrics/minkowski.h" + +#include "kernels/currents_deposit.hpp" + +#include +#include + +#include +#include +#include +#include + +namespace { + + using namespace ntt; + + void errorIf(bool condition, const std::string& msg) { + if (condition) { + throw std::runtime_error(msg); + } + } + + template + void put_value(const array_t& arr, T value, int i) { + auto h = Kokkos::create_mirror_view(arr); + h(i) = value; + Kokkos::deep_copy(arr, h); + } + + // Builds tile_offsets for a single-particle test. Particle 0 is alive + // and lives in tile (tx1, tx2); slots 1..n_slots-1 carry the dead + // sentinel and are never referenced by tile_offsets — so the tiled + // kernel never iterates over them. + array_t build_tile_offsets_single_particle(ncells_t ntx1, + ncells_t ntx2, + ncells_t tx1, + ncells_t tx2) { + const ncells_t total_tiles = ntx1 * ntx2; + const ncells_t hot_tile = tx1 * ntx2 + tx2; + array_t offsets("tile_offsets", total_tiles + 1u); + auto h = Kokkos::create_mirror_view(offsets); + for (ncells_t t = 0; t <= total_tiles; ++t) { + h(t) = (t <= hot_tile) ? static_cast(0) + : static_cast(1); + } + Kokkos::deep_copy(offsets, h); + return offsets; + } + + template + void run_one_case() { + using metric_t = metric::Minkowski; + constexpr unsigned short nx1 = 50u, nx2 = 50u; + metric_t metric { { nx1, nx2 }, + { { 0.0, 55.0 }, { 0.0, 55.0 } }, + {} }; + + // Particle setup (mirrors deposit.cpp). + const int i0 = 25, j0 = 21, i0f = 24, j0f = 20; + const real_t uz = 2.5; + const prtldx_t dxi = static_cast(0.65); + const prtldx_t dxf = static_cast(0.99); + const prtldx_t dyi = static_cast(0.65); + const prtldx_t dyf = static_cast(0.80); + + array_t i1 { "i1", 10 }; + array_t i2 { "i2", 10 }; + array_t i3 { "i3", 10 }; + array_t i1_prev { "i1_prev", 10 }; + array_t i2_prev { "i2_prev", 10 }; + array_t i3_prev { "i3_prev", 10 }; + array_t dx1 { "dx1", 10 }; + array_t dx2 { "dx2", 10 }; + array_t dx3 { "dx3", 10 }; + array_t dx1_prev { "dx1_prev", 10 }; + array_t dx2_prev { "dx2_prev", 10 }; + array_t dx3_prev { "dx3_prev", 10 }; + array_t ux1 { "ux1", 10 }; + array_t ux2 { "ux2", 10 }; + array_t ux3 { "ux3", 10 }; + array_t phi { "phi", 10 }; + array_t weight { "weight", 10 }; + array_t tag { "tag", 10 }; + const real_t charge = 1.0, dt = 1.0; + + put_value(i1, i0f, 0); + put_value(i2, j0f, 0); + put_value(i1_prev, i0, 0); + put_value(i2_prev, j0, 0); + put_value(dx1, dxf, 0); + put_value(dx2, dyf, 0); + put_value(dx1_prev, dxi, 0); + put_value(dx2_prev, dyi, 0); + put_value(ux1, ZERO, 0); + put_value(ux2, ZERO, 0); + put_value(ux3, uz, 0); + put_value(weight, 1.0, 0); + put_value(tag, ParticleTag::alive, 0); + + // Run the flat kernel. + ndfield_t J_flat { "J_flat", + nx1 + 2u * N_GHOSTS, + nx2 + 2u * N_GHOSTS }; + { + auto J_scat = Kokkos::Experimental::create_scatter_view(J_flat); + Kokkos::parallel_for( + "FlatDeposit", + 10, + kernel::DepositCurrents_kernel( + J_scat, + i1, i2, i3, + i1_prev, i2_prev, i3_prev, + dx1, dx2, dx3, + dx1_prev, dx2_prev, dx3_prev, + ux1, ux2, ux3, + phi, weight, tag, + metric, charge, dt)); + Kokkos::Experimental::contribute(J_flat, J_scat); + Kokkos::fence("flat deposit done"); + } + + // Run the tiled kernel. Build a TileLayout with one alive particle + // landing in its expected tile (sort key = min(i, i_prev) / T_TILE). + ndfield_t J_tiled { "J_tiled", + nx1 + 2u * N_GHOSTS, + nx2 + 2u * N_GHOSTS }; + { + const auto sort_i1 = static_cast( + (i0 < i0f) ? i0 : i0f); // min(i, i_prev) before clamp + const auto sort_i2 = static_cast((j0 < j0f) ? j0 : j0f); + const auto ntx1 = static_cast( + std::ceil(static_cast(nx1) / static_cast(T_TILE))); + const auto ntx2 = static_cast( + std::ceil(static_cast(nx2) / static_cast(T_TILE))); + const auto tx1 = static_cast(sort_i1) / T_TILE; + const auto tx2 = static_cast(sort_i2) / T_TILE; + + TileLayout layout; + layout.ntiles_per_axis[0] = ntx1; + layout.ntiles_per_axis[1] = ntx2; + layout.ntiles_per_axis[2] = 1u; + layout.ntiles_total = ntx1 * ntx2; + layout.tile_size = T_TILE; + layout.tile_offsets = build_tile_offsets_single_particle(ntx1, + ntx2, + tx1, + tx2); + + using kernel_t = + kernel::DepositCurrents_kernel_tiled; + kernel_t kern { J_tiled, + i1, i2, i3, + i1_prev, i2_prev, i3_prev, + dx1, dx2, dx3, + dx1_prev, dx2_prev, dx3_prev, + ux1, ux2, ux3, + phi, weight, tag, + metric, charge, dt, layout }; + + Kokkos::TeamPolicy<> policy(static_cast(layout.ntiles_total), + Kokkos::AUTO); + policy.set_scratch_size(0, + Kokkos::PerTeam(kernel_t::scratch_bytes())); + Kokkos::parallel_for("TiledDeposit", policy, kern); + Kokkos::fence("tiled deposit done"); + } + + // Compare J_flat vs J_tiled cell-by-cell. + auto h_flat = Kokkos::create_mirror_view(J_flat); + auto h_tiled = Kokkos::create_mirror_view(J_tiled); + Kokkos::deep_copy(h_flat, J_flat); + Kokkos::deep_copy(h_tiled, J_tiled); + + const real_t eps = static_cast(1.0e-5); + real_t max_diff = ZERO; + int fail_count = 0; + for (ncells_t i = 0; i < h_flat.extent(0); ++i) { + for (ncells_t j = 0; j < h_flat.extent(1); ++j) { + for (int c = 0; c < 3; ++c) { + const real_t a = h_flat(i, j, c); + const real_t b = h_tiled(i, j, c); + const real_t diff = math::fabs(a - b); + const real_t mag = math::max(math::fabs(a), math::fabs(b)); + if (diff > max_diff) { + max_diff = diff; + } + if (diff > eps * math::max(mag, static_cast(1.0))) { + if (fail_count < 5) { + std::cerr << " J(" << i << "," << j << ",c=" << c + << ") flat=" << a << " tiled=" << b + << " diff=" << diff << '\n'; + } + ++fail_count; + } + } + } + } + if (fail_count > 0) { + std::cerr << "X-1 deposit_tiled equivalence FAILED for O=" << O + << " T_TILE=" << T_TILE + << " : " << fail_count << " mismatches; max_diff=" << max_diff + << '\n'; + throw std::logic_error("DepositCurrents_kernel_tiled mismatch"); + } + std::cerr << "X-1 deposit_tiled OK O=" << O << " T_TILE=" << T_TILE + << " max_diff=" << max_diff << '\n'; + } + + template + void run_all_orders() { + run_one_case<0u, T_TILE>(); + run_one_case<1u, T_TILE>(); + run_one_case<2u, T_TILE>(); + run_one_case<3u, T_TILE>(); + run_one_case<4u, T_TILE>(); + run_one_case<5u, T_TILE>(); + run_one_case<6u, T_TILE>(); + run_one_case<7u, T_TILE>(); + run_one_case<8u, T_TILE>(); + run_one_case<9u, T_TILE>(); + run_one_case<10u, T_TILE>(); + run_one_case<11u, T_TILE>(); + } + +} // namespace + +auto main(int argc, char* argv[]) -> int { + Kokkos::initialize(argc, argv); + try { + // Run with each tile-size choice from the validated CMake list. + run_all_orders<4u>(); + run_all_orders<8u>(); + run_all_orders<12u>(); + } catch (std::exception& e) { + std::cerr << e.what() << '\n'; + Kokkos::finalize(); + return 1; + } + Kokkos::finalize(); + return 0; +} From 04d403c606147e0d651cac28f930d5592f808c84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ludwig=20B=C3=B6ss?= Date: Tue, 19 May 2026 19:25:38 +0200 Subject: [PATCH 03/33] removed redundant inner loop over species --- src/framework/domain/metadomain_sort.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/framework/domain/metadomain_sort.cpp b/src/framework/domain/metadomain_sort.cpp index 791bf31a8..a2b08f8be 100644 --- a/src/framework/domain/metadomain_sort.cpp +++ b/src/framework/domain/metadomain_sort.cpp @@ -21,9 +21,7 @@ namespace ntt { const auto clearing_interval = species.clearing_interval(); if ((clearing_interval > 0u) and (step % clearing_interval == 0u) and (step > 0u)) { - for (auto& species : domain.species) { - species.RemoveDead(); - } + species.RemoveDead(); } const auto spatial_sorting_interval = species.spatial_sorting_interval(); if ((spatial_sorting_interval > 0u) and From 5830c0740e43851f5d26afb8e35438160be8a007 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ludwig=20B=C3=B6ss?= Date: Tue, 19 May 2026 19:26:11 +0200 Subject: [PATCH 04/33] frontier-specific memory pool allocation --- src/global/global.cpp | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/global/global.cpp b/src/global/global.cpp index ec22fd2f3..c3aa1bcdc 100644 --- a/src/global/global.cpp +++ b/src/global/global.cpp @@ -6,8 +6,51 @@ #include #endif // MPI_ENABLED +#if defined(HIP_ENABLED) + #include + + #include +#endif // HIP_ENABLED + +namespace { +#if defined(HIP_ENABLED) + // Turn the ROCm stream-ordered allocator into a caching arena. + // + // This Kokkos build uses hipMallocAsync/hipFreeAsync (Kokkos option + // IMPL_HIP_MALLOC_ASYNC). The default memory pool has a release + // threshold of 0, so every freed block is handed back to the driver + // at the next stream sync. With ~50 GB of particle SoA permanently + // pinned and only ~14 GB free, the per-step churn of dozens of + // large, differently-sized sort/comm scratch buffers fragments that + // free space: allocation cost grows monotonically (ParticleSort + // slowdown) until no contiguous mid-size block remains and BinSort's + // `sorted_values` allocation fails (OOM). Raising the release + // threshold to "unlimited" makes the pool retain and recycle freed + // blocks instead, which stabilizes the working set and removes both + // the slowdown and the OOM. + void ConfigureHipMemPool() { + int device = 0; + if (hipGetDevice(&device) != hipSuccess) { + return; + } + hipMemPool_t pool = nullptr; + if (hipDeviceGetDefaultMemPool(&pool, device) != hipSuccess or + pool == nullptr) { + return; + } + uint64_t threshold = UINT64_MAX; + (void)hipMemPoolSetAttribute(pool, + hipMemPoolAttrReleaseThreshold, + &threshold); + } +#endif // HIP_ENABLED +} // namespace + void ntt::GlobalInitialize(int argc, char* argv[]) { Kokkos::initialize(argc, argv); +#if defined(HIP_ENABLED) + ConfigureHipMemPool(); +#endif // HIP_ENABLED #if defined(MPI_ENABLED) MPI_Init(&argc, &argv); #endif // MPI_ENABLED From cf14e9f11891afc732bfb9d289b2c16c53af770c Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Tue, 26 May 2026 17:06:27 +0000 Subject: [PATCH 05/33] support more tile sizes --- CMakeLists.txt | 2 +- tests/kernels/deposit_tiled.cpp | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 248a34e7b..06494a7db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,7 +65,7 @@ set(team_policy_tile_size ${default_team_policy_tile_size} CACHE STRING "team_policy tile edge length in cells") set(team_policy_tile_sizes - "4;6;8;10;12" + "4;6;8;10;12;14;16" CACHE STRING "team_policy tile-size choices") # -------------------------- Compilation settings -------------------------- # diff --git a/tests/kernels/deposit_tiled.cpp b/tests/kernels/deposit_tiled.cpp index 87f4f3f4d..504cc3818 100644 --- a/tests/kernels/deposit_tiled.cpp +++ b/tests/kernels/deposit_tiled.cpp @@ -250,8 +250,12 @@ auto main(int argc, char* argv[]) -> int { try { // Run with each tile-size choice from the validated CMake list. run_all_orders<4u>(); + run_all_orders<6u>(); run_all_orders<8u>(); + run_all_orders<10u>(); run_all_orders<12u>(); + run_all_orders<14u>(); + run_all_orders<16u>(); } catch (std::exception& e) { std::cerr << e.what() << '\n'; Kokkos::finalize(); From 787aa045750fa2c582b54d02c5e8d3f4265de1e0 Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Tue, 26 May 2026 17:12:06 +0000 Subject: [PATCH 06/33] removed persistent sort scratch to reduce memory overhead --- src/framework/containers/particles.h | 16 --- src/framework/containers/particles_sort.cpp | 111 +++++++------------- 2 files changed, 38 insertions(+), 89 deletions(-) diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h index 0a15cb5d9..4f0770729 100644 --- a/src/framework/containers/particles.h +++ b/src/framework/containers/particles.h @@ -99,22 +99,6 @@ namespace ntt { // vendor libraries detected by CMake. TileLayout m_tile_layout {}; -#if defined(TEAM_POLICY) && \ - ((defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) || \ - (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) || \ - (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED))) - // Persistent byte scratch reused by every SoA-member gather in - // `apply_permutation_to_soa`, across all members and all timesteps. - // Without this each member would allocate (and free) its own - // transient buffer every sort; recycling one persistent buffer - // removes that allocation churn entirely — the structural fix for - // the ROCm sort slowdown / fragmentation. Grown monotonically to - // the largest required size, never shrunk. Kokkos device - // allocations are over-aligned (>= 8 B), so reinterpreting the - // bytes as any SoA element type (<= 8 B PODs) is well-defined. - array_t m_perm_scratch {}; -#endif - public: // for empty allocation Particles() {} diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp index c04813e12..0317e0b5f 100644 --- a/src/framework/containers/particles_sort.cpp +++ b/src/framework/containers/particles_sort.cpp @@ -454,28 +454,20 @@ namespace ntt { #if defined(TEAM_POLICY_USE_VENDOR_SORT) namespace permute_helpers { - // Permute a 1D SoA member array `arr` in place by `perm`, gathering - // through `scratch` — a persistent byte buffer reused by every - // member and every timestep (no per-call allocation). An unmanaged - // typed view aliases the scratch bytes; the caller guarantees - // `scratch` is large enough and that Kokkos' device over-alignment - // covers the element type. + // Permute a 1D SoA member array `arr` in place by `perm`, using a + // single transient buffer of size `n`. Buffer is freed at scope + // exit; the explicit fence right before that drains queued GPU + // work referencing it. template - inline void permute_1d_inplace(V& arr, - const prtl_perm_t& perm, - npart_t n, - const array_t& scratch) { + inline void permute_1d_inplace(V& arr, + const prtl_perm_t& perm, + npart_t n) { if (n == 0u) { return; } - using value_t = typename V::non_const_value_type; - using buf_t = Kokkos::View>; - buf_t buf(reinterpret_cast(scratch.data()), n); - auto perm_v = perm; - auto arr_v = arr; + V buf(std::string(arr.label()) + "_perm_buf", n); + auto perm_v = perm; + auto arr_v = arr; Kokkos::parallel_for( "Permute1D", n, @@ -486,22 +478,16 @@ namespace ntt { // 2D analogue for `pld_r` / `pld_i`. template - inline void permute_2d_inplace(V& arr, - const prtl_perm_t& perm, - npart_t n, - npart_t ncols, - const array_t& scratch) { + inline void permute_2d_inplace(V& arr, + const prtl_perm_t& perm, + npart_t n, + npart_t ncols) { if (n == 0u or ncols == 0u) { return; } - using value_t = typename V::non_const_value_type; - using buf_t = Kokkos::View>; - buf_t buf(reinterpret_cast(scratch.data()), n, ncols); - auto perm_v = perm; - auto arr_v = arr; + V buf(std::string(arr.label()) + "_perm_buf", n, ncols); + auto perm_v = perm; + auto arr_v = arr; Kokkos::parallel_for( "Permute2D", CreateParticleRangePolicy({ 0u, 0u }, { n, ncols }), @@ -522,61 +508,40 @@ namespace ntt { return; } - // Size the persistent scratch once to the largest gather any member - // needs this call: 1D members need n * sizeof(real_t) bytes (the - // widest element); the 2D payloads need n * ncols * elem bytes. - // Grown monotonically, never shrunk — so after warmup this incurs - // no allocation at all. - std::size_t need = static_cast(n) * sizeof(real_t); - if (npld_r() > 0) { - need = std::max(need, - static_cast(n) * - static_cast(npld_r()) * sizeof(real_t)); - } - if (npld_i() > 0) { - need = std::max(need, - static_cast(n) * - static_cast(npld_i()) * sizeof(npart_t)); - } - if (m_perm_scratch.extent(0) < need) { - m_perm_scratch = array_t { "perm_scratch", need }; - } - const auto& scratch = m_perm_scratch; - using permute_helpers::permute_1d_inplace; using permute_helpers::permute_2d_inplace; if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) { - permute_1d_inplace(i1, perm, n, scratch); - permute_1d_inplace(dx1, perm, n, scratch); - permute_1d_inplace(i1_prev, perm, n, scratch); - permute_1d_inplace(dx1_prev, perm, n, scratch); + permute_1d_inplace(i1, perm, n); + permute_1d_inplace(dx1, perm, n); + permute_1d_inplace(i1_prev, perm, n); + permute_1d_inplace(dx1_prev, perm, n); } if constexpr (D == Dim::_2D or D == Dim::_3D) { - permute_1d_inplace(i2, perm, n, scratch); - permute_1d_inplace(dx2, perm, n, scratch); - permute_1d_inplace(i2_prev, perm, n, scratch); - permute_1d_inplace(dx2_prev, perm, n, scratch); + permute_1d_inplace(i2, perm, n); + permute_1d_inplace(dx2, perm, n); + permute_1d_inplace(i2_prev, perm, n); + permute_1d_inplace(dx2_prev, perm, n); } if constexpr (D == Dim::_3D) { - permute_1d_inplace(i3, perm, n, scratch); - permute_1d_inplace(dx3, perm, n, scratch); - permute_1d_inplace(i3_prev, perm, n, scratch); - permute_1d_inplace(dx3_prev, perm, n, scratch); - } - permute_1d_inplace(ux1, perm, n, scratch); - permute_1d_inplace(ux2, perm, n, scratch); - permute_1d_inplace(ux3, perm, n, scratch); - permute_1d_inplace(weight, perm, n, scratch); - permute_1d_inplace(tag, perm, n, scratch); + permute_1d_inplace(i3, perm, n); + permute_1d_inplace(dx3, perm, n); + permute_1d_inplace(i3_prev, perm, n); + permute_1d_inplace(dx3_prev, perm, n); + } + permute_1d_inplace(ux1, perm, n); + permute_1d_inplace(ux2, perm, n); + permute_1d_inplace(ux3, perm, n); + permute_1d_inplace(weight, perm, n); + permute_1d_inplace(tag, perm, n); if constexpr (D == Dim::_2D and C != Coord::Cartesian) { - permute_1d_inplace(phi, perm, n, scratch); + permute_1d_inplace(phi, perm, n); } if (npld_r() > 0) { - permute_2d_inplace(pld_r, perm, n, static_cast(npld_r()), scratch); + permute_2d_inplace(pld_r, perm, n, static_cast(npld_r())); } if (npld_i() > 0) { - permute_2d_inplace(pld_i, perm, n, static_cast(npld_i()), scratch); + permute_2d_inplace(pld_i, perm, n, static_cast(npld_i())); } } #endif // TEAM_POLICY_USE_VENDOR_SORT From 827cf264c2903b3179252c88f19c0a7bb090357d Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Tue, 2 Jun 2026 18:20:33 +0000 Subject: [PATCH 07/33] added team policy reporting --- src/engines/reporter.cpp | 3 +++ src/global/utils/reporter.cpp | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/src/engines/reporter.cpp b/src/engines/reporter.cpp index f56874d23..3caeda5de 100644 --- a/src/engines/reporter.cpp +++ b/src/engines/reporter.cpp @@ -32,6 +32,9 @@ namespace ntt { "%s", params.template get("simulation.name").c_str()); reporter::AddParam(report, 4, "Engine", "%s", SimEngine(S).to_string()); +#if defined(TEAM_POLICY) + reporter::AddParam(report, 4, "Tile size", "%d", TEAM_POLICY_TILE_SIZE); +#endif reporter::AddParam(report, 4, "Metric", "%s", M.to_string()); #if SHAPE_ORDER == 0 reporter::AddParam(report, 4, "Deposit", "%s", "zigzag"); diff --git a/src/global/utils/reporter.cpp b/src/global/utils/reporter.cpp index 77117c4b9..a4b10eee6 100644 --- a/src/global/utils/reporter.cpp +++ b/src/global/utils/reporter.cpp @@ -250,6 +250,12 @@ namespace reporter { #else AddParam(report, 4, "GPU_AWARE_MPI", "%s", "OFF"); #endif + +#if defined(TEAM_POLICY) + AddParam(report, 4, "TEAM_POLICY", "%s", "ON"); +#else + AddParam(report, 4, "TEAM_POLICY", "%s", "OFF"); +#endif report += "\n"; return report; } From 4b81914aae5cbc15cea19217b284113f617750f3 Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Tue, 2 Jun 2026 19:08:47 +0000 Subject: [PATCH 08/33] explicitly bind GPU Transport Layer for GPU aware MPI on Frontier --- CMakeLists.txt | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 06494a7db..0137671a0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -211,6 +211,62 @@ if(${mpi}) if(${DEVICE_ENABLED}) if(${gpu_aware_mpi}) add_compile_options("-D GPU_AWARE_MPI") + + # On Cray systems (e.g. Frontier) GPU-aware Cray MPICH can only + # handle device pointers if the GPU Transport Layer (GTL) library + # is linked. The Cray compiler wrappers (cc/CC) inject this + # automatically, but we build with hipcc/nvcc directly, so + # find_package(MPI) only finds base libmpi and the GTL is left + # out -> MPI_Sendrecv on a device pointer fails with + # "OFI ... Bad address". Add it explicitly here. + # + # Cray PE exports PE_MPICH_GTL_DIR_ / PE_MPICH_GTL_LIBS_ + # (e.g. amd_gfx90a -> -lmpi_gtl_hsa). Their absence means this is + # not a Cray MPICH build, in which case nothing extra is needed. + if("${Kokkos_DEVICES}" MATCHES "HIP") + set(_gtl_accels amd_gfx942 amd_gfx940 amd_gfx90a amd_gfx908 amd_gfx906) + elseif("${Kokkos_DEVICES}" MATCHES "CUDA") + set(_gtl_accels nvidia90 nvidia80 nvidia70) + elseif("${Kokkos_DEVICES}" MATCHES "SYCL") + set(_gtl_accels ponteVecchio) + else() + set(_gtl_accels "") + endif() + + set(_gtl_dir "") + set(_gtl_libflag "") + foreach(_accel ${_gtl_accels}) + if((NOT _gtl_dir) AND (DEFINED ENV{PE_MPICH_GTL_DIR_${_accel}})) + # strip the leading "-L" from the Cray-provided value + string(REGEX REPLACE "^-L" "" + _gtl_dir "$ENV{PE_MPICH_GTL_DIR_${_accel}}") + string(REGEX REPLACE "^-l" "" + _gtl_libflag "$ENV{PE_MPICH_GTL_LIBS_${_accel}}") + endif() + endforeach() + + if(_gtl_dir AND _gtl_libflag) + find_library(MPI_GTL_LIBRARY + NAMES ${_gtl_libflag} + HINTS "${_gtl_dir}" + NO_DEFAULT_PATH) + if(MPI_GTL_LIBRARY) + message(STATUS + "GPU-aware MPI: linking Cray GTL library ${MPI_GTL_LIBRARY}") + set(DEPENDENCIES ${DEPENDENCIES} ${MPI_GTL_LIBRARY}) + else() + message(FATAL_ERROR + "${Red}gpu_aware_mpi=ON: Cray MPICH detected but the GTL " + "library 'lib${_gtl_libflag}' was not found in '${_gtl_dir}'. " + "GPU-aware MPI will crash at runtime without it. Make sure the " + "craype-accel module is loaded, or build with gpu_aware_mpi=OFF." + "${ColorReset}") + endif() + else() + message(STATUS + "GPU-aware MPI: no Cray GTL environment found; assuming the MPI " + "implementation is GPU-aware without an extra transport library.") + endif() endif() else() set(gpu_aware_mpi From a603ecb0b11662282cfd613bf8bff085c5aead70 Mon Sep 17 00:00:00 2001 From: haykh Date: Wed, 3 Jun 2026 10:38:29 -0400 Subject: [PATCH 09/33] minor refactor --- src/engines/srpic/currents.h | 92 +- src/global/arch/kokkos_aliases.h | 73 ++ src/kernels/currents_deposit.hpp | 1760 ++++++++++++++---------------- 3 files changed, 934 insertions(+), 991 deletions(-) diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h index faf0bb3ad..63746d101 100644 --- a/src/engines/srpic/currents.h +++ b/src/engines/srpic/currents.h @@ -39,30 +39,12 @@ namespace ntt { species.rangeActiveParticles(), kernel::DepositCurrents_kernel( scatter_cur, - species.i1, - species.i2, - species.i3, - species.i1_prev, - species.i2_prev, - species.i3_prev, - species.dx1, - species.dx2, - species.dx3, - species.dx1_prev, - species.dx2_prev, - species.dx3_prev, - species.ux1, - species.ux2, - species.ux3, - species.phi, - species.weight, - species.tag, + species, local_metric, (real_t)(species.charge()), dt)); } -#if defined(TEAM_POLICY) /** * @brief Tiled deposit launcher (TeamPolicy + per-team scratch). * @@ -78,11 +60,10 @@ namespace ntt { * passed-in `scatter_cur` so the caller still composes correctly. */ template - void CallDepositKernelTiled( - const Particles& species, - const M& local_metric, - const ndfield_t& cur, - real_t dt) { + void CallDepositKernelTiled(const Particles& species, + const M& local_metric, + const ndfield_t& cur, + real_t dt) { static_assert(O <= 11u, "Shape order must be <= 11"); constexpr unsigned short T = static_cast( TEAM_POLICY_TILE_SIZE); @@ -96,40 +77,18 @@ namespace ntt { "with ntiles_total", HERE); - using kernel_t = kernel::DepositCurrents_kernel_tiled; - kernel_t kern { cur, - species.i1, - species.i2, - species.i3, - species.i1_prev, - species.i2_prev, - species.i3_prev, - species.dx1, - species.dx2, - species.dx3, - species.dx1_prev, - species.dx2_prev, - species.dx3_prev, - species.ux1, - species.ux2, - species.ux3, - species.phi, - species.weight, - species.tag, - local_metric, - (real_t)(species.charge()), - dt, - layout }; + auto deposit_kernel = + kernel::DepositCurrentsTiled_kernel { + cur, species, local_metric, (real_t)(species.charge()), dt, layout + }; Kokkos::TeamPolicy<> policy(static_cast(layout.ntiles_total), Kokkos::AUTO); - policy.set_scratch_size(0, Kokkos::PerTeam(kernel_t::scratch_bytes())); - Kokkos::parallel_for("CurrentsDepositTiled", policy, kern); + policy.set_scratch_size( + 0, + Kokkos::PerTeam(decltype(deposit_kernel)::scratch_bytes())); + Kokkos::parallel_for("CurrentsDepositTiled", policy, deposit_kernel); } -#endif // TEAM_POLICY template void CurrentsDeposit(Domain& domain, @@ -164,17 +123,18 @@ namespace ntt { continue; } logger::Checkpoint( - fmt::format("Launching currents deposit (flat fallback, no sort yet) " - "for %d [%s] : %lu %f", - species.index(), - species.label().c_str(), - species.npart(), - (double)species.charge()), + fmt::format( + "Launching currents deposit (flat fallback, no sort yet) " + "for %d [%s] : %lu %f", + species.index(), + species.label().c_str(), + species.npart(), + (double)species.charge()), HERE); CallDepositKernel(species, - domain.mesh.metric, - scatter_cur, - dt); + domain.mesh.metric, + scatter_cur, + dt); } Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur); } else { @@ -192,9 +152,9 @@ namespace ntt { HERE); CallDepositKernelTiled(species, - domain.mesh.metric, - domain.fields.cur, - dt); + domain.mesh.metric, + domain.fields.cur, + dt); } } #else diff --git a/src/global/arch/kokkos_aliases.h b/src/global/arch/kokkos_aliases.h index 43f21ce47..4fe88cdf7 100644 --- a/src/global/arch/kokkos_aliases.h +++ b/src/global/arch/kokkos_aliases.h @@ -226,6 +226,79 @@ namespace kokkos_aliases_hidden { template using range_h_t = typename kokkos_aliases_hidden::range_h_impl::type; +// Array aliases of arbitrary type and dimensions (up to 4) +namespace kokkos_aliases_hidden { + // c++ magic + template + struct scratch_nddata_impl { + using type = void; + }; + + template + struct scratch_nddata_impl<1, T> { + using type = Kokkos::View>; + }; + + template + struct scratch_nddata_impl<2, T> { + using type = Kokkos::View>; + }; + + template + struct scratch_nddata_impl<3, T> { + using type = Kokkos::View>; + }; + + template + struct scratch_nddata_impl<4, T> { + using type = Kokkos::View>; + }; +} // namespace kokkos_aliases_hidden + +template +using scratch_nddata_t = typename kokkos_aliases_hidden::scratch_nddata_impl::type; + +// Defining aliases for Scratch memory ndfield +namespace kokkos_aliases_hidden { + template + struct scratch_ndfield_impl { + using type = void; + }; + + template + struct scratch_ndfield_impl { + using type = Kokkos::View>; + }; + + template + struct scratch_ndfield_impl { + using type = Kokkos::View>; + }; + + template + struct scratch_ndfield_impl { + using type = Kokkos::View>; + }; +} // namespace kokkos_aliases_hidden + +template +using scratch_ndfield_t = + typename kokkos_aliases_hidden::scratch_ndfield_impl::type; + /** * @brief Function template for generating 1D Kokkos range policy for particles. * @tparam D Dimension diff --git a/src/kernels/currents_deposit.hpp b/src/kernels/currents_deposit.hpp index 5eb7ff2b6..79252621a 100644 --- a/src/kernels/currents_deposit.hpp +++ b/src/kernels/currents_deposit.hpp @@ -3,7 +3,7 @@ * @brief Covariant algorithms for the current deposition. * * Two kernels share the same per-particle body - * (`kernel::deposit::deposit_one_particle`): + * (`kernel::DepositOneParticle`): * - `kernel::DepositCurrents_kernel` flat (RangePolicy over particles, * writes into a `Kokkos::Experimental::ScatterView`). Always available. * - `kernel::DepositCurrents_kernel_tiled` team-policy @@ -13,7 +13,7 @@ * * @implements * - kernel::deposit::PrtlPack<> - * - kernel::deposit::deposit_one_particle<> + * - kernel::DepositOneParticle<> * - kernel::DepositCurrents_kernel<> * - kernel::DepositCurrents_kernel_tiled<> (TEAM_POLICY only) * @namespaces: @@ -32,6 +32,7 @@ #include "utils/error.h" #include "utils/numeric.h" +#include "framework/containers/particles.h" #include "kernels/particle_shapes.hpp" #include @@ -42,671 +43,645 @@ namespace kernel { using namespace ntt; - namespace deposit { - - /** - * @brief Per-particle reference pack consumed by both the flat and tiled - * deposit kernels. The same set of SoA references is captured by - * each kernel; bundling them here keeps the helper's argument - * list manageable and ensures every consumer reads the same - * view aliases. - */ - template - struct PrtlPack { - array_t i1, i2, i3; - array_t i1_prev, i2_prev, i3_prev; - array_t dx1, dx2, dx3; - array_t dx1_prev, dx2_prev, dx3_prev; - array_t ux1, ux2, ux3; - array_t phi; - array_t weight; - array_t tag; - }; + /** + * @brief Per-particle deposit body, shared between the flat and tiled + * kernels. + * + * The caller supplies a `deposit_at(idx..., comp, val)` callback that + * applies the contribution `val` to the J component `comp` at the + * **global** J cell index `idx...` (already includes the `N_GHOSTS` + * offset). The flat kernel's callback simply does + * `J_acc(idx..., comp) += val` on its scatter-view accessor; the tiled + * kernel's callback translates `idx...` into per-tile scratch + * coordinates and uses `Kokkos::atomic_add` on SLM. Either way, this + * function is identical numerically and contains the only deposit math + * in the codebase. + * + * Dead particles return early. The callback is invoked once per cell + * write, with the dimension-appropriate signature: + * - 1D: `deposit_at(int g_i1, int comp, real_t val)` + * - 2D: `deposit_at(int g_i1, int g_i2, int comp, real_t val)` + * - 3D: `deposit_at(int g_i1, int g_i2, int g_i3, int comp, real_t val)` + */ + template + Inline void DepositOneParticle(prtlidx_t p, + const ParticleArrays& prtls, + const M& metric, + real_t charge, + real_t inv_dt, + DepositFn deposit_at) { + static_assert(O <= 11u, "Shape function order O must be <= 11"); + constexpr auto D = M::Dim; - /** - * @brief Per-particle deposit body, shared between the flat and tiled - * kernels. - * - * The caller supplies a `deposit_at(idx..., comp, val)` callback that - * applies the contribution `val` to the J component `comp` at the - * **global** J cell index `idx...` (already includes the `N_GHOSTS` - * offset). The flat kernel's callback simply does - * `J_acc(idx..., comp) += val` on its scatter-view accessor; the tiled - * kernel's callback translates `idx...` into per-tile scratch - * coordinates and uses `Kokkos::atomic_add` on SLM. Either way, this - * function is identical numerically and contains the only deposit math - * in the codebase. - * - * Dead particles return early. The callback is invoked once per cell - * write, with the dimension-appropriate signature: - * - 1D: `deposit_at(int g_i1, int comp, real_t val)` - * - 2D: `deposit_at(int g_i1, int g_i2, int comp, real_t val)` - * - 3D: `deposit_at(int g_i1, int g_i2, int g_i3, int comp, real_t val)` - */ - template - Inline void deposit_one_particle(prtlidx_t p, - const PrtlPack& prtls, - const M& metric, - real_t charge, - real_t inv_dt, - DepositFn deposit_at) { - static_assert(O <= 11u, "Shape function order O must be <= 11"); - constexpr auto D = M::Dim; - - const auto& i1 = prtls.i1; - const auto& i2 = prtls.i2; - const auto& i3 = prtls.i3; - const auto& i1_prev = prtls.i1_prev; - const auto& i2_prev = prtls.i2_prev; - const auto& i3_prev = prtls.i3_prev; - const auto& dx1 = prtls.dx1; - const auto& dx2 = prtls.dx2; - const auto& dx3 = prtls.dx3; - const auto& dx1_prev = prtls.dx1_prev; - const auto& dx2_prev = prtls.dx2_prev; - const auto& dx3_prev = prtls.dx3_prev; - const auto& ux1 = prtls.ux1; - const auto& ux2 = prtls.ux2; - const auto& ux3 = prtls.ux3; - const auto& phi = prtls.phi; - const auto& weight = prtls.weight; - const auto& tag = prtls.tag; - - if (tag(p) == ParticleTag::dead) { - return; - } + if (prtls.tag(p) == ParticleTag::dead) { + return; + } - // recover particle velocity to deposit in unsimulated direction - [[maybe_unused]] vec_t vp { ZERO }; - // `vp` only feeds the unsimulated-direction current in the 1D - // (jx2, jx3) and 2D (jx3) branches. In 3D every J component comes - // from the Esirkepov/zigzag charge motion and `vp` is never read, - // so the metric transform + 1/sqrt + NaN/Inf guard below is pure - // dead work there — skip it (also frees xp/inv_energy registers). - if constexpr (D != Dim::_3D) { - coord_t xp { ZERO }; - if constexpr (D == Dim::_1D) { - xp[0] = i_di_to_Xi(i1(p), dx1(p)); - } else if constexpr (D == Dim::_2D) { - if constexpr (M::PrtlDim == Dim::_3D) { - xp[0] = i_di_to_Xi(i1(p), dx1(p)); - xp[1] = i_di_to_Xi(i2(p), dx2(p)); - xp[2] = phi(p); - } else { - xp[0] = i_di_to_Xi(i1(p), dx1(p)); - xp[1] = i_di_to_Xi(i2(p), dx2(p)); - } - } else { - xp[0] = i_di_to_Xi(i1(p), dx1(p)); - xp[1] = i_di_to_Xi(i2(p), dx2(p)); - xp[2] = i_di_to_Xi(i3(p), dx3(p)); - } - auto inv_energy { ZERO }; - if constexpr (S == SimEngine::SRPIC) { - metric.template transform_xyz(xp, - { ux1(p), ux2(p), ux3(p) }, - vp); - inv_energy = ONE / math::sqrt(ONE + NORM_SQR(ux1(p), ux2(p), ux3(p))); + // recover particle velocity to deposit in unsimulated direction + [[maybe_unused]] + vec_t vp { ZERO }; + // `vp` only feeds the unsimulated-direction current in the 1D + // (jx2, jx3) and 2D (jx3) branches. In 3D every J component comes + // from the Esirkepov/zigzag charge motion and `vp` is never read, + // so the metric transform + 1/sqrt + NaN/Inf guard below is pure + // dead work there — skip it (also frees xp/inv_energy registers). + if constexpr (D != Dim::_3D) { + coord_t xp { ZERO }; + if constexpr (D == Dim::_1D) { + xp[0] = i_di_to_Xi(prtls.i1(p), prtls.dx1(p)); + } else if constexpr (D == Dim::_2D) { + if constexpr (M::PrtlDim == Dim::_3D) { + xp[0] = i_di_to_Xi(prtls.i1(p), prtls.dx1(p)); + xp[1] = i_di_to_Xi(prtls.i2(p), prtls.dx2(p)); + xp[2] = prtls.phi(p); } else { - coord_t xp_ { ZERO }; - xp_[0] = xp[0]; - real_t theta_Cd { xp[1] }; - const auto theta_Ph { metric.template convert<2, Crd::Cd, Crd::Ph>( - theta_Cd) }; - const auto small_angle { static_cast(constant::SMALL_ANGLE_GR) }; - const auto large_angle { static_cast( - constant::PI - constant::SMALL_ANGLE_GR) }; - if (theta_Ph < small_angle) { - theta_Cd = metric.template convert<2, Crd::Ph, Crd::Cd>(small_angle); - } else if (theta_Ph >= large_angle) { - theta_Cd = metric.template convert<2, Crd::Ph, Crd::Cd>(large_angle); - } - xp_[1] = theta_Cd; - metric.template transform(xp_, - { ux1(p), ux2(p), ux3(p) }, - vp); - inv_energy = metric.alpha(xp_) / - math::sqrt(ONE + ux1(p) * vp[0] + ux2(p) * vp[1] + - ux3(p) * vp[2]); + xp[0] = i_di_to_Xi(prtls.i1(p), prtls.dx1(p)); + xp[1] = i_di_to_Xi(prtls.i2(p), prtls.dx2(p)); } - if (Kokkos::isnan(vp[2]) || Kokkos::isinf(vp[2])) { - vp[2] = ZERO; + } else { + xp[0] = i_di_to_Xi(prtls.i1(p), prtls.dx1(p)); + xp[1] = i_di_to_Xi(prtls.i2(p), prtls.dx2(p)); + xp[2] = i_di_to_Xi(prtls.i3(p), prtls.dx3(p)); + } + auto inv_energy { ZERO }; + if constexpr (S == SimEngine::SRPIC) { + metric.template transform_xyz( + xp, + { prtls.ux1(p), prtls.ux2(p), prtls.ux3(p) }, + vp); + inv_energy = ONE / U2GAMMA(prtls.ux1(p), prtls.ux2(p), prtls.ux3(p)); + } else { + coord_t xp_ { ZERO }; + xp_[0] = xp[0]; + real_t theta_Cd { xp[1] }; + const auto theta_Ph { metric.template convert<2, Crd::Cd, Crd::Ph>( + theta_Cd) }; + const auto small_angle { static_cast(constant::SMALL_ANGLE_GR) }; + const auto large_angle { static_cast( + constant::PI - constant::SMALL_ANGLE_GR) }; + if (theta_Ph < small_angle) { + theta_Cd = metric.template convert<2, Crd::Ph, Crd::Cd>(small_angle); + } else if (theta_Ph >= large_angle) { + theta_Cd = metric.template convert<2, Crd::Ph, Crd::Cd>(large_angle); } - vp[0] *= inv_energy; - vp[1] *= inv_energy; - vp[2] *= inv_energy; + xp_[1] = theta_Cd; + metric.template transform( + xp_, + { prtls.ux1(p), prtls.ux2(p), prtls.ux3(p) }, + vp); + inv_energy = metric.alpha(xp_) / + math::sqrt(ONE + prtls.ux1(p) * vp[0] + + prtls.ux2(p) * vp[1] + prtls.ux3(p) * vp[2]); + } + if (Kokkos::isnan(vp[2]) || Kokkos::isinf(vp[2])) { + vp[2] = ZERO; } + vp[0] *= inv_energy; + vp[1] *= inv_energy; + vp[2] *= inv_energy; + } - const real_t coeff { weight(p) * charge }; + const real_t coeff { prtls.weight(p) * charge }; + + if constexpr (O == 0u) { + /* + Zig-zag deposit + */ + const auto dxp_r_1 { static_cast(prtls.i1(p) == prtls.i1_prev(p)) * + (prtls.dx1(p) + prtls.dx1_prev(p)) * + static_cast(INV_2) }; + + const real_t Wx1_1 { INV_2 * + (dxp_r_1 + prtls.dx1_prev(p) + + static_cast(prtls.i1(p) > prtls.i1_prev(p))) }; + const real_t Wx1_2 { INV_2 * + (prtls.dx1(p) + dxp_r_1 + + static_cast( + static_cast(prtls.i1(p) > prtls.i1_prev(p)) + + prtls.i1_prev(p) - prtls.i1(p))) }; + const real_t Fx1_1 { (static_cast(prtls.i1(p) > prtls.i1_prev(p)) + + dxp_r_1 - prtls.dx1_prev(p)) * + coeff * inv_dt }; + const real_t Fx1_2 { (static_cast( + prtls.i1(p) - prtls.i1_prev(p) - + static_cast(prtls.i1(p) > prtls.i1_prev(p))) + + prtls.dx1(p) - dxp_r_1) * + coeff * inv_dt }; - if constexpr (O == 0u) { - /* - Zig-zag deposit - */ - const auto dxp_r_1 { static_cast(i1(p) == i1_prev(p)) * - (dx1(p) + dx1_prev(p)) * + if constexpr (D == Dim::_1D) { + const real_t Fx2_1 { HALF * vp[1] * coeff }; + const real_t Fx2_2 { HALF * vp[1] * coeff }; + + const real_t Fx3_1 { HALF * vp[2] * coeff }; + const real_t Fx3_2 { HALF * vp[2] * coeff }; + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, cur::jx1, Fx1_1); + deposit_at(prtls.i1(p) + N_GHOSTS, cur::jx1, Fx1_2); + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, cur::jx2, Fx2_1 * (ONE - Wx1_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, cur::jx2, Fx2_1 * Wx1_1); + deposit_at(prtls.i1(p) + N_GHOSTS, cur::jx2, Fx2_2 * (ONE - Wx1_2)); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, cur::jx2, Fx2_2 * Wx1_2); + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, cur::jx3, Fx3_1 * (ONE - Wx1_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, cur::jx3, Fx3_1 * Wx1_1); + deposit_at(prtls.i1(p) + N_GHOSTS, cur::jx3, Fx3_2 * (ONE - Wx1_2)); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, cur::jx3, Fx3_2 * Wx1_2); + } else if constexpr (D == Dim::_2D || D == Dim::_3D) { + const auto dxp_r_2 { static_cast(prtls.i2(p) == prtls.i2_prev(p)) * + (prtls.dx2(p) + prtls.dx2_prev(p)) * static_cast(INV_2) }; - const real_t Wx1_1 { INV_2 * (dxp_r_1 + dx1_prev(p) + - static_cast(i1(p) > i1_prev(p))) }; - const real_t Wx1_2 { INV_2 * (dx1(p) + dxp_r_1 + - static_cast( - static_cast(i1(p) > i1_prev(p)) + - i1_prev(p) - i1(p))) }; - const real_t Fx1_1 { (static_cast(i1(p) > i1_prev(p)) + - dxp_r_1 - dx1_prev(p)) * - coeff * inv_dt }; - const real_t Fx1_2 { (static_cast( - i1(p) - i1_prev(p) - - static_cast(i1(p) > i1_prev(p))) + - dx1(p) - dxp_r_1) * + const real_t Wx2_1 { INV_2 * (dxp_r_2 + prtls.dx2_prev(p) + + static_cast(prtls.i2(p) > + prtls.i2_prev(p))) }; + const real_t Wx2_2 { INV_2 * + (prtls.dx2(p) + dxp_r_2 + + static_cast( + static_cast(prtls.i2(p) > prtls.i2_prev(p)) + + prtls.i2_prev(p) - prtls.i2(p))) }; + const real_t Fx2_1 { (static_cast(prtls.i2(p) > prtls.i2_prev(p)) + + dxp_r_2 - prtls.dx2_prev(p)) * coeff * inv_dt }; - - if constexpr (D == Dim::_1D) { - const real_t Fx2_1 { HALF * vp[1] * coeff }; - const real_t Fx2_2 { HALF * vp[1] * coeff }; - + const real_t Fx2_2 { + (static_cast(prtls.i2(p) - prtls.i2_prev(p) - + static_cast(prtls.i2(p) > prtls.i2_prev(p))) + + prtls.dx2(p) - dxp_r_2) * + coeff * inv_dt + }; + + if constexpr (D == Dim::_2D) { const real_t Fx3_1 { HALF * vp[2] * coeff }; const real_t Fx3_2 { HALF * vp[2] * coeff }; - deposit_at(i1_prev(p) + N_GHOSTS, cur::jx1, Fx1_1); - deposit_at(i1(p) + N_GHOSTS, cur::jx1, Fx1_2); - - deposit_at(i1_prev(p) + N_GHOSTS, cur::jx2, Fx2_1 * (ONE - Wx1_1)); - deposit_at(i1_prev(p) + N_GHOSTS + 1, cur::jx2, Fx2_1 * Wx1_1); - deposit_at(i1(p) + N_GHOSTS, cur::jx2, Fx2_2 * (ONE - Wx1_2)); - deposit_at(i1(p) + N_GHOSTS + 1, cur::jx2, Fx2_2 * Wx1_2); - - deposit_at(i1_prev(p) + N_GHOSTS, cur::jx3, Fx3_1 * (ONE - Wx1_1)); - deposit_at(i1_prev(p) + N_GHOSTS + 1, cur::jx3, Fx3_1 * Wx1_1); - deposit_at(i1(p) + N_GHOSTS, cur::jx3, Fx3_2 * (ONE - Wx1_2)); - deposit_at(i1(p) + N_GHOSTS + 1, cur::jx3, Fx3_2 * Wx1_2); - } else if constexpr (D == Dim::_2D || D == Dim::_3D) { - const auto dxp_r_2 { static_cast(i2(p) == i2_prev(p)) * - (dx2(p) + dx2_prev(p)) * - static_cast(INV_2) }; - - const real_t Wx2_1 { INV_2 * (dxp_r_2 + dx2_prev(p) + - static_cast(i2(p) > i2_prev(p))) }; - const real_t Wx2_2 { INV_2 * (dx2(p) + dxp_r_2 + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS, + cur::jx1, + Fx1_1 * (ONE - Wx2_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_1 * Wx2_1); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS, + cur::jx1, + Fx1_2 * (ONE - Wx2_2)); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_2 * Wx2_2); + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS, + cur::jx2, + Fx2_1 * (ONE - Wx1_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, + prtls.i2_prev(p) + N_GHOSTS, + cur::jx2, + Fx2_1 * Wx1_1); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS, + cur::jx2, + Fx2_2 * (ONE - Wx1_2)); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, + prtls.i2(p) + N_GHOSTS, + cur::jx2, + Fx2_2 * Wx1_2); + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, + prtls.i2_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * Wx1_1 * (ONE - Wx2_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS + 1, + cur::jx3, + Fx3_1 * (ONE - Wx1_1) * Wx2_1); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, + prtls.i2_prev(p) + N_GHOSTS + 1, + cur::jx3, + Fx3_1 * Wx1_1 * Wx2_1); + + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2)); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, + prtls.i2(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * Wx1_2 * (ONE - Wx2_2)); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS + 1, + cur::jx3, + Fx3_2 * (ONE - Wx1_2) * Wx2_2); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, + prtls.i2(p) + N_GHOSTS + 1, + cur::jx3, + Fx3_2 * Wx1_2 * Wx2_2); + } else { + const auto dxp_r_3 { + static_cast(prtls.i3(p) == prtls.i3_prev(p)) * + (prtls.dx3(p) + prtls.dx3_prev(p)) * static_cast(INV_2) + }; + const real_t Wx3_1 { INV_2 * (dxp_r_3 + prtls.dx3_prev(p) + static_cast( - static_cast(i2(p) > i2_prev(p)) + - i2_prev(p) - i2(p))) }; - const real_t Fx2_1 { (static_cast(i2(p) > i2_prev(p)) + - dxp_r_2 - dx2_prev(p)) * - coeff * inv_dt }; - const real_t Fx2_2 { (static_cast( - i2(p) - i2_prev(p) - - static_cast(i2(p) > i2_prev(p))) + - dx2(p) - dxp_r_2) * + prtls.i3(p) > prtls.i3_prev(p))) }; + const real_t Wx3_2 { + INV_2 * + (prtls.dx3(p) + dxp_r_3 + + static_cast(static_cast(prtls.i3(p) > prtls.i3_prev(p)) + + prtls.i3_prev(p) - prtls.i3(p))) + }; + const real_t Fx3_1 { (static_cast(prtls.i3(p) > prtls.i3_prev(p)) + + dxp_r_3 - prtls.dx3_prev(p)) * coeff * inv_dt }; + const real_t Fx3_2 { + (static_cast(prtls.i3(p) - prtls.i3_prev(p) - + static_cast(prtls.i3(p) > prtls.i3_prev(p))) + + prtls.dx3(p) - dxp_r_3) * + coeff * inv_dt + }; + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS, + prtls.i3_prev(p) + N_GHOSTS, + cur::jx1, + Fx1_1 * (ONE - Wx2_1) * (ONE - Wx3_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS + 1, + prtls.i3_prev(p) + N_GHOSTS, + cur::jx1, + Fx1_1 * Wx2_1 * (ONE - Wx3_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS, + prtls.i3_prev(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_1 * (ONE - Wx2_1) * Wx3_1); + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS + 1, + prtls.i3_prev(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_1 * Wx2_1 * Wx3_1); + + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS, + prtls.i3(p) + N_GHOSTS, + cur::jx1, + Fx1_2 * (ONE - Wx2_2) * (ONE - Wx3_2)); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS + 1, + prtls.i3(p) + N_GHOSTS, + cur::jx1, + Fx1_2 * Wx2_2 * (ONE - Wx3_2)); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS, + prtls.i3(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_2 * (ONE - Wx2_2) * Wx3_2); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS + 1, + prtls.i3(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_2 * Wx2_2 * Wx3_2); + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS, + prtls.i3_prev(p) + N_GHOSTS, + cur::jx2, + Fx2_1 * (ONE - Wx1_1) * (ONE - Wx3_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, + prtls.i2_prev(p) + N_GHOSTS, + prtls.i3_prev(p) + N_GHOSTS, + cur::jx2, + Fx2_1 * Wx1_1 * (ONE - Wx3_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS, + prtls.i3_prev(p) + N_GHOSTS + 1, + cur::jx2, + Fx2_1 * (ONE - Wx1_1) * Wx3_1); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, + prtls.i2_prev(p) + N_GHOSTS, + prtls.i3_prev(p) + N_GHOSTS + 1, + cur::jx2, + Fx2_1 * Wx1_1 * Wx3_1); + + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS, + prtls.i3(p) + N_GHOSTS, + cur::jx2, + Fx2_2 * (ONE - Wx1_2) * (ONE - Wx3_2)); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, + prtls.i2(p) + N_GHOSTS, + prtls.i3(p) + N_GHOSTS, + cur::jx2, + Fx2_2 * Wx1_2 * (ONE - Wx3_2)); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS, + prtls.i3(p) + N_GHOSTS + 1, + cur::jx2, + Fx2_2 * (ONE - Wx1_2) * Wx3_2); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, + prtls.i2(p) + N_GHOSTS, + prtls.i3(p) + N_GHOSTS + 1, + cur::jx2, + Fx2_2 * Wx1_2 * Wx3_2); + + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS, + prtls.i3_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, + prtls.i2_prev(p) + N_GHOSTS, + prtls.i3_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * Wx1_1 * (ONE - Wx2_1)); + deposit_at(prtls.i1_prev(p) + N_GHOSTS, + prtls.i2_prev(p) + N_GHOSTS + 1, + prtls.i3_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * (ONE - Wx1_1) * Wx2_1); + deposit_at(prtls.i1_prev(p) + N_GHOSTS + 1, + prtls.i2_prev(p) + N_GHOSTS + 1, + prtls.i3_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * Wx1_1 * Wx2_1); + + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS, + prtls.i3(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2)); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, + prtls.i2(p) + N_GHOSTS, + prtls.i3(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * Wx1_2 * (ONE - Wx2_2)); + deposit_at(prtls.i1(p) + N_GHOSTS, + prtls.i2(p) + N_GHOSTS + 1, + prtls.i3(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * (ONE - Wx1_2) * Wx2_2); + deposit_at(prtls.i1(p) + N_GHOSTS + 1, + prtls.i2(p) + N_GHOSTS + 1, + prtls.i3(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * Wx1_2 * Wx2_2); + } + } + } else if constexpr ((O >= 1u) and (O <= 11u)) { + + // shape function in dim1 -> always required + real_t iS_x1[O + 2], fS_x1[O + 2]; + // indices of the shape function + int i1_min, i1_max; + + // call shape function + prtl_shape::for_deposit(prtls.i1_prev(p), + static_cast(prtls.dx1_prev(p)), + prtls.i1(p), + static_cast(prtls.dx1(p)), + i1_min, + i1_max, + iS_x1, + fS_x1); - if constexpr (D == Dim::_2D) { - const real_t Fx3_1 { HALF * vp[2] * coeff }; - const real_t Fx3_2 { HALF * vp[2] * coeff }; - - deposit_at(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - cur::jx1, - Fx1_1 * (ONE - Wx2_1)); - deposit_at(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - cur::jx1, - Fx1_1 * Wx2_1); - deposit_at(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - cur::jx1, - Fx1_2 * (ONE - Wx2_2)); - deposit_at(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS + 1, - cur::jx1, - Fx1_2 * Wx2_2); - - deposit_at(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - cur::jx2, - Fx2_1 * (ONE - Wx1_1)); - deposit_at(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - cur::jx2, - Fx2_1 * Wx1_1); - deposit_at(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - cur::jx2, - Fx2_2 * (ONE - Wx1_2)); - deposit_at(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS, - cur::jx2, - Fx2_2 * Wx1_2); - - deposit_at(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - cur::jx3, - Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1)); - deposit_at(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - cur::jx3, - Fx3_1 * Wx1_1 * (ONE - Wx2_1)); - deposit_at(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - cur::jx3, - Fx3_1 * (ONE - Wx1_1) * Wx2_1); - deposit_at(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS + 1, - cur::jx3, - Fx3_1 * Wx1_1 * Wx2_1); - - deposit_at(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - cur::jx3, - Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2)); - deposit_at(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS, - cur::jx3, - Fx3_2 * Wx1_2 * (ONE - Wx2_2)); - deposit_at(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS + 1, - cur::jx3, - Fx3_2 * (ONE - Wx1_2) * Wx2_2); - deposit_at(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS + 1, - cur::jx3, - Fx3_2 * Wx1_2 * Wx2_2); - } else { - const auto dxp_r_3 { static_cast(i3(p) == i3_prev(p)) * - (dx3(p) + dx3_prev(p)) * - static_cast(INV_2) }; - const real_t Wx3_1 { INV_2 * (dxp_r_3 + dx3_prev(p) + - static_cast(i3(p) > i3_prev(p))) }; - const real_t Wx3_2 { INV_2 * (dx3(p) + dxp_r_3 + - static_cast( - static_cast(i3(p) > i3_prev(p)) + - i3_prev(p) - i3(p))) }; - const real_t Fx3_1 { (static_cast(i3(p) > i3_prev(p)) + - dxp_r_3 - dx3_prev(p)) * - coeff * inv_dt }; - const real_t Fx3_2 { (static_cast( - i3(p) - i3_prev(p) - - static_cast(i3(p) > i3_prev(p))) + - dx3(p) - dxp_r_3) * - coeff * inv_dt }; - - deposit_at(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx1, - Fx1_1 * (ONE - Wx2_1) * (ONE - Wx3_1)); - deposit_at(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - i3_prev(p) + N_GHOSTS, - cur::jx1, - Fx1_1 * Wx2_1 * (ONE - Wx3_1)); - deposit_at(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS + 1, - cur::jx1, - Fx1_1 * (ONE - Wx2_1) * Wx3_1); - deposit_at(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - i3_prev(p) + N_GHOSTS + 1, - cur::jx1, - Fx1_1 * Wx2_1 * Wx3_1); - - deposit_at(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx1, - Fx1_2 * (ONE - Wx2_2) * (ONE - Wx3_2)); - deposit_at(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS + 1, - i3(p) + N_GHOSTS, - cur::jx1, - Fx1_2 * Wx2_2 * (ONE - Wx3_2)); - deposit_at(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS + 1, - cur::jx1, - Fx1_2 * (ONE - Wx2_2) * Wx3_2); - deposit_at(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS + 1, - i3(p) + N_GHOSTS + 1, - cur::jx1, - Fx1_2 * Wx2_2 * Wx3_2); - - deposit_at(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx2, - Fx2_1 * (ONE - Wx1_1) * (ONE - Wx3_1)); - deposit_at(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx2, - Fx2_1 * Wx1_1 * (ONE - Wx3_1)); - deposit_at(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS + 1, - cur::jx2, - Fx2_1 * (ONE - Wx1_1) * Wx3_1); - deposit_at(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS + 1, - cur::jx2, - Fx2_1 * Wx1_1 * Wx3_1); - - deposit_at(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx2, - Fx2_2 * (ONE - Wx1_2) * (ONE - Wx3_2)); - deposit_at(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx2, - Fx2_2 * Wx1_2 * (ONE - Wx3_2)); - deposit_at(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS + 1, - cur::jx2, - Fx2_2 * (ONE - Wx1_2) * Wx3_2); - deposit_at(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS + 1, - cur::jx2, - Fx2_2 * Wx1_2 * Wx3_2); - - deposit_at(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx3, - Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1)); - deposit_at(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx3, - Fx3_1 * Wx1_1 * (ONE - Wx2_1)); - deposit_at(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - i3_prev(p) + N_GHOSTS, - cur::jx3, - Fx3_1 * (ONE - Wx1_1) * Wx2_1); - deposit_at(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS + 1, - i3_prev(p) + N_GHOSTS, - cur::jx3, - Fx3_1 * Wx1_1 * Wx2_1); - - deposit_at(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx3, - Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2)); - deposit_at(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx3, - Fx3_2 * Wx1_2 * (ONE - Wx2_2)); - deposit_at(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS + 1, - i3(p) + N_GHOSTS, - cur::jx3, - Fx3_2 * (ONE - Wx1_2) * Wx2_2); - deposit_at(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS + 1, - i3(p) + N_GHOSTS, - cur::jx3, - Fx3_2 * Wx1_2 * Wx2_2); + if constexpr (D == Dim::_1D) { + // (1D): fused Esirkepov, no [O+2] temporaries. + // jx1[i] = -Qdx1dt * sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) + // = -Qdx1dt * P1[i] (Eq. 38, 1D) + // Wx23[i] = HALF * (fS_x1[i] + iS_x1[i]) (computed inline) + const real_t Qdx1dt = coeff * inv_dt; + const real_t QVx2 = coeff * vp[1]; + const real_t QVx3 = coeff * vp[2]; + + // account for ghost cells + i1_min += N_GHOSTS; + i1_max += N_GHOSTS; + + // get number of update indices for asymmetric movement + const int di_x1 = i1_max - i1_min; + + // Current update — fused over the union line so the J cell + // stays L1-resident across the 3 component atomic_adds. + real_t P1 = ZERO; + for (int i = 0; i <= di_x1; ++i) { + P1 += fS_x1[i] - iS_x1[i]; + const int gi = i1_min + i; + const real_t Wx23 = HALF * (fS_x1[i] + iS_x1[i]); + if (i < di_x1) { + deposit_at(gi, cur::jx1, -Qdx1dt * P1); } + deposit_at(gi, cur::jx2, QVx2 * Wx23); + deposit_at(gi, cur::jx3, QVx3 * Wx23); } - } else if constexpr ((O >= 1u) and (O <= 11u)) { + + } else if constexpr (D == Dim::_2D) { // shape function in dim1 -> always required - real_t iS_x1[O + 2], fS_x1[O + 2]; + real_t iS_x2[O + 2], fS_x2[O + 2]; // indices of the shape function - int i1_min, i1_max; + int i2_min, i2_max; // call shape function - prtl_shape::for_deposit(i1_prev(p), - static_cast(dx1_prev(p)), - i1(p), - static_cast(dx1(p)), - i1_min, - i1_max, - iS_x1, - fS_x1); - - if constexpr (D == Dim::_1D) { - // (1D): fused Esirkepov, no [O+2] temporaries. - // jx1[i] = -Qdx1dt * sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) - // = -Qdx1dt * P1[i] (Eq. 38, 1D) - // Wx23[i] = HALF * (fS_x1[i] + iS_x1[i]) (computed inline) - const real_t Qdx1dt = coeff * inv_dt; - const real_t QVx2 = coeff * vp[1]; - const real_t QVx3 = coeff * vp[2]; - - // account for ghost cells - i1_min += N_GHOSTS; - i1_max += N_GHOSTS; - - // get number of update indices for asymmetric movement - const int di_x1 = i1_max - i1_min; - - // Current update — fused over the union line so the J cell - // stays L1-resident across the 3 component atomic_adds. - real_t P1 = ZERO; - for (int i = 0; i <= di_x1; ++i) { - P1 += fS_x1[i] - iS_x1[i]; - const int gi = i1_min + i; - const real_t Wx23 = HALF * (fS_x1[i] + iS_x1[i]); + prtl_shape::for_deposit(prtls.i2_prev(p), + static_cast(prtls.dx2_prev(p)), + prtls.i2(p), + static_cast(prtls.dx2(p)), + i2_min, + i2_max, + iS_x2, + fS_x2); + + /** + * (2D): fused Esirkepov, no [O+2]^2 temporaries. + * + * Esirkepov 2001 Eq. 38 (simplified) is separable: with + * P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) and + * P2[j] = sum_{j'=0}^{j} (fS_x2[j'] - iS_x2[j']), + * jx1[i][j] = -Q*HALF * P1[i] * (fS_x2[j] + iS_x2[j]) + * jx2[i][j] = -Q*HALF * P2[j] * (fS_x1[i] + iS_x1[i]) + * Wx3[i][j] = THIRD*( fS_x2[j]*(HALF*iS_x1[i]+fS_x1[i]) + * + iS_x2[j]*(HALF*fS_x1[i]+iS_x1[i]) ) + * with Q = coeff*inv_dt (Qdx1dt == Qdx2dt). Same value as the + * old explicit Wx/jx tensors up to FP reassociation; + * charge-conserving by construction. Prefix sums carried as + * running scalars, so the only per-thread state is the + * existing 1D shape arrays. + */ + const real_t QVx3 = coeff * vp[2]; + // -Q*HALF prefactor (Qdx1dt == Qdx2dt == coeff*inv_dt) + const real_t cf = -(coeff * inv_dt) * HALF; + + // account for ghost cells + i1_min += N_GHOSTS; + i2_min += N_GHOSTS; + i1_max += N_GHOSTS; + i2_max += N_GHOSTS; + + // get number of update indices for asymmetric movement + const int di_x1 = i1_max - i1_min; + const int di_x2 = i2_max - i2_min; + + // Current update — fused over the union plane so the J cell + // line stays L1-resident across the 3 component atomic_adds. + real_t P1 = ZERO; + for (int i = 0; i <= di_x1; ++i) { + P1 += fS_x1[i] - iS_x1[i]; + const int gi = i1_min + i; + const real_t iSx1 = iS_x1[i]; + const real_t fSx1 = fS_x1[i]; + const real_t A1 = fSx1 + iSx1; // jx2 cross-factor + real_t P2 = ZERO; + for (int j = 0; j <= di_x2; ++j) { + P2 += fS_x2[j] - iS_x2[j]; + const int gj = i2_min + j; + const real_t iSx2 = iS_x2[j]; + const real_t fSx2 = fS_x2[j]; if (i < di_x1) { - deposit_at(gi, cur::jx1, -Qdx1dt * P1); + deposit_at(gi, gj, cur::jx1, cf * P1 * (fSx2 + iSx2)); } - deposit_at(gi, cur::jx2, QVx2 * Wx23); - deposit_at(gi, cur::jx3, QVx3 * Wx23); + if (j < di_x2) { + deposit_at(gi, gj, cur::jx2, cf * P2 * A1); + } + const real_t Wx3 = THIRD * (fSx2 * (HALF * iSx1 + fSx1) + + iSx2 * (HALF * fSx1 + iSx1)); + deposit_at(gi, gj, cur::jx3, QVx3 * Wx3); } + } + + } else if constexpr (D == Dim::_3D) { + // shape function in dim2 + real_t iS_x2[O + 2], fS_x2[O + 2]; + // indices of the shape function + int i2_min, i2_max; + // call shape function + prtl_shape::for_deposit(prtls.i2_prev(p), + static_cast(prtls.dx2_prev(p)), + prtls.i2(p), + static_cast(prtls.dx2(p)), + i2_min, + i2_max, + iS_x2, + fS_x2); + + // shape function in dim3 + real_t iS_x3[O + 2], fS_x3[O + 2]; + // indices of the shape function + int i3_min, i3_max; - } else if constexpr (D == Dim::_2D) { - - // shape function in dim1 -> always required - real_t iS_x2[O + 2], fS_x2[O + 2]; - // indices of the shape function - int i2_min, i2_max; - - // call shape function - prtl_shape::for_deposit(i2_prev(p), - static_cast(dx2_prev(p)), - i2(p), - static_cast(dx2(p)), - i2_min, - i2_max, - iS_x2, - fS_x2); - - // (2D): fused Esirkepov, no [O+2]^2 temporaries. - // - // Esirkepov 2001 Eq. 38 (simplified) is separable: with - // P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) and - // P2[j] = sum_{j'=0}^{j} (fS_x2[j'] - iS_x2[j']), - // jx1[i][j] = -Q*HALF * P1[i] * (fS_x2[j] + iS_x2[j]) - // jx2[i][j] = -Q*HALF * P2[j] * (fS_x1[i] + iS_x1[i]) - // Wx3[i][j] = THIRD*( fS_x2[j]*(HALF*iS_x1[i]+fS_x1[i]) - // + iS_x2[j]*(HALF*fS_x1[i]+iS_x1[i]) ) - // with Q = coeff*inv_dt (Qdx1dt == Qdx2dt). Same value as the - // old explicit Wx/jx tensors up to FP reassociation; - // charge-conserving by construction. Prefix sums carried as - // running scalars, so the only per-thread state is the - // existing 1D shape arrays. - const real_t QVx3 = coeff * vp[2]; - // -Q*HALF prefactor (Qdx1dt == Qdx2dt == coeff*inv_dt) - const real_t cf = -(coeff * inv_dt) * HALF; - - // account for ghost cells - i1_min += N_GHOSTS; - i2_min += N_GHOSTS; - i1_max += N_GHOSTS; - i2_max += N_GHOSTS; - - // get number of update indices for asymmetric movement - const int di_x1 = i1_max - i1_min; - const int di_x2 = i2_max - i2_min; - - // Current update — fused over the union plane so the J cell - // line stays L1-resident across the 3 component atomic_adds. - real_t P1 = ZERO; - for (int i = 0; i <= di_x1; ++i) { - P1 += fS_x1[i] - iS_x1[i]; - const int gi = i1_min + i; - const real_t iSx1 = iS_x1[i]; - const real_t fSx1 = fS_x1[i]; - const real_t A1 = fSx1 + iSx1; // jx2 cross-factor - real_t P2 = ZERO; - for (int j = 0; j <= di_x2; ++j) { - P2 += fS_x2[j] - iS_x2[j]; - const int gj = i2_min + j; - const real_t iSx2 = iS_x2[j]; - const real_t fSx2 = fS_x2[j]; + // call shape function + prtl_shape::for_deposit(prtls.i3_prev(p), + static_cast(prtls.dx3_prev(p)), + prtls.i3(p), + static_cast(prtls.dx3(p)), + i3_min, + i3_max, + iS_x3, + fS_x3); + + /** + * fused Esirkepov, no (O+2)^3 temporaries. + * + * The Esirkepov 3D current (2001, Eq. 31) is separable: with + * P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) (and likewise + * P2[j], P3[k]) the cumulative-sum currents collapse to + * + * jx1[i][j][k] = -Q*THIRD * P1[i] * G23(j,k) + * jx2[i][j][k] = -Q*THIRD * P2[j] * H13(i,k) + * jx3[i][j][k] = -Q*THIRD * P3[k] * F12(i,j) + * + * with the 1D-shape cross-factors + * + * G23(j,k) = iS_x2[j]*iS_x3[k] + fS_x2[j]*fS_x3[k] + * + HALF*(iS_x3[k]*fS_x2[j] + iS_x2[j]*fS_x3[k]) + * H13(i,k) = iS_x1[i]*iS_x3[k] + fS_x1[i]*fS_x3[k] + * + HALF*(iS_x3[k]*fS_x1[i] + iS_x1[i]*fS_x3[k]) + * F12(i,j) = iS_x1[i]*iS_x2[j] + fS_x1[i]*fS_x2[j] + * + HALF*(iS_x1[i]*fS_x2[j] + iS_x2[j]*fS_x1[i]) + * + * and Q = coeff*inv_dt (Qdxdt == Qdydt == Qdzdt). This is the + * same value as the old explicit Wx/jx tensors up to + * floating-point reassociation: charge-conserving by + * construction (the Esirkepov decomposition is exact). The + * prefix sums are carried as running scalars in the deposit + * loop, so the only per-thread state is the existing 1D shape + * arrays (no (O+2)^3 / (O+2)^2 locals, hence far fewer VGPRs + * and no private-memory tensor traffic). + */ + + // account for ghost cells + i1_min += N_GHOSTS; + i2_min += N_GHOSTS; + i3_min += N_GHOSTS; + i1_max += N_GHOSTS; + i2_max += N_GHOSTS; + i3_max += N_GHOSTS; + + // get number of update indices for asymmetric movement + const int di_x1 = i1_max - i1_min; + const int di_x2 = i2_max - i2_min; + const int di_x3 = i3_max - i3_min; + + // -Q*THIRD prefactor (Qdxdt == Qdydt == Qdzdt == coeff*inv_dt) + const real_t cf = -(coeff * inv_dt) * THIRD; + + /** + * Current update — fused over the union cube so the J cell + * line stays L1-resident across the 3 component atomic_adds. + * Per-cell branches on (i(i2_prev(p), - static_cast(dx2_prev(p)), - i2(p), - static_cast(dx2(p)), - i2_min, - i2_max, - iS_x2, - fS_x2); - - // shape function in dim3 - real_t iS_x3[O + 2], fS_x3[O + 2]; - // indices of the shape function - int i3_min, i3_max; - - // call shape function - prtl_shape::for_deposit(i3_prev(p), - static_cast(dx3_prev(p)), - i3(p), - static_cast(dx3(p)), - i3_min, - i3_max, - iS_x3, - fS_x3); - - // fused Esirkepov, no (O+2)^3 temporaries. - // - // The Esirkepov 3D current (2001, Eq. 31) is separable: with - // P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) (and likewise - // P2[j], P3[k]) the cumulative-sum currents collapse to - // - // jx1[i][j][k] = -Q*THIRD * P1[i] * G23(j,k) - // jx2[i][j][k] = -Q*THIRD * P2[j] * H13(i,k) - // jx3[i][j][k] = -Q*THIRD * P3[k] * F12(i,j) - // - // with the 1D-shape cross-factors - // - // G23(j,k) = iS_x2[j]*iS_x3[k] + fS_x2[j]*fS_x3[k] - // + HALF*(iS_x3[k]*fS_x2[j] + iS_x2[j]*fS_x3[k]) - // H13(i,k) = iS_x1[i]*iS_x3[k] + fS_x1[i]*fS_x3[k] - // + HALF*(iS_x3[k]*fS_x1[i] + iS_x1[i]*fS_x3[k]) - // F12(i,j) = iS_x1[i]*iS_x2[j] + fS_x1[i]*fS_x2[j] - // + HALF*(iS_x1[i]*fS_x2[j] + iS_x2[j]*fS_x1[i]) - // - // and Q = coeff*inv_dt (Qdxdt == Qdydt == Qdzdt). This is the - // same value as the old explicit Wx/jx tensors up to - // floating-point reassociation: charge-conserving by - // construction (the Esirkepov decomposition is exact). The - // prefix sums are carried as running scalars in the deposit - // loop, so the only per-thread state is the existing 1D shape - // arrays (no (O+2)^3 / (O+2)^2 locals, hence far fewer VGPRs - // and no private-memory tensor traffic). - - // account for ghost cells - i1_min += N_GHOSTS; - i2_min += N_GHOSTS; - i3_min += N_GHOSTS; - i1_max += N_GHOSTS; - i2_max += N_GHOSTS; - i3_max += N_GHOSTS; - - // get number of update indices for asymmetric movement - const int di_x1 = i1_max - i1_min; - const int di_x2 = i2_max - i2_min; - const int di_x3 = i3_max - i3_min; - - // -Q*THIRD prefactor (Qdxdt == Qdydt == Qdzdt == coeff*inv_dt) - const real_t cf = -(coeff * inv_dt) * THIRD; - - /* - Current update — fused over the union cube so the J cell - line stays L1-resident across the 3 component atomic_adds. - Per-cell branches on (i 11 not supported. Seriously. " - "What are you even doing here? Entity already goes to 11!"); - } + } // dim + } else { // order + raise::KernelError( + HERE, + "Unsupported interpolation order. O > 11 not supported. Seriously. " + "What are you even doing here? Entity already goes to 11!"); } - - } // namespace deposit + } /** * @brief Flat current-deposition kernel. @@ -721,38 +696,19 @@ namespace kernel { static_assert(O <= 11u, "Shape function order O must be <= 11"); static constexpr auto D = M::Dim; - scatter_ndfield_t J; - deposit::PrtlPack prtls; - const M metric; - const real_t charge, inv_dt; + scatter_ndfield_t J; + const ParticleArrays prtls; + const M metric; + const real_t charge, inv_dt; public: DepositCurrents_kernel(const scatter_ndfield_t& scatter_cur, - const array_t& i1, - const array_t& i2, - const array_t& i3, - const array_t& i1_prev, - const array_t& i2_prev, - const array_t& i3_prev, - const array_t& dx1, - const array_t& dx2, - const array_t& dx3, - const array_t& dx1_prev, - const array_t& dx2_prev, - const array_t& dx3_prev, - const array_t& ux1, - const array_t& ux2, - const array_t& ux3, - const array_t& phi, - const array_t& weight, - const array_t& tag, + const ParticleArrays& prtls, const M& metric, real_t charge, const real_t dt) : J { scatter_cur } - , prtls { i1, i2, i3, i1_prev, i2_prev, i3_prev, - dx1, dx2, dx3, dx1_prev, dx2_prev, dx3_prev, - ux1, ux2, ux3, phi, weight, tag } + , prtls { prtls } , metric { metric } , charge { charge } , inv_dt { ONE / dt } { @@ -765,27 +721,25 @@ namespace kernel { Inline auto operator()(prtlidx_t p) const -> void { auto J_acc = J.access(); if constexpr (D == Dim::_1D) { - deposit::deposit_one_particle( - p, - prtls, - metric, - charge, - inv_dt, - [&](int g_i1, int comp, real_t v) { - J_acc(g_i1, comp) += v; - }); + DepositOneParticle(p, + prtls, + metric, + charge, + inv_dt, + [&](int g_i1, int comp, real_t v) { + J_acc(g_i1, comp) += v; + }); } else if constexpr (D == Dim::_2D) { - deposit::deposit_one_particle( - p, - prtls, - metric, - charge, - inv_dt, - [&](int g_i1, int g_i2, int comp, real_t v) { - J_acc(g_i1, g_i2, comp) += v; - }); + DepositOneParticle(p, + prtls, + metric, + charge, + inv_dt, + [&](int g_i1, int g_i2, int comp, real_t v) { + J_acc(g_i1, g_i2, comp) += v; + }); } else if constexpr (D == Dim::_3D) { - deposit::deposit_one_particle( + DepositOneParticle( p, prtls, metric, @@ -798,8 +752,6 @@ namespace kernel { } }; -#if defined(TEAM_POLICY) - /** * @brief Tiled current-deposition kernel. * @@ -851,107 +803,74 @@ namespace kernel { * per write. Sorting less often than every step therefore costs * escape-valve traffic, never accuracy. */ - template - class DepositCurrents_kernel_tiled { + template + class DepositCurrentsTiled_kernel { static_assert(O <= 11u, "Shape order O must be <= 11"); static_assert(T_TILE > 0u, "T_TILE must be positive"); static constexpr auto D = M::Dim; - // Per-side scratch halo, derived from first principles. - // - // total halo = stencil_reach(O) + drift_between_sort_and_deposit - // - // stencil_reach(O) — maximum cells the deposit writes ABOVE - // min(i, i_prev) under CFL |v·dt/dx| ≤ 1/2: - // - O == 0 (zigzag): writes {i_prev, i_prev+1, i, i+1} ⇒ +2 - // - O >= 1 Esirkepov: `for_deposit` returns an (O+2)-wide - // array but only O+1 entries are non-zero, and the union - // window satisfies `i_max - i_min <= O+1` (see - // particle_shapes.hpp::for_deposit). The genuine one-sided - // reach above min(i, i_prev) is therefore O, not O+1 — the - // old `O+1` carried one extra cell of conservative padding - // on top of the already-conservative drift term below. - // - // drift — sort runs at end-of-step (see srpic.hpp), so a particle - // sees exactly one pusher step before the *next* step's deposit - // when sorted every step (the common case). DRIFT is therefore a - // fixed constant of 1, NOT a compile-time function of the runtime - // sort cadence. Sizing the halo for the common case (rather than a - // worst-case sort interval) is what keeps the scratch small enough - // for good occupancy; a species sorted less often than - // every step just drifts past the halo and takes the global-J - // escape valve more often — correct, only slower (see the class - // doc-comment for why this is charge-conserving). - // - static constexpr int STENCIL_REACH = (O == 0u) - ? 2 - : static_cast(O); - static constexpr int DRIFT = 1; - static constexpr int HALO = STENCIL_REACH + DRIFT; - static constexpr int TE = static_cast(T_TILE) + 2 * HALO; - - using exec_space = Kokkos::DefaultExecutionSpace; - using team_policy = Kokkos::TeamPolicy; - using member_t = typename team_policy::member_type; - using scratch_mem = typename exec_space::scratch_memory_space; - - // Scratch view types: trailing extent of 3 (jx1, jx2, jx3 components) - // is fixed by a runtime extent so we don't need a separate dimension - // template per component count. - using scratch_1d_t = Kokkos::View>; - using scratch_2d_t = Kokkos::View>; - using scratch_3d_t = Kokkos::View>; - - ndfield_t J; - deposit::PrtlPack prtls; - const M metric; - const real_t charge, inv_dt; + /** + * Per-side scratch halo, derived from first principles. + * + * total halo = stencil_reach(O) + drift_between_sort_and_deposit + * + * stencil_reach(O) — maximum cells the deposit writes ABOVE + * min(i, i_prev) under CFL |v * dt/dx| <= 1/2: + * - O == 0 (zigzag): writes { i_prev, i_prev+1, i, i+1 } => +2 + * - O >= 1 Esirkepov: `for_deposit` returns an (O+2)-wide + * array but only O+1 entries are non-zero, and the union + * window satisfies `i_max - i_min <= O+1` (see + * particle_shapes.hpp::for_deposit). The genuine one-sided + * reach above min(i, i_prev) is therefore O, not O+1 — the + * old `O+1` carried one extra cell of conservative padding + * on top of the already-conservative drift term below. + * + * drift — sort runs at end-of-step (see srpic.hpp), so a particle + * sees exactly one pusher step before the *next* step's deposit + * when sorted every step (the common case). DRIFT is therefore a + * fixed constant of 1, NOT a compile-time function of the runtime + * sort cadence. Sizing the halo for the common case (rather than a + * worst-case sort interval) is what keeps the scratch small enough + * for good occupancy; a species sorted less often than + * every step just drifts past the halo and takes the global-J + * escape valve more often — correct, only slower (see the class + * doc-comment for why this is charge-conserving). + */ + static constexpr int STENCIL_REACH = (O == 0u) ? 2 : static_cast(O); + static constexpr int DRIFT = 1; + static constexpr int HALO = STENCIL_REACH + DRIFT; + static constexpr int TE = static_cast(T_TILE) + 2 * HALO; + + using exec_space = Kokkos::DefaultExecutionSpace; + using team_policy = Kokkos::TeamPolicy; + using member_t = typename team_policy::member_type; + + ndfield_t J; + ParticleArrays prtls; + const M metric; + const real_t charge, inv_dt; // Tile metadata produced by SortSpatially. - array_t tile_offsets; - ncells_t ntx1 { 1u }, ntx2 { 1u }, ntx3 { 1u }; - ncells_t total_tiles { 0u }; + array_t tile_offsets; + ncells_t ntx1 { 1u }, ntx2 { 1u }, ntx3 { 1u }; + ncells_t total_tiles { 0u }; - // J's full storage extent including all ghost cells. Used to clip - // the cooperative flush so that a partial tile at the high end of - // the domain does not over-write past the J view. - int j_ext1 { 0 }, j_ext2 { 0 }, j_ext3 { 0 }; + /** + * J's full storage extent including all ghost cells. Used to clip + * the cooperative flush so that a partial tile at the high end of + * the domain does not over-write past the J view. + */ + int j_ext1 { 0 }, j_ext2 { 0 }, j_ext3 { 0 }; public: - DepositCurrents_kernel_tiled(const ndfield_t& cur, - const array_t& i1, - const array_t& i2, - const array_t& i3, - const array_t& i1_prev, - const array_t& i2_prev, - const array_t& i3_prev, - const array_t& dx1, - const array_t& dx2, - const array_t& dx3, - const array_t& dx1_prev, - const array_t& dx2_prev, - const array_t& dx3_prev, - const array_t& ux1, - const array_t& ux2, - const array_t& ux3, - const array_t& phi, - const array_t& weight, - const array_t& tag, - const M& metric, - real_t charge, - const real_t dt, - const TileLayout& layout) + DepositCurrentsTiled_kernel(const ndfield_t& cur, + const ParticleArrays& prtls, + const M& metric, + real_t charge, + real_t dt, + const TileLayout& layout) : J { cur } - , prtls { i1, i2, i3, i1_prev, i2_prev, i3_prev, - dx1, dx2, dx3, dx1_prev, dx2_prev, dx3_prev, - ux1, ux2, ux3, phi, weight, tag } + , prtls { prtls } , metric { metric } , charge { charge } , inv_dt { ONE / dt } @@ -964,16 +883,18 @@ namespace kernel { layout.tile_size != T_TILE, "Tiled deposit launched with mismatched T_TILE and runtime tile_size", HERE); - // Note: HALO is allowed to exceed N_GHOSTS. The cooperative - // scratch→J flush and the per-particle escape valve both bounds-clip - // their writes against `j_ext*` so writes that would land past J's - // ghost stripe are silently dropped (they only ever come from a - // particle whose stencil reaches into the domain ghost region, where - // CommunicateFields will re-supply the contribution). - if constexpr (D == Dim::_1D || D == Dim::_2D || D == Dim::_3D) { + /** + * @note: HALO is allowed to exceed N_GHOSTS. The cooperative + * scratch→J flush and the per-particle escape valve both bounds-clip + * their writes against `j_ext*` so writes that would land past J's + * ghost stripe are silently dropped (they only ever come from a + * particle whose stencil reaches into the domain ghost region, where + * CommunicateFields will re-supply the contribution). + */ + if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) { j_ext1 = static_cast(cur.extent(0)); } - if constexpr (D == Dim::_2D || D == Dim::_3D) { + if constexpr (D == Dim::_2D or D == Dim::_3D) { j_ext2 = static_cast(cur.extent(1)); } if constexpr (D == Dim::_3D) { @@ -985,23 +906,24 @@ namespace kernel { * @brief Per-team scratch size in bytes. Used by the launcher to set * `team_policy.set_scratch_size(0, Kokkos::PerTeam(bytes))`. */ - static constexpr std::size_t scratch_bytes() { + static constexpr size_t scratch_bytes() { if constexpr (D == Dim::_1D) { - return scratch_1d_t::shmem_size(TE, 3); + return scratch_ndfield_t::shmem_size(TE, 3); } else if constexpr (D == Dim::_2D) { - return scratch_2d_t::shmem_size(TE, TE, 3); + return scratch_ndfield_t::shmem_size(TE, TE, 3); } else { - return scratch_3d_t::shmem_size(TE, TE, TE, 3); + return scratch_ndfield_t::shmem_size(TE, TE, TE, 3); } } - KOKKOS_INLINE_FUNCTION - void operator()(const member_t& team) const { + Inline void operator()(const member_t& team) const { const auto tile_id = static_cast(team.league_rank()); - // Tile coordinates (tile-grid indices) → tile origin in **active** - // cell coords (no ghost offset). Using ncells_t to match the linearised - // tile index produced by SortSpatially. - ncells_t tx1 = 0, tx2 = 0, tx3 = 0; + /** + * Tile coordinates (tile-grid indices) → tile origin in **active** + * cell coords (no ghost offset). Using ncells_t to match the linearised + * tile index produced by SortSpatially. + */ + ncells_t tx1 = 0, tx2 = 0, tx3 = 0; if constexpr (D == Dim::_1D) { tx1 = tile_id; } else if constexpr (D == Dim::_2D) { @@ -1014,213 +936,201 @@ namespace kernel { tx2 = rem / ntx3; tx3 = rem - tx2 * ntx3; } - // origin_active = lowest active-cell index in the tile (no ghost). - // origin_J = same value translated into J's storage coordinate - // (i.e. plus N_GHOSTS). - // origin_J_low = J coordinate of scratch index 0 (i.e. origin_J - HALO). - // local index `li` in scratch ↔ global J index `gi = li + origin_J_low`. - const int origin_J1_low = static_cast(tx1 * T_TILE) - + static_cast(N_GHOSTS) - HALO; - const int origin_J2_low = static_cast(tx2 * T_TILE) - + static_cast(N_GHOSTS) - HALO; - const int origin_J3_low = static_cast(tx3 * T_TILE) - + static_cast(N_GHOSTS) - HALO; + /** + * origin_active = lowest active-cell index in the tile (no ghost). + * origin_J = same value translated into J's storage coordinate + * (i.e. plus N_GHOSTS). + * origin_J_low = J coordinate of scratch index 0 (i.e. origin_J - HALO). + * local index `li` in scratch ↔ global J index `gi = li + origin_J_low`. + */ + const int origin_J1_low = static_cast(tx1 * T_TILE) + + static_cast(N_GHOSTS) - HALO; + const int origin_J2_low = static_cast(tx2 * T_TILE) + + static_cast(N_GHOSTS) - HALO; + const int origin_J3_low = static_cast(tx3 * T_TILE) + + static_cast(N_GHOSTS) - HALO; // Allocate scratch and cooperatively zero-fill it. if constexpr (D == Dim::_1D) { - scratch_1d_t scr(team.team_scratch(0), TE, 3); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, TE * 3), - [&](const int idx) { - const int li = idx / 3; - const int c = idx - li * 3; - scr(li, c) = ZERO; - }); + scratch_ndfield_t scr { team.team_scratch(0), TE }; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, TE * 3), + [&](ncells_t idx) { + const auto li = idx / 3; + const auto c = idx - li * 3; + scr(li, c) = ZERO; + }); team.team_barrier(); const auto p_begin = tile_offsets(tile_id); const auto p_end = tile_offsets(tile_id + 1u); - const int e1_d = j_ext1; Kokkos::parallel_for( Kokkos::TeamThreadRange(team, p_begin, p_end), - [&](const npart_t p) { - deposit::deposit_one_particle( + [&](prtlidx_t p) { + DepositOneParticle( p, prtls, metric, charge, inv_dt, - // Escape valve: a particle whose stencil reaches past the - // tile's scratch (e.g. exceeded the compile-time - // STENCIL_REACH + DRIFT budget) falls back to a direct - // atomic_add on the global J view. Bounds-clipped against - // J's storage extent so writes past the domain ghost stripe - // are dropped (matches the cooperative flush below; those - // contributions are re-supplied by SynchronizeFields(J)). + /** + * Escape valve: a particle whose stencil reaches past the + * tile's scratch (e.g. exceeded the compile-time + * STENCIL_REACH + DRIFT budget) falls back to a direct + * atomic_add on the global J view. Bounds-clipped against + * J's storage extent so writes past the domain ghost stripe + * are dropped (matches the cooperative flush below; those + * contributions are re-supplied by SynchronizeFields(J)). + */ [&](int g_i1, int comp, real_t v) { const int li = g_i1 - origin_J1_low; - if (li >= 0 && li < TE) { + if (li >= 0 and li < TE) { Kokkos::atomic_add(&scr(li, comp), v); - } else if (g_i1 >= 0 && g_i1 < e1_d) { + } else if (g_i1 >= 0 and g_i1 < j_ext1) { Kokkos::atomic_add(&J(g_i1, comp), v); } }); }); team.team_barrier(); - // Cooperative flush of scratch to global J. Bounds-clip against - // the J view extent in case a partial high-end tile (or non-zero - // halo at domain edges) would otherwise write past J. - const int e1 = j_ext1; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, TE * 3), - [&](const int idx) { - const int li = idx / 3; - const int c = idx - li * 3; - const int gi = li + origin_J1_low; - if (gi < 0 || gi >= e1) { - return; - } - const real_t v = scr(li, c); - if (v != ZERO) { - Kokkos::atomic_add(&J(gi, c), v); - } - }); + /** + * Cooperative flush of scratch to global J. Bounds-clip against + * the J view extent in case a partial high-end tile (or non-zero + * halo at domain edges) would otherwise write past J. + */ + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, TE * 3), + [&](const int idx) { + const auto li = idx / 3; + const auto c = idx - li * 3; + const auto gi = li + origin_J1_low; + if (gi < 0 or gi >= j_ext1) { + return; + } + const real_t v = scr(li, c); + if (v != ZERO) { + Kokkos::atomic_add(&J(gi, c), v); + } + }); } else if constexpr (D == Dim::_2D) { - scratch_2d_t scr(team.team_scratch(0), TE, TE, 3); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, TE * TE * 3), - [&](const int idx) { - const int lij = idx / 3; - const int c = idx - lij * 3; - const int li = lij / TE; - const int lj = lij - li * TE; - scr(li, lj, c) = ZERO; - }); + scratch_ndfield_t scr { team.team_scratch(0), TE, TE }; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, SQR(TE) * 3), + [&](const int idx) { + const auto lij = idx / 3; + const auto c = idx - lij * 3; + const auto li = lij / TE; + const auto lj = lij - li * TE; + scr(li, lj, c) = ZERO; + }); team.team_barrier(); const auto p_begin = tile_offsets(tile_id); const auto p_end = tile_offsets(tile_id + 1u); - const int e1_d = j_ext1; - const int e2_d = j_ext2; Kokkos::parallel_for( Kokkos::TeamThreadRange(team, p_begin, p_end), - [&](const npart_t p) { - deposit::deposit_one_particle( + [&](prtlidx_t p) { + DepositOneParticle( p, prtls, metric, charge, inv_dt, // See 1D branch for rationale. - [&](int g_i1, int g_i2, int comp, real_t v) { - const int li = g_i1 - origin_J1_low; - const int lj = g_i2 - origin_J2_low; - if (li >= 0 && li < TE && lj >= 0 && lj < TE) { + [&](const int g_i1, const int g_i2, int comp, real_t v) { + const auto li = g_i1 - origin_J1_low; + const auto lj = g_i2 - origin_J2_low; + if ((li >= 0 and li < TE) and (lj >= 0 and lj < TE)) { Kokkos::atomic_add(&scr(li, lj, comp), v); - } else if (g_i1 >= 0 && g_i1 < e1_d && g_i2 >= 0 && - g_i2 < e2_d) { + } else if ((g_i1 >= 0 and g_i1 < j_ext1) and + (g_i2 >= 0 and g_i2 < j_ext2)) { Kokkos::atomic_add(&J(g_i1, g_i2, comp), v); } }); }); team.team_barrier(); - const int e1 = j_ext1; - const int e2 = j_ext2; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, TE * TE * 3), - [&](const int idx) { - const int lij = idx / 3; - const int c = idx - lij * 3; - const int li = lij / TE; - const int lj = lij - li * TE; - const int gi = li + origin_J1_low; - const int gj = lj + origin_J2_low; - if (gi < 0 || gi >= e1 || gj < 0 || gj >= e2) { - return; - } - const real_t v = scr(li, lj, c); - if (v != ZERO) { - Kokkos::atomic_add(&J(gi, gj, c), v); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, SQR(TE) * 3), + [&](const int idx) { + const auto lij = idx / 3; + const auto c = idx - lij * 3; + const auto li = lij / TE; + const auto lj = lij - li * TE; + const auto gi = li + origin_J1_low; + const auto gj = lj + origin_J2_low; + if ((gi < 0 or gi >= j_ext1) or + (gj < 0 or gj >= j_ext2)) { + return; + } + const real_t v = scr(li, lj, c); + if (v != ZERO) { + Kokkos::atomic_add(&J(gi, gj, c), v); + } + }); } else if constexpr (D == Dim::_3D) { - scratch_3d_t scr(team.team_scratch(0), TE, TE, TE, 3); - const int cells = TE * TE * TE; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, cells * 3), - [&](const int idx) { - const int lijk = idx / 3; - const int c = idx - lijk * 3; - const int li = lijk / (TE * TE); - const int rem = lijk - li * TE * TE; - const int lj = rem / TE; - const int lk = rem - lj * TE; - scr(li, lj, lk, c) = ZERO; - }); + scratch_ndfield_t scr { team.team_scratch(0), TE, TE, TE }; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, CUBE(TE) * 3), + [&](const int idx) { + const auto lijk = idx / 3; + const auto c = idx - lijk * 3; + const auto li = lijk / (TE * TE); + const auto rem = lijk - li * TE * TE; + const auto lj = rem / TE; + const auto lk = rem - lj * TE; + scr(li, lj, lk, c) = ZERO; + }); team.team_barrier(); const auto p_begin = tile_offsets(tile_id); const auto p_end = tile_offsets(tile_id + 1u); - const int e1_d = j_ext1; - const int e2_d = j_ext2; - const int e3_d = j_ext3; Kokkos::parallel_for( Kokkos::TeamThreadRange(team, p_begin, p_end), - [&](const npart_t p) { - deposit::deposit_one_particle( + [&](prtlidx_t p) { + DepositOneParticle( p, prtls, metric, charge, inv_dt, // See 1D branch for rationale. - [&](int g_i1, int g_i2, int g_i3, int comp, real_t v) { - const int li = g_i1 - origin_J1_low; - const int lj = g_i2 - origin_J2_low; - const int lk = g_i3 - origin_J3_low; - if (li >= 0 && li < TE && lj >= 0 && lj < TE && lk >= 0 && - lk < TE) { + [&](const int g_i1, const int g_i2, const int g_i3, int comp, real_t v) { + const auto li = g_i1 - origin_J1_low; + const auto lj = g_i2 - origin_J2_low; + const auto lk = g_i3 - origin_J3_low; + if ((li >= 0 and li < TE) and (lj >= 0 and lj < TE) and + (lk >= 0 and lk < TE)) { Kokkos::atomic_add(&scr(li, lj, lk, comp), v); - } else if (g_i1 >= 0 && g_i1 < e1_d && g_i2 >= 0 && - g_i2 < e2_d && g_i3 >= 0 && g_i3 < e3_d) { + } else if ((g_i1 >= 0 and g_i1 < j_ext1) and + (g_i2 >= 0 and g_i2 < j_ext2) and + (g_i3 >= 0 and g_i3 < j_ext3)) { Kokkos::atomic_add(&J(g_i1, g_i2, g_i3, comp), v); } }); }); team.team_barrier(); - const int e1 = j_ext1; - const int e2 = j_ext2; - const int e3 = j_ext3; - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, cells * 3), - [&](const int idx) { - const int lijk = idx / 3; - const int c = idx - lijk * 3; - const int li = lijk / (TE * TE); - const int rem = lijk - li * TE * TE; - const int lj = rem / TE; - const int lk = rem - lj * TE; - const int gi = li + origin_J1_low; - const int gj = lj + origin_J2_low; - const int gk = lk + origin_J3_low; - if (gi < 0 || gi >= e1 || gj < 0 || gj >= e2 || gk < 0 || - gk >= e3) { - return; - } - const real_t v = scr(li, lj, lk, c); - if (v != ZERO) { - Kokkos::atomic_add(&J(gi, gj, gk, c), v); - } - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, CUBE(TE) * 3), + [&](const int idx) { + const int lijk = idx / 3; + const int c = idx - lijk * 3; + const int li = lijk / (TE * TE); + const int rem = lijk - li * TE * TE; + const int lj = rem / TE; + const int lk = rem - lj * TE; + const int gi = li + origin_J1_low; + const int gj = lj + origin_J2_low; + const int gk = lk + origin_J3_low; + if ((gi < 0 or gi >= j_ext1) or + (gj < 0 or gj >= j_ext2) or + (gk < 0 or gk >= j_ext3)) { + return; + } + const real_t v = scr(li, lj, lk, c); + if (v != ZERO) { + Kokkos::atomic_add(&J(gi, gj, gk, c), v); + } + }); } } }; -#endif // TEAM_POLICY - } // namespace kernel #undef i_di_to_Xi From b5e05ad7130ca88c44ecdecdb6c46690fb2e2a9e Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Fri, 19 Jun 2026 20:13:33 +0000 Subject: [PATCH 10/33] compile-time sorting interval with team policies and sorting speedup --- CMakeLists.txt | 13 ++ cmake/defaults.cmake | 5 + cmake/report.cmake | 14 ++ src/engines/srpic/currents.h | 90 ++++--- src/framework/containers/particles.h | 13 +- src/framework/containers/particles_sort.cpp | 110 +++++---- src/framework/parameters/parameters.cpp | 7 + src/framework/parameters/particles.cpp | 9 + src/global/arch/kokkos_aliases.h | 25 +- src/kernels/currents_deposit.hpp | 245 ++++++++++++++------ tests/framework/particles_sort.cpp | 12 +- 11 files changed, 372 insertions(+), 171 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0137671a0..d7bea74f0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,6 +67,10 @@ set(team_policy_tile_size set(team_policy_tile_sizes "4;6;8;10;12;14;16" CACHE STRING "team_policy tile-size choices") +set(team_policy_sort + ${default_team_policy_sort} + CACHE STRING + "team_policy hardwired spatial sorting interval; >0 overrides the runtime spatial_sorting_interval and sizes the tiled deposit scratch halo (0 = use runtime)") # -------------------------- Compilation settings -------------------------- # set(CMAKE_CXX_STANDARD 20) @@ -157,6 +161,15 @@ if(${team_policy}) add_compile_options("-D TEAM_POLICY") add_compile_options("-D TEAM_POLICY_TILE_SIZE=${team_policy_tile_size}") + # Optional compile-time hardwired sort interval. When > 0, it (a) overrides + # the runtime spatial_sorting_interval (see framework/parameters) and (b) + # sizes the tiled deposit scratch halo to DRIFT = interval, so a particle + # drifting over a full sort interval still deposits inside its tile scratch. + if(team_policy_sort GREATER 0) + add_compile_options( + "-D TEAM_POLICY_SORT_INTERVAL=${team_policy_sort}") + endif() + # Vendor sort: oneDPL on SYCL, Thrust on CUDA. Used automatically # when found; falls back to Kokkos::BinSort otherwise. if("${Kokkos_DEVICES}" MATCHES "SYCL") diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake index a85accf84..619f48dc1 100644 --- a/cmake/defaults.cmake +++ b/cmake/defaults.cmake @@ -107,3 +107,8 @@ set_property(CACHE default_team_policy PROPERTY TYPE BOOL) set(default_team_policy_tile_size 8 CACHE INTERNAL "Default tile edge length in cells for team_policy") + +set(default_team_policy_sort + 0 + CACHE INTERNAL + "Default hardwired spatial sorting interval for team_policy (0 = runtime)") diff --git a/cmake/report.cmake b/cmake/report.cmake index 65a22a7a6..e6036d366 100644 --- a/cmake/report.cmake +++ b/cmake/report.cmake @@ -141,6 +141,17 @@ if(${team_policy}) "${Blue}" TEAM_POLICY_TILE_SIZE_REPORT 46) + if(team_policy_sort GREATER 0) + printchoices( + "Team Sort Interval" + "team_policy_sort" + "${team_policy_sort}" + ${team_policy_sort} + 0 + "${Blue}" + TEAM_POLICY_SORT_INTERVAL_REPORT + 46) + endif() endif() printchoices( "Debug mode" @@ -220,6 +231,9 @@ endif() string(APPEND REPORT_TEXT " " ${TEAM_POLICY_REPORT} "\n") if(${team_policy}) string(APPEND REPORT_TEXT " " ${TEAM_POLICY_TILE_SIZE_REPORT} "\n") + if(team_policy_sort GREATER 0) + string(APPEND REPORT_TEXT " " ${TEAM_POLICY_SORT_INTERVAL_REPORT} "\n") + endif() endif() string( diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h index 63746d101..eadda70b9 100644 --- a/src/engines/srpic/currents.h +++ b/src/engines/srpic/currents.h @@ -79,7 +79,8 @@ namespace ntt { auto deposit_kernel = kernel::DepositCurrentsTiled_kernel { - cur, species, local_metric, (real_t)(species.charge()), dt, layout + cur, species, local_metric, (real_t)(species.charge()), + dt, layout, species.npart() }; Kokkos::TeamPolicy<> policy(static_cast(layout.ntiles_total), @@ -88,6 +89,30 @@ namespace ntt { 0, Kokkos::PerTeam(decltype(deposit_kernel)::scratch_bytes())); Kokkos::parallel_for("CurrentsDepositTiled", policy, deposit_kernel); + + // Particles appended since the last sort (injection / MPI receive on a + // no-sort step) live past the partition and are not visited by any team + // above. Deposit that tail [npart_partitioned, npart) with the flat + // scatter-view kernel so every active particle is deposited exactly + // once. The range is empty when the species was just sorted (the + // every-step-sorted common case), so this is a no-op there. + if (species.npart() > layout.npart_partitioned) { + // `cur` is a const ref; take a non-const View handle (shallow copy, + // shares storage) so the scatter view can contribute back into it. + auto cur_nc = cur; + auto scatter_cur = Kokkos::Experimental::create_scatter_view(cur_nc); + Kokkos::parallel_for( + "CurrentsDepositTiledTail", + CreateParticleRangePolicy({ layout.npart_partitioned }, + { species.npart() }), + kernel::DepositCurrents_kernel( + scatter_cur, + species, + local_metric, + (real_t)(species.charge()), + dt)); + Kokkos::Experimental::contribute(cur_nc, scatter_cur); + } } template @@ -98,51 +123,45 @@ namespace ntt { #if defined(TEAM_POLICY) - // First-step fallback: if any contributing species has not been - // sorted yet (tile_layout still empty), fall back to the flat - // scatter-view path for that step. Subsequent steps see populated - // layouts and use the tiled kernel. - bool any_unsorted = false; + // Tiled deposit. Correctness no longer depends on the SoA being in a + // "sorted" state at deposit time — the tiled kernel handles a stale + // partition per-particle: + // - a particle whose full stencil has drifted out of its tile is + // deposited straight to the global J view (the per-particle escape + // valve); `team_policy_sort_interval` sizes the scratch halo so the + // common in-tile case stays in fast SLM (see currents_deposit.hpp); + // - particles dead-tagged in place since the sort are clamped out by + // the kernel and skipped by the dead-tag test; + // - particles appended past the partition since the sort (injection / + // MPI receive on a no-sort step) are deposited by the launcher's + // flat tail pass over [npart_partitioned, npart). + // Together these cover every active particle exactly once for any sort + // interval. The only case the tiled kernel cannot serve is the very + // first step, before any SortSpatially has populated a layout; that + // species takes the flat scatter-view path for that step alone. for (auto& species : domain.species) { if ((species.pusher() == ParticlePusher::NONE) or (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) { continue; } - if (species.tile_layout().ntiles_total == 0u or - species.tile_layout().tile_offsets.extent(0) == 0u) { - any_unsorted = true; - break; - } - } - if (any_unsorted) { - auto scatter_cur = Kokkos::Experimental::create_scatter_view( - domain.fields.cur); - for (auto& species : domain.species) { - if ((species.pusher() == ParticlePusher::NONE) or - (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) { - continue; - } + const auto& layout = species.tile_layout(); + if (layout.ntiles_total == 0u or layout.tile_offsets.extent(0) == 0u) { logger::Checkpoint( - fmt::format( - "Launching currents deposit (flat fallback, no sort yet) " - "for %d [%s] : %lu %f", - species.index(), - species.label().c_str(), - species.npart(), - (double)species.charge()), + fmt::format("Launching currents deposit (flat, no sort yet) for " + "%d [%s] : %lu %f", + species.index(), + species.label().c_str(), + species.npart(), + (double)species.charge()), HERE); + auto scatter_cur = Kokkos::Experimental::create_scatter_view( + domain.fields.cur); CallDepositKernel(species, domain.mesh.metric, scatter_cur, dt); - } - Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur); - } else { - for (auto& species : domain.species) { - if ((species.pusher() == ParticlePusher::NONE) or - (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) { - continue; - } + Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur); + } else { logger::Checkpoint( fmt::format("Launching tiled currents deposit for %d [%s] : %lu %f", species.index(), @@ -150,7 +169,6 @@ namespace ntt { species.npart(), (double)species.charge()), HERE); - CallDepositKernelTiled(species, domain.mesh.metric, domain.fields.cur, diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h index 4f0770729..0877efd15 100644 --- a/src/framework/containers/particles.h +++ b/src/framework/containers/particles.h @@ -299,11 +299,14 @@ namespace ntt { private: /** * @brief Apply a particle-index permutation (built by oneDPL/Thrust - * sort_by_key) to every SoA member array. Sequential — one - * transient buffer at a time, fenced before scope exit. - * Only compiled when a vendor sort backend is enabled; the - * BinSort path applies the permutation in place via - * `sorter.sort(view)` instead. + * sort_by_key) to the SoA member arrays. Each member is + * gathered into a fresh full-capacity buffer whose handle is + * then swapped in (no copy-back), one buffer at a time, fenced + * before the old storage is released. The *_prev arrays are + * intentionally not permuted (overwritten by the next push + * before any read). Only compiled when a vendor sort backend + * is enabled; the BinSort path applies the permutation in + * place via `sorter.sort(view)` instead. */ void apply_permutation_to_soa(const prtl_perm_t& perm); diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp index 0317e0b5f..a52b013f7 100644 --- a/src/framework/containers/particles_sort.cpp +++ b/src/framework/containers/particles_sort.cpp @@ -264,8 +264,11 @@ namespace ntt { const auto slice = prtl_slice_t(0, npart_local); #if defined(TEAM_POLICY_USE_VENDOR_SORT) // Vendor path: produce an explicit permutation via sort_by_key, - // then apply it to each SoA member with a sequential one-buffer - // gather (peak transient = one `npart × sizeof(member)` buffer. + // then apply it to each SoA member by gathering into a fresh + // full-capacity buffer and swapping the View handle in (no + // copy-back). The *_prev arrays are skipped — see + // apply_permutation_to_soa. Peak transient = one + // `maxnpart × sizeof(member)` buffer at a time. prtl_perm_t perm { "tile_perm", npart_local }; #if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED) sort_helpers::sort_by_key_dispatch(tile_indices, @@ -376,6 +379,11 @@ namespace ntt { Kokkos::deep_copy(tile_offsets, h_offsets); m_tile_layout.tile_offsets = tile_offsets; + // tile_offsets(total_tiles) is the alive-particle count at sort time: + // the tiles partition exactly [0, npart_partitioned). The deposit + // launcher compares this against the live npart() to detect (and + // separately deposit) particles appended since this sort. + m_tile_layout.npart_partitioned = h_offsets(total_tiles); } // 6. Populate `m_tile_layout` size/shape. `tile_perm` is not used @@ -454,38 +462,43 @@ namespace ntt { #if defined(TEAM_POLICY_USE_VENDOR_SORT) namespace permute_helpers { - // Permute a 1D SoA member array `arr` in place by `perm`, using a - // single transient buffer of size `n`. Buffer is freed at scope - // exit; the explicit fence right before that drains queued GPU - // work referencing it. + // Permute a 1D SoA member array `arr` by `perm`. Gathers into a + // fresh buffer allocated at the member's full capacity (maxnpart), + // then swaps the View handle in. This avoids the redundant copy-back + // pass of the old gather-then-deep_copy approach (~2x less HBM + // traffic). Allocating at full capacity preserves the member's spare + // room for injection; the untouched tail [n, capacity) is + // zero-initialized by Kokkos (cleaner than the stale values the old + // deep_copy left there). The fence drains the gather (which reads the + // old storage) before the swap drops the last reference to it. template - inline void permute_1d_inplace(V& arr, - const prtl_perm_t& perm, - npart_t n) { + inline void permute_1d_swap(V& arr, + const prtl_perm_t& perm, + npart_t n) { if (n == 0u) { return; } - V buf(std::string(arr.label()) + "_perm_buf", n); + V buf(arr.label(), arr.extent(0)); auto perm_v = perm; auto arr_v = arr; Kokkos::parallel_for( "Permute1D", n, KOKKOS_LAMBDA(const npart_t p) { buf(p) = arr_v(perm_v(p)); }); - Kokkos::deep_copy(Kokkos::subview(arr, prtl_slice_t(0u, n)), buf); - Kokkos::fence("permute_1d_inplace: end"); + Kokkos::fence("permute_1d_swap: end"); + arr = buf; } // 2D analogue for `pld_r` / `pld_i`. template - inline void permute_2d_inplace(V& arr, - const prtl_perm_t& perm, - npart_t n, - npart_t ncols) { + inline void permute_2d_swap(V& arr, + const prtl_perm_t& perm, + npart_t n, + npart_t ncols) { if (n == 0u or ncols == 0u) { return; } - V buf(std::string(arr.label()) + "_perm_buf", n, ncols); + V buf(arr.label(), arr.extent(0), arr.extent(1)); auto perm_v = perm; auto arr_v = arr; Kokkos::parallel_for( @@ -494,9 +507,8 @@ namespace ntt { KOKKOS_LAMBDA(const npart_t p, const npart_t l) { buf(p, l) = arr_v(perm_v(p), l); }); - Kokkos::deep_copy(Kokkos::subview(arr, prtl_slice_t(0u, n), Kokkos::ALL), - buf); - Kokkos::fence("permute_2d_inplace: end"); + Kokkos::fence("permute_2d_swap: end"); + arr = buf; } } // namespace permute_helpers @@ -508,40 +520,50 @@ namespace ntt { return; } - using permute_helpers::permute_1d_inplace; - using permute_helpers::permute_2d_inplace; - + using permute_helpers::permute_1d_swap; + using permute_helpers::permute_2d_swap; + + // The *_prev arrays (i{1,2,3}_prev, dx{1,2,3}_prev) are intentionally + // NOT permuted. SortSpatially runs at the very end of the step loop + // (engine step_forward), and the first thing the next step's pusher + // does is overwrite prev := current (positionPush, sr.hpp / gr.hpp) + // for every active particle, before any consumer reads prev: + // - current deposit: runs after the push, which has already + // overwritten prev; species with pusher==NONE (whose prev would + // stay un-permuted) are skipped by CurrentsDeposit entirely. + // - pusher getParticlePrevPosition / piston: read prev only after + // positionPush has rewritten it within the same call. + // - checkpoint (prev is checkpoint-only, never in diagnostic + // output): on restart the first push overwrites prev before it + // is read, so restart results are unaffected; only the redundant + // prev field saved to the checkpoint differs from the old code. + // Permuting prev would therefore reorder data that is overwritten + // before it is ever observed. if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) { - permute_1d_inplace(i1, perm, n); - permute_1d_inplace(dx1, perm, n); - permute_1d_inplace(i1_prev, perm, n); - permute_1d_inplace(dx1_prev, perm, n); + permute_1d_swap(i1, perm, n); + permute_1d_swap(dx1, perm, n); } if constexpr (D == Dim::_2D or D == Dim::_3D) { - permute_1d_inplace(i2, perm, n); - permute_1d_inplace(dx2, perm, n); - permute_1d_inplace(i2_prev, perm, n); - permute_1d_inplace(dx2_prev, perm, n); + permute_1d_swap(i2, perm, n); + permute_1d_swap(dx2, perm, n); } if constexpr (D == Dim::_3D) { - permute_1d_inplace(i3, perm, n); - permute_1d_inplace(dx3, perm, n); - permute_1d_inplace(i3_prev, perm, n); - permute_1d_inplace(dx3_prev, perm, n); - } - permute_1d_inplace(ux1, perm, n); - permute_1d_inplace(ux2, perm, n); - permute_1d_inplace(ux3, perm, n); - permute_1d_inplace(weight, perm, n); - permute_1d_inplace(tag, perm, n); + permute_1d_swap(i3, perm, n); + permute_1d_swap(dx3, perm, n); + } + permute_1d_swap(ux1, perm, n); + permute_1d_swap(ux2, perm, n); + permute_1d_swap(ux3, perm, n); + permute_1d_swap(weight, perm, n); + permute_1d_swap(tag, perm, n); if constexpr (D == Dim::_2D and C != Coord::Cartesian) { - permute_1d_inplace(phi, perm, n); + permute_1d_swap(phi, perm, n); } if (npld_r() > 0) { - permute_2d_inplace(pld_r, perm, n, static_cast(npld_r())); + permute_2d_swap(pld_r, perm, n, static_cast(npld_r())); } if (npld_i() > 0) { - permute_2d_inplace(pld_i, perm, n, static_cast(npld_i())); + permute_2d_swap(pld_i, perm, n, static_cast(npld_i())); } } #endif // TEAM_POLICY_USE_VENDOR_SORT diff --git a/src/framework/parameters/parameters.cpp b/src/framework/parameters/parameters.cpp index 7372da510..4116783a1 100644 --- a/src/framework/parameters/parameters.cpp +++ b/src/framework/parameters/parameters.cpp @@ -67,11 +67,18 @@ namespace ntt { "clear_interval", defaults::clear_interval); set("particles.clear_interval", global_clearing_interval); +#if defined(TEAM_POLICY_SORT_INTERVAL) + // See particles.cpp: the compile-time team_policy_sort_interval overrides + // the runtime value (kept consistent here for the stored global param). + const auto global_spatial_sorting_interval = static_cast( + TEAM_POLICY_SORT_INTERVAL); +#else const auto global_spatial_sorting_interval = toml::find_or( toml_data, "particles", "spatial_sorting_interval", 0u); +#endif set("particles.spatial_sorting_interval", global_spatial_sorting_interval); set("scales.n0", ppc0 / get("scales.V0")); diff --git a/src/framework/parameters/particles.cpp b/src/framework/parameters/particles.cpp index 6f1a23c03..46192035b 100644 --- a/src/framework/parameters/particles.cpp +++ b/src/framework/parameters/particles.cpp @@ -121,10 +121,19 @@ namespace ntt { sp, "clear_interval", global_clearing_interval); +#if defined(TEAM_POLICY_SORT_INTERVAL) + // Compile-time hardwired sort interval (the `team_policy_sort_interval` + // CMake knob). It overrides whatever the input file requested so the + // tiled deposit's scratch halo — sized for exactly this cadence — is + // guaranteed to contain every particle's drift between sorts. + const auto spatial_sorting_interval = static_cast( + TEAM_POLICY_SORT_INTERVAL); +#else const auto spatial_sorting_interval = toml::find_or( sp, "spatial_sorting_interval", global_spatial_sorting_interval); +#endif auto pusher_str = toml::find_or(sp, "pusher", std::string(def_pusher)); const auto npayloads_real = toml::find_or(sp, "n_payloads_real", diff --git a/src/global/arch/kokkos_aliases.h b/src/global/arch/kokkos_aliases.h index 4fe88cdf7..7212b554e 100644 --- a/src/global/arch/kokkos_aliases.h +++ b/src/global/arch/kokkos_aliases.h @@ -340,20 +340,27 @@ auto CreateRangePolicyOnHost(const tuple_t&, using prtl_perm_t = array_t; // Tile layout metadata: the contract between Stream 1 (sort) and Streams -// 2/3 (tiled deposit / pusher). All members are device-resident. -// ntiles_per_axis : number of tiles along each axis (1 for unused axes). -// ntiles_total : product of ntiles_per_axis = league size for TeamPolicy. -// tile_size : tile edge length in cells (compile-time CMake knob, -// replicated here for runtime checks). -// tile_offsets : prefix-sum of per-tile particle counts; size -// ntiles_total + 1; tile t owns particles -// [tile_offsets(t), tile_offsets(t+1)). -// tile_perm : size npart, particle index sorted by tile. +// 2/3 (tiled deposit / pusher). Scalars are host-resident; the views are +// device-resident. +// ntiles_per_axis : number of tiles along each axis (1 for unused axes). +// ntiles_total : product of ntiles_per_axis = league size for TeamPolicy. +// tile_size : tile edge length in cells (compile-time CMake knob, +// replicated here for runtime checks). +// npart_partitioned: number of (alive) particles partitioned at the last +// sort, i.e. tile_offsets(ntiles_total). The tiles cover +// exactly [0, npart_partitioned); particles appended past +// it (injection / MPI receive on a no-sort step) are not +// partitioned and must be deposited separately. +// tile_offsets : prefix-sum of per-tile particle counts; size +// ntiles_total + 1; tile t owns particles +// [tile_offsets(t), tile_offsets(t+1)). +// tile_perm : size npart, particle index sorted by tile. template struct TileLayout { ncells_t ntiles_per_axis[3] { 1u, 1u, 1u }; ncells_t ntiles_total { 0u }; unsigned short tile_size { 0u }; + npart_t npart_partitioned { 0u }; array_t tile_offsets; prtl_perm_t tile_perm; }; diff --git a/src/kernels/currents_deposit.hpp b/src/kernels/currents_deposit.hpp index 79252621a..8840a1046 100644 --- a/src/kernels/currents_deposit.hpp +++ b/src/kernels/currents_deposit.hpp @@ -780,28 +780,37 @@ namespace kernel { * `SortSpatially` (`particles_sort.cpp`) is responsible for keeping * the SoA arrays consistent with that. * - * **Halo sizing and escape valve.** Sort runs at the end of the - * previous step (see `srpic.hpp`), so at deposit time the particle - * has already been pushed once — its `min(i, i_prev)` may differ - * from the bin key by one cell of drift per step elapsed since the - * last sort. The scratch HALO is `STENCIL_REACH(O) + DRIFT`, where - * `STENCIL_REACH = 2` for zigzag (writes `{i_prev, i_prev+1, i, - * i+1}` ⇒ +2 above `min(i, i_prev)` with `|Δi|=1`) and `O` for - * Esirkepov, and `DRIFT` is a fixed constant (1) covering the one - * guaranteed post-sort pusher step. + * **Halo sizing and escape valve.** Sort runs at the end of a step + * (see `srpic.hpp`); a particle is pushed once per step thereafter, so + * its `min(i, i_prev)` may differ from the bin key by one cell of drift + * per step elapsed since the last sort. The scratch HALO is + * `STENCIL_REACH(O) + DRIFT`, where `STENCIL_REACH = 2` for zigzag + * (writes `{i_prev, i_prev+1, i, i+1}` ⇒ +2 above `min(i, i_prev)` with + * `|Δi|=1`) and `O` for Esirkepov. `DRIFT` is the + * `team_policy_sort_interval` CMake knob (macro TEAM_POLICY_SORT_INTERVAL) + * when set — the hardwired sort interval, hence the maximum drift any + * particle accrues between sorts — and `1` otherwise (the + * every-step-sorted common case). * - * HALO is sized for the *common* (every-step-sorted) case, not for - * a worst-case sort cadence: correctness does **not** depend on it. - * Any particle whose stencil escapes the scratch tile — because it - * drifted further than `DRIFT` (e.g. a large runtime - * `spatial_sorting_interval`), or because the halo is otherwise - * undersized — silently falls back to a direct, bounds-clipped - * `Kokkos::atomic_add` on the global J view. That path is - * charge-conserving (each particle's stencil is deposited exactly - * once, partly to private SLM scratch and partly to global J, and - * scratch is flushed once via `atomic_add`); it is merely slower - * per write. Sorting less often than every step therefore costs + * Correctness does **not** depend on the halo size. Any particle whose + * full stencil escapes the scratch tile — because it drifted further + * than `DRIFT`, was reordered far from its tile by a no-sort-step + * `CommunicateParticles`, or because the halo is otherwise undersized — + * is deposited *as a whole* via a direct, bounds-clipped + * `Kokkos::atomic_add` on the global J view (the per-particle escape + * valve). Each particle's stencil is therefore deposited exactly once + * (entirely to SLM scratch when it fits, entirely to global J when it + * does not), so the path is charge-conserving; it is merely slower per + * write. Sizing `DRIFT` to the sort interval keeps the common, + * within-interval drift in fast SLM; sorting less often only costs * escape-valve traffic, never accuracy. + * + * **Partition coverage.** The team iteration covers only the particles + * partitioned at the last sort, `[0, layout.npart_partitioned)`, clamped + * to the live `npart`. Particles appended past the partition since the + * sort are not seen here; the launcher (`engines/srpic/currents.h`) + * deposits that tail with the flat kernel so every active particle is + * covered exactly once regardless of sort cadence. */ template class DepositCurrentsTiled_kernel { @@ -825,21 +834,33 @@ namespace kernel { * old `O+1` carried one extra cell of conservative padding * on top of the already-conservative drift term below. * - * drift — sort runs at end-of-step (see srpic.hpp), so a particle - * sees exactly one pusher step before the *next* step's deposit - * when sorted every step (the common case). DRIFT is therefore a - * fixed constant of 1, NOT a compile-time function of the runtime - * sort cadence. Sizing the halo for the common case (rather than a - * worst-case sort interval) is what keeps the scratch small enough - * for good occupancy; a species sorted less often than - * every step just drifts past the halo and takes the global-J - * escape valve more often — correct, only slower (see the class - * doc-comment for why this is charge-conserving). + * drift — sort runs at end-of-step (see srpic.hpp), so a particle is + * pushed once per step between its last sort and a given deposit. With + * a sort interval of `K`, a particle therefore drifts at most `K` cells + * (CFL |v dt/dx| <= 1/2 ⇒ |Δi| <= 1 per step) before the next sort. The + * `team_policy_sort_interval` CMake knob (macro TEAM_POLICY_SORT_INTERVAL) + * pins that interval at compile time and feeds it here as DRIFT, sizing + * the halo so a fully-interval-drifted particle still deposits inside + * its tile scratch. When the knob is unset, DRIFT defaults to 1 (the + * sorted-every-step common case); any particle that drifts past the halo + * (e.g. a larger runtime interval, or a CFL excursion) takes the + * per-particle global-J escape valve below — correct, only slower (see + * the class doc-comment for why this is charge-conserving). */ - static constexpr int STENCIL_REACH = (O == 0u) ? 2 : static_cast(O); - static constexpr int DRIFT = 1; - static constexpr int HALO = STENCIL_REACH + DRIFT; - static constexpr int TE = static_cast(T_TILE) + 2 * HALO; + static constexpr int STENCIL_REACH = (O == 0u) ? 2 : static_cast(O); + // One-sided footprint reach for the per-particle escape valve: the + // deposit writes at most this many cells above max(i,i_prev) (and fewer + // below min), so [min - FOOTPRINT_REACH, max + FOOTPRINT_REACH] in cell + // coords conservatively bounds every deposited cell for any order + // (Esirkepov reaches max+O; O=0 zigzag reaches max+1). + static constexpr int FOOTPRINT_REACH = (O == 0u) ? 1 : static_cast(O); +#if defined(TEAM_POLICY_SORT_INTERVAL) + static constexpr int DRIFT = static_cast(TEAM_POLICY_SORT_INTERVAL); +#else + static constexpr int DRIFT = 1; +#endif + static constexpr int HALO = STENCIL_REACH + DRIFT; + static constexpr int TE = static_cast(T_TILE) + 2 * HALO; using exec_space = Kokkos::DefaultExecutionSpace; using team_policy = Kokkos::TeamPolicy; @@ -855,6 +876,17 @@ namespace kernel { ncells_t ntx1 { 1u }, ntx2 { 1u }, ntx3 { 1u }; ncells_t total_tiles { 0u }; + /** + * Current active-particle count. `tile_offsets` partitions only the + * particles that existed at the last sort ([0, layout.npart_partitioned)); + * `npart` may differ if the pusher dead-tagged particles in place since. + * Each team clamps its `[tile_offsets(t), tile_offsets(t+1))` slice to + * `npart` so stale slots past the live array are never read. Particles + * appended *beyond* the partition (npart > npart_partitioned) are not seen + * by any team here — the launcher deposits that tail separately. + */ + npart_t npart { 0u }; + /** * J's full storage extent including all ghost cells. Used to clip * the cooperative flush so that a partial tile at the high end of @@ -868,7 +900,8 @@ namespace kernel { const M& metric, real_t charge, real_t dt, - const TileLayout& layout) + const TileLayout& layout, + npart_t npart) : J { cur } , prtls { prtls } , metric { metric } @@ -878,7 +911,8 @@ namespace kernel { , ntx1 { layout.ntiles_per_axis[0] } , ntx2 { layout.ntiles_per_axis[1] } , ntx3 { layout.ntiles_per_axis[2] } - , total_tiles { layout.ntiles_total } { + , total_tiles { layout.ntiles_total } + , npart { npart } { raise::ErrorIf( layout.tile_size != T_TILE, "Tiled deposit launched with mismatched T_TILE and runtime tile_size", @@ -907,12 +941,17 @@ namespace kernel { * `team_policy.set_scratch_size(0, Kokkos::PerTeam(bytes))`. */ static constexpr size_t scratch_bytes() { + // The component count (3) is a *static* extent of scratch_ndfield_t + // (View / **[3] / ***[3]), so shmem_size() takes only the + // dynamic spatial extents — passing 3 as well trips Kokkos' + // `rank_dynamic != number of arguments` abort. This matches the + // scratch View construction below, which also omits the 3. if constexpr (D == Dim::_1D) { - return scratch_ndfield_t::shmem_size(TE, 3); + return scratch_ndfield_t::shmem_size(TE); } else if constexpr (D == Dim::_2D) { - return scratch_ndfield_t::shmem_size(TE, TE, 3); + return scratch_ndfield_t::shmem_size(TE, TE); } else { - return scratch_ndfield_t::shmem_size(TE, TE, TE, 3); + return scratch_ndfield_t::shmem_size(TE, TE, TE); } } @@ -961,31 +1000,49 @@ namespace kernel { }); team.team_barrier(); - const auto p_begin = tile_offsets(tile_id); - const auto p_end = tile_offsets(tile_id + 1u); + // Clamp the tile's particle slice to the live array: slots past + // `npart` may hold stale (possibly alive-tagged) data from a prior + // step's compaction and must not be re-deposited. + const auto t_lo = tile_offsets(tile_id); + const auto t_hi = tile_offsets(tile_id + 1u); + const auto p_begin = (t_lo < npart) ? t_lo : npart; + const auto p_end = (t_hi < npart) ? t_hi : npart; Kokkos::parallel_for( Kokkos::TeamThreadRange(team, p_begin, p_end), [&](prtlidx_t p) { + /** + * Per-particle escape valve: route the WHOLE particle to the + * global J view when its Esirkepov footprint does not fit + * inside this tile's scratch window [0,TE); only particles + * fully inside the tile touch SLM scratch. A particle drifts + * out of its tile when sorted less often than every step. + * + * The conservative footprint bound in cell coords, + * [min(i,i_prev) - O, max(i,i_prev) + O], covers + * prtl_shape::for_deposit for any order (i_min >= + * min-floor(O/2), i_max <= max+O), so when `to_scratch` is true + * every deposited cell is provably in [0,TE) and the scratch write + * needs no per-cell bounds test. The global path bounds-clips + * against J's storage extent (writes past the ghost stripe are + * re-supplied by SynchronizeFields(J)). + */ + const int i1c = prtls.i1(p), i1p = prtls.i1_prev(p); + const int lo1 = (i1c < i1p ? i1c : i1p) + static_cast(N_GHOSTS) - + FOOTPRINT_REACH - origin_J1_low; + const int hi1 = (i1c > i1p ? i1c : i1p) + static_cast(N_GHOSTS) + + FOOTPRINT_REACH - origin_J1_low; + const bool to_scratch = (lo1 >= 0 and hi1 < TE); DepositOneParticle( p, prtls, metric, charge, inv_dt, - /** - * Escape valve: a particle whose stencil reaches past the - * tile's scratch (e.g. exceeded the compile-time - * STENCIL_REACH + DRIFT budget) falls back to a direct - * atomic_add on the global J view. Bounds-clipped against - * J's storage extent so writes past the domain ghost stripe - * are dropped (matches the cooperative flush below; those - * contributions are re-supplied by SynchronizeFields(J)). - */ [&](int g_i1, int comp, real_t v) { - const int li = g_i1 - origin_J1_low; - if (li >= 0 and li < TE) { - Kokkos::atomic_add(&scr(li, comp), v); - } else if (g_i1 >= 0 and g_i1 < j_ext1) { + if (to_scratch) { + Kokkos::atomic_add(&scr(g_i1 - origin_J1_low, comp), v); + //} else if (g_i1 >= 0 and g_i1 < j_ext1) { + } else { Kokkos::atomic_add(&J(g_i1, comp), v); } }); @@ -1022,25 +1079,42 @@ namespace kernel { }); team.team_barrier(); - const auto p_begin = tile_offsets(tile_id); - const auto p_end = tile_offsets(tile_id + 1u); + // Clamp the tile's particle slice to the live array: slots past + // `npart` may hold stale (possibly alive-tagged) data from a prior + // step's compaction and must not be re-deposited. + const auto t_lo = tile_offsets(tile_id); + const auto t_hi = tile_offsets(tile_id + 1u); + const auto p_begin = (t_lo < npart) ? t_lo : npart; + const auto p_end = (t_hi < npart) ? t_hi : npart; Kokkos::parallel_for( Kokkos::TeamThreadRange(team, p_begin, p_end), [&](prtlidx_t p) { + // See 1D branch for rationale: route the whole particle to the + // global escape valve unless its full footprint fits in scratch. + const int i1c = prtls.i1(p), i1p = prtls.i1_prev(p); + const int i2c = prtls.i2(p), i2p = prtls.i2_prev(p); + const int lo1 = (i1c < i1p ? i1c : i1p) + static_cast(N_GHOSTS) - + FOOTPRINT_REACH - origin_J1_low; + const int hi1 = (i1c > i1p ? i1c : i1p) + static_cast(N_GHOSTS) + + FOOTPRINT_REACH - origin_J1_low; + const int lo2 = (i2c < i2p ? i2c : i2p) + static_cast(N_GHOSTS) - + FOOTPRINT_REACH - origin_J2_low; + const int hi2 = (i2c > i2p ? i2c : i2p) + static_cast(N_GHOSTS) + + FOOTPRINT_REACH - origin_J2_low; + const bool to_scratch = (lo1 >= 0 and hi1 < TE and lo2 >= 0 and + hi2 < TE); DepositOneParticle( p, prtls, metric, charge, inv_dt, - // See 1D branch for rationale. [&](const int g_i1, const int g_i2, int comp, real_t v) { - const auto li = g_i1 - origin_J1_low; - const auto lj = g_i2 - origin_J2_low; - if ((li >= 0 and li < TE) and (lj >= 0 and lj < TE)) { - Kokkos::atomic_add(&scr(li, lj, comp), v); - } else if ((g_i1 >= 0 and g_i1 < j_ext1) and - (g_i2 >= 0 and g_i2 < j_ext2)) { + if (to_scratch) { + Kokkos::atomic_add( + &scr(g_i1 - origin_J1_low, g_i2 - origin_J2_low, comp), + v); + } else { Kokkos::atomic_add(&J(g_i1, g_i2, comp), v); } }); @@ -1078,28 +1152,49 @@ namespace kernel { }); team.team_barrier(); - const auto p_begin = tile_offsets(tile_id); - const auto p_end = tile_offsets(tile_id + 1u); + // Clamp the tile's particle slice to the live array: slots past + // `npart` may hold stale (possibly alive-tagged) data from a prior + // step's compaction and must not be re-deposited. + const auto t_lo = tile_offsets(tile_id); + const auto t_hi = tile_offsets(tile_id + 1u); + const auto p_begin = (t_lo < npart) ? t_lo : npart; + const auto p_end = (t_hi < npart) ? t_hi : npart; Kokkos::parallel_for( Kokkos::TeamThreadRange(team, p_begin, p_end), [&](prtlidx_t p) { + // See 1D branch for rationale: route the whole particle to the + // global escape valve unless its full footprint fits in scratch. + const int i1c = prtls.i1(p), i1p = prtls.i1_prev(p); + const int i2c = prtls.i2(p), i2p = prtls.i2_prev(p); + const int i3c = prtls.i3(p), i3p = prtls.i3_prev(p); + const int lo1 = (i1c < i1p ? i1c : i1p) + static_cast(N_GHOSTS) - + FOOTPRINT_REACH - origin_J1_low; + const int hi1 = (i1c > i1p ? i1c : i1p) + static_cast(N_GHOSTS) + + FOOTPRINT_REACH - origin_J1_low; + const int lo2 = (i2c < i2p ? i2c : i2p) + static_cast(N_GHOSTS) - + FOOTPRINT_REACH - origin_J2_low; + const int hi2 = (i2c > i2p ? i2c : i2p) + static_cast(N_GHOSTS) + + FOOTPRINT_REACH - origin_J2_low; + const int lo3 = (i3c < i3p ? i3c : i3p) + static_cast(N_GHOSTS) - + FOOTPRINT_REACH - origin_J3_low; + const int hi3 = (i3c > i3p ? i3c : i3p) + static_cast(N_GHOSTS) + + FOOTPRINT_REACH - origin_J3_low; + const bool to_scratch = (lo1 >= 0 and hi1 < TE and lo2 >= 0 and + hi2 < TE and lo3 >= 0 and hi3 < TE); DepositOneParticle( p, prtls, metric, charge, inv_dt, - // See 1D branch for rationale. [&](const int g_i1, const int g_i2, const int g_i3, int comp, real_t v) { - const auto li = g_i1 - origin_J1_low; - const auto lj = g_i2 - origin_J2_low; - const auto lk = g_i3 - origin_J3_low; - if ((li >= 0 and li < TE) and (lj >= 0 and lj < TE) and - (lk >= 0 and lk < TE)) { - Kokkos::atomic_add(&scr(li, lj, lk, comp), v); - } else if ((g_i1 >= 0 and g_i1 < j_ext1) and - (g_i2 >= 0 and g_i2 < j_ext2) and - (g_i3 >= 0 and g_i3 < j_ext3)) { + if (to_scratch) { + Kokkos::atomic_add(&scr(g_i1 - origin_J1_low, + g_i2 - origin_J2_low, + g_i3 - origin_J3_low, + comp), + v); + } else { Kokkos::atomic_add(&J(g_i1, g_i2, g_i3, comp), v); } }); diff --git a/tests/framework/particles_sort.cpp b/tests/framework/particles_sort.cpp index 1ba10d828..6945c962f 100644 --- a/tests/framework/particles_sort.cpp +++ b/tests/framework/particles_sort.cpp @@ -82,7 +82,11 @@ auto main(int argc, char* argv[]) -> int { Kokkos::deep_copy(pld_r_h, prtls.pld_r); Kokkos::deep_copy(pld_i_h, prtls.pld_i); - for (auto p { 0u }; p < 75u; ++p) { + // Only [0, npart) is defined after a sort. The swap-based gather in + // apply_permutation_to_soa replaces each SoA View, zero-filling the + // spare capacity [npart, maxnpart) (a don't-care region overwritten + // by injection), so the old "tail preserved" check no longer holds. + for (auto p { 0u }; p < 66u; ++p) { if (p < 16u) { raise::ErrorIf(weight_h(p) != 3.0, "error in sorting particles", HERE); } else if (p < 33u) { @@ -180,7 +184,11 @@ auto main(int argc, char* argv[]) -> int { auto weight_h = Kokkos::create_mirror_view(prtls.weight); Kokkos::deep_copy(weight_h, prtls.weight); - for (auto p { 0u }; p < 75u; ++p) { + // Only [0, npart) is defined after a sort. The swap-based gather in + // apply_permutation_to_soa replaces each SoA View, zero-filling the + // spare capacity [npart, maxnpart) (a don't-care region overwritten + // by injection), so the old "tail preserved" check no longer holds. + for (auto p { 0u }; p < 66u; ++p) { if (p < 13u) { raise::ErrorIf(weight_h(p) != 4.0, "error in sorting particles", HERE); } else if (p < 26u) { From 4f4409d507b942e9a7a15cc3c6a3abb69af07dd7 Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Fri, 19 Jun 2026 20:13:50 +0000 Subject: [PATCH 11/33] fix test for tiled deposit --- tests/kernels/deposit_tiled.cpp | 289 ++++++++++++++++++++++++++++++-- 1 file changed, 273 insertions(+), 16 deletions(-) diff --git a/tests/kernels/deposit_tiled.cpp b/tests/kernels/deposit_tiled.cpp index 504cc3818..3cc2f62b0 100644 --- a/tests/kernels/deposit_tiled.cpp +++ b/tests/kernels/deposit_tiled.cpp @@ -3,7 +3,7 @@ * @brief X-1 numerical-equivalence test for the tiled deposit kernel. * * Runs the flat (`DepositCurrents_kernel`) and tiled - * (`DepositCurrents_kernel_tiled`) kernels on identical particle SoA inputs + * (`DepositCurrentsTiled_kernel`) kernels on identical particle SoA inputs * for shape orders O = 1..11 and asserts that the resulting J array is * identical cell-by-cell within a small floating-point tolerance. * @@ -48,6 +48,37 @@ namespace { Kokkos::deep_copy(arr, h); } + // Pack the per-test SoA arrays into a ParticleArrays — the struct both + // deposit kernels take. Payload (pld_*) members stay default; these are + // 2D Cartesian Minkowski cases, so phi/i3/dx3 are present but unread. + ParticleArrays pack_arrays(const array_t& i1, + const array_t& i2, + const array_t& i3, + const array_t& i1_prev, + const array_t& i2_prev, + const array_t& i3_prev, + const array_t& dx1, + const array_t& dx2, + const array_t& dx3, + const array_t& dx1_prev, + const array_t& dx2_prev, + const array_t& dx3_prev, + const array_t& ux1, + const array_t& ux2, + const array_t& ux3, + const array_t& phi, + const array_t& weight, + const array_t& tag) { + ParticleArrays pa; + pa.i1 = i1, pa.i2 = i2, pa.i3 = i3; + pa.i1_prev = i1_prev, pa.i2_prev = i2_prev, pa.i3_prev = i3_prev; + pa.dx1 = dx1, pa.dx2 = dx2, pa.dx3 = dx3; + pa.dx1_prev = dx1_prev, pa.dx2_prev = dx2_prev, pa.dx3_prev = dx3_prev; + pa.ux1 = ux1, pa.ux2 = ux2, pa.ux3 = ux3; + pa.phi = phi, pa.weight = weight, pa.tag = tag; + return pa; + } + // Builds tile_offsets for a single-particle test. Particle 0 is alive // and lives in tile (tx1, tx2); slots 1..n_slots-1 carry the dead // sentinel and are never referenced by tile_offsets — so the tiled @@ -68,6 +99,69 @@ namespace { return offsets; } + // Buckets ALL `n_alive` particles (slots 0..n_alive-1) into tile 0, + // modelling a maximally-stale tile layout: every particle was "sorted" + // into tile 0 but now sits anywhere in the domain (as happens when the + // SoA drifts / is reordered between sorts). Every particle except the + // few that genuinely live near the origin must therefore take the + // per-particle escape valve to global J. + array_t build_tile_offsets_all_in_tile0(ncells_t total_tiles, + npart_t n_alive) { + array_t offsets("tile_offsets", total_tiles + 1u); + auto h = Kokkos::create_mirror_view(offsets); + h(0) = static_cast(0); + for (ncells_t t = 1; t <= total_tiles; ++t) { + h(t) = n_alive; + } + Kokkos::deep_copy(offsets, h); + return offsets; + } + + // Cell-by-cell comparison of two J fields; throws on mismatch. + void compare_J_fields(const ndfield_t& J_flat, + const ndfield_t& J_tiled, + unsigned short O, + unsigned short T_TILE, + const char* label) { + auto h_flat = Kokkos::create_mirror_view(J_flat); + auto h_tiled = Kokkos::create_mirror_view(J_tiled); + Kokkos::deep_copy(h_flat, J_flat); + Kokkos::deep_copy(h_tiled, J_tiled); + + const real_t eps = static_cast(1.0e-5); + real_t max_diff = ZERO; + int fail_count = 0; + for (ncells_t i = 0; i < h_flat.extent(0); ++i) { + for (ncells_t j = 0; j < h_flat.extent(1); ++j) { + for (int c = 0; c < 3; ++c) { + const real_t a = h_flat(i, j, c); + const real_t b = h_tiled(i, j, c); + const real_t diff = math::fabs(a - b); + const real_t mag = math::max(math::fabs(a), math::fabs(b)); + if (diff > max_diff) { + max_diff = diff; + } + if (diff > eps * math::max(mag, static_cast(1.0))) { + if (fail_count < 5) { + std::cerr << " [" << label << "] J(" << i << "," << j + << ",c=" << c << ") flat=" << a << " tiled=" << b + << " diff=" << diff << '\n'; + } + ++fail_count; + } + } + } + } + if (fail_count > 0) { + std::cerr << "deposit_tiled[" << label << "] FAILED for O=" << O + << " T_TILE=" << T_TILE << " : " << fail_count + << " mismatches; max_diff=" << max_diff << '\n'; + throw std::logic_error("DepositCurrentsTiled_kernel mismatch"); + } + std::cerr << "deposit_tiled[" << label << "] OK O=" << O + << " T_TILE=" << T_TILE << " max_diff=" << max_diff << '\n'; + } + template void run_one_case() { using metric_t = metric::Minkowski; @@ -129,12 +223,12 @@ namespace { 10, kernel::DepositCurrents_kernel( J_scat, - i1, i2, i3, - i1_prev, i2_prev, i3_prev, - dx1, dx2, dx3, - dx1_prev, dx2_prev, dx3_prev, - ux1, ux2, ux3, - phi, weight, tag, + pack_arrays(i1, i2, i3, + i1_prev, i2_prev, i3_prev, + dx1, dx2, dx3, + dx1_prev, dx2_prev, dx3_prev, + ux1, ux2, ux3, + phi, weight, tag), metric, charge, dt)); Kokkos::Experimental::contribute(J_flat, J_scat); Kokkos::fence("flat deposit done"); @@ -168,15 +262,18 @@ namespace { tx2); using kernel_t = - kernel::DepositCurrents_kernel_tiled; + kernel::DepositCurrentsTiled_kernel; + // npart = full slot count (10): the lone alive particle sits in slot 0 + // and the per-tile slice clamp keeps the (dead) tail out. kernel_t kern { J_tiled, - i1, i2, i3, - i1_prev, i2_prev, i3_prev, - dx1, dx2, dx3, - dx1_prev, dx2_prev, dx3_prev, - ux1, ux2, ux3, - phi, weight, tag, - metric, charge, dt, layout }; + pack_arrays(i1, i2, i3, + i1_prev, i2_prev, i3_prev, + dx1, dx2, dx3, + dx1_prev, dx2_prev, dx3_prev, + ux1, ux2, ux3, + phi, weight, tag), + metric, charge, dt, layout, + static_cast(10) }; Kokkos::TeamPolicy<> policy(static_cast(layout.ntiles_total), Kokkos::AUTO); @@ -221,12 +318,158 @@ namespace { << " T_TILE=" << T_TILE << " : " << fail_count << " mismatches; max_diff=" << max_diff << '\n'; - throw std::logic_error("DepositCurrents_kernel_tiled mismatch"); + throw std::logic_error("DepositCurrentsTiled_kernel mismatch"); } std::cerr << "X-1 deposit_tiled OK O=" << O << " T_TILE=" << T_TILE << " max_diff=" << max_diff << '\n'; } + // Drift / stale-layout regression test for the per-particle escape valve. + // + // A population of alive particles is spread across the whole domain + // (including the near-boundary cells that deposit into the ghost stripe) + // but the tile layout buckets them ALL into tile 0 — i.e. the layout is + // maximally stale w.r.t. their real positions, exactly the situation that + // arises when the SoA is reordered/appended between sorts. The tiled + // kernel must route every out-of-tile particle to the global J view and + // reproduce the flat deposit cell-for-cell: no charge dropped or + // double-counted at any drift distance. This is the property that broke + // when `spatial_sorting_interval > 1` left drifted particles depositing + // partial stencils, producing a density/E line at the decomposition + // boundary. + template + void run_drift_case() { + using metric_t = metric::Minkowski; + constexpr unsigned short nx1 = 50u, nx2 = 50u; + metric_t metric { { nx1, nx2 }, { { 0.0, 55.0 }, { 0.0, 55.0 } }, {} }; + + constexpr int n_slots = 64; + constexpr int n_base = 5; + const int bases[n_base] = { 1, 13, 25, 37, 48 }; + const int n_alive = n_base * n_base; // 25 + + array_t i1 { "i1", n_slots }, i2 { "i2", n_slots }, + i3 { "i3", n_slots }; + array_t i1_prev { "i1_prev", n_slots }, + i2_prev { "i2_prev", n_slots }, i3_prev { "i3_prev", n_slots }; + array_t dx1 { "dx1", n_slots }, dx2 { "dx2", n_slots }, + dx3 { "dx3", n_slots }; + array_t dx1_prev { "dx1_prev", n_slots }, + dx2_prev { "dx2_prev", n_slots }, dx3_prev { "dx3_prev", n_slots }; + array_t ux1 { "ux1", n_slots }, ux2 { "ux2", n_slots }, + ux3 { "ux3", n_slots }; + array_t phi { "phi", n_slots }, weight { "weight", n_slots }; + array_t tag { "tag", n_slots }; + const real_t charge = 1.0, dt = 1.0; + + // Fill alive particles on host (slots >= n_alive stay zero == dead). + auto h_i1 = Kokkos::create_mirror_view(i1); + auto h_i2 = Kokkos::create_mirror_view(i2); + auto h_i1p = Kokkos::create_mirror_view(i1_prev); + auto h_i2p = Kokkos::create_mirror_view(i2_prev); + auto h_dx1 = Kokkos::create_mirror_view(dx1); + auto h_dx2 = Kokkos::create_mirror_view(dx2); + auto h_dx1p = Kokkos::create_mirror_view(dx1_prev); + auto h_dx2p = Kokkos::create_mirror_view(dx2_prev); + auto h_ux3 = Kokkos::create_mirror_view(ux3); + auto h_w = Kokkos::create_mirror_view(weight); + auto h_tag = Kokkos::create_mirror_view(tag); + int p = 0; + for (int a = 0; a < n_base; ++a) { + for (int b = 0; b < n_base; ++b, ++p) { + h_i1p(p) = bases[a]; + h_i1(p) = bases[a] - 1; + h_i2p(p) = bases[b]; + h_i2(p) = bases[b] - 1; + h_dx1p(p) = static_cast(0.65); + h_dx1(p) = static_cast(0.99); + h_dx2p(p) = static_cast(0.65); + h_dx2(p) = static_cast(0.80); + h_ux3(p) = static_cast(2.5); + h_w(p) = static_cast(1.0); + h_tag(p) = ParticleTag::alive; + } + } + Kokkos::deep_copy(i1, h_i1); + Kokkos::deep_copy(i2, h_i2); + Kokkos::deep_copy(i1_prev, h_i1p); + Kokkos::deep_copy(i2_prev, h_i2p); + Kokkos::deep_copy(dx1, h_dx1); + Kokkos::deep_copy(dx2, h_dx2); + Kokkos::deep_copy(dx1_prev, h_dx1p); + Kokkos::deep_copy(dx2_prev, h_dx2p); + Kokkos::deep_copy(ux3, h_ux3); + Kokkos::deep_copy(weight, h_w); + Kokkos::deep_copy(tag, h_tag); + + // Flat reference over all slots (dead slots are skipped internally). + ndfield_t J_flat { "J_flat", + nx1 + 2u * N_GHOSTS, + nx2 + 2u * N_GHOSTS }; + { + auto J_scat = Kokkos::Experimental::create_scatter_view(J_flat); + Kokkos::parallel_for( + "FlatDepositDrift", + n_slots, + kernel::DepositCurrents_kernel( + J_scat, + pack_arrays(i1, i2, i3, + i1_prev, i2_prev, i3_prev, + dx1, dx2, dx3, + dx1_prev, dx2_prev, dx3_prev, + ux1, ux2, ux3, + phi, weight, tag), + metric, charge, dt)); + Kokkos::Experimental::contribute(J_flat, J_scat); + Kokkos::fence("flat drift deposit done"); + } + + // Tiled with a maximally-stale layout: all alive particles in tile 0. + ndfield_t J_tiled { "J_tiled", + nx1 + 2u * N_GHOSTS, + nx2 + 2u * N_GHOSTS }; + { + const auto ntx1 = static_cast( + std::ceil(static_cast(nx1) / static_cast(T_TILE))); + const auto ntx2 = static_cast( + std::ceil(static_cast(nx2) / static_cast(T_TILE))); + + TileLayout layout; + layout.ntiles_per_axis[0] = ntx1; + layout.ntiles_per_axis[1] = ntx2; + layout.ntiles_per_axis[2] = 1u; + layout.ntiles_total = ntx1 * ntx2; + layout.tile_size = T_TILE; + layout.tile_offsets = build_tile_offsets_all_in_tile0( + ntx1 * ntx2, + static_cast(n_alive)); + + using kernel_t = + kernel::DepositCurrentsTiled_kernel; + // npart = n_alive: the stale layout buckets all alive particles into + // tile 0, so the team must walk [0, n_alive) and route the drifted + // ones to the global-J escape valve. + kernel_t kern { J_tiled, + pack_arrays(i1, i2, i3, + i1_prev, i2_prev, i3_prev, + dx1, dx2, dx3, + dx1_prev, dx2_prev, dx3_prev, + ux1, ux2, ux3, + phi, weight, tag), + metric, charge, dt, layout, + static_cast(n_alive) }; + + Kokkos::TeamPolicy<> policy(static_cast(layout.ntiles_total), + Kokkos::AUTO); + policy.set_scratch_size(0, + Kokkos::PerTeam(kernel_t::scratch_bytes())); + Kokkos::parallel_for("TiledDepositDrift", policy, kern); + Kokkos::fence("tiled drift deposit done"); + } + + compare_J_fields(J_flat, J_tiled, O, T_TILE, "drift"); + } + template void run_all_orders() { run_one_case<0u, T_TILE>(); @@ -241,6 +484,20 @@ namespace { run_one_case<9u, T_TILE>(); run_one_case<10u, T_TILE>(); run_one_case<11u, T_TILE>(); + + // Stale-layout / drift regression (per-particle escape valve). + run_drift_case<0u, T_TILE>(); + run_drift_case<1u, T_TILE>(); + run_drift_case<2u, T_TILE>(); + run_drift_case<3u, T_TILE>(); + run_drift_case<4u, T_TILE>(); + run_drift_case<5u, T_TILE>(); + run_drift_case<6u, T_TILE>(); + run_drift_case<7u, T_TILE>(); + run_drift_case<8u, T_TILE>(); + run_drift_case<9u, T_TILE>(); + run_drift_case<10u, T_TILE>(); + run_drift_case<11u, T_TILE>(); } } // namespace From 4dfaba918443fb6dc374f2d6c661c5836e5742dc Mon Sep 17 00:00:00 2001 From: Ludwig Boess Date: Fri, 19 Jun 2026 22:36:26 -0400 Subject: [PATCH 12/33] replaced compile-time sort intervale with compile-time drift halo size --- CMakeLists.txt | 40 ++++++++++++++----------- cmake/defaults.cmake | 6 ++-- cmake/report.cmake | 24 +++++++-------- src/engines/srpic/currents.h | 2 +- src/framework/parameters/parameters.cpp | 7 ----- src/framework/parameters/particles.cpp | 9 ------ src/kernels/currents_deposit.hpp | 39 ++++++++++++------------ 7 files changed, 56 insertions(+), 71 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d7bea74f0..f8409209b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,10 +67,10 @@ set(team_policy_tile_size set(team_policy_tile_sizes "4;6;8;10;12;14;16" CACHE STRING "team_policy tile-size choices") -set(team_policy_sort - ${default_team_policy_sort} +set(team_policy_drift + ${default_team_policy_drift} CACHE STRING - "team_policy hardwired spatial sorting interval; >0 overrides the runtime spatial_sorting_interval and sizes the tiled deposit scratch halo (0 = use runtime)") + "team_policy tiled-deposit scratch halo drift in cells (max cells a particle may move between two sorts). Sizes the deposit scratch halo only; the sort cadence is set at runtime via spatial_sorting_interval. Default 1.") # -------------------------- Compilation settings -------------------------- # set(CMAKE_CXX_STANDARD 20) @@ -161,14 +161,13 @@ if(${team_policy}) add_compile_options("-D TEAM_POLICY") add_compile_options("-D TEAM_POLICY_TILE_SIZE=${team_policy_tile_size}") - # Optional compile-time hardwired sort interval. When > 0, it (a) overrides - # the runtime spatial_sorting_interval (see framework/parameters) and (b) - # sizes the tiled deposit scratch halo to DRIFT = interval, so a particle - # drifting over a full sort interval still deposits inside its tile scratch. - if(team_policy_sort GREATER 0) - add_compile_options( - "-D TEAM_POLICY_SORT_INTERVAL=${team_policy_sort}") - endif() + # Compile-time tiled-deposit scratch halo drift. Sizes the halo so a + # particle that drifts up to DRIFT cells between two sorts still deposits + # inside its tile scratch; particles drifting further take the + # per-particle global-J escape valve (correct, only slower). This is + # independent of the sort cadence, which is set at runtime via + # `spatial_sorting_interval`. Defaults to 1 (the sorted-every-step case). + add_compile_options("-D TEAM_POLICY_DRIFT=${team_policy_drift}") # Vendor sort: oneDPL on SYCL, Thrust on CUDA. Used automatically # when found; falls back to Kokkos::BinSort otherwise. @@ -196,18 +195,23 @@ if(${team_policy}) endif() if("${Kokkos_DEVICES}" MATCHES "HIP") - # rocThrust ships with ROCm and exposes the same thrust:: API. Using - # it lets the HIP backend build a single permutation via - # sort_by_key and gather all SoA members through one reused buffer, - # instead of the legacy per-member Kokkos::BinSort path which - # allocates a fresh `sorted_values` buffer for every member every - # step (the dominant source of allocator churn / fragmentation on - # ROCm). + # rocThrust ships with ROCm. The HIP sort_by_key path uses rocprim's + # bounded-bit radix sort directly (rocprim is rocThrust's own + # dependency, so its headers come in transitively; we find it + # explicitly to keep the include path robust). This builds a single + # permutation that gathers all SoA members, instead of the legacy + # per-member Kokkos::BinSort path which allocates a fresh + # `sorted_values` buffer for every member every step (the dominant + # source of allocator churn / fragmentation on ROCm). find_package(rocthrust QUIET) if(rocthrust_FOUND) message(STATUS "team_policy: rocThrust enabled for HIP sort_by_key") add_compile_options("-D ROCTHRUST_ENABLED") set(DEPENDENCIES ${DEPENDENCIES} roc::rocthrust) + find_package(rocprim QUIET) + if(rocprim_FOUND) + set(DEPENDENCIES ${DEPENDENCIES} roc::rocprim) + endif() else() message(STATUS "team_policy: rocThrust not found; using BinSort " "fallback for HIP sort_by_key") diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake index 619f48dc1..01427921f 100644 --- a/cmake/defaults.cmake +++ b/cmake/defaults.cmake @@ -108,7 +108,7 @@ set(default_team_policy_tile_size 8 CACHE INTERNAL "Default tile edge length in cells for team_policy") -set(default_team_policy_sort - 0 +set(default_team_policy_drift + 1 CACHE INTERNAL - "Default hardwired spatial sorting interval for team_policy (0 = runtime)") + "Default tiled-deposit scratch halo drift for team_policy (cells between sorts)") diff --git a/cmake/report.cmake b/cmake/report.cmake index e6036d366..d90dfb085 100644 --- a/cmake/report.cmake +++ b/cmake/report.cmake @@ -141,17 +141,15 @@ if(${team_policy}) "${Blue}" TEAM_POLICY_TILE_SIZE_REPORT 46) - if(team_policy_sort GREATER 0) - printchoices( - "Team Sort Interval" - "team_policy_sort" - "${team_policy_sort}" - ${team_policy_sort} - 0 - "${Blue}" - TEAM_POLICY_SORT_INTERVAL_REPORT - 46) - endif() + printchoices( + "Team Deposit Drift" + "team_policy_drift" + "${team_policy_drift}" + ${team_policy_drift} + 1 + "${Blue}" + TEAM_POLICY_DRIFT_REPORT + 46) endif() printchoices( "Debug mode" @@ -231,9 +229,7 @@ endif() string(APPEND REPORT_TEXT " " ${TEAM_POLICY_REPORT} "\n") if(${team_policy}) string(APPEND REPORT_TEXT " " ${TEAM_POLICY_TILE_SIZE_REPORT} "\n") - if(team_policy_sort GREATER 0) - string(APPEND REPORT_TEXT " " ${TEAM_POLICY_SORT_INTERVAL_REPORT} "\n") - endif() + string(APPEND REPORT_TEXT " " ${TEAM_POLICY_DRIFT_REPORT} "\n") endif() string( diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h index eadda70b9..b8cfe63ab 100644 --- a/src/engines/srpic/currents.h +++ b/src/engines/srpic/currents.h @@ -128,7 +128,7 @@ namespace ntt { // partition per-particle: // - a particle whose full stencil has drifted out of its tile is // deposited straight to the global J view (the per-particle escape - // valve); `team_policy_sort_interval` sizes the scratch halo so the + // valve); `team_policy_drift` sizes the scratch halo so the // common in-tile case stays in fast SLM (see currents_deposit.hpp); // - particles dead-tagged in place since the sort are clamped out by // the kernel and skipped by the dead-tag test; diff --git a/src/framework/parameters/parameters.cpp b/src/framework/parameters/parameters.cpp index 4116783a1..7372da510 100644 --- a/src/framework/parameters/parameters.cpp +++ b/src/framework/parameters/parameters.cpp @@ -67,18 +67,11 @@ namespace ntt { "clear_interval", defaults::clear_interval); set("particles.clear_interval", global_clearing_interval); -#if defined(TEAM_POLICY_SORT_INTERVAL) - // See particles.cpp: the compile-time team_policy_sort_interval overrides - // the runtime value (kept consistent here for the stored global param). - const auto global_spatial_sorting_interval = static_cast( - TEAM_POLICY_SORT_INTERVAL); -#else const auto global_spatial_sorting_interval = toml::find_or( toml_data, "particles", "spatial_sorting_interval", 0u); -#endif set("particles.spatial_sorting_interval", global_spatial_sorting_interval); set("scales.n0", ppc0 / get("scales.V0")); diff --git a/src/framework/parameters/particles.cpp b/src/framework/parameters/particles.cpp index 46192035b..6f1a23c03 100644 --- a/src/framework/parameters/particles.cpp +++ b/src/framework/parameters/particles.cpp @@ -121,19 +121,10 @@ namespace ntt { sp, "clear_interval", global_clearing_interval); -#if defined(TEAM_POLICY_SORT_INTERVAL) - // Compile-time hardwired sort interval (the `team_policy_sort_interval` - // CMake knob). It overrides whatever the input file requested so the - // tiled deposit's scratch halo — sized for exactly this cadence — is - // guaranteed to contain every particle's drift between sorts. - const auto spatial_sorting_interval = static_cast( - TEAM_POLICY_SORT_INTERVAL); -#else const auto spatial_sorting_interval = toml::find_or( sp, "spatial_sorting_interval", global_spatial_sorting_interval); -#endif auto pusher_str = toml::find_or(sp, "pusher", std::string(def_pusher)); const auto npayloads_real = toml::find_or(sp, "n_payloads_real", diff --git a/src/kernels/currents_deposit.hpp b/src/kernels/currents_deposit.hpp index 8840a1046..b92b02423 100644 --- a/src/kernels/currents_deposit.hpp +++ b/src/kernels/currents_deposit.hpp @@ -786,11 +786,12 @@ namespace kernel { * per step elapsed since the last sort. The scratch HALO is * `STENCIL_REACH(O) + DRIFT`, where `STENCIL_REACH = 2` for zigzag * (writes `{i_prev, i_prev+1, i, i+1}` ⇒ +2 above `min(i, i_prev)` with - * `|Δi|=1`) and `O` for Esirkepov. `DRIFT` is the - * `team_policy_sort_interval` CMake knob (macro TEAM_POLICY_SORT_INTERVAL) - * when set — the hardwired sort interval, hence the maximum drift any - * particle accrues between sorts — and `1` otherwise (the - * every-step-sorted common case). + * `|Δi|=1`) and `O` for Esirkepov. `DRIFT` is the `team_policy_drift` + * CMake knob (macro TEAM_POLICY_DRIFT) — the number of cells a particle + * may drift between two sorts that the halo is sized to absorb — and `1` + * by default (the every-step-sorted common case). It is independent of + * the sort cadence, which is set at runtime via `spatial_sorting_interval`; + * particles that drift past the halo take the escape valve below. * * Correctness does **not** depend on the halo size. Any particle whose * full stencil escapes the scratch tile — because it drifted further @@ -801,9 +802,9 @@ namespace kernel { * valve). Each particle's stencil is therefore deposited exactly once * (entirely to SLM scratch when it fits, entirely to global J when it * does not), so the path is charge-conserving; it is merely slower per - * write. Sizing `DRIFT` to the sort interval keeps the common, - * within-interval drift in fast SLM; sorting less often only costs - * escape-valve traffic, never accuracy. + * write. Sizing `DRIFT` to the typical between-sort drift keeps the + * common case in fast SLM; sorting less often (or drifting past the + * halo) only costs escape-valve traffic, never accuracy. * * **Partition coverage.** The team iteration covers only the particles * partitioned at the last sort, `[0, layout.npart_partitioned)`, clamped @@ -836,15 +837,15 @@ namespace kernel { * * drift — sort runs at end-of-step (see srpic.hpp), so a particle is * pushed once per step between its last sort and a given deposit. With - * a sort interval of `K`, a particle therefore drifts at most `K` cells - * (CFL |v dt/dx| <= 1/2 ⇒ |Δi| <= 1 per step) before the next sort. The - * `team_policy_sort_interval` CMake knob (macro TEAM_POLICY_SORT_INTERVAL) - * pins that interval at compile time and feeds it here as DRIFT, sizing - * the halo so a fully-interval-drifted particle still deposits inside - * its tile scratch. When the knob is unset, DRIFT defaults to 1 (the - * sorted-every-step common case); any particle that drifts past the halo - * (e.g. a larger runtime interval, or a CFL excursion) takes the - * per-particle global-J escape valve below — correct, only slower (see + * a runtime sort interval of `K` (spatial_sorting_interval), a particle + * drifts at most `K` cells (CFL |v dt/dx| <= 1/2 ⇒ |Δi| <= 1 per step) + * before the next sort. The `team_policy_drift` CMake knob (macro + * TEAM_POLICY_DRIFT) sets DRIFT independently of `K`, sizing the halo so + * a particle that drifts up to DRIFT cells still deposits inside its + * tile scratch. DRIFT defaults to 1 (the sorted-every-step common case); + * any particle that drifts past the halo (e.g. a larger sort interval, + * or a CFL excursion) takes the per-particle global-J escape valve + * below — correct, only slower (see * the class doc-comment for why this is charge-conserving). */ static constexpr int STENCIL_REACH = (O == 0u) ? 2 : static_cast(O); @@ -854,8 +855,8 @@ namespace kernel { // coords conservatively bounds every deposited cell for any order // (Esirkepov reaches max+O; O=0 zigzag reaches max+1). static constexpr int FOOTPRINT_REACH = (O == 0u) ? 1 : static_cast(O); -#if defined(TEAM_POLICY_SORT_INTERVAL) - static constexpr int DRIFT = static_cast(TEAM_POLICY_SORT_INTERVAL); +#if defined(TEAM_POLICY_DRIFT) + static constexpr int DRIFT = static_cast(TEAM_POLICY_DRIFT); #else static constexpr int DRIFT = 1; #endif From cd1be6b383839fb72a31e078c3da5a59f0246d71 Mon Sep 17 00:00:00 2001 From: Ludwig Boess Date: Fri, 19 Jun 2026 22:37:08 -0400 Subject: [PATCH 13/33] AMD-specific sorting improvements --- src/framework/containers/particles.h | 12 ++ src/framework/containers/particles_sort.cpp | 128 +++++++------ src/global/utils/sort_dispatch.h | 154 ++++++++++++--- tests/framework/particles_sort.cpp | 196 ++++++++++++++------ tests/framework/sort_by_key.cpp | 3 + 5 files changed, 353 insertions(+), 140 deletions(-) diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h index 0877efd15..511a07a62 100644 --- a/src/framework/containers/particles.h +++ b/src/framework/containers/particles.h @@ -99,6 +99,18 @@ namespace ntt { // vendor libraries detected by CMake. TileLayout m_tile_layout {}; +#if defined(TEAM_POLICY) + // Build m_tile_layout.tile_offsets / npart_partitioned from the + // already-sorted tile-index keys. A separate member function (not a + // lambda local to SortSpatially) so the inner device kernel is not an + // extended __device__ lambda nested inside another lambda — which + // nvcc forbids. Lets the vendor path run the offsets pass and then + // release the keys before the SoA gather allocates its buffers. + void compute_tile_offsets(const array_t& tile_indices, + ncells_t total_tiles, + npart_t npart_local); +#endif + public: // for empty allocation Particles() {} diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp index a52b013f7..3db6c1438 100644 --- a/src/framework/containers/particles_sort.cpp +++ b/src/framework/containers/particles_sort.cpp @@ -205,6 +205,61 @@ namespace ntt { m_is_sorted = true; } +#if defined(TEAM_POLICY) + template + void Particles::compute_tile_offsets( + const array_t& tile_indices, + ncells_t total_tiles, + npart_t npart_local) { + // Compute the per-tile prefix-sum `tile_offsets` for the tiled + // pusher from the (already sorted) `tile_indices` — monotonically + // non-decreasing for alive particles, with the dead sentinel + // `total_tiles + 1` clustered at the end. Transition-detect directly + // on it: the start of each non-empty tile is the only place a write + // happens — atomic-free in the dense branch. Empty tiles (no + // particles) are filled by a reverse pass on a small host mirror + // (`total_tiles ≈ 176K` at production scale → ~700 KB). + array_t tile_offsets { "tile_offsets", total_tiles + 1u }; + Kokkos::deep_copy(tile_offsets, static_cast(npart_local)); + + const auto total_tiles_v = total_tiles; + auto ti_v = tile_indices; + Kokkos::parallel_for( + "DetectTileBoundaries", + CreateParticleRangePolicy({ 0u }, { npart_local }), + Lambda(prtlidx_t p) { + const auto t_curr = ti_v(p); + const bool boundary = (p == 0u) || (ti_v(p - 1u) != t_curr); + if (!boundary) { + return; + } + if (t_curr < total_tiles_v) { + tile_offsets(t_curr) = p; + } else { + // First dead particle — also marks the alive_count boundary + // stored at index total_tiles. + Kokkos::atomic_min(&tile_offsets(total_tiles_v), p); + } + }); + + auto h_offsets = Kokkos::create_mirror_view(tile_offsets); + Kokkos::deep_copy(h_offsets, tile_offsets); + for (auto t = static_cast(total_tiles); t-- > 0u;) { + if (h_offsets(t) > h_offsets(t + 1u)) { + h_offsets(t) = h_offsets(t + 1u); + } + } + Kokkos::deep_copy(tile_offsets, h_offsets); + + m_tile_layout.tile_offsets = tile_offsets; + // tile_offsets(total_tiles) is the alive-particle count at sort time: + // the tiles partition exactly [0, npart_partitioned). The deposit + // launcher compares this against the live npart() to detect (and + // separately deposit) particles appended since this sort. + m_tile_layout.npart_partitioned = h_offsets(total_tiles); + } +#endif // TEAM_POLICY + template void Particles::SortSpatially(const Grid& grid) { #if defined(TEAM_POLICY) @@ -262,6 +317,7 @@ namespace ntt { // the dead-particle sentinel bin (total_tiles + 1u). const ncells_t n_bins = total_tiles + 2u; const auto slice = prtl_slice_t(0, npart_local); + #if defined(TEAM_POLICY_USE_VENDOR_SORT) // Vendor path: produce an explicit permutation via sort_by_key, // then apply it to each SoA member by gathering into a fresh @@ -286,6 +342,12 @@ namespace ntt { n_bins, sort::backend::Thrust {}); #endif + // `tile_indices` is sorted in place by sort_by_key. Build the tile + // offsets from it now, then drop it before the gather allocates its + // `maxnpart`-sized buffers — so the keys are not co-resident with + // them at the gather's peak (#2). + compute_tile_offsets(tile_indices, total_tiles, npart_local); + tile_indices = array_t {}; Kokkos::fence("SortSpatially: pre-gather drain"); apply_permutation_to_soa(perm); #else @@ -330,67 +392,17 @@ namespace ntt { sorter.sort(Kokkos::subview(pld_i, slice, pldi)); } // Apply the same permutation to `tile_indices` itself so it ends - // monotonically non-decreasing for the offsets pass below. + // monotonically non-decreasing for the offsets pass, then build the + // tile offsets from it (the in-place BinSort path has no separate + // gather to hoist this ahead of). sorter.sort(tile_indices); + compute_tile_offsets(tile_indices, total_tiles, npart_local); #endif // TEAM_POLICY_USE_VENDOR_SORT - // 5. Compute per-tile prefix-sum `tile_offsets` for the tiled - // pusher. `tile_indices` is now sorted (monotonically - // non-decreasing for alive particles, dead sentinel - // `total_tiles + 1` clustered at the end) — vendor sort_by_key - // sorts keys in place; the BinSort path explicitly applies the - // same permutation to `tile_indices` above. Transition-detect - // directly on it: the start of each non-empty tile is the only - // place a write happens — atomic-free in the dense branch. - // Empty tiles (no particles) are filled by a reverse pass on a - // small host mirror (`total_tiles ≈ 176K` at production scale → - // ~700 KB). - { - array_t tile_offsets { "tile_offsets", total_tiles + 1u }; - Kokkos::deep_copy(tile_offsets, static_cast(npart_local)); - - const auto total_tiles_v = total_tiles; - auto ti_v = tile_indices; - Kokkos::parallel_for( - "DetectTileBoundaries", - rangeActiveParticles(), - Lambda(prtlidx_t p) { - const auto t_curr = ti_v(p); - const bool boundary = (p == 0u) || (ti_v(p - 1u) != t_curr); - if (!boundary) { - return; - } - if (t_curr < total_tiles_v) { - tile_offsets(t_curr) = p; - } else { - // First dead particle — also marks the alive_count boundary - // stored at index total_tiles. - Kokkos::atomic_min(&tile_offsets(total_tiles_v), p); - } - }); - - auto h_offsets = Kokkos::create_mirror_view(tile_offsets); - Kokkos::deep_copy(h_offsets, tile_offsets); - for (auto t = static_cast(total_tiles); t-- > 0u;) { - if (h_offsets(t) > h_offsets(t + 1u)) { - h_offsets(t) = h_offsets(t + 1u); - } - } - Kokkos::deep_copy(tile_offsets, h_offsets); - - m_tile_layout.tile_offsets = tile_offsets; - // tile_offsets(total_tiles) is the alive-particle count at sort time: - // the tiles partition exactly [0, npart_partitioned). The deposit - // launcher compares this against the live npart() to detect (and - // separately deposit) particles appended since this sort. - m_tile_layout.npart_partitioned = h_offsets(total_tiles); - } - - // 6. Populate `m_tile_layout` size/shape. `tile_perm` is not used - // in the current design — the SoA arrays are physically permuted - // into tile order, so consumers iterate - // `[tile_offsets(t), tile_offsets(t+1))` directly without a - // separate permutation indirection. + // Populate `m_tile_layout` size/shape. `tile_perm` is not used in the + // current design — the SoA arrays are physically permuted into tile + // order, so consumers iterate `[tile_offsets(t), tile_offsets(t+1))` + // directly without a separate permutation indirection. m_tile_layout.ntiles_per_axis[0] = ntx[0]; m_tile_layout.ntiles_per_axis[1] = ntx[1]; m_tile_layout.ntiles_per_axis[2] = ntx[2]; diff --git a/src/global/utils/sort_dispatch.h b/src/global/utils/sort_dispatch.h index 506a90742..cfc389464 100644 --- a/src/global/utils/sort_dispatch.h +++ b/src/global/utils/sort_dispatch.h @@ -14,8 +14,13 @@ * keys[perm[0]] <= keys[perm[1]] <= ... in stable order. * Always-available overloads: BinSort (uses Kokkos::BinSort) and * StdSort (host-side std::stable_sort fallback). The vendor-library - * overloads (OneDPL on SYCL, Thrust on CUDA) are conditional on the - * respective build flags. + * overloads (OneDPL on SYCL, cub radix sort on CUDA, rocprim radix + * sort on HIP) are conditional on the respective build flags. + * @note The CUDA/HIP overloads bound the radix sort to the significant + * key bits (`significant_bits(n_bins)`) instead of the full 32, so + * only ceil(log2(n_bins)) bits are sorted — fewer radix passes than + * a full-width `thrust::sort_by_key`. Scratch is transient (freed at + * scope exit); no persistent buffer is retained (cf. 787aa045). */ #ifndef GLOBAL_UTILS_SORT_DISPATCH_H @@ -38,22 +43,32 @@ #include #endif #if defined(CUDA_ENABLED) && defined(THRUST_ENABLED) - #include - #include - #include + #include #endif #if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED) - #include - #include - #include - #include + #include #endif #include +#include #include namespace ntt::sort_helpers { + // Number of low-order bits needed to represent keys in [0, n_bins). + // The radix sort only needs to scan these bits — bounding `end_bit` to + // ceil(log2(n_bins)) instead of 32 cuts the number of passes (e.g. 18 + // bits when total_tiles ~ 176K). Returns at least 1. + inline unsigned int significant_bits(ncells_t n_bins) { + unsigned int bits = 0u; + while (bits < 32u && + (static_cast(1u) << bits) < + static_cast(n_bins)) { + ++bits; + } + return (bits == 0u) ? 1u : bits; + } + // Always-available legacy fallback: Kokkos::BinSort. n_bins must be an // upper bound on distinct key values. inline void sort_by_key_dispatch(const array_t& keys, @@ -109,39 +124,126 @@ namespace ntt::sort_helpers { #if defined(CUDA_ENABLED) && defined(THRUST_ENABLED) inline void sort_by_key_dispatch(const array_t& keys, prtl_perm_t& perm, - ncells_t /*n_bins*/, + ncells_t n_bins, ::sort::backend::Thrust) { const auto n = static_cast(keys.extent(0)); if (n == 0u) { return; } - Kokkos::fence("sort_by_key_dispatch Thrust: pre-sort"); - thrust::device_ptr kp(keys.data()); - thrust::device_ptr pp(perm.data()); - thrust::sequence(pp, pp + n); - thrust::sort_by_key(kp, kp + n, pp); - Kokkos::fence("sort_by_key_dispatch Thrust: post-sort"); + auto exec = Kokkos::DefaultExecutionSpace(); + auto perm_v = perm; + Kokkos::parallel_for( + "PermInitIota", + n, + KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; }); + + // Out-of-place radix sort bounded to the significant key bits. The + // _out buffers and temp storage are transient (freed at scope exit). + array_t keys_out("tile_keys_sorted", n); + prtl_perm_t perm_out("tile_perm_sorted", n); + const int end_bit = static_cast(significant_bits(n_bins)); + + exec.fence("sort_by_key_dispatch Thrust: pre-sort"); + auto stream = exec.cuda_stream(); + + std::size_t temp_bytes = 0; + auto err = cub::DeviceRadixSort::SortPairs(nullptr, + temp_bytes, + keys.data(), + keys_out.data(), + perm.data(), + perm_out.data(), + n, + 0, + end_bit, + stream); + raise::ErrorIf(err != cudaSuccess, + "cub::DeviceRadixSort::SortPairs (size query) failed", + HERE); + array_t temp("cub_radix_temp", + (temp_bytes == 0u) ? std::size_t { 1 } : temp_bytes); + err = cub::DeviceRadixSort::SortPairs(temp.data(), + temp_bytes, + keys.data(), + keys_out.data(), + perm.data(), + perm_out.data(), + n, + 0, + end_bit, + stream); + raise::ErrorIf(err != cudaSuccess, + "cub::DeviceRadixSort::SortPairs failed", + HERE); + exec.fence("sort_by_key_dispatch Thrust: post-sort"); + + // Publish sorted keys back in place; swap the sorted permutation in. + auto keys_nc = keys; // non-const handle aliasing the same storage + Kokkos::deep_copy(keys_nc, keys_out); + perm = perm_out; } #endif #if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED) - // rocThrust exposes the same thrust:: API as CUDA Thrust; with hipcc - // device_ptr-based algorithms dispatch to the HIP backend. Mirrors - // the CUDA Thrust overload. + // HIP analogue of the CUDA cub overload, using rocprim's radix sort + // (which ships with rocThrust). Same bounded-bit, out-of-place, + // transient-scratch scheme. inline void sort_by_key_dispatch(const array_t& keys, prtl_perm_t& perm, - ncells_t /*n_bins*/, + ncells_t n_bins, ::sort::backend::Rocthrust) { const auto n = static_cast(keys.extent(0)); if (n == 0u) { return; } - Kokkos::fence("sort_by_key_dispatch Rocthrust: pre-sort"); - thrust::device_ptr kp(keys.data()); - thrust::device_ptr pp(perm.data()); - thrust::sequence(pp, pp + n); - thrust::sort_by_key(kp, kp + n, pp); - Kokkos::fence("sort_by_key_dispatch Rocthrust: post-sort"); + auto exec = Kokkos::DefaultExecutionSpace(); + auto perm_v = perm; + Kokkos::parallel_for( + "PermInitIota", + n, + KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; }); + + array_t keys_out("tile_keys_sorted", n); + prtl_perm_t perm_out("tile_perm_sorted", n); + const unsigned int end_bit = significant_bits(n_bins); + + exec.fence("sort_by_key_dispatch Rocthrust: pre-sort"); + auto stream = exec.hip_stream(); + + std::size_t temp_bytes = 0; + auto err = rocprim::radix_sort_pairs(nullptr, + temp_bytes, + keys.data(), + keys_out.data(), + perm.data(), + perm_out.data(), + static_cast(n), + 0u, + end_bit, + stream); + raise::ErrorIf(err != hipSuccess, + "rocprim::radix_sort_pairs (size query) failed", + HERE); + array_t temp("rocprim_radix_temp", + (temp_bytes == 0u) ? std::size_t { 1 } : temp_bytes); + err = rocprim::radix_sort_pairs(temp.data(), + temp_bytes, + keys.data(), + keys_out.data(), + perm.data(), + perm_out.data(), + static_cast(n), + 0u, + end_bit, + stream); + raise::ErrorIf(err != hipSuccess, + "rocprim::radix_sort_pairs failed", + HERE); + exec.fence("sort_by_key_dispatch Rocthrust: post-sort"); + + auto keys_nc = keys; // non-const handle aliasing the same storage + Kokkos::deep_copy(keys_nc, keys_out); + perm = perm_out; } #endif diff --git a/tests/framework/particles_sort.cpp b/tests/framework/particles_sort.cpp index 6945c962f..4e4881d98 100644 --- a/tests/framework/particles_sort.cpp +++ b/tests/framework/particles_sort.cpp @@ -31,12 +31,14 @@ auto main(int argc, char* argv[]) -> int { ntt::EmissionType::NONE, 2u, 1u); - auto& i1_p = prtls.i1; - auto& i2_p = prtls.i2; - auto& tag_p = prtls.tag; - auto& weight_p = prtls.weight; - auto& pld_r = prtls.pld_r; - auto& pld_i = prtls.pld_i; + auto& i1_p = prtls.i1; + auto& i2_p = prtls.i2; + auto& i1_prev_p = prtls.i1_prev; + auto& i2_prev_p = prtls.i2_prev; + auto& tag_p = prtls.tag; + auto& weight_p = prtls.weight; + auto& pld_r = prtls.pld_r; + auto& pld_i = prtls.pld_i; Kokkos::parallel_for( "InitParticles", prtls.maxnpart(), @@ -61,9 +63,14 @@ auto main(int argc, char* argv[]) -> int { i2_p(p) = 23u; weight_p(p) = 3.0; } - pld_r(p, 0) = weight_p(p) + static_cast(0.5); - pld_r(p, 1) = weight_p(p) + static_cast(10.5); - pld_i(p, 0) = static_cast(weight_p(p) + 10.0); + // team_policy keys on min(i, i_prev); without a meaningful + // i_prev every key would collapse to 0. Set i_prev = i so the + // tile key reduces to the particle's current cell. + i1_prev_p(p) = i1_p(p); + i2_prev_p(p) = i2_p(p); + pld_r(p, 0) = weight_p(p) + static_cast(0.5); + pld_r(p, 1) = weight_p(p) + static_cast(10.5); + pld_i(p, 0) = static_cast(weight_p(p) + 10.0); } else { tag_p(p) = ntt::ParticleTag::dead; } @@ -75,43 +82,78 @@ auto main(int argc, char* argv[]) -> int { prtls.SortSpatially(grid); + auto i1_h = Kokkos::create_mirror_view(prtls.i1); + auto i2_h = Kokkos::create_mirror_view(prtls.i2); + auto tag_h = Kokkos::create_mirror_view(prtls.tag); auto weight_h = Kokkos::create_mirror_view(prtls.weight); auto pld_r_h = Kokkos::create_mirror_view(prtls.pld_r); auto pld_i_h = Kokkos::create_mirror_view(prtls.pld_i); + Kokkos::deep_copy(i1_h, prtls.i1); + Kokkos::deep_copy(i2_h, prtls.i2); + Kokkos::deep_copy(tag_h, prtls.tag); Kokkos::deep_copy(weight_h, prtls.weight); Kokkos::deep_copy(pld_r_h, prtls.pld_r); Kokkos::deep_copy(pld_i_h, prtls.pld_i); - // Only [0, npart) is defined after a sort. The swap-based gather in - // apply_permutation_to_soa replaces each SoA View, zero-filling the - // spare capacity [npart, maxnpart) (a don't-care region overwritten - // by injection), so the old "tail preserved" check no longer holds. + // Tile geometry, mirroring sort::PositionToTileIndex. T = 1 (no + // team_policy) reproduces the legacy per-cell ordering. +#if defined(TEAM_POLICY) + const ncells_t T = static_cast(TEAM_POLICY_TILE_SIZE); +#else + const ncells_t T = 1u; +#endif + const auto na = grid.n_active(); + const ncells_t ntx2 = (na[1] + T - 1u) / T; + const auto tile_of = [&](int a, int b) -> ncells_t { + return (static_cast(a) / T) * ntx2 + + (static_cast(b) / T); + }; + + // SortSpatially is order-by-tile, not order-by-cell: assert the + // invariants that hold for any tile size rather than a hardwired + // permutation. (1) alive particles form a prefix sorted by + // non-decreasing tile index; (2) dead particles (weight == -1) form + // the suffix; (3) every SoA member is permuted by the *same* + // permutation, so each alive slot still satisfies pld == f(weight); + // (4) no alive particle is lost. Only [0, npart) is defined after a + // sort — the swap-gather zero-fills [npart, maxnpart). + bool seen_dead = false; + bool have_prev = false; + ncells_t prev_tile = 0u; + npart_t n_alive_obs = 0u; for (auto p { 0u }; p < 66u; ++p) { - if (p < 16u) { - raise::ErrorIf(weight_h(p) != 3.0, "error in sorting particles", HERE); - } else if (p < 33u) { - raise::ErrorIf(weight_h(p) != 1.0, "error in sorting particles", HERE); - } else if (p < 46u) { - raise::ErrorIf(weight_h(p) != 2.0, "error in sorting particles", HERE); - } else if (p < 59u) { - raise::ErrorIf(weight_h(p) != 0.0, "error in sorting particles", HERE); - } else { - raise::ErrorIf(weight_h(p) != -1.0, "error in sorting particles", HERE); - } - if (p < 59u) { - raise::ErrorIf(pld_r_h(p, 0) != weight_h(p) + static_cast(0.5), - "error in sorting particle real payload 0", - HERE); - raise::ErrorIf(pld_r_h(p, 1) != weight_h(p) + static_cast(10.5), - "error in sorting particle real payload 1", + if (tag_h(p) != ntt::ParticleTag::alive) { + seen_dead = true; + raise::ErrorIf(weight_h(p) != -1.0, + "dead particle has unexpected weight", HERE); - raise::ErrorIf( - pld_i_h(p, 0) != - static_cast(weight_h(p) + static_cast(10.0)), - "error in sorting particle integer payload 0", - HERE); + continue; } + raise::ErrorIf(seen_dead, + "alive particle after a dead one (not sorted to prefix)", + HERE); + const auto tile = tile_of(i1_h(p), i2_h(p)); + raise::ErrorIf(have_prev && (tile < prev_tile), + "alive particles not sorted by tile index", + HERE); + prev_tile = tile; + have_prev = true; + ++n_alive_obs; + raise::ErrorIf(pld_r_h(p, 0) != weight_h(p) + static_cast(0.5), + "error in sorting particle real payload 0", + HERE); + raise::ErrorIf(pld_r_h(p, 1) != weight_h(p) + static_cast(10.5), + "error in sorting particle real payload 1", + HERE); + raise::ErrorIf( + pld_i_h(p, 0) != + static_cast(weight_h(p) + static_cast(10.0)), + "error in sorting particle integer payload 0", + HERE); } + raise::ErrorIf(n_alive_obs != 59u, + "wrong number of alive particles after sort", + HERE); } { // 3D @@ -133,11 +175,14 @@ auto main(int argc, char* argv[]) -> int { ntt::EmissionType::NONE, 0u, 0u); - auto& i1_p = prtls.i1; - auto& i2_p = prtls.i2; - auto& i3_p = prtls.i3; - auto& tag_p = prtls.tag; - auto& weight_p = prtls.weight; + auto& i1_p = prtls.i1; + auto& i2_p = prtls.i2; + auto& i3_p = prtls.i3; + auto& i1_prev_p = prtls.i1_prev; + auto& i2_prev_p = prtls.i2_prev; + auto& i3_prev_p = prtls.i3_prev; + auto& tag_p = prtls.tag; + auto& weight_p = prtls.weight; Kokkos::parallel_for( "InitParticles", prtls.maxnpart(), @@ -171,6 +216,11 @@ auto main(int argc, char* argv[]) -> int { i3_p(p) = 7u; weight_p(p) = 4.0; } + // see 2D block: i_prev = i so the team_policy tile key reduces + // to the particle's current cell. + i1_prev_p(p) = i1_p(p); + i2_prev_p(p) = i2_p(p); + i3_prev_p(p) = i3_p(p); } else { tag_p(p) = ntt::ParticleTag::dead; } @@ -182,27 +232,61 @@ auto main(int argc, char* argv[]) -> int { prtls.SortSpatially(grid); + auto i1_h = Kokkos::create_mirror_view(prtls.i1); + auto i2_h = Kokkos::create_mirror_view(prtls.i2); + auto i3_h = Kokkos::create_mirror_view(prtls.i3); + auto tag_h = Kokkos::create_mirror_view(prtls.tag); auto weight_h = Kokkos::create_mirror_view(prtls.weight); + Kokkos::deep_copy(i1_h, prtls.i1); + Kokkos::deep_copy(i2_h, prtls.i2); + Kokkos::deep_copy(i3_h, prtls.i3); + Kokkos::deep_copy(tag_h, prtls.tag); Kokkos::deep_copy(weight_h, prtls.weight); - // Only [0, npart) is defined after a sort. The swap-based gather in - // apply_permutation_to_soa replaces each SoA View, zero-filling the - // spare capacity [npart, maxnpart) (a don't-care region overwritten - // by injection), so the old "tail preserved" check no longer holds. + + // Same invariants as the 2D block (no payloads here): alive prefix + // sorted by non-decreasing tile index, dead (weight == -1) suffix, + // alive count preserved. T = 1 reproduces the legacy per-cell order. +#if defined(TEAM_POLICY) + const ncells_t T = static_cast(TEAM_POLICY_TILE_SIZE); +#else + const ncells_t T = 1u; +#endif + const auto na = grid.n_active(); + const ncells_t ntx2 = (na[1] + T - 1u) / T; + const ncells_t ntx3 = (na[2] + T - 1u) / T; + const auto tile_of = [&](int a, int b, int c) -> ncells_t { + return ((static_cast(a) / T) * ntx2 + + (static_cast(b) / T)) * + ntx3 + + (static_cast(c) / T); + }; + + bool seen_dead = false; + bool have_prev = false; + ncells_t prev_tile = 0u; + npart_t n_alive_obs = 0u; for (auto p { 0u }; p < 66u; ++p) { - if (p < 13u) { - raise::ErrorIf(weight_h(p) != 4.0, "error in sorting particles", HERE); - } else if (p < 26u) { - raise::ErrorIf(weight_h(p) != 1.0, "error in sorting particles", HERE); - } else if (p < 39u) { - raise::ErrorIf(weight_h(p) != 2.0, "error in sorting particles", HERE); - } else if (p < 46u) { - raise::ErrorIf(weight_h(p) != 0.0, "error in sorting particles", HERE); - } else if (p < 59u) { - raise::ErrorIf(weight_h(p) != 3.0, "error in sorting particles", HERE); - } else { - raise::ErrorIf(weight_h(p) != -1.0, "error in sorting particles", HERE); + if (tag_h(p) != ntt::ParticleTag::alive) { + seen_dead = true; + raise::ErrorIf(weight_h(p) != -1.0, + "dead particle has unexpected weight", + HERE); + continue; } + raise::ErrorIf(seen_dead, + "alive particle after a dead one (not sorted to prefix)", + HERE); + const auto tile = tile_of(i1_h(p), i2_h(p), i3_h(p)); + raise::ErrorIf(have_prev && (tile < prev_tile), + "alive particles not sorted by tile index", + HERE); + prev_tile = tile; + have_prev = true; + ++n_alive_obs; } + raise::ErrorIf(n_alive_obs != 59u, + "wrong number of alive particles after sort", + HERE); } } catch (const std::exception& e) { diff --git a/tests/framework/sort_by_key.cpp b/tests/framework/sort_by_key.cpp index 94dc51d0e..9ccecc732 100644 --- a/tests/framework/sort_by_key.cpp +++ b/tests/framework/sort_by_key.cpp @@ -99,6 +99,9 @@ auto main(int argc, char* argv[]) -> int { #endif #if defined(CUDA_ENABLED) && defined(THRUST_ENABLED) test_one_backend("Thrust", ::sort::backend::Thrust {}); +#endif +#if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED) + test_one_backend("Rocthrust", ::sort::backend::Rocthrust {}); #endif } catch (std::exception& e) { std::cerr << e.what() << std::endl; From ed70dbd24512efb3c483f9773f67fb865eb44f97 Mon Sep 17 00:00:00 2001 From: Ludwig Boess Date: Sun, 21 Jun 2026 12:48:28 -0400 Subject: [PATCH 14/33] reduced exchange for filters --- src/engines/srpic/currents.h | 116 +++++++++++++++++++++++++++++++---- 1 file changed, 104 insertions(+), 12 deletions(-) diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h index b8cfe63ab..30eb9466d 100644 --- a/src/engines/srpic/currents.h +++ b/src/engines/srpic/currents.h @@ -27,6 +27,8 @@ #include "kernels/currents_deposit.hpp" #include "kernels/digital_filter.hpp" +#include + namespace ntt { namespace srpic { @@ -202,9 +204,11 @@ namespace ntt { Domain& domain, const SimulationParams& params) { logger::Checkpoint("Launching currents filtering kernels", HERE); - auto range = srpic::RangeWithAxisBCs(domain); const auto nfilter = params.template get( "algorithms.current_filters"); + if (nfilter == 0u) { + return; + } tuple_t size; if constexpr (M::Dim == Dim::_1D || M::Dim == Dim::_2D || M::Dim == Dim::_3D) { size[0] = domain.mesh.n_active(in::x1); @@ -215,17 +219,105 @@ namespace ntt { if constexpr (M::Dim == Dim::_3D) { size[2] = domain.mesh.n_active(in::x3); } - // !TODO: this needs to be done more efficiently - for (auto i { 0u }; i < nfilter; ++i) { - Kokkos::deep_copy(domain.fields.buff, domain.fields.cur); - Kokkos::parallel_for("CurrentsFilter", - range, - kernel::DigitalFilter_kernel( - domain.fields.cur, - domain.fields.buff, - size, - domain.mesh.flds_bc())); - metadomain.CommunicateFields(domain, Comm::J); + + // The filter ping-pongs `cur` <-> scratch `buff`: one up-front copy + // seeds `buff` with valid ghost cells (the kernel writes only the + // cells in its launch range), then each pass filters the input into + // the other buffer and swaps the View handles so `cur` again names the + // result. `buff` is pure scratch (also reused by CommunicateFields), so + // the permanent handle swap is transparent and `cur` always names the + // result. The single seeding copy preserves physical-boundary ghosts + // (conductor/match/atmosphere/...), which neither the kernel nor the + // MPI exchange refresh. + Kokkos::deep_copy(domain.fields.buff, domain.fields.cur); + + const auto flds_bc = domain.mesh.flds_bc(); + + if constexpr (M::CoordType == Coord::Cartesian) { + // Reduced-exchange ghost-margin scheme. One halo exchange refreshes + // N_GHOSTS ghost layers — enough for N_GHOSTS passes of the 3-point + // binomial if each pass also recomputes the inner ghost layers it + // will need next. We therefore extend the launch range by a shrinking + // margin `m` into the ghost zone, but only on comm-refreshed sides + // (PERIODIC self-wrap or SYNC inter-domain), where the ghost cell is + // interior physics. Physical-boundary ghosts are never written or + // refreshed, exactly as in the per-pass loop, so the result is + // identical for every BC — while doing one exchange per N_GHOSTS + // passes instead of one per pass. (Entering the loop the ghosts are + // valid to distance N_GHOSTS: srpic.hpp runs CommunicateFields(J) + // immediately before CurrentsFilter.) + const int G = static_cast(N_GHOSTS); + const auto comm_side = [](FldsBC b) { + return (b == FldsBC::PERIODIC) or (b == FldsBC::SYNC); + }; + bool ext_lo[3] = { false, false, false }; + bool ext_hi[3] = { false, false, false }; + for (auto d { 0 }; d < static_cast(M::Dim); ++d) { + ext_lo[d] = comm_side(flds_bc[d].first); + ext_hi[d] = comm_side(flds_bc[d].second); + } + const auto make_range = [&](int m) -> range_t { + const auto ml = [&](int d) -> ncells_t { + return ext_lo[d] ? static_cast(m) : 0u; + }; + const auto mh = [&](int d) -> ncells_t { + return ext_hi[d] ? static_cast(m) : 0u; + }; + if constexpr (M::Dim == Dim::_1D) { + return CreateRangePolicy( + { domain.mesh.i_min(in::x1) - ml(0) }, + { domain.mesh.i_max(in::x1) + mh(0) }); + } else if constexpr (M::Dim == Dim::_2D) { + return CreateRangePolicy( + { domain.mesh.i_min(in::x1) - ml(0), + domain.mesh.i_min(in::x2) - ml(1) }, + { domain.mesh.i_max(in::x1) + mh(0), + domain.mesh.i_max(in::x2) + mh(1) }); + } else { + return CreateRangePolicy( + { domain.mesh.i_min(in::x1) - ml(0), + domain.mesh.i_min(in::x2) - ml(1), + domain.mesh.i_min(in::x3) - ml(2) }, + { domain.mesh.i_max(in::x1) + mh(0), + domain.mesh.i_max(in::x2) + mh(1), + domain.mesh.i_max(in::x3) + mh(2) }); + } + }; + int m = G - 1; + for (auto i { 0u }; i < nfilter; ++i) { + Kokkos::parallel_for( + "CurrentsFilter", + make_range(m), + kernel::DigitalFilter_kernel( + domain.fields.buff, + domain.fields.cur, + size, + flds_bc)); + std::swap(domain.fields.cur, domain.fields.buff); + --m; + if (m < 0 or i == nfilter - 1u) { + // refresh ghosts to distance G (and leave them valid for the + // downstream field solver after the final pass) + metadomain.CommunicateFields(domain, Comm::J); + m = G - 1; + } + } + } else { + // Non-Cartesian (axis BCs need the +1 range fixup): keep the + // per-pass exchange cadence, still ping-ponging the buffers. + const auto range = srpic::RangeWithAxisBCs(domain); + for (auto i { 0u }; i < nfilter; ++i) { + Kokkos::parallel_for( + "CurrentsFilter", + range, + kernel::DigitalFilter_kernel( + domain.fields.buff, + domain.fields.cur, + size, + flds_bc)); + std::swap(domain.fields.cur, domain.fields.buff); + metadomain.CommunicateFields(domain, Comm::J); + } } } From 314d1424c2d2927a119a8918a45dfce8aa9bb501 Mon Sep 17 00:00:00 2001 From: Ludwig Boess Date: Sun, 21 Jun 2026 12:49:15 -0400 Subject: [PATCH 15/33] moved tag_offsets_h outside of loop to avoid multiple device-host copies --- src/framework/containers/particles_comm.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/framework/containers/particles_comm.cpp b/src/framework/containers/particles_comm.cpp index 0fa140f93..6ede3abf0 100644 --- a/src/framework/containers/particles_comm.cpp +++ b/src/framework/containers/particles_comm.cpp @@ -266,6 +266,11 @@ namespace ntt { auto iteration = 0; auto current_received = 0; + // `tag_offsets` is the same for every direction; mirror it to the host + // once here instead of re-copying it inside the direction loop. + auto tag_offsets_h = Kokkos::create_mirror_view(tag_offsets); + Kokkos::deep_copy(tag_offsets_h, tag_offsets); + for (const auto& direction : dirs_to_comm) { const auto send_rank = send_ranks.at(direction); const auto recv_rank = recv_ranks.at(direction); @@ -291,9 +296,6 @@ namespace ntt { npart_send_in * NPLDS_I }; } - auto tag_offsets_h = Kokkos::create_mirror_view(tag_offsets); - Kokkos::deep_copy(tag_offsets_h, tag_offsets); - npart_t idx_offset = npart_dead; if (tag_send > 2) { idx_offset += tag_offsets_h(tag_send - 3); From 8047a118816d27f46a577a27235ac19ac8f740b7 Mon Sep 17 00:00:00 2001 From: Ludwig Boess Date: Sun, 21 Jun 2026 12:49:52 -0400 Subject: [PATCH 16/33] batched async communication --- src/framework/domain/comm_mpi.hpp | 170 +++++++++++++++++++++++ src/framework/domain/metadomain_comm.cpp | 66 +++++++++ 2 files changed, 236 insertions(+) diff --git a/src/framework/domain/comm_mpi.hpp b/src/framework/domain/comm_mpi.hpp index 52103c170..668efdd75 100644 --- a/src/framework/domain/comm_mpi.hpp +++ b/src/framework/domain/comm_mpi.hpp @@ -365,6 +365,176 @@ namespace comm { } } + // ---- batched, non-blocking field halo exchange ------------------------ // + + // Per-direction parameters, collected once by the caller (GetSendRecvParams) + // and reused for every field communicated in the same call. + template + struct FieldCommDir { + int send_rank { -1 }, recv_rank { -1 }; + unsigned int send_ind { 0 }, recv_ind { 0 }; + std::vector send_slice {}, recv_slice {}; + int tag { 0 }; + }; + + /** + * @brief Non-additive (ghost-overwrite) halo exchange of one field across + * all directions at once. + * @details Packs every send, posts all `MPI_Irecv` then all `MPI_Isend` + * (unique per-direction tags), a single `MPI_Waitall`, then unpacks + * — overlapping the round-trips instead of serializing one blocking + * `MPI_Sendrecv` per direction. Self-communication (periodic single + * domain) is a local copy. Slicing/packing mirrors + * `comm::CommunicateField`. The tag pairs A's send in a direction + * with B's receive in the same direction (the matching neighbor), + * so it is unique per (rank-pair, direction). + */ + template + inline void CommunicateFieldBatched(unsigned int my_idx, + ndfield_t& fld, + const std::vector>& dirs, + const cell_range_t& comps) { + static constexpr unsigned short Dp1 = static_cast(D) + 1; + using buf_t = ndarray_t; + const ncells_t ncomp = comps.second - comps.first; + const auto ndirs = dirs.size(); + + const auto is_self = [&](const FieldCommDir& dd) { + return (dd.send_ind == my_idx) && (dd.recv_ind == my_idx); + }; + const auto ext = [](const std::vector& sl, int d) -> ncells_t { + return sl[d].second - sl[d].first; + }; + const auto make_buf = [&](const char* lbl, + const std::vector& sl) -> buf_t { + if constexpr (D == Dim::_1D) { + return buf_t(lbl, ext(sl, 0), ncomp); + } else if constexpr (D == Dim::_2D) { + return buf_t(lbl, ext(sl, 0), ext(sl, 1), ncomp); + } else { + return buf_t(lbl, ext(sl, 0), ext(sl, 1), ext(sl, 2), ncomp); + } + }; + const auto count = [](const buf_t& b) -> int { + ncells_t n = 1; + for (auto d { 0 }; d < static_cast(Dp1); ++d) { + n *= static_cast(b.extent(d)); + } + return static_cast(n); + }; + const auto pack = [&](buf_t& b, const std::vector& sl) { + if constexpr (D == Dim::_1D) { + Kokkos::deep_copy(b, Kokkos::subview(fld, sl[0], comps)); + } else if constexpr (D == Dim::_2D) { + Kokkos::deep_copy(b, Kokkos::subview(fld, sl[0], sl[1], comps)); + } else { + Kokkos::deep_copy(b, Kokkos::subview(fld, sl[0], sl[1], sl[2], comps)); + } + }; + const auto unpack = [&](const buf_t& b, const std::vector& sl) { + if constexpr (D == Dim::_1D) { + Kokkos::deep_copy(Kokkos::subview(fld, sl[0], comps), b); + } else if constexpr (D == Dim::_2D) { + Kokkos::deep_copy(Kokkos::subview(fld, sl[0], sl[1], comps), b); + } else { + Kokkos::deep_copy(Kokkos::subview(fld, sl[0], sl[1], sl[2], comps), b); + } + }; + const auto self_copy = [&](const std::vector& ssl, + const std::vector& rsl) { + if constexpr (D == Dim::_1D) { + Kokkos::deep_copy(Kokkos::subview(fld, rsl[0], comps), + Kokkos::subview(fld, ssl[0], comps)); + } else if constexpr (D == Dim::_2D) { + Kokkos::deep_copy(Kokkos::subview(fld, rsl[0], rsl[1], comps), + Kokkos::subview(fld, ssl[0], ssl[1], comps)); + } else { + Kokkos::deep_copy(Kokkos::subview(fld, rsl[0], rsl[1], rsl[2], comps), + Kokkos::subview(fld, ssl[0], ssl[1], ssl[2], comps)); + } + }; + + std::vector send_buf(ndirs), recv_buf(ndirs); +#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI) + using hbuf_t = typename buf_t::host_mirror_type; + std::vector send_h(ndirs), recv_h(ndirs); +#endif + + // phase 1: self copies, pack sends, allocate recvs + for (auto i { 0u }; i < ndirs; ++i) { + const auto& dd = dirs[i]; + if (is_self(dd)) { + self_copy(dd.send_slice, dd.recv_slice); + continue; + } + if (dd.recv_rank >= 0) { + recv_buf[i] = make_buf("recv_fld", dd.recv_slice); +#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI) + recv_h[i] = Kokkos::create_mirror_view(recv_buf[i]); +#endif + } + if (dd.send_rank >= 0) { + send_buf[i] = make_buf("send_fld", dd.send_slice); + pack(send_buf[i], dd.send_slice); +#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI) + send_h[i] = Kokkos::create_mirror_view(send_buf[i]); + Kokkos::deep_copy(send_h[i], send_buf[i]); +#endif + } + } + // drain packs (and device->host staging) before MPI reads the buffers + Kokkos::fence("CommunicateFieldBatched: pre-MPI"); + + // phase 2: post all recvs, then all sends + std::vector reqs; + reqs.reserve(2 * ndirs); + for (auto i { 0u }; i < ndirs; ++i) { + const auto& dd = dirs[i]; + if (is_self(dd) || dd.recv_rank < 0) { + continue; + } + reqs.emplace_back(); +#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI) + MPI_Irecv(recv_h[i].data(), count(recv_buf[i]), mpi::get_type(), + dd.recv_rank, dd.tag, MPI_COMM_WORLD, &reqs.back()); +#else + MPI_Irecv(recv_buf[i].data(), count(recv_buf[i]), mpi::get_type(), + dd.recv_rank, dd.tag, MPI_COMM_WORLD, &reqs.back()); +#endif + } + for (auto i { 0u }; i < ndirs; ++i) { + const auto& dd = dirs[i]; + if (is_self(dd) || dd.send_rank < 0) { + continue; + } + reqs.emplace_back(); +#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI) + MPI_Isend(send_h[i].data(), count(send_buf[i]), mpi::get_type(), + dd.send_rank, dd.tag, MPI_COMM_WORLD, &reqs.back()); +#else + MPI_Isend(send_buf[i].data(), count(send_buf[i]), mpi::get_type(), + dd.send_rank, dd.tag, MPI_COMM_WORLD, &reqs.back()); +#endif + } + + // phase 3: complete all transfers + if (not reqs.empty()) { + MPI_Waitall(static_cast(reqs.size()), reqs.data(), MPI_STATUSES_IGNORE); + } + + // phase 4: unpack received ghosts + for (auto i { 0u }; i < ndirs; ++i) { + const auto& dd = dirs[i]; + if (is_self(dd) || dd.recv_rank < 0) { + continue; + } +#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI) + Kokkos::deep_copy(recv_buf[i], recv_h[i]); +#endif + unpack(recv_buf[i], dd.recv_slice); + } + } + } // namespace comm #endif // FRAMEWORK_DOMAIN_COMM_MPI_HPP diff --git a/src/framework/domain/metadomain_comm.cpp b/src/framework/domain/metadomain_comm.cpp index f8c0f60d7..d6061d7bf 100644 --- a/src/framework/domain/metadomain_comm.cpp +++ b/src/framework/domain/metadomain_comm.cpp @@ -273,6 +273,71 @@ namespace ntt { comp_range_cur = cell_range_t(cur::jx1, cur::jx3 + 1); } // traverse in all directions and send/recv the fields +#if defined(MPI_ENABLED) + // Batched non-blocking exchange: collect the per-direction params once, + // then overlap all directions for each field (post all Irecv/Isend with + // per-direction tags + one Waitall) instead of a blocking Sendrecv per + // direction. The #else branch is the single-rank non-MPI path. + { + std::vector> dirs; + dirs.reserve(dir::Directions::all.size()); + for (auto& direction : dir::Directions::all) { + const auto [send_params, recv_params] = GetSendRecvParams(this, + domain, + direction, + false); + const auto [send_indrank, send_slice] = send_params; + const auto [recv_indrank, recv_slice] = recv_params; + const auto [send_ind, send_rank] = send_indrank; + const auto [recv_ind, recv_rank] = recv_indrank; + if (send_rank < 0 and recv_rank < 0) { + continue; + } + comm::FieldCommDir fcd; + fcd.send_rank = send_rank; + fcd.recv_rank = recv_rank; + fcd.send_ind = send_ind; + fcd.recv_ind = recv_ind; + fcd.send_slice = send_slice; + fcd.recv_slice = recv_slice; + fcd.tag = static_cast(mpi::PrtlSendTag::dir2tag(direction)); + dirs.push_back(std::move(fcd)); + } + if (comm_em) { + comm::CommunicateFieldBatched(domain.index(), + domain.fields.em, + dirs, + comp_range_fld); + } + if constexpr (S == SimEngine::GRPIC) { + if (comm_aux) { + comm::CommunicateFieldBatched(domain.index(), + domain.fields.aux, + dirs, + comp_range_fld); + } + if (comm_em0) { + comm::CommunicateFieldBatched(domain.index(), + domain.fields.em0, + dirs, + comp_range_fld); + } + if (comm_j) { + comm::CommunicateFieldBatched(domain.index(), + domain.fields.cur0, + dirs, + comp_range_cur); + } + } else { + if (comm_j) { + comm::CommunicateFieldBatched(domain.index(), + domain.fields.cur, + dirs, + comp_range_cur); + } + } + } +#else for (auto& direction : dir::Directions::all) { const auto [send_params, recv_params] = GetSendRecvParams(this, domain, direction, false); @@ -364,6 +429,7 @@ namespace ntt { } } } +#endif // MPI_ENABLED } template From 50369176dde282993a75a1c73bd89b097ffcc38d Mon Sep 17 00:00:00 2001 From: Ludwig Boess Date: Sun, 21 Jun 2026 16:45:53 -0400 Subject: [PATCH 17/33] removed async comm because of bandwidth issue --- src/framework/domain/comm_mpi.hpp | 170 ----------------------- src/framework/domain/metadomain_comm.cpp | 66 --------- 2 files changed, 236 deletions(-) diff --git a/src/framework/domain/comm_mpi.hpp b/src/framework/domain/comm_mpi.hpp index 668efdd75..52103c170 100644 --- a/src/framework/domain/comm_mpi.hpp +++ b/src/framework/domain/comm_mpi.hpp @@ -365,176 +365,6 @@ namespace comm { } } - // ---- batched, non-blocking field halo exchange ------------------------ // - - // Per-direction parameters, collected once by the caller (GetSendRecvParams) - // and reused for every field communicated in the same call. - template - struct FieldCommDir { - int send_rank { -1 }, recv_rank { -1 }; - unsigned int send_ind { 0 }, recv_ind { 0 }; - std::vector send_slice {}, recv_slice {}; - int tag { 0 }; - }; - - /** - * @brief Non-additive (ghost-overwrite) halo exchange of one field across - * all directions at once. - * @details Packs every send, posts all `MPI_Irecv` then all `MPI_Isend` - * (unique per-direction tags), a single `MPI_Waitall`, then unpacks - * — overlapping the round-trips instead of serializing one blocking - * `MPI_Sendrecv` per direction. Self-communication (periodic single - * domain) is a local copy. Slicing/packing mirrors - * `comm::CommunicateField`. The tag pairs A's send in a direction - * with B's receive in the same direction (the matching neighbor), - * so it is unique per (rank-pair, direction). - */ - template - inline void CommunicateFieldBatched(unsigned int my_idx, - ndfield_t& fld, - const std::vector>& dirs, - const cell_range_t& comps) { - static constexpr unsigned short Dp1 = static_cast(D) + 1; - using buf_t = ndarray_t; - const ncells_t ncomp = comps.second - comps.first; - const auto ndirs = dirs.size(); - - const auto is_self = [&](const FieldCommDir& dd) { - return (dd.send_ind == my_idx) && (dd.recv_ind == my_idx); - }; - const auto ext = [](const std::vector& sl, int d) -> ncells_t { - return sl[d].second - sl[d].first; - }; - const auto make_buf = [&](const char* lbl, - const std::vector& sl) -> buf_t { - if constexpr (D == Dim::_1D) { - return buf_t(lbl, ext(sl, 0), ncomp); - } else if constexpr (D == Dim::_2D) { - return buf_t(lbl, ext(sl, 0), ext(sl, 1), ncomp); - } else { - return buf_t(lbl, ext(sl, 0), ext(sl, 1), ext(sl, 2), ncomp); - } - }; - const auto count = [](const buf_t& b) -> int { - ncells_t n = 1; - for (auto d { 0 }; d < static_cast(Dp1); ++d) { - n *= static_cast(b.extent(d)); - } - return static_cast(n); - }; - const auto pack = [&](buf_t& b, const std::vector& sl) { - if constexpr (D == Dim::_1D) { - Kokkos::deep_copy(b, Kokkos::subview(fld, sl[0], comps)); - } else if constexpr (D == Dim::_2D) { - Kokkos::deep_copy(b, Kokkos::subview(fld, sl[0], sl[1], comps)); - } else { - Kokkos::deep_copy(b, Kokkos::subview(fld, sl[0], sl[1], sl[2], comps)); - } - }; - const auto unpack = [&](const buf_t& b, const std::vector& sl) { - if constexpr (D == Dim::_1D) { - Kokkos::deep_copy(Kokkos::subview(fld, sl[0], comps), b); - } else if constexpr (D == Dim::_2D) { - Kokkos::deep_copy(Kokkos::subview(fld, sl[0], sl[1], comps), b); - } else { - Kokkos::deep_copy(Kokkos::subview(fld, sl[0], sl[1], sl[2], comps), b); - } - }; - const auto self_copy = [&](const std::vector& ssl, - const std::vector& rsl) { - if constexpr (D == Dim::_1D) { - Kokkos::deep_copy(Kokkos::subview(fld, rsl[0], comps), - Kokkos::subview(fld, ssl[0], comps)); - } else if constexpr (D == Dim::_2D) { - Kokkos::deep_copy(Kokkos::subview(fld, rsl[0], rsl[1], comps), - Kokkos::subview(fld, ssl[0], ssl[1], comps)); - } else { - Kokkos::deep_copy(Kokkos::subview(fld, rsl[0], rsl[1], rsl[2], comps), - Kokkos::subview(fld, ssl[0], ssl[1], ssl[2], comps)); - } - }; - - std::vector send_buf(ndirs), recv_buf(ndirs); -#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI) - using hbuf_t = typename buf_t::host_mirror_type; - std::vector send_h(ndirs), recv_h(ndirs); -#endif - - // phase 1: self copies, pack sends, allocate recvs - for (auto i { 0u }; i < ndirs; ++i) { - const auto& dd = dirs[i]; - if (is_self(dd)) { - self_copy(dd.send_slice, dd.recv_slice); - continue; - } - if (dd.recv_rank >= 0) { - recv_buf[i] = make_buf("recv_fld", dd.recv_slice); -#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI) - recv_h[i] = Kokkos::create_mirror_view(recv_buf[i]); -#endif - } - if (dd.send_rank >= 0) { - send_buf[i] = make_buf("send_fld", dd.send_slice); - pack(send_buf[i], dd.send_slice); -#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI) - send_h[i] = Kokkos::create_mirror_view(send_buf[i]); - Kokkos::deep_copy(send_h[i], send_buf[i]); -#endif - } - } - // drain packs (and device->host staging) before MPI reads the buffers - Kokkos::fence("CommunicateFieldBatched: pre-MPI"); - - // phase 2: post all recvs, then all sends - std::vector reqs; - reqs.reserve(2 * ndirs); - for (auto i { 0u }; i < ndirs; ++i) { - const auto& dd = dirs[i]; - if (is_self(dd) || dd.recv_rank < 0) { - continue; - } - reqs.emplace_back(); -#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI) - MPI_Irecv(recv_h[i].data(), count(recv_buf[i]), mpi::get_type(), - dd.recv_rank, dd.tag, MPI_COMM_WORLD, &reqs.back()); -#else - MPI_Irecv(recv_buf[i].data(), count(recv_buf[i]), mpi::get_type(), - dd.recv_rank, dd.tag, MPI_COMM_WORLD, &reqs.back()); -#endif - } - for (auto i { 0u }; i < ndirs; ++i) { - const auto& dd = dirs[i]; - if (is_self(dd) || dd.send_rank < 0) { - continue; - } - reqs.emplace_back(); -#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI) - MPI_Isend(send_h[i].data(), count(send_buf[i]), mpi::get_type(), - dd.send_rank, dd.tag, MPI_COMM_WORLD, &reqs.back()); -#else - MPI_Isend(send_buf[i].data(), count(send_buf[i]), mpi::get_type(), - dd.send_rank, dd.tag, MPI_COMM_WORLD, &reqs.back()); -#endif - } - - // phase 3: complete all transfers - if (not reqs.empty()) { - MPI_Waitall(static_cast(reqs.size()), reqs.data(), MPI_STATUSES_IGNORE); - } - - // phase 4: unpack received ghosts - for (auto i { 0u }; i < ndirs; ++i) { - const auto& dd = dirs[i]; - if (is_self(dd) || dd.recv_rank < 0) { - continue; - } -#if defined(DEVICE_ENABLED) && !defined(GPU_AWARE_MPI) - Kokkos::deep_copy(recv_buf[i], recv_h[i]); -#endif - unpack(recv_buf[i], dd.recv_slice); - } - } - } // namespace comm #endif // FRAMEWORK_DOMAIN_COMM_MPI_HPP diff --git a/src/framework/domain/metadomain_comm.cpp b/src/framework/domain/metadomain_comm.cpp index d6061d7bf..f8c0f60d7 100644 --- a/src/framework/domain/metadomain_comm.cpp +++ b/src/framework/domain/metadomain_comm.cpp @@ -273,71 +273,6 @@ namespace ntt { comp_range_cur = cell_range_t(cur::jx1, cur::jx3 + 1); } // traverse in all directions and send/recv the fields -#if defined(MPI_ENABLED) - // Batched non-blocking exchange: collect the per-direction params once, - // then overlap all directions for each field (post all Irecv/Isend with - // per-direction tags + one Waitall) instead of a blocking Sendrecv per - // direction. The #else branch is the single-rank non-MPI path. - { - std::vector> dirs; - dirs.reserve(dir::Directions::all.size()); - for (auto& direction : dir::Directions::all) { - const auto [send_params, recv_params] = GetSendRecvParams(this, - domain, - direction, - false); - const auto [send_indrank, send_slice] = send_params; - const auto [recv_indrank, recv_slice] = recv_params; - const auto [send_ind, send_rank] = send_indrank; - const auto [recv_ind, recv_rank] = recv_indrank; - if (send_rank < 0 and recv_rank < 0) { - continue; - } - comm::FieldCommDir fcd; - fcd.send_rank = send_rank; - fcd.recv_rank = recv_rank; - fcd.send_ind = send_ind; - fcd.recv_ind = recv_ind; - fcd.send_slice = send_slice; - fcd.recv_slice = recv_slice; - fcd.tag = static_cast(mpi::PrtlSendTag::dir2tag(direction)); - dirs.push_back(std::move(fcd)); - } - if (comm_em) { - comm::CommunicateFieldBatched(domain.index(), - domain.fields.em, - dirs, - comp_range_fld); - } - if constexpr (S == SimEngine::GRPIC) { - if (comm_aux) { - comm::CommunicateFieldBatched(domain.index(), - domain.fields.aux, - dirs, - comp_range_fld); - } - if (comm_em0) { - comm::CommunicateFieldBatched(domain.index(), - domain.fields.em0, - dirs, - comp_range_fld); - } - if (comm_j) { - comm::CommunicateFieldBatched(domain.index(), - domain.fields.cur0, - dirs, - comp_range_cur); - } - } else { - if (comm_j) { - comm::CommunicateFieldBatched(domain.index(), - domain.fields.cur, - dirs, - comp_range_cur); - } - } - } -#else for (auto& direction : dir::Directions::all) { const auto [send_params, recv_params] = GetSendRecvParams(this, domain, direction, false); @@ -429,7 +364,6 @@ namespace ntt { } } } -#endif // MPI_ENABLED } template From 88914209ae20bff930cf6bedacb0653053fa81f3 Mon Sep 17 00:00:00 2001 From: Ludwig Boess Date: Sun, 21 Jun 2026 16:51:00 -0400 Subject: [PATCH 18/33] bugfix in current deposit --- src/kernels/currents_deposit.hpp | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/kernels/currents_deposit.hpp b/src/kernels/currents_deposit.hpp index b92b02423..98fa0b661 100644 --- a/src/kernels/currents_deposit.hpp +++ b/src/kernels/currents_deposit.hpp @@ -1042,8 +1042,12 @@ namespace kernel { [&](int g_i1, int comp, real_t v) { if (to_scratch) { Kokkos::atomic_add(&scr(g_i1 - origin_J1_low, comp), v); - //} else if (g_i1 >= 0 and g_i1 < j_ext1) { - } else { + } else if (g_i1 >= 0 and g_i1 < j_ext1) { + // Bounds-clip the escape-valve write against J's storage, + // exactly as the cooperative flush does. Cells past the + // ghost stripe are re-supplied by SynchronizeFields(J); an + // unclipped write here faults the GPU (an escaped boundary + // particle's stencil can reach past j_ext1). Kokkos::atomic_add(&J(g_i1, comp), v); } }); @@ -1115,7 +1119,11 @@ namespace kernel { Kokkos::atomic_add( &scr(g_i1 - origin_J1_low, g_i2 - origin_J2_low, comp), v); - } else { + } else if (g_i1 >= 0 and g_i1 < j_ext1 and g_i2 >= 0 and + g_i2 < j_ext2) { + // Bounds-clip as the cooperative flush does; an unclipped + // escape-valve write faults the GPU when an escaped boundary + // particle's stencil reaches past j_ext. Kokkos::atomic_add(&J(g_i1, g_i2, comp), v); } }); @@ -1195,7 +1203,11 @@ namespace kernel { g_i3 - origin_J3_low, comp), v); - } else { + } else if (g_i1 >= 0 and g_i1 < j_ext1 and g_i2 >= 0 and + g_i2 < j_ext2 and g_i3 >= 0 and g_i3 < j_ext3) { + // Bounds-clip as the cooperative flush does; an unclipped + // escape-valve write faults the GPU when an escaped boundary + // particle's stencil reaches past j_ext. Kokkos::atomic_add(&J(g_i1, g_i2, g_i3, comp), v); } }); From 4f3a7625ab055fec718fdac32bb1f588c980777c Mon Sep 17 00:00:00 2001 From: Ludwig Boess Date: Sun, 21 Jun 2026 19:38:24 -0400 Subject: [PATCH 19/33] double-buffer radix sort to drop the N-sized temp (fixes device OOM at scale) --- src/global/utils/sort_dispatch.h | 74 +++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 25 deletions(-) diff --git a/src/global/utils/sort_dispatch.h b/src/global/utils/sort_dispatch.h index cfc389464..27a20a910 100644 --- a/src/global/utils/sort_dispatch.h +++ b/src/global/utils/sort_dispatch.h @@ -137,8 +137,8 @@ namespace ntt::sort_helpers { n, KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; }); - // Out-of-place radix sort bounded to the significant key bits. The - // _out buffers and temp storage are transient (freed at scope exit). + // Radix sort bounded to the significant key bits. The _out buffers and + // temp storage are transient (freed at scope exit). array_t keys_out("tile_keys_sorted", n); prtl_perm_t perm_out("tile_perm_sorted", n); const int end_bit = static_cast(significant_bits(n_bins)); @@ -146,13 +146,20 @@ namespace ntt::sort_helpers { exec.fence("sort_by_key_dispatch Thrust: pre-sort"); auto stream = exec.cuda_stream(); + // DoubleBuffer radix sort: cub ping-pongs between the supplied + // (current, alternate) buffer pairs, so `temp_bytes` holds only the + // histograms (~MB) instead of an internal N-sized alternate (~8*N bytes). + // Nearly halves the sort's transient memory vs the plain out-of-place + // form, which matters at high npart where the N-sized temp can fail to + // allocate (device OOM at scale). + cub::DoubleBuffer d_keys(keys.data(), keys_out.data()); + cub::DoubleBuffer d_perm(perm.data(), perm_out.data()); + std::size_t temp_bytes = 0; auto err = cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, - keys.data(), - keys_out.data(), - perm.data(), - perm_out.data(), + d_keys, + d_perm, n, 0, end_bit, @@ -164,10 +171,8 @@ namespace ntt::sort_helpers { (temp_bytes == 0u) ? std::size_t { 1 } : temp_bytes); err = cub::DeviceRadixSort::SortPairs(temp.data(), temp_bytes, - keys.data(), - keys_out.data(), - perm.data(), - perm_out.data(), + d_keys, + d_perm, n, 0, end_bit, @@ -177,10 +182,16 @@ namespace ntt::sort_helpers { HERE); exec.fence("sort_by_key_dispatch Thrust: post-sort"); - // Publish sorted keys back in place; swap the sorted permutation in. - auto keys_nc = keys; // non-const handle aliasing the same storage - Kokkos::deep_copy(keys_nc, keys_out); - perm = perm_out; + // Publish results from whichever buffer cub left as Current() (depends on + // the pass count): copy sorted keys back into `keys`' storage if they + // ended up in the alternate, and point `perm` at its current buffer. + if (d_keys.Current() != keys.data()) { + auto keys_nc = keys; // non-const handle aliasing the same storage + Kokkos::deep_copy(keys_nc, keys_out); + } + if (d_perm.Current() == perm_out.data()) { + perm = perm_out; + } } #endif @@ -210,13 +221,20 @@ namespace ntt::sort_helpers { exec.fence("sort_by_key_dispatch Rocthrust: pre-sort"); auto stream = exec.hip_stream(); + // double_buffer radix sort: rocprim ping-pongs between the supplied + // (current, alternate) buffer pairs, so `temp_storage` holds only the + // histograms (~MB) instead of an internal N-sized alternate (~8*N bytes, + // the dominant `rocprim_radix_temp`). This nearly halves the sort's + // transient memory vs the plain out-of-place form, which matters at high + // npart where that N-sized temp can fail to allocate (device OOM at scale). + rocprim::double_buffer d_keys(keys.data(), keys_out.data()); + rocprim::double_buffer d_perm(perm.data(), perm_out.data()); + std::size_t temp_bytes = 0; auto err = rocprim::radix_sort_pairs(nullptr, temp_bytes, - keys.data(), - keys_out.data(), - perm.data(), - perm_out.data(), + d_keys, + d_perm, static_cast(n), 0u, end_bit, @@ -228,10 +246,8 @@ namespace ntt::sort_helpers { (temp_bytes == 0u) ? std::size_t { 1 } : temp_bytes); err = rocprim::radix_sort_pairs(temp.data(), temp_bytes, - keys.data(), - keys_out.data(), - perm.data(), - perm_out.data(), + d_keys, + d_perm, static_cast(n), 0u, end_bit, @@ -241,9 +257,17 @@ namespace ntt::sort_helpers { HERE); exec.fence("sort_by_key_dispatch Rocthrust: post-sort"); - auto keys_nc = keys; // non-const handle aliasing the same storage - Kokkos::deep_copy(keys_nc, keys_out); - perm = perm_out; + // Publish results from whichever buffer rocprim left as `current()` + // (depends on the pass count). If the sorted keys ended up in the + // alternate, copy them back into `keys`' storage so downstream + // (compute_tile_offsets) sees them; point `perm` at its current buffer. + if (d_keys.current() != keys.data()) { + auto keys_nc = keys; // non-const handle aliasing the same storage + Kokkos::deep_copy(keys_nc, keys_out); + } + if (d_perm.current() == perm_out.data()) { + perm = perm_out; + } } #endif From 94ba9cdb3e484dbffed0a681bb1222d873b207af Mon Sep 17 00:00:00 2001 From: Ludwig Boess Date: Wed, 24 Jun 2026 18:26:32 -0400 Subject: [PATCH 20/33] fix printing of total particles --- src/global/utils/diag.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/global/utils/diag.cpp b/src/global/utils/diag.cpp index 4cf763faf..604a872ae 100644 --- a/src/global/utils/diag.cpp +++ b/src/global/utils/diag.cpp @@ -17,6 +17,7 @@ #include #endif // MPI_ENABLED +#include #include #include #include @@ -26,8 +27,8 @@ namespace diag { auto npart_stats(npart_t npart, npart_t maxnpart) - -> std::vector> { - auto stats = std::vector>(); + -> std::vector> { + auto stats = std::vector>(); const auto percentage = [](npart_t part, npart_t maxpart) -> unsigned short { return static_cast( 100.0f * static_cast(part) / static_cast(maxpart)); @@ -59,9 +60,9 @@ namespace diag { if (rank != MPI_ROOT_RANK) { return stats; } - const npart_t tot_npart = std::accumulate(mpi_npart.begin(), - mpi_npart.end(), - static_cast(0)); + const std::size_t tot_npart = std::accumulate(mpi_npart.begin(), + mpi_npart.end(), + static_cast(0)); const npart_t max_idx = std::distance( mpi_npart.begin(), std::max_element(mpi_npart.begin(), mpi_npart.end())); From 9a4333e1798a6965054dc80407ac64f7ff3774e8 Mon Sep 17 00:00:00 2001 From: Ludwig Boess Date: Wed, 24 Jun 2026 18:47:03 -0400 Subject: [PATCH 21/33] use a persistent buffer per sort to reduce memory overhead --- src/framework/containers/particles.h | 19 +-- src/framework/containers/particles_sort.cpp | 131 ++++++++++++++------ 2 files changed, 101 insertions(+), 49 deletions(-) diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h index 511a07a62..c3fa93137 100644 --- a/src/framework/containers/particles.h +++ b/src/framework/containers/particles.h @@ -311,14 +311,17 @@ namespace ntt { private: /** * @brief Apply a particle-index permutation (built by oneDPL/Thrust - * sort_by_key) to the SoA member arrays. Each member is - * gathered into a fresh full-capacity buffer whose handle is - * then swapped in (no copy-back), one buffer at a time, fenced - * before the old storage is released. The *_prev arrays are - * intentionally not permuted (overwritten by the next push - * before any read). Only compiled when a vendor sort backend - * is enabled; the BinSort path applies the permutation in - * place via `sorter.sort(view)` instead. + * sort_by_key) to the SoA member arrays. Members are gathered + * through `perm` into a reusable `n`-sized scratch buffer + * (one per member type, shared across members of that type) + * and copied back in place, so the large persistent member + * arrays keep their storage/address and the gather makes a + * handful of transient allocations instead of one maxnpart + * buffer per member. The *_prev arrays are intentionally not + * permuted (overwritten by the next push before any read). + * Only compiled when a vendor sort backend is enabled; the + * BinSort path applies the permutation in place via + * `sorter.sort(view)` instead. */ void apply_permutation_to_soa(const prtl_perm_t& perm); diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp index 3db6c1438..8261fe5b4 100644 --- a/src/framework/containers/particles_sort.cpp +++ b/src/framework/containers/particles_sort.cpp @@ -474,53 +474,73 @@ namespace ntt { #if defined(TEAM_POLICY_USE_VENDOR_SORT) namespace permute_helpers { - // Permute a 1D SoA member array `arr` by `perm`. Gathers into a - // fresh buffer allocated at the member's full capacity (maxnpart), - // then swaps the View handle in. This avoids the redundant copy-back - // pass of the old gather-then-deep_copy approach (~2x less HBM - // traffic). Allocating at full capacity preserves the member's spare - // room for injection; the untouched tail [n, capacity) is - // zero-initialized by Kokkos (cleaner than the stale values the old - // deep_copy left there). The fence drains the gather (which reads the - // old storage) before the swap drops the last reference to it. + // Permute a 1D SoA member `arr` by `perm` in place, using a + // caller-owned reusable `scratch` buffer. Gathers the live prefix + // `arr[perm[0..n)]` into `scratch[0..n)`, then copies it back into + // `arr[0..n)`. Unlike the old swap-the-handle approach, `arr` keeps + // its original storage and full maxnpart capacity, so the large + // persistent member arrays never change address between sorts (the + // dominant source of allocator churn / fragmentation on ROCm), and + // `scratch` is shared across every member of the same type within one + // sort -- one transient allocation per type group instead of a fresh + // maxnpart buffer per member. `scratch` is sized to the live count + // `n` (not maxnpart), which also lowers the gather's peak transient. + // The tail `arr[n, capacity)` is left untouched (stale, never read: + // consumers iterate `[0, npart())`). Cost vs the swap: one extra + // copy-back pass (~2x HBM traffic), negligible when sorting is a + // small fraction of the step. The fences keep the shared scratch from + // being overwritten by the next member's gather before this member's + // copy-back has drained it. template - inline void permute_1d_swap(V& arr, + inline void permute_1d_into(V& arr, + const V& scratch, const prtl_perm_t& perm, npart_t n) { if (n == 0u) { return; } - V buf(arr.label(), arr.extent(0)); auto perm_v = perm; auto arr_v = arr; + auto buf_v = scratch; Kokkos::parallel_for( "Permute1D", n, - KOKKOS_LAMBDA(const npart_t p) { buf(p) = arr_v(perm_v(p)); }); - Kokkos::fence("permute_1d_swap: end"); - arr = buf; + KOKKOS_LAMBDA(const npart_t p) { buf_v(p) = arr_v(perm_v(p)); }); + Kokkos::fence("permute_1d_into: gather"); + Kokkos::deep_copy( + Kokkos::subview(arr, std::make_pair(static_cast(0), n)), + Kokkos::subview(scratch, std::make_pair(static_cast(0), n))); + Kokkos::fence("permute_1d_into: copy-back"); } - // 2D analogue for `pld_r` / `pld_i`. + // 2D analogue for `pld_r` / `pld_i` (`scratch` is `(>= n, ncols)`). template - inline void permute_2d_swap(V& arr, + inline void permute_2d_into(V& arr, + const V& scratch, const prtl_perm_t& perm, npart_t n, npart_t ncols) { if (n == 0u or ncols == 0u) { return; } - V buf(arr.label(), arr.extent(0), arr.extent(1)); auto perm_v = perm; auto arr_v = arr; + auto buf_v = scratch; Kokkos::parallel_for( "Permute2D", CreateParticleRangePolicy({ 0u, 0u }, { n, ncols }), KOKKOS_LAMBDA(const npart_t p, const npart_t l) { - buf(p, l) = arr_v(perm_v(p), l); + buf_v(p, l) = arr_v(perm_v(p), l); }); - Kokkos::fence("permute_2d_swap: end"); - arr = buf; + Kokkos::fence("permute_2d_into: gather"); + Kokkos::deep_copy( + Kokkos::subview(arr, + std::make_pair(static_cast(0), n), + Kokkos::ALL), + Kokkos::subview(scratch, + std::make_pair(static_cast(0), n), + Kokkos::ALL)); + Kokkos::fence("permute_2d_into: copy-back"); } } // namespace permute_helpers @@ -532,8 +552,8 @@ namespace ntt { return; } - using permute_helpers::permute_1d_swap; - using permute_helpers::permute_2d_swap; + using permute_helpers::permute_1d_into; + using permute_helpers::permute_2d_into; // The *_prev arrays (i{1,2,3}_prev, dx{1,2,3}_prev) are intentionally // NOT permuted. SortSpatially runs at the very end of the step loop @@ -551,31 +571,60 @@ namespace ntt { // prev field saved to the checkpoint differs from the old code. // Permuting prev would therefore reorder data that is overwritten // before it is ever observed. - if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) { - permute_1d_swap(i1, perm, n); - permute_1d_swap(dx1, perm, n); + // + // Each block below allocates a single `n`-sized scratch buffer that + // is reused for every member of that type, then freed before the next + // block allocates its own. The whole gather thus makes one transient + // allocation per type group (int / prtldx_t / real_t / short / each + // payload) instead of one fresh maxnpart buffer per member, and peak + // transient is a single n-sized scratch at a time. + { + array_t scratch { "perm_scratch_int", n }; + if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) { + permute_1d_into(i1, scratch, perm, n); + } + if constexpr (D == Dim::_2D or D == Dim::_3D) { + permute_1d_into(i2, scratch, perm, n); + } + if constexpr (D == Dim::_3D) { + permute_1d_into(i3, scratch, perm, n); + } } - if constexpr (D == Dim::_2D or D == Dim::_3D) { - permute_1d_swap(i2, perm, n); - permute_1d_swap(dx2, perm, n); + { + array_t scratch { "perm_scratch_prtldx", n }; + if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) { + permute_1d_into(dx1, scratch, perm, n); + } + if constexpr (D == Dim::_2D or D == Dim::_3D) { + permute_1d_into(dx2, scratch, perm, n); + } + if constexpr (D == Dim::_3D) { + permute_1d_into(dx3, scratch, perm, n); + } } - if constexpr (D == Dim::_3D) { - permute_1d_swap(i3, perm, n); - permute_1d_swap(dx3, perm, n); - } - permute_1d_swap(ux1, perm, n); - permute_1d_swap(ux2, perm, n); - permute_1d_swap(ux3, perm, n); - permute_1d_swap(weight, perm, n); - permute_1d_swap(tag, perm, n); - if constexpr (D == Dim::_2D and C != Coord::Cartesian) { - permute_1d_swap(phi, perm, n); + { + array_t scratch { "perm_scratch_real", n }; + permute_1d_into(ux1, scratch, perm, n); + permute_1d_into(ux2, scratch, perm, n); + permute_1d_into(ux3, scratch, perm, n); + permute_1d_into(weight, scratch, perm, n); + if constexpr (D == Dim::_2D and C != Coord::Cartesian) { + permute_1d_into(phi, scratch, perm, n); + } + } + { + array_t scratch { "perm_scratch_tag", n }; + permute_1d_into(tag, scratch, perm, n); } if (npld_r() > 0) { - permute_2d_swap(pld_r, perm, n, static_cast(npld_r())); + const auto ncols = static_cast(npld_r()); + array_t scratch { "perm_scratch_pld_r", n, ncols }; + permute_2d_into(pld_r, scratch, perm, n, ncols); } if (npld_i() > 0) { - permute_2d_swap(pld_i, perm, n, static_cast(npld_i())); + const auto ncols = static_cast(npld_i()); + array_t scratch { "perm_scratch_pld_i", n, ncols }; + permute_2d_into(pld_i, scratch, perm, n, ncols); } } #endif // TEAM_POLICY_USE_VENDOR_SORT From 216940ce14c55be31f9851ab6217fa11ca85e2d9 Mon Sep 17 00:00:00 2001 From: Ludwig Boess Date: Wed, 24 Jun 2026 18:47:17 -0400 Subject: [PATCH 22/33] added the option to explicitly disable vendor sort --- CMakeLists.txt | 98 ++++++++++++++++++++--------------- cmake/defaults.cmake | 13 +++++ cmake/report.cmake | 10 ++++ src/global/utils/reporter.cpp | 7 +++ 4 files changed, 87 insertions(+), 41 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f8409209b..b9bbb1767 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,6 +71,10 @@ set(team_policy_drift ${default_team_policy_drift} CACHE STRING "team_policy tiled-deposit scratch halo drift in cells (max cells a particle may move between two sorts). Sizes the deposit scratch halo only; the sort cadence is set at runtime via spatial_sorting_interval. Default 1.") +set(vendor_sort + ${default_vendor_sort} + CACHE BOOL + "Use the vendor sort_by_key (oneDPL/Thrust/rocThrust) for the team_policy spatial sort when available. OFF forces the Kokkos::BinSort fallback, which sorts each SoA member in place (lower peak memory, no maxnpart gather buffer) at the cost of sort speed.") # -------------------------- Compilation settings -------------------------- # set(CMAKE_CXX_STANDARD 20) @@ -169,53 +173,65 @@ if(${team_policy}) # `spatial_sorting_interval`. Defaults to 1 (the sorted-every-step case). add_compile_options("-D TEAM_POLICY_DRIFT=${team_policy_drift}") - # Vendor sort: oneDPL on SYCL, Thrust on CUDA. Used automatically - # when found; falls back to Kokkos::BinSort otherwise. - if("${Kokkos_DEVICES}" MATCHES "SYCL") - find_package(oneDPL QUIET) - if(oneDPL_FOUND) - message(STATUS "team_policy: oneDPL found, enabling SYCL sort_by_key") - add_compile_options("-D ONEDPL_ENABLED") - set(DEPENDENCIES ${DEPENDENCIES} oneDPL) - else() - message(STATUS "team_policy: oneDPL not found; using BinSort fallback " - "for SYCL sort_by_key") + # Vendor sort: oneDPL on SYCL, Thrust on CUDA, rocThrust/rocprim on HIP. + # When `vendor_sort` is ON (default) the available library is detected + # and used; the spatial sort then builds a single permutation that + # gathers all SoA members. When `vendor_sort` is OFF, or no library is + # found, the code falls back to Kokkos::BinSort, which sorts each member + # in place -- lower peak memory and no maxnpart gather buffer, at the + # cost of sort speed (negligible when sorting is a small fraction of the + # step). The `vendor_sort` knob lets you force the BinSort fallback even + # when a vendor library is present. + if(${vendor_sort}) + if("${Kokkos_DEVICES}" MATCHES "SYCL") + find_package(oneDPL QUIET) + if(oneDPL_FOUND) + message(STATUS "team_policy: oneDPL found, enabling SYCL sort_by_key") + add_compile_options("-D ONEDPL_ENABLED") + set(DEPENDENCIES ${DEPENDENCIES} oneDPL) + else() + message(STATUS "team_policy: oneDPL not found; using BinSort fallback " + "for SYCL sort_by_key") + endif() endif() - endif() - if("${Kokkos_DEVICES}" MATCHES "CUDA") - find_package(Thrust QUIET) - if(Thrust_FOUND) - message(STATUS "team_policy: Thrust enabled for CUDA sort_by_key") - add_compile_options("-D THRUST_ENABLED") - else() - message(STATUS "team_policy: Thrust not found; using BinSort fallback " - "for CUDA sort_by_key") + if("${Kokkos_DEVICES}" MATCHES "CUDA") + find_package(Thrust QUIET) + if(Thrust_FOUND) + message(STATUS "team_policy: Thrust enabled for CUDA sort_by_key") + add_compile_options("-D THRUST_ENABLED") + else() + message(STATUS "team_policy: Thrust not found; using BinSort fallback " + "for CUDA sort_by_key") + endif() endif() - endif() - if("${Kokkos_DEVICES}" MATCHES "HIP") - # rocThrust ships with ROCm. The HIP sort_by_key path uses rocprim's - # bounded-bit radix sort directly (rocprim is rocThrust's own - # dependency, so its headers come in transitively; we find it - # explicitly to keep the include path robust). This builds a single - # permutation that gathers all SoA members, instead of the legacy - # per-member Kokkos::BinSort path which allocates a fresh - # `sorted_values` buffer for every member every step (the dominant - # source of allocator churn / fragmentation on ROCm). - find_package(rocthrust QUIET) - if(rocthrust_FOUND) - message(STATUS "team_policy: rocThrust enabled for HIP sort_by_key") - add_compile_options("-D ROCTHRUST_ENABLED") - set(DEPENDENCIES ${DEPENDENCIES} roc::rocthrust) - find_package(rocprim QUIET) - if(rocprim_FOUND) - set(DEPENDENCIES ${DEPENDENCIES} roc::rocprim) + if("${Kokkos_DEVICES}" MATCHES "HIP") + # rocThrust ships with ROCm. The HIP sort_by_key path uses rocprim's + # bounded-bit radix sort directly (rocprim is rocThrust's own + # dependency, so its headers come in transitively; we find it + # explicitly to keep the include path robust). This builds a single + # permutation that gathers all SoA members, instead of the legacy + # per-member Kokkos::BinSort path which allocates a fresh + # `sorted_values` buffer for every member every step (the dominant + # source of allocator churn / fragmentation on ROCm). + find_package(rocthrust QUIET) + if(rocthrust_FOUND) + message(STATUS "team_policy: rocThrust enabled for HIP sort_by_key") + add_compile_options("-D ROCTHRUST_ENABLED") + set(DEPENDENCIES ${DEPENDENCIES} roc::rocthrust) + find_package(rocprim QUIET) + if(rocprim_FOUND) + set(DEPENDENCIES ${DEPENDENCIES} roc::rocprim) + endif() + else() + message(STATUS "team_policy: rocThrust not found; using BinSort " + "fallback for HIP sort_by_key") endif() - else() - message(STATUS "team_policy: rocThrust not found; using BinSort " - "fallback for HIP sort_by_key") endif() + else() + message(STATUS "team_policy: vendor_sort=OFF; forcing Kokkos::BinSort " + "fallback for spatial sort_by_key") endif() endif() diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake index 01427921f..c03ebaf81 100644 --- a/cmake/defaults.cmake +++ b/cmake/defaults.cmake @@ -104,6 +104,19 @@ else() endif() set_property(CACHE default_team_policy PROPERTY TYPE BOOL) +if(DEFINED ENV{Entity_ENABLE_VENDOR_SORT}) + set(default_vendor_sort + $ENV{Entity_ENABLE_VENDOR_SORT} + CACHE INTERNAL + "Default flag for vendor sort_by_key (oneDPL/Thrust/rocThrust)") +else() + set(default_vendor_sort + ON + CACHE INTERNAL + "Default flag for vendor sort_by_key (oneDPL/Thrust/rocThrust)") +endif() +set_property(CACHE default_vendor_sort PROPERTY TYPE BOOL) + set(default_team_policy_tile_size 8 CACHE INTERNAL "Default tile edge length in cells for team_policy") diff --git a/cmake/report.cmake b/cmake/report.cmake index d90dfb085..d972fafc4 100644 --- a/cmake/report.cmake +++ b/cmake/report.cmake @@ -150,6 +150,15 @@ if(${team_policy}) "${Blue}" TEAM_POLICY_DRIFT_REPORT 46) + printchoices( + "Vendor sort" + "vendor_sort" + "${ON_OFF_VALUES}" + ${vendor_sort} + ON + "${Green}" + VENDOR_SORT_REPORT + 46) endif() printchoices( "Debug mode" @@ -230,6 +239,7 @@ string(APPEND REPORT_TEXT " " ${TEAM_POLICY_REPORT} "\n") if(${team_policy}) string(APPEND REPORT_TEXT " " ${TEAM_POLICY_TILE_SIZE_REPORT} "\n") string(APPEND REPORT_TEXT " " ${TEAM_POLICY_DRIFT_REPORT} "\n") + string(APPEND REPORT_TEXT " " ${VENDOR_SORT_REPORT} "\n") endif() string( diff --git a/src/global/utils/reporter.cpp b/src/global/utils/reporter.cpp index a4b10eee6..6113a89c7 100644 --- a/src/global/utils/reporter.cpp +++ b/src/global/utils/reporter.cpp @@ -253,6 +253,13 @@ namespace reporter { #if defined(TEAM_POLICY) AddParam(report, 4, "TEAM_POLICY", "%s", "ON"); + #if (defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) || \ + (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) || \ + (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)) + AddParam(report, 4, "VENDOR_SORT", "%s", "ON"); + #else + AddParam(report, 4, "VENDOR_SORT", "%s", "OFF (BinSort)"); + #endif #else AddParam(report, 4, "TEAM_POLICY", "%s", "OFF"); #endif From af942c82eeda02c30e15004069caf83b57583dd4 Mon Sep 17 00:00:00 2001 From: Ludwig Boess Date: Wed, 24 Jun 2026 19:08:50 -0400 Subject: [PATCH 23/33] remove dead particles after every sort --- src/framework/containers/particles.h | 8 ++++- src/framework/containers/particles_sort.cpp | 35 +++++++++++++++------ 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h index c3fa93137..fe4a0b17f 100644 --- a/src/framework/containers/particles.h +++ b/src/framework/containers/particles.h @@ -322,8 +322,14 @@ namespace ntt { * Only compiled when a vendor sort backend is enabled; the * BinSort path applies the permutation in place via * `sorter.sort(view)` instead. + * @param perm Permutation: sorted position -> pre-sort slot index. + * @param n Number of leading (alive) particles to gather. Pass + * `npart_partitioned` so only the alive set is moved into + * `[0, n)` (the dead were binned to the sentinel tile and sort + * to the tail); the caller then drops the dead tail via + * `set_npart(n)`. */ - void apply_permutation_to_soa(const prtl_perm_t& perm); + void apply_permutation_to_soa(const prtl_perm_t& perm, npart_t n); public: #endif diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp index 8261fe5b4..5e9ae4147 100644 --- a/src/framework/containers/particles_sort.cpp +++ b/src/framework/containers/particles_sort.cpp @@ -319,12 +319,12 @@ namespace ntt { const auto slice = prtl_slice_t(0, npart_local); #if defined(TEAM_POLICY_USE_VENDOR_SORT) - // Vendor path: produce an explicit permutation via sort_by_key, - // then apply it to each SoA member by gathering into a fresh - // full-capacity buffer and swapping the View handle in (no - // copy-back). The *_prev arrays are skipped — see + // Vendor path: produce an explicit permutation via sort_by_key, then + // apply it to each SoA member by gathering the alive prefix through a + // reusable scratch buffer (one per member type, sized to the alive + // count, copied back in place). The *_prev arrays are skipped — see // apply_permutation_to_soa. Peak transient = one - // `maxnpart × sizeof(member)` buffer at a time. + // `npart_partitioned × sizeof(member)` scratch at a time. prtl_perm_t perm { "tile_perm", npart_local }; #if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED) sort_helpers::sort_by_key_dispatch(tile_indices, @@ -349,7 +349,14 @@ namespace ntt { compute_tile_offsets(tile_indices, total_tiles, npart_local); tile_indices = array_t {}; Kokkos::fence("SortSpatially: pre-gather drain"); - apply_permutation_to_soa(perm); + // Gather only the alive particles. The sort binned dead particles to + // the sentinel tile, so they occupy [npart_partitioned, npart_local) + // and `perm[0, npart_partitioned)` lists the alive slots in tile + // order. Restricting the gather to the alive count drops the dead in + // the same pass (no separate compaction), skips work on dead slots, + // and sizes the gather scratch to the alive count. The dead tail is + // released below via `set_npart`. + apply_permutation_to_soa(perm, m_tile_layout.npart_partitioned); #else // BinSort path: same mechanism as legacy SortSpatially (BinSort // allocates one temp View per `sorter.sort(view)` call and frees @@ -411,6 +418,16 @@ namespace ntt { m_tile_layout.tile_perm = prtl_perm_t {}; m_is_sorted = true; + // Compact-on-sort: drop the dead tail now instead of waiting for the + // periodic RemoveDead. The sort parked dead particles in the sentinel + // bin at [npart_partitioned, npart()), and the alive set is exactly + // [0, npart_partitioned) — gathered into tile order above (vendor) or + // sorted in place (BinSort). Shrinking npart() here keeps it, and + // every subsequent sort's transient buffers, tracking the alive count + // instead of ratcheting up with dead slots between clearing intervals. + // (RemoveDead remains the compactor when spatial sorting is disabled.) + set_npart(m_tile_layout.npart_partitioned); + Kokkos::fence("SortSpatially: end of team_policy path"); #else // !TEAM_POLICY — legacy in-place BinSort by global cell index const auto nx2 = grid.n_active(in::x2); @@ -546,8 +563,8 @@ namespace ntt { } // namespace permute_helpers template - void Particles::apply_permutation_to_soa(const prtl_perm_t& perm) { - const auto n = npart(); + void Particles::apply_permutation_to_soa(const prtl_perm_t& perm, + npart_t n) { if (n == 0u) { return; } @@ -632,7 +649,7 @@ namespace ntt { #if defined(TEAM_POLICY_USE_VENDOR_SORT) #define APPLY_PERM_INSTANTIATE(D, C) \ template void Particles::apply_permutation_to_soa( \ - const prtl_perm_t&); + const prtl_perm_t&, npart_t); #else #define APPLY_PERM_INSTANTIATE(D, C) #endif From e1de8ae56656b286bcfcfce474d7a17af8555a6a Mon Sep 17 00:00:00 2001 From: Ludwig Boess Date: Wed, 24 Jun 2026 19:09:15 -0400 Subject: [PATCH 24/33] test update --- tests/framework/particles_sort.cpp | 44 ++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/tests/framework/particles_sort.cpp b/tests/framework/particles_sort.cpp index 4e4881d98..a2b82456e 100644 --- a/tests/framework/particles_sort.cpp +++ b/tests/framework/particles_sort.cpp @@ -112,16 +112,30 @@ auto main(int argc, char* argv[]) -> int { // SortSpatially is order-by-tile, not order-by-cell: assert the // invariants that hold for any tile size rather than a hardwired // permutation. (1) alive particles form a prefix sorted by - // non-decreasing tile index; (2) dead particles (weight == -1) form - // the suffix; (3) every SoA member is permuted by the *same* - // permutation, so each alive slot still satisfies pld == f(weight); - // (4) no alive particle is lost. Only [0, npart) is defined after a - // sort — the swap-gather zero-fills [npart, maxnpart). + // non-decreasing tile index; (2) every SoA member is permuted by the + // *same* permutation, so each alive slot still satisfies + // pld == f(weight); (3) no alive particle is lost. Only [0, npart()) + // is defined after a sort. The team_policy path compacts — it drops + // the dead, so npart() equals the alive count and [0, npart()) is + // entirely alive; the legacy (non-team) path keeps the dead as a + // weight == -1 suffix, leaving npart() unchanged. Iterating + // [0, npart()) exercises both: the prefix-sorted / no-alive-after-dead + // checks below hold either way. +#if defined(TEAM_POLICY) + raise::ErrorIf(prtls.npart() != 59u, + "team_policy sort must compact: npart() should equal " + "the alive count", + HERE); +#else + raise::ErrorIf(prtls.npart() != 66u, + "legacy sort should leave npart() unchanged", + HERE); +#endif bool seen_dead = false; bool have_prev = false; ncells_t prev_tile = 0u; npart_t n_alive_obs = 0u; - for (auto p { 0u }; p < 66u; ++p) { + for (auto p { 0u }; p < prtls.npart(); ++p) { if (tag_h(p) != ntt::ParticleTag::alive) { seen_dead = true; raise::ErrorIf(weight_h(p) != -1.0, @@ -244,8 +258,10 @@ auto main(int argc, char* argv[]) -> int { Kokkos::deep_copy(weight_h, prtls.weight); // Same invariants as the 2D block (no payloads here): alive prefix - // sorted by non-decreasing tile index, dead (weight == -1) suffix, - // alive count preserved. T = 1 reproduces the legacy per-cell order. + // sorted by non-decreasing tile index, alive count preserved. The + // team_policy path compacts the dead away (npart() == alive count); + // the legacy path keeps them as a weight == -1 suffix. T = 1 + // reproduces the legacy per-cell order. #if defined(TEAM_POLICY) const ncells_t T = static_cast(TEAM_POLICY_TILE_SIZE); #else @@ -261,11 +277,21 @@ auto main(int argc, char* argv[]) -> int { (static_cast(c) / T); }; +#if defined(TEAM_POLICY) + raise::ErrorIf(prtls.npart() != 59u, + "team_policy sort must compact: npart() should equal " + "the alive count", + HERE); +#else + raise::ErrorIf(prtls.npart() != 66u, + "legacy sort should leave npart() unchanged", + HERE); +#endif bool seen_dead = false; bool have_prev = false; ncells_t prev_tile = 0u; npart_t n_alive_obs = 0u; - for (auto p { 0u }; p < 66u; ++p) { + for (auto p { 0u }; p < prtls.npart(); ++p) { if (tag_h(p) != ntt::ParticleTag::alive) { seen_dead = true; raise::ErrorIf(weight_h(p) != -1.0, From 592ff89012eaf3276bc6dc491193df9c329765e8 Mon Sep 17 00:00:00 2001 From: Ludwig Boess Date: Thu, 25 Jun 2026 11:11:47 -0400 Subject: [PATCH 25/33] generalized reduced exchange for current filters on any coordinate system --- src/engines/srpic/currents.h | 167 ++++++++++++++++++----------------- 1 file changed, 88 insertions(+), 79 deletions(-) diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h index 30eb9466d..dbb482e83 100644 --- a/src/engines/srpic/currents.h +++ b/src/engines/srpic/currents.h @@ -233,90 +233,99 @@ namespace ntt { const auto flds_bc = domain.mesh.flds_bc(); - if constexpr (M::CoordType == Coord::Cartesian) { - // Reduced-exchange ghost-margin scheme. One halo exchange refreshes - // N_GHOSTS ghost layers — enough for N_GHOSTS passes of the 3-point - // binomial if each pass also recomputes the inner ghost layers it - // will need next. We therefore extend the launch range by a shrinking - // margin `m` into the ghost zone, but only on comm-refreshed sides - // (PERIODIC self-wrap or SYNC inter-domain), where the ghost cell is - // interior physics. Physical-boundary ghosts are never written or - // refreshed, exactly as in the per-pass loop, so the result is - // identical for every BC — while doing one exchange per N_GHOSTS - // passes instead of one per pass. (Entering the loop the ghosts are - // valid to distance N_GHOSTS: srpic.hpp runs CommunicateFields(J) - // immediately before CurrentsFilter.) - const int G = static_cast(N_GHOSTS); - const auto comm_side = [](FldsBC b) { - return (b == FldsBC::PERIODIC) or (b == FldsBC::SYNC); + // Reduced-exchange ghost-margin scheme (all coordinate types). One halo + // exchange refreshes N_GHOSTS ghost layers — enough for N_GHOSTS passes + // of the 3-point binomial if each pass also recomputes the inner ghost + // layers it will need next. We therefore extend the launch range by a + // shrinking margin `m` into the ghost zone, but only on comm-refreshed + // sides (PERIODIC self-wrap or SYNC inter-domain), where the ghost cell + // is interior physics. Physical-boundary ghosts are never written or + // refreshed, exactly as in the per-pass loop, so the result is identical + // for every BC — while doing one exchange per N_GHOSTS passes instead of + // one per pass. (Entering the loop the ghosts are valid to distance + // N_GHOSTS: srpic.hpp runs CommunicateFields(J) immediately before + // CurrentsFilter.) + // + // Non-Cartesian axis: the theta (x2) direction is self-contained in the + // filter kernel — the axis branches only ever read/write within + // [i2_min, i2_max] and never cross the axis — so the axis needs no halo + // exchange at all (the axis current fold is done once by + // SynchronizeFields(J) before this function). The single coordinate + // dependency is that the axis cell sits at i_max(x2), one past the active + // range, and must be filtered on every pass. We therefore add a fixed +1 + // to the x2 upper bound when that side is AXIS — a physical boundary, so + // never the shrinking comm margin. This folds the old RangeWithAxisBCs + // fixup into make_range, letting the same loop serve every CoordType. + const int G = static_cast(N_GHOSTS); + const auto comm_side = [](FldsBC b) { + return (b == FldsBC::PERIODIC) or (b == FldsBC::SYNC); + }; + bool ext_lo[3] = { false, false, false }; + bool ext_hi[3] = { false, false, false }; + for (auto d { 0 }; d < static_cast(M::Dim); ++d) { + ext_lo[d] = comm_side(flds_bc[d].first); + ext_hi[d] = comm_side(flds_bc[d].second); + } + // AXIS at the upper x2 boundary needs the axis cell (i_max(x2)) included + // every pass; matches srpic::RangeWithAxisBCs. (The lower-x2 axis cell is + // already i_min(x2), so no fixup is needed there.) + bool axis_hi_x2 = false; + if constexpr (M::CoordType != Coord::Cartesian and + (M::Dim == Dim::_2D or M::Dim == Dim::_3D)) { + axis_hi_x2 = (flds_bc[1].second == FldsBC::AXIS); + } + const auto make_range = [&](int m) -> range_t { + const auto ml = [&](int d) -> ncells_t { + return ext_lo[d] ? static_cast(m) : 0u; }; - bool ext_lo[3] = { false, false, false }; - bool ext_hi[3] = { false, false, false }; - for (auto d { 0 }; d < static_cast(M::Dim); ++d) { - ext_lo[d] = comm_side(flds_bc[d].first); - ext_hi[d] = comm_side(flds_bc[d].second); - } - const auto make_range = [&](int m) -> range_t { - const auto ml = [&](int d) -> ncells_t { - return ext_lo[d] ? static_cast(m) : 0u; - }; - const auto mh = [&](int d) -> ncells_t { - return ext_hi[d] ? static_cast(m) : 0u; - }; - if constexpr (M::Dim == Dim::_1D) { - return CreateRangePolicy( - { domain.mesh.i_min(in::x1) - ml(0) }, - { domain.mesh.i_max(in::x1) + mh(0) }); - } else if constexpr (M::Dim == Dim::_2D) { - return CreateRangePolicy( - { domain.mesh.i_min(in::x1) - ml(0), - domain.mesh.i_min(in::x2) - ml(1) }, - { domain.mesh.i_max(in::x1) + mh(0), - domain.mesh.i_max(in::x2) + mh(1) }); - } else { - return CreateRangePolicy( - { domain.mesh.i_min(in::x1) - ml(0), - domain.mesh.i_min(in::x2) - ml(1), - domain.mesh.i_min(in::x3) - ml(2) }, - { domain.mesh.i_max(in::x1) + mh(0), - domain.mesh.i_max(in::x2) + mh(1), - domain.mesh.i_max(in::x3) + mh(2) }); + const auto mh = [&](int d) -> ncells_t { + if (ext_hi[d]) { + return static_cast(m); } - }; - int m = G - 1; - for (auto i { 0u }; i < nfilter; ++i) { - Kokkos::parallel_for( - "CurrentsFilter", - make_range(m), - kernel::DigitalFilter_kernel( - domain.fields.buff, - domain.fields.cur, - size, - flds_bc)); - std::swap(domain.fields.cur, domain.fields.buff); - --m; - if (m < 0 or i == nfilter - 1u) { - // refresh ghosts to distance G (and leave them valid for the - // downstream field solver after the final pass) - metadomain.CommunicateFields(domain, Comm::J); - m = G - 1; + // axis cell fixup (x2 == dimension index 1); mutually exclusive with + // the comm margin since AXIS is not a comm side + if (d == 1 and axis_hi_x2) { + return 1u; } + return 0u; + }; + if constexpr (M::Dim == Dim::_1D) { + return CreateRangePolicy( + { domain.mesh.i_min(in::x1) - ml(0) }, + { domain.mesh.i_max(in::x1) + mh(0) }); + } else if constexpr (M::Dim == Dim::_2D) { + return CreateRangePolicy( + { domain.mesh.i_min(in::x1) - ml(0), + domain.mesh.i_min(in::x2) - ml(1) }, + { domain.mesh.i_max(in::x1) + mh(0), + domain.mesh.i_max(in::x2) + mh(1) }); + } else { + return CreateRangePolicy( + { domain.mesh.i_min(in::x1) - ml(0), + domain.mesh.i_min(in::x2) - ml(1), + domain.mesh.i_min(in::x3) - ml(2) }, + { domain.mesh.i_max(in::x1) + mh(0), + domain.mesh.i_max(in::x2) + mh(1), + domain.mesh.i_max(in::x3) + mh(2) }); } - } else { - // Non-Cartesian (axis BCs need the +1 range fixup): keep the - // per-pass exchange cadence, still ping-ponging the buffers. - const auto range = srpic::RangeWithAxisBCs(domain); - for (auto i { 0u }; i < nfilter; ++i) { - Kokkos::parallel_for( - "CurrentsFilter", - range, - kernel::DigitalFilter_kernel( - domain.fields.buff, - domain.fields.cur, - size, - flds_bc)); - std::swap(domain.fields.cur, domain.fields.buff); + }; + int m = G - 1; + for (auto i { 0u }; i < nfilter; ++i) { + Kokkos::parallel_for( + "CurrentsFilter", + make_range(m), + kernel::DigitalFilter_kernel( + domain.fields.buff, + domain.fields.cur, + size, + flds_bc)); + std::swap(domain.fields.cur, domain.fields.buff); + --m; + if (m < 0 or i == nfilter - 1u) { + // refresh ghosts to distance G (and leave them valid for the + // downstream field solver after the final pass) metadomain.CommunicateFields(domain, Comm::J); + m = G - 1; } } } From 6e1e23e29b82e05a3c2593657925af239af36e30 Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Thu, 25 Jun 2026 18:37:44 +0000 Subject: [PATCH 26/33] bugfix --- src/engines/srpic/currents.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h index dbb482e83..23c57a903 100644 --- a/src/engines/srpic/currents.h +++ b/src/engines/srpic/currents.h @@ -47,6 +47,7 @@ namespace ntt { dt)); } +#if defined(TEAM_POLICY) /** * @brief Tiled deposit launcher (TeamPolicy + per-team scratch). * @@ -116,6 +117,7 @@ namespace ntt { Kokkos::Experimental::contribute(cur_nc, scatter_cur); } } +#endif // TEAM_POLICY template void CurrentsDeposit(Domain& domain, From ff3ec19bcd463024fd5f78979d5135c120792228 Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Thu, 25 Jun 2026 23:08:19 +0000 Subject: [PATCH 27/33] move definittion of `compute_tile_offsets` to `public` so it compiles under nvcc --- src/framework/containers/particles.h | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h index fe4a0b17f..396b5aec8 100644 --- a/src/framework/containers/particles.h +++ b/src/framework/containers/particles.h @@ -99,18 +99,6 @@ namespace ntt { // vendor libraries detected by CMake. TileLayout m_tile_layout {}; -#if defined(TEAM_POLICY) - // Build m_tile_layout.tile_offsets / npart_partitioned from the - // already-sorted tile-index keys. A separate member function (not a - // lambda local to SortSpatially) so the inner device kernel is not an - // extended __device__ lambda nested inside another lambda — which - // nvcc forbids. Lets the vendor path run the offsets pass and then - // release the keys before the SoA gather allocates its buffers. - void compute_tile_offsets(const array_t& tile_indices, - ncells_t total_tiles, - npart_t npart_local); -#endif - public: // for empty allocation Particles() {} @@ -219,6 +207,20 @@ namespace ntt { return m_ntags; } +#if defined(TEAM_POLICY) + // Build m_tile_layout.tile_offsets / npart_partitioned from the + // already-sorted tile-index keys. A separate member function (not a + // lambda local to SortSpatially) so the inner device kernel is not an + // extended __device__ lambda nested inside another lambda — which + // nvcc forbids. Lets the vendor path run the offsets pass and then + // release the keys before the SoA gather allocates its buffers. + // NOTE: must be public — nvcc forbids an extended __host__ __device__ + // lambda inside a member function with private/protected access. + void compute_tile_offsets(const array_t& tile_indices, + ncells_t total_tiles, + npart_t npart_local); +#endif + [[nodiscard]] auto memory_footprint() const -> std::size_t { std::size_t footprint = 0; From 03ccd200e25baae4064e1a3e8e29ad5b2b0502f6 Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Fri, 26 Jun 2026 01:02:05 +0000 Subject: [PATCH 28/33] added `team_policy_team_size` as a tunable runtime parameter --- input.example.toml | 8 +++++ src/engines/engine.hpp | 13 ++++--- src/engines/reporter.cpp | 15 ++++++++ src/engines/srpic/currents.h | 48 ++++++++++++++++++++++--- src/framework/parameters/algorithms.cpp | 7 ++++ src/framework/parameters/algorithms.h | 1 + src/global/defaults.h | 2 ++ 7 files changed, 85 insertions(+), 9 deletions(-) diff --git a/input.example.toml b/input.example.toml index 741d5c2d2..87ca48f3c 100644 --- a/input.example.toml +++ b/input.example.toml @@ -301,6 +301,14 @@ # @type: bool # @default: true enable = "" + # team_policy tiled-deposit work-group (team) size + # @type: uint [>= 0] + # @default: 0 + # @note: 0 keeps Kokkos::AUTO (backend occupancy heuristic); a positive + # value overrides it, clamped to the backend/scratch maximum at + # launch. Only used in `team_policy=ON` builds. Pick a multiple of + # the device subgroup width for best occupancy (see ideal_tile_size.py) + team_policy_team_size = "" # @inferred: # - order diff --git a/src/engines/engine.hpp b/src/engines/engine.hpp index b20e163ca..057bfcb5a 100644 --- a/src/engines/engine.hpp +++ b/src/engines/engine.hpp @@ -78,10 +78,11 @@ namespace ntt { Metadomain m_metadomain; user::PGen m_pgen; - const bool is_resuming; - const simtime_t runtime; - const real_t dt; - const timestep_t max_steps; + const bool is_resuming; + const simtime_t runtime; + const real_t dt; + const std::size_t team_policy_team_size; + const timestep_t max_steps; const timestep_t start_step; const simtime_t start_time; simtime_t time; @@ -109,6 +110,8 @@ namespace ntt { , is_resuming { m_params.get("checkpoint.is_resuming") } , runtime { m_params.get("simulation.runtime") } , dt { m_params.get("algorithms.timestep.dt") } + , team_policy_team_size { m_params.get( + "algorithms.deposit.team_policy_team_size") } , max_steps { static_cast(runtime / dt) } , start_step { m_params.get("checkpoint.start_step") } , start_time { m_params.get("checkpoint.start_time") } @@ -127,6 +130,8 @@ namespace ntt { auto parameters = prm::Parameters {}; parameters.set("dt", static_cast(dt)); parameters.set("time", static_cast(time)); + parameters.set("team_policy_team_size", + static_cast(team_policy_team_size)); return parameters; } }; diff --git a/src/engines/reporter.cpp b/src/engines/reporter.cpp index 3caeda5de..fcd46fb81 100644 --- a/src/engines/reporter.cpp +++ b/src/engines/reporter.cpp @@ -34,6 +34,21 @@ namespace ntt { reporter::AddParam(report, 4, "Engine", "%s", SimEngine(S).to_string()); #if defined(TEAM_POLICY) reporter::AddParam(report, 4, "Tile size", "%d", TEAM_POLICY_TILE_SIZE); + #if defined(TEAM_POLICY_DRIFT) + reporter::AddParam(report, 4, "Halo drift", "%d", TEAM_POLICY_DRIFT); + #endif + if (params.template get( + "algorithms.deposit.team_policy_team_size") == 0u) { + reporter::AddParam(report, 4, "Team size", "%s", "AUTO (Kokkos)"); + } else { + reporter::AddParam( + report, + 4, + "Team size", + "%d (requested; clamped to backend max at launch)", + static_cast(params.template get( + "algorithms.deposit.team_policy_team_size"))); + } #endif reporter::AddParam(report, 4, "Metric", "%s", M.to_string()); #if SHAPE_ORDER == 0 diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h index 23c57a903..74946ed68 100644 --- a/src/engines/srpic/currents.h +++ b/src/engines/srpic/currents.h @@ -66,7 +66,8 @@ namespace ntt { void CallDepositKernelTiled(const Particles& species, const M& local_metric, const ndfield_t& cur, - real_t dt) { + real_t dt, + int team_size_req) { static_assert(O <= 11u, "Shape order must be <= 11"); constexpr unsigned short T = static_cast( TEAM_POLICY_TILE_SIZE); @@ -86,11 +87,41 @@ namespace ntt { dt, layout, species.npart() }; + const auto scratch = Kokkos::PerTeam( + decltype(deposit_kernel)::scratch_bytes()); + + // Team (work-group) size. The default (team_size_req == 0) leaves + // Kokkos::AUTO, which sizes the team from the backend occupancy + // heuristic. A positive `algorithms.deposit.team_policy_team_size` + // overrides it, clamped to the scratch/backend-feasible maximum so an + // over-large request cannot abort the launch (Kokkos errors when + // team_size > team_size_max). No portable subgroup rounding is applied; + // pick a multiple of the device subgroup width (printed per arch by + // ideal_tile_size.py) for the best occupancy. Kokkos::TeamPolicy<> policy(static_cast(layout.ntiles_total), Kokkos::AUTO); - policy.set_scratch_size( - 0, - Kokkos::PerTeam(decltype(deposit_kernel)::scratch_bytes())); + policy.set_scratch_size(0, scratch); + if (team_size_req > 0) { + const int ts_max = policy.team_size_max(deposit_kernel, + Kokkos::ParallelForTag {}); + int ts = team_size_req; + if (ts > ts_max) { + raise::Warning( + fmt::format("algorithms.deposit.team_policy_team_size = %d exceeds " + "the tiled-deposit maximum %d on this backend; clamping " + "to %d", + team_size_req, + ts_max, + ts_max), + HERE); + ts = ts_max; + } + policy = Kokkos::TeamPolicy<>(static_cast(layout.ntiles_total), ts); + policy.set_scratch_size(0, scratch); + logger::Checkpoint( + fmt::format("Tiled deposit: explicit team size %d", ts), + HERE); + } Kokkos::parallel_for("CurrentsDepositTiled", policy, deposit_kernel); // Particles appended since the last sort (injection / MPI receive on a @@ -126,6 +157,12 @@ namespace ntt { Kokkos::deep_copy(domain.fields.cur, ZERO); #if defined(TEAM_POLICY) + // Optional runtime override for the tiled-deposit team (work-group) size; + // 0 (default) keeps Kokkos::AUTO. Clamped to the backend max in the + // launcher (see CallDepositKernelTiled). + const auto team_size_req = static_cast( + engine_params.get("team_policy_team_size", + std::optional { 0u })); // Tiled deposit. Correctness no longer depends on the SoA being in a // "sorted" state at deposit time — the tiled kernel handles a stale @@ -176,7 +213,8 @@ namespace ntt { CallDepositKernelTiled(species, domain.mesh.metric, domain.fields.cur, - dt); + dt, + team_size_req); } } #else diff --git a/src/framework/parameters/algorithms.cpp b/src/framework/parameters/algorithms.cpp index 4766db965..87c77fbd9 100644 --- a/src/framework/parameters/algorithms.cpp +++ b/src/framework/parameters/algorithms.cpp @@ -33,6 +33,11 @@ namespace ntt { deposit_enable = toml::find_or(toml_data, "algorithms", "deposit", "enable", true); deposit_order = static_cast(SHAPE_ORDER); + deposit_team_policy_team_size = toml::find_or(toml_data, + "algorithms", + "deposit", + "team_policy_team_size", + defaults::team_policy_team_size); fieldsolver_enable = toml::find_or(toml_data, "algorithms", @@ -140,6 +145,8 @@ namespace ntt { params->set("algorithms.deposit.enable", deposit_enable.value()); params->set("algorithms.deposit.order", deposit_order.value()); + params->set("algorithms.deposit.team_policy_team_size", + deposit_team_policy_team_size.value()); params->set("algorithms.fieldsolver.enable", fieldsolver_enable.value()); for (const auto& [key, value] : fieldsolver_stencil_coeffs.value()) { diff --git a/src/framework/parameters/algorithms.h b/src/framework/parameters/algorithms.h index 97edb244a..c46f480fe 100644 --- a/src/framework/parameters/algorithms.h +++ b/src/framework/parameters/algorithms.h @@ -34,6 +34,7 @@ namespace ntt { std::optional deposit_enable; std::optional deposit_order; + std::optional deposit_team_policy_team_size; std::optional fieldsolver_enable; std::optional> fieldsolver_stencil_coeffs; diff --git a/src/global/defaults.h b/src/global/defaults.h index e1387677e..c4f9b67f4 100644 --- a/src/global/defaults.h +++ b/src/global/defaults.h @@ -22,6 +22,8 @@ namespace ntt::defaults { const unsigned short current_filters = 0; + const std::size_t team_policy_team_size = 0; + const std::string em_pusher = "Boris"; const std::string ph_pusher = "Photon"; const timestep_t clear_interval = 100; From ca7307dd7ee3b19e0997dedbc6787131ed9556ed Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Fri, 26 Jun 2026 02:38:14 +0000 Subject: [PATCH 29/33] add script to compute ideal tile size --- ideal_tile_size.py | 955 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 955 insertions(+) create mode 100644 ideal_tile_size.py diff --git a/ideal_tile_size.py b/ideal_tile_size.py new file mode 100644 index 000000000..e216f5981 --- /dev/null +++ b/ideal_tile_size.py @@ -0,0 +1,955 @@ +#!/usr/bin/env python3 +"""Recommend the team tile size (T_TILE) for entity's tiled current-deposit kernel. + +Each GPU work-group (team) owns a TE^dim scratch tile in shared memory (SLM on Intel, +LDS on AMD, shared mem on NVIDIA), accumulates its particles' currents into it, then +flushes once to global memory. With + + TE = T_TILE + 2*HALO + HALO = stencil_reach + drift (stencil_reach = shape_order for Esirkepov, 2 for the + O==0 zigzag deposit; drift = the compile-time + `team_policy_drift` CMake knob, NOT the runtime + spatial_sorting_interval -- see currents_deposit.hpp) + +the tile size is squeezed by three competing pressures: + + * Shared-memory capacity (HARD): TE^dim * ncomp * sizeof(real) must fit, ideally with + several work-groups resident per compute unit so latency is hidden. Binds in 3D / + double precision / on AMD's 64 KiB LDS. + * Halo overhead (push LARGER): zero-fill + flush sweep the whole TE^dim tile, so a tiny + tile is almost all halo (1-(T/TE)^dim wasted). Big HALO (infrequent sorts) makes this + worse and forces larger tiles. + * Particles per tile (push SMALLER): ppc*T^dim particles all atomic-add into one fixed + scratch tile -> SLM-atomic contention and load imbalance grow with tile size. + +Recommendation = the largest tile that respects the particle budget and shared-memory +residency; if that tile would be mostly halo, it is grown (toward lower halo) up to the +shared-memory limit. This is a first-order model -- confirm by sweeping the entity knobs + -D team_policy_tile_size= -D team_policy_drift= and re-profiling (see roofline/). +The team (work-group) size defaults to Kokkos::AUTO; override it at runtime with the + [algorithms.deposit] team_policy_team_size = (0 = AUTO) +toml knob -- clamped to the backend maximum at launch (engines/srpic/currents.h). + +Two ways to drive it: + + * Interactive TUI (no arguments): + python ideal_tile_size.py + + * Scriptable CLI (any argument): + python ideal_tile_size.py pvc --dim 2 --ppc 16 + python ideal_tile_size.py amd --dim 3 --ppc 64 + python ideal_tile_size.py all --dim 2 --ppc 16 --drift 4 # infrequent sorting +""" +import argparse +import curses +import os +import sys +from typing import Callable, List, Optional, Tuple + +# Shared-memory budget is the load-bearing number. Per compute unit (Xe-core / SM / CU); +# smem_wg_max is the largest single-work-group allocation. +ARCH = { + "pvc": dict(label="Intel Data Center GPU Max 1550 (PVC, Xe-HPC), per tile", + smem_cu=128 * 1024, smem_wg_max=128 * 1024, + subgroup=32, max_wg=1024, n_cu=64, cu="Xe-core"), + "a100": dict(label="NVIDIA A100 (Ampere)", + smem_cu=164 * 1024, smem_wg_max=163 * 1024, + subgroup=32, max_wg=1024, n_cu=108, cu="SM", + note="shared mem >48 KiB/block needs opt-in (cudaFuncAttributeMaxDynamicSharedMemorySize)"), + "h100": dict(label="NVIDIA H100 (Hopper)", + smem_cu=228 * 1024, smem_wg_max=227 * 1024, + subgroup=32, max_wg=1024, n_cu=132, cu="SM", + note="shared mem >48 KiB/block needs opt-in (cudaFuncAttributeMaxDynamicSharedMemorySize)"), + "gh200": dict(label="NVIDIA GH200 Grace Hopper (Hopper H100/H200 GPU)", + smem_cu=228 * 1024, smem_wg_max=227 * 1024, + subgroup=32, max_wg=1024, n_cu=132, cu="SM", + note="shared mem >48 KiB/block needs opt-in (cudaFuncAttributeMaxDynamicSharedMemorySize)"), + "mi250x": dict(label="AMD Instinct MI250X (CDNA2), per GCD", + smem_cu=64 * 1024, smem_wg_max=64 * 1024, + subgroup=64, max_wg=1024, n_cu=110, cu="CU"), + "mi300x": dict(label="AMD Instinct MI300X (CDNA3)", + smem_cu=64 * 1024, smem_wg_max=64 * 1024, + subgroup=64, max_wg=1024, n_cu=304, cu="CU"), +} +ALIAS = {"nvidia": "h100", "amd": "mi300x", "intel": "pvc", "mi250": "mi250x", "mi300": "mi300x", + "gracehopper": "gh200", "grace-hopper": "gh200", "gh200x": "gh200"} +PRECISION = {"single": 4, "double": 8} + +# arch choices offered in the TUI (canonical keys plus the meta-target "all") +ARCH_CHOICES = list(ARCH.keys()) + ["all"] +# "all" expands to one representative of each vendor (matches the CLI behaviour) +ALL_ARCHS = ["pvc", "nvidia", "amd"] + + +# ============================ +# core model (shared by TUI + CLI) +# ============================ + +class Settings: + """Tunable inputs for the tile-size model. + + Attribute names match the argparse dest names so `recommend`/`report_lines` accept + either a Settings instance (TUI) or an argparse namespace (CLI) interchangeably. + """ + + def __init__(self): + self.arch = "pvc" # an ARCH key or "all" + self.dim = 2 # 1 / 2 / 3 + self.ppc = 16.0 # particles per cell (per species) + self.shape_order = 2 # entity shape_order + self.precision = "single" # single / double + self.components = 3 # current-field components (J has 3) + self.drift = 1 # team_policy_drift: cells of drift the scratch halo absorbs + # (compile-time CMake knob, independent of spatial_sorting_interval) + self.target_resident = 2 # work-groups resident per compute unit + self.npart_cap = 1600.0 # particle-per-tile budget (contention / load-balance proxy) + self.halo_max = 0.70 # halo fraction above which the tile is grown + self.grid = 0 # cells per dim (0 disables the GPU-fill check) + self.balance_factor = 4 # min tiles per compute unit + self.min_tile = 4 # entity's team_policy_tile_sizes list starts at 4 + self.max_tile = 64 + + +def resolve_arch(name): + key = ALIAS.get(name.lower(), name.lower()) + if key not in ARCH: + raise SystemExit("unknown arch '%s'; choose from %s (or aliases %s)" + % (name, ", ".join(ARCH), ", ".join(ALIAS))) + return key, ARCH[key] + + +def recommend(hw, p): + """p: Settings or argparse namespace. Returns dict with rows, chosen row, binding.""" + # Matches DepositCurrentsTiled_kernel: STENCIL_REACH = O for Esirkepov (O>=1), + # 2 for the O==0 zigzag deposit; HALO = STENCIL_REACH + TEAM_POLICY_DRIFT. + stencil_reach = 2 if p.shape_order == 0 else p.shape_order + halo = stencil_reach + p.drift + real = PRECISION[p.precision] + atoms_pp = p.components * (p.shape_order + 1) ** p.dim # ~ useful atomics / particle + + rows = [] + for T in range(p.min_tile, p.max_tile + 1, 2): # entity uses even tile sizes + TE = T + 2 * halo + scratch = TE ** p.dim * p.components * real + npart = p.ppc * T ** p.dim + halo_frac = 1.0 - (T / TE) ** p.dim + ovhd = (2.0 * p.components / p.ppc) * (TE / T) ** p.dim / atoms_pp # zero+flush vs deposit + resident = int(hw["smem_cu"] // scratch) if scratch else 0 + ntiles = (p.grid / T) ** p.dim if p.grid else None + rows.append(dict(T=T, TE=TE, scratch=scratch, npart=npart, halo_frac=halo_frac, + ovhd=ovhd, resident=resident, ntiles=ntiles, + ok_cap=scratch <= hw["smem_wg_max"])) + + capfeas = [r for r in rows if r["ok_cap"]] + if not capfeas: + return dict(halo=halo, rows=rows, chosen=None, binding=None, grown=False) + + def largest(pred, default): + ts = [r["T"] for r in capfeas if pred(r)] + return max(ts) if ts else default + + T_cap = max(r["T"] for r in capfeas) + T_res = largest(lambda r: r["resident"] >= p.target_resident, p.min_tile) + T_np = largest(lambda r: r["npart"] <= p.npart_cap, p.min_tile) + T_bal = largest(lambda r: r["ntiles"] is None or r["ntiles"] >= p.balance_factor * hw["n_cu"], p.min_tile) + bounds = {"shared-memory capacity": T_cap, "shared-memory residency": T_res, + "GPU fill (too few tiles)": T_bal, "particle budget per tile": T_np} + + chosen_T = max(min(bounds.values()), p.min_tile) + binding = min(bounds, key=lambda k: bounds[k]) + + # If the particle-budget pick is mostly halo, grow the tile to cut halo, but never + # past what shared memory / GPU-fill allow (that just trades halo for contention). + grown = False + cur = next(r for r in capfeas if r["T"] == chosen_T) + if cur["halo_frac"] > p.halo_max: + halo_ok = [r["T"] for r in capfeas if r["halo_frac"] <= p.halo_max] + ceil_T = min(T_res, T_bal, T_cap) + if halo_ok: + target = max(min(halo_ok), chosen_T) # smallest tile that clears halo_max + new_T = min(max(target, chosen_T), ceil_T) + if new_T > chosen_T: + chosen_T, grown = new_T, True + binding = ("halo overhead" if min(halo_ok) <= ceil_T + else min({k: v for k, v in bounds.items() + if k != "particle budget per tile"}, + key=lambda k: bounds[k])) + else: + new_T = min(ceil_T, T_cap) # can't clear halo_max at all -> go as big as SLM allows + if new_T > chosen_T: + chosen_T, grown = new_T, True + binding = "shared-memory residency" + + chosen = next(r for r in capfeas if r["T"] == chosen_T) + return dict(halo=halo, rows=rows, chosen=chosen, binding=binding, grown=grown, atoms_pp=atoms_pp) + + +def kib(b): + return "%.1f" % (b / 1024.0) + + +def report_lines(name, key, hw, p, res): + """Build the recommendation report as a list of text lines (no printing).""" + L = [] + L.append("=" * 80) + L.append("%s [preset: %s]" % (hw["label"], key)) + L.append(" dim=%d ppc=%g shape_order=%d precision=%s(%dB) J-components=%d" + % (p.dim, p.ppc, p.shape_order, p.precision, PRECISION[p.precision], p.components)) + reach = 2 if p.shape_order == 0 else p.shape_order + reach_kind = "zigzag" if p.shape_order == 0 else "Esirkepov O" + L.append(" HALO = stencil_reach + drift = %d + %d = %d -> TE = T_TILE + %d" + " (reach %d = %s; drift = team_policy_drift)" + % (reach, p.drift, res["halo"], 2 * res["halo"], reach, reach_kind)) + L.append(" shared mem %s KiB/%s (budget %s KiB for %d resident WGs); subgroup=%d, n_cu=%d" + % (kib(hw["smem_cu"]), hw["cu"], kib(hw["smem_cu"] / p.target_resident), + p.target_resident, hw["subgroup"], hw["n_cu"])) + if hw.get("note"): + L.append(" note: %s" % hw["note"]) + L.append("-" * 80) + L.append(" T_TILE TE scratch resWG part/tile halo% zero+flush%") + for r in res["rows"]: + if not r["ok_cap"]: + continue + mark = " <== recommended" if r is res["chosen"] else "" + L.append(" %3d %4d %6s K %4d %9.0f %4.0f %7.1f%s" + % (r["T"], r["TE"], kib(r["scratch"]), r["resident"], r["npart"], + 100 * r["halo_frac"], 100 * r["ovhd"], mark)) + L.append("-" * 80) + + c = res["chosen"] + if c is None: + L.append(" INFEASIBLE: even T_TILE=%d does not fit %s KiB shared memory." + % (p.min_tile, kib(hw["smem_wg_max"]))) + L.append(" -> sort more often (smaller drift), use precision single, lower shape_order,") + L.append(" or use a non-tiled (global-atomic / ScatterView) deposit on this arch.") + return L + L.append(" RECOMMENDED T_TILE = %d (limited by: %s%s)" + % (c["T"], res["binding"], "; tile grown to reduce halo" if res["grown"] else "")) + L.append(" %.1f KiB scratch/team, %d work-groups resident/%s, %.0f particles/team, %.0f%% halo" + % (c["scratch"] / 1024.0, c["resident"], hw["cu"], c["npart"], 100 * c["halo_frac"])) + team = min(hw["max_wg"], 256 - 256 % hw["subgroup"]) + extra = "" if c["T"] <= 16 else " (entity's team_policy_tile_sizes list stops at 16; extend it)" + L.append(" entity build: -D team_policy=ON -D team_policy_tile_size=%d -D team_policy_drift=%d%s" + % (min(c["T"], 16), p.drift, extra)) + L.append(" team (work-group) size: Kokkos::AUTO by default; to override, set in the toml") + L.append(" [algorithms.deposit] team_policy_team_size = %d (0 = AUTO; keep a multiple of" + % team) + L.append(" subgroup=%d), then sweep around it and re-profile" % hw["subgroup"]) + # contextual guidance + if c["halo_frac"] > p.halo_max: + if p.drift > 1: + L.append(" !! %.0f%% of the tile is halo, inflated by team_policy_drift=%d; lower it " + "(and sort at least that often via spatial_sorting_interval)" + % (100 * c["halo_frac"], p.drift)) + else: + L.append(" !! %.0f%% halo is intrinsic at this size (shared memory caps the tile here)" + % (100 * c["halo_frac"])) + if c["npart"] > p.npart_cap: + L.append(" !! %.0f particles/team exceeds the %.0f budget -> watch SLM-atomic contention" + % (c["npart"], p.npart_cap)) + if c["resident"] < p.target_resident: + L.append(" !! only %d work-group(s) resident/%s -> limited latency hiding" + % (c["resident"], hw["cu"])) + return L + + +def archs_for(name): + """Expand a setting/arg value into the list of arch names to report on.""" + return ALL_ARCHS if name.lower() == "all" else [name] + + +def build_report(p): + """Run the model for the selected arch(es) and return the full report as lines.""" + lines = [] + for a in archs_for(p.arch): + try: + key, hw = resolve_arch(a) + except SystemExit as e: + lines.append(str(e)) + continue + lines.extend(report_lines(a, key, hw, p, recommend(hw, p))) + lines.append("=" * 80) + return lines + + +# ============================ +# colors: edit these +# ============================ + +COLOR_TITLE_FG = curses.COLOR_BLUE +COLOR_TEXT_FG = curses.COLOR_WHITE +COLOR_SELECTED_FG = curses.COLOR_WHITE +COLOR_SELECTED_BG = curses.COLOR_BLACK +COLOR_HINT_FG = curses.COLOR_YELLOW +COLOR_OK_FG = curses.COLOR_GREEN +COLOR_ERR_FG = curses.COLOR_RED +COLOR_KEY_FG = curses.COLOR_MAGENTA +COLOR_DIM_FG = curses.COLOR_CYAN + +PAIR_TITLE = 1 +PAIR_TEXT = 2 +PAIR_SELECTED = 3 +PAIR_HINT = 4 +PAIR_OK = 5 +PAIR_ERR = 6 +PAIR_KEY = 7 +PAIR_DIM = 8 + + +class MenuItem: + def __init__(self, label, hint="", right=None, on_enter=None, + on_space=None, disabled=None): + self.label = label + self.hint = hint + self.right = right + self.on_enter = on_enter + self.on_space = on_space + self.disabled = disabled + + +# ============================ +# TUI +# ============================ + +class App: + def __init__(self, stdscr): + self.stdscr = stdscr + self.s = Settings() + + self.state = "mainmenu" + self.stack: List[Tuple[str, int]] = [] + self.selected = 0 + self.scroll = 0 + self.message = "use arrows or j/k" + + self._init_curses() + + def _init_curses(self) -> None: + curses.curs_set(0) + self.stdscr.keypad(True) + curses.noecho() + curses.cbreak() + + if curses.has_colors(): + curses.start_color() + curses.use_default_colors() + curses.init_pair(PAIR_TITLE, COLOR_TITLE_FG, -1) + curses.init_pair(PAIR_TEXT, COLOR_TEXT_FG, -1) + curses.init_pair(PAIR_SELECTED, COLOR_SELECTED_FG, COLOR_SELECTED_BG) + curses.init_pair(PAIR_HINT, COLOR_HINT_FG, -1) + curses.init_pair(PAIR_OK, COLOR_OK_FG, -1) + curses.init_pair(PAIR_ERR, COLOR_ERR_FG, -1) + curses.init_pair(PAIR_KEY, COLOR_KEY_FG, -1) + curses.init_pair(PAIR_DIM, COLOR_DIM_FG, -1) + + def cp(self, pair_id: int) -> int: + return curses.color_pair(pair_id) if curses.has_colors() else 0 + + # ----- formatting helpers ----- + + def arch_label(self) -> str: + if self.s.arch == "all": + return "all (%s)" % " + ".join(ALL_ARCHS) + try: + return ARCH[resolve_arch(self.s.arch)[0]]["label"] + except SystemExit: + return self.s.arch + + # ----- nav stack ----- + + def push(self, st: str) -> None: + self.stack.append((self.state, self.selected)) + self.state = st + self.selected = 0 + self.scroll = 0 + self.message = "" + + def pop(self) -> None: + if self.stack: + self.state, self.selected = self.stack.pop() + else: + self.state, self.selected = "mainmenu", 0 + self.scroll = 0 + self.message = "" + + # ----- drawing ----- + + def add(self, y: int, x: int, s: str, attr: int = 0) -> None: + try: + self.stdscr.addstr(y, x, s, attr) + except curses.error: + pass + + def hline(self, y: int) -> None: + _, w = self.stdscr.getmaxyx() + try: + self.stdscr.hline(y, 0, curses.ACS_HLINE, max(0, w - 1)) + except curses.error: + pass + + def draw_keybar(self, y: int, x: int, pairs: List[Tuple[str, str]]) -> None: + cur_x = x + for key, action in pairs: + self.add(y, cur_x, key, self.cp(PAIR_KEY) | curses.A_BOLD) + cur_x += len(key) + self.add(y, cur_x, " ", self.cp(PAIR_DIM)) + cur_x += 1 + self.add(y, cur_x, action, self.cp(PAIR_HINT)) + cur_x += len(action) + self.add(y, cur_x, " ", self.cp(PAIR_DIM)) + cur_x += 3 + + def breadcrumb(self) -> str: + return { + "mainmenu": "mainmenu", + "arch": "mainmenu > architecture", + "physics": "mainmenu > physics & particles", + "tuning": "mainmenu > tuning knobs", + }.get(self.state, "mainmenu") + + def draw_menu(self, title: str, prompt: str, items: List[MenuItem]) -> None: + self.stdscr.erase() + h, w = self.stdscr.getmaxyx() + + self.add(0, 2, title, self.cp(PAIR_TITLE) | curses.A_BOLD) + bc = self.breadcrumb() + self.add(0, max(2, w - 2 - len(bc)), bc, self.cp(PAIR_DIM)) + + self.draw_keybar( + 1, + 2, + [ + ("up/dn/j/k", "move"), + ("enter", "select"), + ("space", "toggle/cycle"), + ("b", "back"), + ("q", "quit"), + ], + ) + self.hline(2) + + status1 = ("arch: %s dim: %d precision: %s ppc: %g" + % (self.s.arch, self.s.dim, self.s.precision, self.s.ppc)) + status2 = ("shape_order: %d drift: %d components: %d tile range: %d-%d" + % (self.s.shape_order, self.s.drift, self.s.components, + self.s.min_tile, self.s.max_tile)) + self.add(3, 2, status1[: w - 4], self.cp(PAIR_TEXT)) + self.add(4, 2, status2[: w - 4], self.cp(PAIR_TEXT)) + self.hline(5) + + self.add(6, 2, prompt[: w - 4], self.cp(PAIR_TEXT) | curses.A_BOLD) + + list_y = 8 + footer_h = 3 + view_h = max(1, h - list_y - footer_h) + n = len(items) + + if n == 0: + self.add(list_y, 2, "(empty)", self.cp(PAIR_HINT)) + else: + self.selected = max(0, min(self.selected, n - 1)) + + if self.selected < self.scroll: + self.scroll = self.selected + if self.selected >= self.scroll + view_h: + self.scroll = self.selected - view_h + 1 + self.scroll = max(0, min(self.scroll, max(0, n - view_h))) + + shown = items[self.scroll : self.scroll + view_h] + + for i, it in enumerate(shown): + idx = self.scroll + i + sel = idx == self.selected + dis = bool(it.disabled and it.disabled()) + + row_attr = ( + self.cp(PAIR_SELECTED) | curses.A_BOLD + if sel + else (self.cp(PAIR_DIM) if dis else self.cp(PAIR_TEXT)) + ) + self.add(list_y + i, 2, (" %s" % it.label)[: w - 4], row_attr) + + if it.right: + rt = (it.right() or "").strip() + if rt: + rt = rt[: max(0, w - 6)] + x = max(2, w - 2 - len(rt)) + rt_attr = ( + row_attr + if sel + else (self.cp(PAIR_HINT) if not dis else self.cp(PAIR_DIM)) + ) + self.add(list_y + i, x, rt, rt_attr) + + if sel and it.hint: + self.add( + list_y + i, + min(w - 4, 30), + (" %s" % it.hint)[: w - 4], + self.cp(PAIR_HINT), + ) + + self.hline(h - 3) + msg = self.message or "" + if msg: + is_err = msg.startswith("error") + attr = (self.cp(PAIR_ERR) if is_err else self.cp(PAIR_OK)) | curses.A_BOLD + self.add(h - 2, 2, msg[: w - 4], attr) + self.stdscr.refresh() + + # ----- modals ----- + + def input_box(self, title: str, prompt: str, initial: str) -> Optional[str]: + h, w = self.stdscr.getmaxyx() + win_h, win_w = 9, min(86, max(46, w - 6)) + top, left = max(0, (h - win_h) // 2), max(0, (w - win_w) // 2) + + win = curses.newwin(win_h, win_w, top, left) + win.keypad(True) + win.border() + + win.addstr(1, 2, title[: win_w - 4], self.cp(PAIR_TITLE) | curses.A_BOLD) + win.addstr(2, 2, prompt[: win_w - 4], self.cp(PAIR_TEXT)) + + buf = list(initial) + curses.curs_set(1) + + while True: + win.addstr(4, 2, " " * (win_w - 4), self.cp(PAIR_TEXT)) + text = "".join(buf) + if len(text) > win_w - 4: + text = text[-(win_w - 4) :] + win.addstr(4, 2, text, self.cp(PAIR_TEXT) | curses.A_BOLD) + win.addstr(6, 2, "enter=ok esc=cancel", self.cp(PAIR_DIM)) + win.refresh() + + ch = win.getch() + if ch == 27: + curses.curs_set(0) + return None + if ch in (curses.KEY_ENTER, 10, 13): + curses.curs_set(0) + return "".join(buf).strip() + if ch in (curses.KEY_BACKSPACE, 127, 8): + if buf: + buf.pop() + elif 32 <= ch <= 126: + buf.append(chr(ch)) + + # ----- value editors ----- + + def cycle_attr(self, attr: str, options: list) -> None: + cur = getattr(self.s, attr) + if cur not in options: + setattr(self.s, attr, options[0]) + else: + setattr(self.s, attr, options[(options.index(cur) + 1) % len(options)]) + + def edit_int(self, label: str, attr: str, minv: Optional[int] = None) -> None: + val = self.input_box(label, "enter an integer:", str(getattr(self.s, attr))) + if val is None or val == "": + return + try: + n = int(val) + except ValueError: + self.message = "error: '%s' is not an integer" % val + return + if minv is not None and n < minv: + self.message = "error: %s must be >= %d" % (label, minv) + return + setattr(self.s, attr, n) + self.message = "%s = %d" % (label, n) + + def edit_float(self, label: str, attr: str, + minv: Optional[float] = None, maxv: Optional[float] = None) -> None: + val = self.input_box(label, "enter a number:", str(getattr(self.s, attr))) + if val is None or val == "": + return + try: + x = float(val) + except ValueError: + self.message = "error: '%s' is not a number" % val + return + if minv is not None and x < minv: + self.message = "error: %s must be >= %g" % (label, minv) + return + if maxv is not None and x > maxv: + self.message = "error: %s must be <= %g" % (label, maxv) + return + setattr(self.s, attr, x) + self.message = "%s = %g" % (label, x) + + # ----- report pager ----- + + def _pager_attr(self, ln: str) -> int: + s = ln.strip() + if "RECOMMENDED" in ln: + return self.cp(PAIR_OK) | curses.A_BOLD + if "<== recommended" in ln: + return self.cp(PAIR_OK) + if "INFEASIBLE" in ln or "unknown arch" in ln: + return self.cp(PAIR_ERR) | curses.A_BOLD + if s.startswith("!!"): + return self.cp(PAIR_HINT) + if s.startswith("T_TILE"): + return self.cp(PAIR_TITLE) | curses.A_BOLD + if set(s) <= {"=", "-"} and s: + return self.cp(PAIR_DIM) + return self.cp(PAIR_TEXT) + + def pager(self, title: str, lines: List[str]) -> None: + top = 0 + note = "" + while True: + self.stdscr.erase() + h, w = self.stdscr.getmaxyx() + + self.add(0, 2, title, self.cp(PAIR_TITLE) | curses.A_BOLD) + self.draw_keybar( + 1, + 2, + [ + ("up/dn/j/k", "scroll"), + ("PgUp/PgDn", "page"), + ("g/G", "top/end"), + ("w", "save"), + ("b/q", "back"), + ], + ) + self.hline(2) + + list_y = 4 + view_h = max(1, h - list_y - 2) + n = len(lines) + top = max(0, min(top, max(0, n - view_h))) + + for i, ln in enumerate(lines[top : top + view_h]): + self.add(list_y + i, 2, ln[: w - 3], self._pager_attr(ln)) + + self.hline(h - 2) + footer = "line %d-%d / %d" % (top + 1, min(top + view_h, n), n) + if note: + footer += " " + note + self.add(h - 1, 2, footer[: w - 4], self.cp(PAIR_DIM)) + self.stdscr.refresh() + + ch = self.stdscr.getch() + if ch in (ord("q"), ord("Q"), ord("b"), 8, 127): + return + if ch in (curses.KEY_UP, ord("k"), ord("K")): + top -= 1 + elif ch in (curses.KEY_DOWN, ord("j"), ord("J")): + top += 1 + elif ch in (curses.KEY_PPAGE,): + top -= view_h + elif ch in (curses.KEY_NPAGE, ord(" ")): + top += view_h + elif ch in (ord("g"),): + top = 0 + elif ch in (ord("G"),): + top = n + elif ch in (ord("w"), ord("W")): + note = self._save_report(lines) + + def _save_report(self, lines: List[str]) -> str: + path = os.path.join(os.getcwd(), "ideal_tile_size_report.txt") + try: + with open(path, "w") as f: + f.write("\n".join(lines) + "\n") + return "saved to %s" % path + except OSError as e: + return "save failed: %s" % e + + def do_compute(self) -> None: + self.pager("recommendation [%s]" % self.s.arch, build_report(self.s)) + + def reset(self) -> None: + self.s = Settings() + self.message = "reset to defaults" + + # ----- menus ----- + + def menu_main(self) -> Tuple[str, str, List[MenuItem]]: + return ( + "entity tile-size advisor", + "main menu:", + [ + MenuItem( + "architecture", + "choose the target GPU (or 'all')", + right=self.arch_label, + on_enter=lambda: self.push("arch"), + ), + MenuItem( + "physics & particles", + "dim, ppc, shape order, precision, ...", + right=lambda: "dim %d / ppc %g / so %d / %s" + % (self.s.dim, self.s.ppc, self.s.shape_order, self.s.precision), + on_enter=lambda: self.push("physics"), + ), + MenuItem( + "tuning knobs", + "residency, budgets, tile range, ...", + right=lambda: "res %d / cap %g / halo %.2f" + % (self.s.target_resident, self.s.npart_cap, self.s.halo_max), + on_enter=lambda: self.push("tuning"), + ), + MenuItem( + "compute recommendation", + "run the model and show the report", + on_enter=self.do_compute, + ), + MenuItem("reset to defaults", "", on_enter=self.reset), + MenuItem("exit", "", on_enter=lambda: setattr(self, "state", "exit")), + ], + ) + + def menu_arch(self) -> Tuple[str, str, List[MenuItem]]: + def choose(name: str): + self.s.arch = name + self.pop() + + def label(name: str) -> str: + mark = "(*)" if name == self.s.arch else "( )" + return "%s %s" % (mark, name) + + def right(name: str) -> str: + if name == "all": + return " + ".join(ALL_ARCHS) + return ARCH[name]["label"] + + items = [ + MenuItem( + label(a), + "select target", + right=(lambda a=a: right(a)), + on_enter=(lambda a=a: choose(a)), + ) + for a in ARCH_CHOICES + ] + items.append(MenuItem("back", "return", on_enter=self.pop)) + return ("architecture", "pick the target GPU:", items) + + def menu_physics(self) -> Tuple[str, str, List[MenuItem]]: + def cyc_dim(): + self.cycle_attr("dim", [1, 2, 3]) + + def cyc_prec(): + self.cycle_attr("precision", ["single", "double"]) + + return ( + "physics & particles", + "set physical inputs:", + [ + MenuItem( + "dim", + "space cycles: 1 / 2 / 3", + right=lambda: str(self.s.dim), + on_enter=cyc_dim, + on_space=cyc_dim, + ), + MenuItem( + "ppc", + "particles per cell (per species)", + right=lambda: "%g" % self.s.ppc, + on_enter=lambda: self.edit_float("ppc", "ppc", minv=0.0), + ), + MenuItem( + "shape_order", + "entity particle shape order", + right=lambda: str(self.s.shape_order), + on_enter=lambda: self.edit_int("shape_order", "shape_order", minv=0), + ), + MenuItem( + "precision", + "space cycles: single / double", + right=lambda: self.s.precision, + on_enter=cyc_prec, + on_space=cyc_prec, + ), + MenuItem( + "components", + "current-field components (J has 3)", + right=lambda: str(self.s.components), + on_enter=lambda: self.edit_int("components", "components", minv=1), + ), + MenuItem( + "drift", + "team_policy_drift CMake knob: cells the scratch halo absorbs (>= spatial_sorting_interval)", + right=lambda: str(self.s.drift), + on_enter=lambda: self.edit_int("drift", "drift", minv=0), + ), + MenuItem("back", "return", on_enter=self.pop), + ], + ) + + def menu_tuning(self) -> Tuple[str, str, List[MenuItem]]: + return ( + "tuning knobs", + "set the model's budgets and ranges:", + [ + MenuItem( + "target_resident", + "work-groups resident per compute unit", + right=lambda: str(self.s.target_resident), + on_enter=lambda: self.edit_int("target_resident", "target_resident", minv=1), + ), + MenuItem( + "npart_cap", + "particle-per-tile budget (contention proxy)", + right=lambda: "%g" % self.s.npart_cap, + on_enter=lambda: self.edit_float("npart_cap", "npart_cap", minv=1.0), + ), + MenuItem( + "halo_max", + "halo fraction above which the tile is grown (0..1)", + right=lambda: "%.2f" % self.s.halo_max, + on_enter=lambda: self.edit_float("halo_max", "halo_max", minv=0.0, maxv=1.0), + ), + MenuItem( + "grid", + "cells per dim (0 disables GPU-fill check)", + right=lambda: str(self.s.grid), + on_enter=lambda: self.edit_int("grid", "grid", minv=0), + ), + MenuItem( + "balance_factor", + "min tiles per compute unit", + right=lambda: str(self.s.balance_factor), + on_enter=lambda: self.edit_int("balance_factor", "balance_factor", minv=1), + ), + MenuItem( + "min_tile", + "smallest T_TILE to consider", + right=lambda: str(self.s.min_tile), + on_enter=lambda: self.edit_int("min_tile", "min_tile", minv=1), + ), + MenuItem( + "max_tile", + "largest T_TILE to consider", + right=lambda: str(self.s.max_tile), + on_enter=lambda: self.edit_int("max_tile", "max_tile", minv=1), + ), + MenuItem("back", "return", on_enter=self.pop), + ], + ) + + def get_menu(self) -> Tuple[str, str, List[MenuItem]]: + if self.state == "mainmenu": + return self.menu_main() + if self.state == "arch": + return self.menu_arch() + if self.state == "physics": + return self.menu_physics() + if self.state == "tuning": + return self.menu_tuning() + self.state = "mainmenu" + return self.menu_main() + + # ----- navigation ----- + + def is_disabled(self, it: MenuItem) -> bool: + return bool(it.disabled and it.disabled()) + + def move_sel(self, items: List[MenuItem], delta: int) -> None: + if not items: + return + n = len(items) + start = self.selected + for _ in range(n): + self.selected = (self.selected + delta) % n + if not self.is_disabled(items[self.selected]): + return + self.selected = start + + def activate(self, items: List[MenuItem], enter: bool) -> None: + if not items: + return + it = items[self.selected] + if self.is_disabled(it): + self.message = "error: option disabled." + return + fn = it.on_enter if enter else it.on_space + if fn: + fn() + + # ----- loop ----- + + def run(self) -> None: + while True: + if self.state == "exit": + return + + title, prompt, items = self.get_menu() + self.draw_menu(title, prompt, items) + + ch = self.stdscr.getch() + + if ch in (ord("q"), ord("Q")): + self.state = "exit" + continue + if ch in (ord("b"), 8, 127): + self.pop() + continue + if ch in (curses.KEY_UP, ord("k"), ord("K")): + self.move_sel(items, -1) + continue + if ch in (curses.KEY_DOWN, ord("j"), ord("J")): + self.move_sel(items, +1) + continue + if ch in (curses.KEY_ENTER, 10, 13): + self.activate(items, enter=True) + continue + if ch == ord(" "): + self.activate(items, enter=False) + continue + + +def run_tui() -> int: + try: + curses.wrapper(lambda stdscr: App(stdscr).run()) + except KeyboardInterrupt: + return 130 + return 0 + + +# ============================ +# CLI (preserved for scripting / sweeps) +# ============================ + +def run_cli(argv) -> int: + ap = argparse.ArgumentParser(description="Recommend entity team tile size (T_TILE).") + ap.add_argument("arch", help="pvc | nvidia | amd (or a100/h100/gh200/mi250x/mi300x, or 'all')") + ap.add_argument("--dim", type=int, default=2, choices=(1, 2, 3)) + ap.add_argument("--ppc", type=float, default=16.0, help="particles per cell (per species)") + ap.add_argument("--shape-order", type=int, default=2, help="particle shape order (entity shape_order)") + ap.add_argument("--precision", choices=("single", "double"), default="single") + ap.add_argument("--components", type=int, default=3, help="current-field components (J has 3)") + ap.add_argument("--drift", type=int, default=1, + help="team_policy_drift CMake knob (compile-time): cells of drift the scratch " + "halo absorbs; size it >= spatial_sorting_interval") + ap.add_argument("--target-resident", type=int, default=2, help="work-groups resident per compute unit") + ap.add_argument("--npart-cap", type=float, default=1600, + help="particle-per-tile budget (SLM-atomic-contention / load-balance proxy)") + ap.add_argument("--halo-max", type=float, default=0.70, help="halo fraction above which the tile is grown") + ap.add_argument("--grid", type=int, default=0, help="cells per dim (optional; enables a GPU-fill check)") + ap.add_argument("--balance-factor", type=int, default=4, help="min tiles per compute unit") + ap.add_argument("--min-tile", type=int, default=4, + help="smallest T_TILE to consider (entity's team_policy_tile_sizes starts at 4)") + ap.add_argument("--max-tile", type=int, default=64) + p = ap.parse_args(argv) + + for ln in build_report(p): + print(ln) + return 0 + + +def main() -> int: + # no arguments -> interactive TUI; any argument -> scriptable CLI + if len(sys.argv) > 1: + return run_cli(sys.argv[1:]) + return run_tui() + + +if __name__ == "__main__": + raise SystemExit(main()) From 61ebebfffdf38c91456311087b94b96a20917d99 Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Fri, 26 Jun 2026 03:03:23 +0000 Subject: [PATCH 30/33] team policy for GRPIC --- src/engines/grpic/currents.h | 228 ++++++++++++++++++++++++++++++----- src/engines/grpic/grpic.hpp | 3 +- 2 files changed, 200 insertions(+), 31 deletions(-) diff --git a/src/engines/grpic/currents.h b/src/engines/grpic/currents.h index 11c847533..cb4032f54 100644 --- a/src/engines/grpic/currents.h +++ b/src/engines/grpic/currents.h @@ -2,6 +2,8 @@ * @file engines/grpic/currents.h * @brief Current deposition and filtering routines for the GRPIC engine * @implements + * - ntt::grpic::CallDepositKernel<> -> void (flat path) + * - ntt::grpic::CallDepositKernelTiled<> -> void (TEAM_POLICY) * - ntt::grpic::CurrentsDeposit<> -> void * - ntt::grpic::CurrentsFilter<> -> void * @namespaces: @@ -12,8 +14,11 @@ #define ENGINES_GRPIC_CURRENTS_H #include "enums.h" +#include "global.h" +#include "arch/kokkos_aliases.h" #include "traits/metric.h" +#include "utils/error.h" #include "utils/log.h" #include "utils/param_container.h" @@ -26,13 +31,198 @@ namespace ntt { namespace grpic { + template + void CallDepositKernel(const Particles& species, + const M& local_metric, + const scatter_ndfield_t& scatter_cur, + real_t dt) { + Kokkos::parallel_for("CurrentsDeposit", + species.rangeActiveParticles(), + kernel::DepositCurrents_kernel( + scatter_cur, + species, + local_metric, + (real_t)(species.charge()), + dt)); + } + +#if defined(TEAM_POLICY) + /** + * @brief Tiled deposit launcher (TeamPolicy + per-team scratch). + * + * Identical in structure to the SRPIC launcher (`engines/srpic/currents.h`): + * iterates over `tile_layout.ntiles_total` teams; each team accumulates its + * tile's particle contributions in SLM scratch and atomically flushes to the + * global J (here `cur0`, the GRPIC half-step current). Requires the species + * to have been sorted with `team_policy` enabled (`tile_layout` populated by + * `SortSpatially`). + * + * The deposit body (`kernel::DepositOneParticle`) is + * the same shared math used by the flat path — it already carries the GR + * velocity-recovery branch — so the only engine-specific differences from + * SRPIC are the `SimEngine::GRPIC` tag and the `cur0` target. + * + * Falls back to the flat kernel for the tail `[npart_partitioned, npart)` + * exactly as SRPIC does; see the per-step coverage note in + * `kernels/currents_deposit.hpp`. + */ + template + void CallDepositKernelTiled(const Particles& species, + const M& local_metric, + const ndfield_t& cur, + real_t dt, + int team_size_req) { + static_assert(O <= 11u, "Shape order must be <= 11"); + constexpr unsigned short T = static_cast( + TEAM_POLICY_TILE_SIZE); + const auto& layout = species.tile_layout(); + raise::ErrorIf(layout.ntiles_total == 0u, + "CallDepositKernelTiled: tile_layout has 0 tiles — call " + "SortSpatially before CurrentsDeposit", + HERE); + raise::ErrorIf(layout.tile_offsets.extent(0) != layout.ntiles_total + 1u, + "CallDepositKernelTiled: tile_offsets size inconsistent " + "with ntiles_total", + HERE); + + auto deposit_kernel = + kernel::DepositCurrentsTiled_kernel { + cur, species, local_metric, (real_t)(species.charge()), + dt, layout, species.npart() + }; + + const auto scratch = Kokkos::PerTeam( + decltype(deposit_kernel)::scratch_bytes()); + + // Team (work-group) size. The default (team_size_req == 0) leaves + // Kokkos::AUTO, which sizes the team from the backend occupancy + // heuristic. A positive `algorithms.deposit.team_policy_team_size` + // overrides it, clamped to the scratch/backend-feasible maximum so an + // over-large request cannot abort the launch (Kokkos errors when + // team_size > team_size_max). No portable subgroup rounding is applied; + // pick a multiple of the device subgroup width (printed per arch by + // ideal_tile_size.py) for the best occupancy. + Kokkos::TeamPolicy<> policy(static_cast(layout.ntiles_total), + Kokkos::AUTO); + policy.set_scratch_size(0, scratch); + if (team_size_req > 0) { + const int ts_max = policy.team_size_max(deposit_kernel, + Kokkos::ParallelForTag {}); + int ts = team_size_req; + if (ts > ts_max) { + raise::Warning( + fmt::format("algorithms.deposit.team_policy_team_size = %d exceeds " + "the tiled-deposit maximum %d on this backend; clamping " + "to %d", + team_size_req, + ts_max, + ts_max), + HERE); + ts = ts_max; + } + policy = Kokkos::TeamPolicy<>(static_cast(layout.ntiles_total), ts); + policy.set_scratch_size(0, scratch); + logger::Checkpoint( + fmt::format("Tiled deposit: explicit team size %d", ts), + HERE); + } + Kokkos::parallel_for("CurrentsDepositTiled", policy, deposit_kernel); + + // Particles appended since the last sort (injection / MPI receive on a + // no-sort step) live past the partition and are not visited by any team + // above. Deposit that tail [npart_partitioned, npart) with the flat + // scatter-view kernel so every active particle is deposited exactly + // once. The range is empty when the species was just sorted (the + // every-step-sorted common case), so this is a no-op there. + if (species.npart() > layout.npart_partitioned) { + // `cur` is a const ref; take a non-const View handle (shallow copy, + // shares storage) so the scatter view can contribute back into it. + auto cur_nc = cur; + auto scatter_cur = Kokkos::Experimental::create_scatter_view(cur_nc); + Kokkos::parallel_for( + "CurrentsDepositTiledTail", + CreateParticleRangePolicy({ layout.npart_partitioned }, + { species.npart() }), + kernel::DepositCurrents_kernel( + scatter_cur, + species, + local_metric, + (real_t)(species.charge()), + dt)); + Kokkos::Experimental::contribute(cur_nc, scatter_cur); + } + } +#endif // TEAM_POLICY + template void CurrentsDeposit(Domain& domain, const prm::Parameters& engine_params) { + const auto dt = engine_params.get("dt"); + // GRPIC deposits the half-step current into `cur0` (the engine no longer + // pre-zeros it — this is the single source of truth, matching SRPIC). + Kokkos::deep_copy(domain.fields.cur0, ZERO); + +#if defined(TEAM_POLICY) + // Optional runtime override for the tiled-deposit team (work-group) size; + // 0 (default) keeps Kokkos::AUTO. Clamped to the backend max in the + // launcher (see CallDepositKernelTiled). + const auto team_size_req = static_cast( + engine_params.get("team_policy_team_size", + std::optional { 0u })); + + // Tiled deposit. Correctness no longer depends on the SoA being in a + // "sorted" state at deposit time — the tiled kernel handles a stale + // partition per-particle (escape valve for drifted particles, dead-tag + // clamp, and the launcher's flat tail pass for appended particles). The + // only case the tiled kernel cannot serve is the very first step, before + // any SortSpatially has populated a layout; that species takes the flat + // scatter-view path for that step alone. See engines/srpic/currents.h and + // kernels/currents_deposit.hpp for the full coverage argument. + for (auto& species : domain.species) { + if ((species.pusher() == ParticlePusher::NONE) or + (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) { + continue; + } + const auto& layout = species.tile_layout(); + if (layout.ntiles_total == 0u or layout.tile_offsets.extent(0) == 0u) { + logger::Checkpoint( + fmt::format("Launching currents deposit (flat, no sort yet) for " + "%d [%s] : %lu %f", + species.index(), + species.label().c_str(), + species.npart(), + (double)species.charge()), + HERE); + auto scatter_cur0 = Kokkos::Experimental::create_scatter_view( + domain.fields.cur0); + CallDepositKernel(species, + domain.mesh.metric, + scatter_cur0, + dt); + Kokkos::Experimental::contribute(domain.fields.cur0, scatter_cur0); + } else { + logger::Checkpoint( + fmt::format("Launching tiled currents deposit for %d [%s] : %lu %f", + species.index(), + species.label().c_str(), + species.npart(), + (double)species.charge()), + HERE); + CallDepositKernelTiled(species, + domain.mesh.metric, + domain.fields.cur0, + dt, + team_size_req); + } + } +#else auto scatter_cur0 = Kokkos::Experimental::create_scatter_view( domain.fields.cur0); - const auto dt = engine_params.get("dt"); for (auto& species : domain.species) { + if ((species.pusher() == ParticlePusher::NONE) or + (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) { + continue; + } logger::Checkpoint( fmt::format("Launching currents deposit kernel for %d [%s] : %lu %f", species.index(), @@ -40,36 +230,14 @@ namespace ntt { species.npart(), (double)species.charge()), HERE); - if (species.npart() == 0 || cmp::AlmostZero(species.charge())) { - continue; - } - Kokkos::parallel_for("CurrentsDeposit", - species.rangeActiveParticles(), - kernel::DepositCurrents_kernel( - scatter_cur0, - species.i1, - species.i2, - species.i3, - species.i1_prev, - species.i2_prev, - species.i3_prev, - species.dx1, - species.dx2, - species.dx3, - species.dx1_prev, - species.dx2_prev, - species.dx3_prev, - species.ux1, - species.ux2, - species.ux3, - species.phi, - species.weight, - species.tag, - domain.mesh.metric, - (real_t)(species.charge()), - dt)); + + CallDepositKernel(species, + domain.mesh.metric, + scatter_cur0, + dt); } Kokkos::Experimental::contribute(domain.fields.cur0, scatter_cur0); +#endif } template @@ -103,4 +271,4 @@ namespace ntt { } // namespace grpic } // namespace ntt -#endif // ENGINES_GRPIC_CURRENTS_H \ No newline at end of file +#endif // ENGINES_GRPIC_CURRENTS_H diff --git a/src/engines/grpic/grpic.hpp b/src/engines/grpic/grpic.hpp index fc2daa4de..621592955 100644 --- a/src/engines/grpic/grpic.hpp +++ b/src/engines/grpic/grpic.hpp @@ -416,7 +416,8 @@ namespace ntt { */ if (deposit_enabled) { timers.start("CurrentDeposit"); - Kokkos::deep_copy(dom.fields.cur0, ZERO); + // `cur0` is zeroed inside grpic::CurrentsDeposit (matching SRPIC), + // so no pre-zero is needed here. grpic::CurrentsDeposit(dom, this->engineParams()); timers.stop("CurrentDeposit"); From 210454b150ce7674985c030076b3429ca746722e Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Fri, 26 Jun 2026 14:03:51 +0000 Subject: [PATCH 31/33] bugfix in deposit tests --- tests/kernels/deposit.cpp | 18 ++++++++++++------ tests/kernels/deposit_tiled.cpp | 16 ++++++++++++++++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/tests/kernels/deposit.cpp b/tests/kernels/deposit.cpp index 24f544202..10095d304 100644 --- a/tests/kernels/deposit.cpp +++ b/tests/kernels/deposit.cpp @@ -142,15 +142,21 @@ void testDeposit(const std::vector& res, auto J_scat = Kokkos::Experimental::create_scatter_view(J); + // The deposit kernel now takes a `ParticleArrays` SoA struct instead of + // the individual per-component arrays. Pack the per-test arrays into one; + // payload (pld_*) members stay default (unused here). + ParticleArrays pa; + pa.i1 = i1, pa.i2 = i2, pa.i3 = i3; + pa.i1_prev = i1_prev, pa.i2_prev = i2_prev, pa.i3_prev = i3_prev; + pa.dx1 = dx1, pa.dx2 = dx2, pa.dx3 = dx3; + pa.dx1_prev = dx1_prev, pa.dx2_prev = dx2_prev, pa.dx3_prev = dx3_prev; + pa.ux1 = ux1, pa.ux2 = ux2, pa.ux3 = ux3; + pa.phi = phi, pa.weight = weight, pa.tag = tag; + // clang-format off Kokkos::parallel_for("CurrentsDeposit", 10, kernel::DepositCurrents_kernel(J_scat, - i1, i2, i3, - i1_prev, i2_prev, i3_prev, - dx1, dx2, dx3, - dx1_prev, dx2_prev, dx3_prev, - ux1, ux2, ux3, - phi, weight, tag, + pa, metric, charge, inv_dt)); // clang-format on diff --git a/tests/kernels/deposit_tiled.cpp b/tests/kernels/deposit_tiled.cpp index 3cc2f62b0..1b6237ae4 100644 --- a/tests/kernels/deposit_tiled.cpp +++ b/tests/kernels/deposit_tiled.cpp @@ -339,6 +339,22 @@ namespace { // boundary. template void run_drift_case() { + // This case deposits boundary-adjacent particles (cells touching the + // ghost stripe), so an order-O stencil must fit inside the field's + // N_GHOSTS ghost layers. N_GHOSTS is a compile-time constant fixed by + // the build's SHAPE_ORDER ((SHAPE_ORDER+1)/2 + 1); a build whose ghost + // width is smaller than order O requires would deposit outside the + // field -- silent on GPU (no Kokkos View bounds guard; the overshoot + // cells carry zero shape-weight so results still match) but heap + // corruption on a host/SERIAL build. Skip those orders here; build at + // the matching SHAPE_ORDER to drift-test higher orders. The equivalence + // ("X-1") cases above stay interior, so they exercise all orders. + if constexpr ((O + 1u) / 2u + 1u > N_GHOSTS) { + std::cerr << "deposit_tiled[drift] SKIP O=" << O << " T_TILE=" << T_TILE + << " (needs N_GHOSTS>=" << ((O + 1u) / 2u + 1u) + << ", build has " << N_GHOSTS << ")\n"; + return; + } using metric_t = metric::Minkowski; constexpr unsigned short nx1 = 50u, nx2 = 50u; metric_t metric { { nx1, nx2 }, { { 0.0, 55.0 }, { 0.0, 55.0 } }, {} }; From e9cfa06efbd7a975d98644a5d5cd0955c3d0bd45 Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Fri, 26 Jun 2026 14:23:32 +0000 Subject: [PATCH 32/33] added explicit charge conservation test to the tiled deposit test --- tests/kernels/deposit_tiled.cpp | 53 +++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tests/kernels/deposit_tiled.cpp b/tests/kernels/deposit_tiled.cpp index 1b6237ae4..f058d6f30 100644 --- a/tests/kernels/deposit_tiled.cpp +++ b/tests/kernels/deposit_tiled.cpp @@ -162,6 +162,55 @@ namespace { << " T_TILE=" << T_TILE << " max_diff=" << max_diff << '\n'; } + // Intrinsic charge-conservation check on a single deposited J field. + // Esirkepov/zigzag deposits satisfy the discrete continuity equation, so + // the spatial sum of the discrete divergence div.J = dJx/dx + dJy/dy + // vanishes whenever the summation region encloses every particle's full + // stencil (J == 0 on the region's outer boundary). This is evaluated on + // J_tiled ALONE -- it does not compare against the flat reference -- so it + // certifies the per-particle escape valve deposits each drifted particle's + // stencil as one coherent unit: no cell dropped, duplicated, or split + // between SLM scratch and global J. (The run_drift_case order guard keeps + // every stencil inside [0, j_ext), so the extreme ghost cells stay zero and + // the telescoping boundary flux is genuinely zero rather than clipped.) + // Accumulated in double regardless of build precision to keep the + // tolerance tight. + void check_charge_conservation(const ndfield_t& J, + unsigned short O, + unsigned short T_TILE, + const char* label) { + auto h = Kokkos::create_mirror_view(J); + Kokkos::deep_copy(h, J); + + double sum_div = 0.0; // Sum over the field of div.J (jx1 -> dx, jx2 -> dy). + double abs_tot = 0.0; // Total |J|, sets the relative tolerance scale. + for (ncells_t i = 1; i < h.extent(0); ++i) { + for (ncells_t j = 1; j < h.extent(1); ++j) { + sum_div += (static_cast(h(i, j, 0)) - + static_cast(h(i - 1, j, 0))) + + (static_cast(h(i, j, 1)) - + static_cast(h(i, j - 1, 1))); + } + } + for (ncells_t i = 0; i < h.extent(0); ++i) { + for (ncells_t j = 0; j < h.extent(1); ++j) { + abs_tot += std::fabs(static_cast(h(i, j, 0))) + + std::fabs(static_cast(h(i, j, 1))); + } + } + const double tol = 1.0e-5 * (abs_tot > 1.0 ? abs_tot : 1.0); + if (std::fabs(sum_div) > tol) { + std::cerr << "deposit_tiled[" << label + << "] CHARGE NON-CONSERVED for O=" << O << " T_TILE=" << T_TILE + << " : sum(div.J)=" << sum_div << " tol=" << tol + << " (abs_tot=" << abs_tot << ")\n"; + throw std::logic_error( + "DepositCurrentsTiled_kernel charge non-conservation"); + } + std::cerr << "deposit_tiled[" << label << "] charge-conserved O=" << O + << " T_TILE=" << T_TILE << " sum(div.J)=" << sum_div << '\n'; + } + template void run_one_case() { using metric_t = metric::Minkowski; @@ -484,6 +533,10 @@ namespace { } compare_J_fields(J_flat, J_tiled, O, T_TILE, "drift"); + // Self-contained conservation check on the escape-valve output: the + // drifted particles all take the per-particle global-J path, so this + // certifies that path is charge-conserving without leaning on J_flat. + check_charge_conservation(J_tiled, O, T_TILE, "drift"); } template From ca3f162fa0bff91ffea83fcf55f6aaa50e987a50 Mon Sep 17 00:00:00 2001 From: Ludwig Boess Date: Mon, 29 Jun 2026 14:49:09 -0400 Subject: [PATCH 33/33] bugfix in for sort includes --- src/global/utils/sort_dispatch.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/global/utils/sort_dispatch.h b/src/global/utils/sort_dispatch.h index 27a20a910..432b1c1b9 100644 --- a/src/global/utils/sort_dispatch.h +++ b/src/global/utils/sort_dispatch.h @@ -38,6 +38,21 @@ #include #include +// Entity's Kokkos alias macros (arch/kokkos_aliases.h) define bare words such +// as `Function`, `Inline`, `Lambda` and `ClassLambda`. These collide with +// template-parameter and member names used inside the vendor sort headers +// (rocPRIM, cub, oneDPL) and corrupt their parsing (e.g. rocPRIM's +// `template`). Suspend the aliases across the +// vendor includes only, then restore them for the rest of the translation unit. +#pragma push_macro("Function") +#pragma push_macro("Inline") +#pragma push_macro("Lambda") +#pragma push_macro("ClassLambda") +#undef Function +#undef Inline +#undef Lambda +#undef ClassLambda + #if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED) #include #include @@ -49,6 +64,11 @@ #include #endif +#pragma pop_macro("ClassLambda") +#pragma pop_macro("Lambda") +#pragma pop_macro("Inline") +#pragma pop_macro("Function") + #include #include #include