Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
a016b6b
version bump
haykh May 18, 2026
c84edca
Merge branch 'master' into 1.5.0rc
haykh May 18, 2026
00fa0bf
port: team policy and vendor specific sort
LudwigBoess May 19, 2026
04d403c
removed redundant inner loop over species
LudwigBoess May 19, 2026
5830c07
frontier-specific memory pool allocation
LudwigBoess May 19, 2026
6e42c8a
version in cmake
haykh May 21, 2026
cd6c8c7
Merge branch 'master' into dev/team_policy
LudwigBoess May 26, 2026
cf14e9f
support more tile sizes
LudwigBoess May 26, 2026
787aa04
removed persistent sort scratch to reduce memory overhead
LudwigBoess May 26, 2026
827cf26
added team policy reporting
LudwigBoess Jun 2, 2026
4b81914
explicitly bind GPU Transport Layer for GPU aware MPI on Frontier
LudwigBoess Jun 2, 2026
a603ecb
minor refactor
haykh Jun 3, 2026
f80e561
Merge branch 'dev/team_policy' of github.com:entity-toolkit/entity in…
haykh Jun 3, 2026
b5e05ad
compile-time sorting interval with team policies and sorting speedup
LudwigBoess Jun 19, 2026
4f4409d
fix test for tiled deposit
LudwigBoess Jun 19, 2026
4dfaba9
replaced compile-time sort intervale with compile-time drift halo size
LudwigBoess Jun 20, 2026
cd1be6b
AMD-specific sorting improvements
LudwigBoess Jun 20, 2026
ed70dbd
reduced exchange for filters
LudwigBoess Jun 21, 2026
314d142
moved tag_offsets_h outside of loop to avoid multiple device-host copies
LudwigBoess Jun 21, 2026
8047a11
batched async communication
LudwigBoess Jun 21, 2026
5036917
removed async comm because of bandwidth issue
LudwigBoess Jun 21, 2026
8891420
bugfix in current deposit
LudwigBoess Jun 21, 2026
4f3a762
double-buffer radix sort to drop the N-sized temp (fixes device OOM …
LudwigBoess Jun 21, 2026
94ba9cd
fix printing of total particles
LudwigBoess Jun 24, 2026
9a4333e
use a persistent buffer per sort to reduce memory overhead
LudwigBoess Jun 24, 2026
216940c
added the option to explicitly disable vendor sort
LudwigBoess Jun 24, 2026
af942c8
remove dead particles after every sort
LudwigBoess Jun 24, 2026
e1de8ae
test update
LudwigBoess Jun 24, 2026
592ff89
generalized reduced exchange for current filters on any coordinate sy…
LudwigBoess Jun 25, 2026
9602eab
Merge pull request #212 from entity-toolkit/dev/frontier_mpi
LudwigBoess Jun 25, 2026
e2ff66d
Merge branch '1.5.0rc' into dev/team_policy
LudwigBoess Jun 25, 2026
6e1e23e
bugfix
LudwigBoess Jun 25, 2026
ff3ec19
move definittion of `compute_tile_offsets` to `public` so it compiles…
LudwigBoess Jun 25, 2026
03ccd20
added `team_policy_team_size` as a tunable runtime parameter
LudwigBoess Jun 26, 2026
ca7307d
add script to compute ideal tile size
LudwigBoess Jun 26, 2026
61ebebf
team policy for GRPIC
LudwigBoess Jun 26, 2026
210454b
bugfix in deposit tests
LudwigBoess Jun 26, 2026
e9cfa06
added explicit charge conservation test to the tiled deposit test
LudwigBoess Jun 26, 2026
c6e47fb
Merge pull request #209 from entity-toolkit/dev/team_policy
LudwigBoess Jun 26, 2026
ca3f162
bugfix in for sort includes
LudwigBoess Jun 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 156 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ set(PROJECT_NAME entity)

project(
${PROJECT_NAME}
VERSION 1.4.2
VERSION 1.5.0
LANGUAGES CXX C)
add_compile_options("-D ENTITY_VERSION=\"${PROJECT_VERSION}\"")
set(hash_cmd "git diff --quiet src/ && echo $(git rev-parse HEAD) ")
Expand Down Expand Up @@ -58,6 +58,24 @@ set(gpu_aware_mpi
${default_gpu_aware_mpi}
CACHE BOOL "Enable GPU-aware MPI")

set(team_policy
${default_team_policy}
CACHE BOOL "Enable team_policy tile-blocked deposit/pusher kernels")
set(team_policy_tile_size
${default_team_policy_tile_size}
CACHE STRING "team_policy tile edge length in cells")
set(team_policy_tile_sizes
"4;6;8;10;12;14;16"
CACHE STRING "team_policy tile-size choices")
set(team_policy_drift
${default_team_policy_drift}
CACHE STRING
"team_policy tiled-deposit scratch halo drift in cells (max cells a particle may move between two sorts). Sizes the deposit scratch halo only; the sort cadence is set at runtime via spatial_sorting_interval. Default 1.")
set(vendor_sort
${default_vendor_sort}
CACHE BOOL
"Use the vendor sort_by_key (oneDPL/Thrust/rocThrust) for the team_policy spatial sort when available. OFF forces the Kokkos::BinSort fallback, which sorts each SoA member in place (lower peak memory, no maxnpart gather buffer) at the cost of sort speed.")

# -------------------------- Compilation settings -------------------------- #
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
Expand Down Expand Up @@ -136,6 +154,87 @@ else()
set(DEVICE_ENABLED OFF)
endif()

# ------------------------------ team_policy wiring ------------------------ #
if(${team_policy})
list(FIND team_policy_tile_sizes "${team_policy_tile_size}" _tps_idx)
if(_tps_idx EQUAL -1)
message(FATAL_ERROR
"${Red}team_policy_tile_size must be one of ${team_policy_tile_sizes}, "
"got '${team_policy_tile_size}'${ColorReset}")
endif()
add_compile_options("-D TEAM_POLICY")
add_compile_options("-D TEAM_POLICY_TILE_SIZE=${team_policy_tile_size}")

# Compile-time tiled-deposit scratch halo drift. Sizes the halo so a
# particle that drifts up to DRIFT cells between two sorts still deposits
# inside its tile scratch; particles drifting further take the
# per-particle global-J escape valve (correct, only slower). This is
# independent of the sort cadence, which is set at runtime via
# `spatial_sorting_interval`. Defaults to 1 (the sorted-every-step case).
add_compile_options("-D TEAM_POLICY_DRIFT=${team_policy_drift}")

# Vendor sort: oneDPL on SYCL, Thrust on CUDA, rocThrust/rocprim on HIP.
# When `vendor_sort` is ON (default) the available library is detected
# and used; the spatial sort then builds a single permutation that
# gathers all SoA members. When `vendor_sort` is OFF, or no library is
# found, the code falls back to Kokkos::BinSort, which sorts each member
# in place -- lower peak memory and no maxnpart gather buffer, at the
# cost of sort speed (negligible when sorting is a small fraction of the
# step). The `vendor_sort` knob lets you force the BinSort fallback even
# when a vendor library is present.
if(${vendor_sort})
if("${Kokkos_DEVICES}" MATCHES "SYCL")
find_package(oneDPL QUIET)
if(oneDPL_FOUND)
message(STATUS "team_policy: oneDPL found, enabling SYCL sort_by_key")
add_compile_options("-D ONEDPL_ENABLED")
set(DEPENDENCIES ${DEPENDENCIES} oneDPL)
else()
message(STATUS "team_policy: oneDPL not found; using BinSort fallback "
"for SYCL sort_by_key")
endif()
endif()

if("${Kokkos_DEVICES}" MATCHES "CUDA")
find_package(Thrust QUIET)
if(Thrust_FOUND)
message(STATUS "team_policy: Thrust enabled for CUDA sort_by_key")
add_compile_options("-D THRUST_ENABLED")
else()
message(STATUS "team_policy: Thrust not found; using BinSort fallback "
"for CUDA sort_by_key")
endif()
endif()

if("${Kokkos_DEVICES}" MATCHES "HIP")
# rocThrust ships with ROCm. The HIP sort_by_key path uses rocprim's
# bounded-bit radix sort directly (rocprim is rocThrust's own
# dependency, so its headers come in transitively; we find it
# explicitly to keep the include path robust). This builds a single
# permutation that gathers all SoA members, instead of the legacy
# per-member Kokkos::BinSort path which allocates a fresh
# `sorted_values` buffer for every member every step (the dominant
# source of allocator churn / fragmentation on ROCm).
find_package(rocthrust QUIET)
if(rocthrust_FOUND)
message(STATUS "team_policy: rocThrust enabled for HIP sort_by_key")
add_compile_options("-D ROCTHRUST_ENABLED")
set(DEPENDENCIES ${DEPENDENCIES} roc::rocthrust)
find_package(rocprim QUIET)
if(rocprim_FOUND)
set(DEPENDENCIES ${DEPENDENCIES} roc::rocprim)
endif()
else()
message(STATUS "team_policy: rocThrust not found; using BinSort "
"fallback for HIP sort_by_key")
endif()
endif()
else()
message(STATUS "team_policy: vendor_sort=OFF; forcing Kokkos::BinSort "
"fallback for spatial sort_by_key")
endif()
endif()

# MPI
if(${mpi})
find_or_fetch_dependency(MPI FALSE REQUIRED)
Expand All @@ -145,6 +244,62 @@ if(${mpi})
if(${DEVICE_ENABLED})
if(${gpu_aware_mpi})
add_compile_options("-D GPU_AWARE_MPI")

# On Cray systems (e.g. Frontier) GPU-aware Cray MPICH can only
# handle device pointers if the GPU Transport Layer (GTL) library
# is linked. The Cray compiler wrappers (cc/CC) inject this
# automatically, but we build with hipcc/nvcc directly, so
# find_package(MPI) only finds base libmpi and the GTL is left
# out -> MPI_Sendrecv on a device pointer fails with
# "OFI ... Bad address". Add it explicitly here.
#
# Cray PE exports PE_MPICH_GTL_DIR_<accel> / PE_MPICH_GTL_LIBS_<accel>
# (e.g. amd_gfx90a -> -lmpi_gtl_hsa). Their absence means this is
# not a Cray MPICH build, in which case nothing extra is needed.
if("${Kokkos_DEVICES}" MATCHES "HIP")
set(_gtl_accels amd_gfx942 amd_gfx940 amd_gfx90a amd_gfx908 amd_gfx906)
elseif("${Kokkos_DEVICES}" MATCHES "CUDA")
set(_gtl_accels nvidia90 nvidia80 nvidia70)
elseif("${Kokkos_DEVICES}" MATCHES "SYCL")
set(_gtl_accels ponteVecchio)
else()
set(_gtl_accels "")
endif()

set(_gtl_dir "")
set(_gtl_libflag "")
foreach(_accel ${_gtl_accels})
if((NOT _gtl_dir) AND (DEFINED ENV{PE_MPICH_GTL_DIR_${_accel}}))
# strip the leading "-L" from the Cray-provided value
string(REGEX REPLACE "^-L" ""
_gtl_dir "$ENV{PE_MPICH_GTL_DIR_${_accel}}")
string(REGEX REPLACE "^-l" ""
_gtl_libflag "$ENV{PE_MPICH_GTL_LIBS_${_accel}}")
endif()
endforeach()

if(_gtl_dir AND _gtl_libflag)
find_library(MPI_GTL_LIBRARY
NAMES ${_gtl_libflag}
HINTS "${_gtl_dir}"
NO_DEFAULT_PATH)
if(MPI_GTL_LIBRARY)
message(STATUS
"GPU-aware MPI: linking Cray GTL library ${MPI_GTL_LIBRARY}")
set(DEPENDENCIES ${DEPENDENCIES} ${MPI_GTL_LIBRARY})
else()
message(FATAL_ERROR
"${Red}gpu_aware_mpi=ON: Cray MPICH detected but the GTL "
"library 'lib${_gtl_libflag}' was not found in '${_gtl_dir}'. "
"GPU-aware MPI will crash at runtime without it. Make sure the "
"craype-accel module is loaded, or build with gpu_aware_mpi=OFF."
"${ColorReset}")
endif()
else()
message(STATUS
"GPU-aware MPI: no Cray GTL environment found; assuming the MPI "
"implementation is GPU-aware without an extra transport library.")
endif()
endif()
else()
set(gpu_aware_mpi
Expand Down
33 changes: 33 additions & 0 deletions cmake/defaults.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,36 @@ else()
endif()

set_property(CACHE default_gpu_aware_mpi PROPERTY TYPE BOOL)

if(DEFINED ENV{Entity_ENABLE_TEAM_POLICY})
set(default_team_policy
$ENV{Entity_ENABLE_TEAM_POLICY}
CACHE INTERNAL "Default flag for team_policy tile-blocked kernels")
else()
set(default_team_policy
OFF
CACHE INTERNAL "Default flag for team_policy tile-blocked kernels")
endif()
set_property(CACHE default_team_policy PROPERTY TYPE BOOL)

if(DEFINED ENV{Entity_ENABLE_VENDOR_SORT})
set(default_vendor_sort
$ENV{Entity_ENABLE_VENDOR_SORT}
CACHE INTERNAL
"Default flag for vendor sort_by_key (oneDPL/Thrust/rocThrust)")
else()
set(default_vendor_sort
ON
CACHE INTERNAL
"Default flag for vendor sort_by_key (oneDPL/Thrust/rocThrust)")
endif()
set_property(CACHE default_vendor_sort PROPERTY TYPE BOOL)

set(default_team_policy_tile_size
8
CACHE INTERNAL "Default tile edge length in cells for team_policy")

set(default_team_policy_drift
1
CACHE INTERNAL
"Default tiled-deposit scratch halo drift for team_policy (cells between sorts)")
45 changes: 45 additions & 0 deletions cmake/report.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,44 @@ if(${mpi} AND ${DEVICE_ENABLED})
GPU_AWARE_MPI_REPORT
46)
endif()
printchoices(
"Team Policy"
"team_policy"
"${ON_OFF_VALUES}"
${team_policy}
OFF
"${Green}"
TEAM_POLICY_REPORT
46)
if(${team_policy})
printchoices(
"Team Tile Size"
"team_policy_tile_size"
"${team_policy_tile_sizes}"
${team_policy_tile_size}
${default_team_policy_tile_size}
"${Blue}"
TEAM_POLICY_TILE_SIZE_REPORT
46)
printchoices(
"Team Deposit Drift"
"team_policy_drift"
"${team_policy_drift}"
${team_policy_drift}
1
"${Blue}"
TEAM_POLICY_DRIFT_REPORT
46)
printchoices(
"Vendor sort"
"vendor_sort"
"${ON_OFF_VALUES}"
${vendor_sort}
ON
"${Green}"
VENDOR_SORT_REPORT
46)
endif()
printchoices(
"Debug mode"
"DEBUG"
Expand Down Expand Up @@ -197,6 +235,13 @@ if(${mpi} AND ${DEVICE_ENABLED})
string(APPEND REPORT_TEXT " " ${GPU_AWARE_MPI_REPORT} "\n")
endif()

string(APPEND REPORT_TEXT " " ${TEAM_POLICY_REPORT} "\n")
if(${team_policy})
string(APPEND REPORT_TEXT " " ${TEAM_POLICY_TILE_SIZE_REPORT} "\n")
string(APPEND REPORT_TEXT " " ${TEAM_POLICY_DRIFT_REPORT} "\n")
string(APPEND REPORT_TEXT " " ${VENDOR_SORT_REPORT} "\n")
endif()

string(
APPEND
REPORT_TEXT
Expand Down
Loading
Loading