diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h index e2b9c8c731..aef9b0e853 100644 --- a/src/TiledArray/dist_eval/contraction_eval.h +++ b/src/TiledArray/dist_eval/contraction_eval.h @@ -127,11 +127,65 @@ class Summa const ordinal_type right_stride_local_; ///< stride for local right row iterators + // 3-d (h-grouped) grid information. The world's first + // proc_h_ * proc_h_stride ranks are partitioned into proc_h_ contiguous + // groups; slab h belongs to group h % proc_h_, and each group runs its + // own 2-d SUMMA grid (proc_grid_ is this rank's GROUP-LOCAL grid, + // constructed over the group's rank interval). proc_h_ == 1 is the + // ordinary shared-grid batched contraction. + const ordinal_type proc_h_; ///< Number of slab (h) groups + const ordinal_type + proc_h_stride_; ///< World ranks per slab group (the + ///< group of slab h spans world ranks + ///< [(h % proc_h_) * proc_h_stride_, ...)) + const ordinal_type first_slab_; ///< This rank's group's first slab (== its + ///< group index), or nh_ if this rank is + ///< in no group (idle for this eval) + const ordinal_type my_slabs_; ///< Number of slabs of this rank's group + + /// \return the world rank that owns result tile \p i: the within-group + /// owner (from the group-local process grid) shifted by the world-rank + /// offset of the group that owns \p i's slab. For proc_h_ == 1 the offset + /// is 0 and this is the ordinary cyclic owner. + ProcessID result_tile_owner(const ordinal_type i) const { + const ordinal_type source_index = DistEvalImpl_::perm_index_to_source(i); + // owner is independent of slab index *within a group* + const ordinal_type slab_index = source_index % result_slab_size_; + const ordinal_type tile_row = slab_index / proc_grid_.cols(); + const ordinal_type tile_col = slab_index % proc_grid_.cols(); + const ordinal_type proc_row = tile_row % proc_grid_.proc_rows(); + const ordinal_type proc_col = tile_col % proc_grid_.proc_cols(); + const ProcessID within_group = proc_row * proc_grid_.proc_cols() + proc_col; + // shift by the offset of the group that owns this tile's slab + const ordinal_type slab = source_index / result_slab_size_; + const ordinal_type group = (proc_h_ > 1ul) ? (slab % proc_h_) : 0ul; + return ProcessID(group * proc_h_stride_) + within_group; + } + /// \return the slab index of SUMMA step \p s ordinal_type step_h(const ordinal_type s) const { return s / k_; } /// \return the within-slab inner-dimension index of SUMMA step \p s ordinal_type step_k(const ordinal_type s) const { return s % k_; } + /// \return the smallest SUMMA step >= \p s that belongs to one of this + /// rank's group's slabs, or nsteps_ if there is none + ordinal_type next_step(ordinal_type s) const { + if (proc_h_ == 1ul) return std::min(s, nsteps_); + if (first_slab_ >= nh_) return nsteps_; // not in any group + while (s < nsteps_ && (step_h(s) % proc_h_) != first_slab_) + s = (step_h(s) + 1ul) * k_; // jump to the start of the next slab + return std::min(s, nsteps_); + } + + /// \return this rank's group-local ordinal of slab \p h (which must + /// belong to this rank's group) + ordinal_type slab_ord(const ordinal_type h) const { + return (h - first_slab_) / proc_h_; + } + + /// \return the number of SUMMA steps of this rank's group's slabs + ordinal_type my_steps() const { return my_slabs_ * k_; } + typedef Future right_future; ///< Future to a right-hand argument tile typedef Future @@ -225,7 +279,7 @@ class Summa /// \param end The end of the row or column range /// \param stride The row or column index stride /// \param k The broadcast group index - /// \param max_group_size The maximum number of processes in the result + /// \param max_proc_h_stride The maximum number of processes in the result /// group, which is equal to the number of process in this process row or /// column as defined by \c proc_grid_. /// \param key_offset The key that will be used to identify the process group @@ -238,21 +292,21 @@ class Summa const std::vector& process_mask, ordinal_type index, const ordinal_type end, const ordinal_type stride, - const ordinal_type max_group_size, + const ordinal_type max_proc_h_stride, const ordinal_type k, const ordinal_type key_offset, const ProcMap& proc_map) const { // Generate the list of processes in rank_row - std::vector proc_list(max_group_size, -1); + std::vector proc_list(max_proc_h_stride, -1); // Flag the root processes of the broadcast, which may not be included // by shape. - ordinal_type p = k % max_group_size; + ordinal_type p = k % max_proc_h_stride; proc_list[p] = proc_map(p); ordinal_type count = 1ul; // Flag all processes that have non-zero tiles - for (p = 0ul; (index < end) && (count < max_group_size); - index += stride, p = (p + 1u) % max_group_size) { + for (p = 0ul; (index < end) && (count < max_proc_h_stride); + index += stride, p = (p + 1u) % max_proc_h_stride) { if ((proc_list[p] != -1) || (shape.is_zero(index)) || !process_mask.at(p)) continue; @@ -734,7 +788,7 @@ class Summa // broadcast root (i.e. within-slab k congruent to rank_col mod Pcols) const ordinal_type Pcols = proc_grid_.proc_cols(); - for (; s < end; ++s) { + for (s = next_step(s); s < end; s = next_step(s + 1ul)) { const ordinal_type k = step_k(s); if (k % Pcols != static_cast(proc_grid_.rank_col())) continue; @@ -787,7 +841,7 @@ class Summa // broadcast root (i.e. within-slab k congruent to rank_row mod Prows) const ordinal_type Prows = proc_grid_.proc_rows(); - for (; s < end; ++s) { + for (s = next_step(s); s < end; s = next_step(s + 1ul)) { const ordinal_type k = step_k(s); if (k % Prows != static_cast(proc_grid_.rank_row())) continue; @@ -847,9 +901,9 @@ class Summa /// \return The first step, greater than or equal to \c s with non-zero /// tiles, or \c nsteps_ if none is found. ordinal_type iterate_row(ordinal_type s) const { - // Iterate over steps until a non-zero tile is found or the end of the - // matrix is reached. - for (; s < nsteps_; ++s) { + // Iterate over this rank's group's steps until a non-zero tile is found + // or the end of the matrix is reached. + for (s = next_step(s); s < nsteps_; s = next_step(s + 1ul)) { // Search for non-zero tiles in row k of slab h of right ordinal_type i = step_h(s) * right_slab_size_ + step_k(s) * proc_grid_.cols(); @@ -871,9 +925,9 @@ class Summa /// \return The first step, greater than or equal to \c s, that contains /// a non-zero tile. If no non-zero tile is not found, return \c nsteps_. ordinal_type iterate_col(ordinal_type s) const { - // Iterate over steps until a non-zero tile is found or the end of the - // matrix is reached. - for (; s < nsteps_; ++s) { + // Iterate over this rank's group's steps until a non-zero tile is found + // or the end of the matrix is reached. + for (s = next_step(s); s < nsteps_; s = next_step(s + 1ul)) { // Search column k of slab h for non-zero tiles const ordinal_type base = step_h(s) * left_slab_size_; for (ordinal_type i = base + left_start_local_ + step_k(s); @@ -963,11 +1017,17 @@ class Summa return tile_count; } else { // Construct static broadcast groups for dense arguments - // (key space [0, 2*nsteps_) is reserved for the sparse per-step groups) - const madness::DistributedID col_did(DistEvalImpl_::id(), 2ul * nsteps_); + // (key space [0, 2*nsteps_) is reserved for the sparse per-step groups, + // whose keys h*k_ and h*k_+nsteps_ are disjoint across h-groups; the + // two static keys are offset PAST that range and made group-unique so + // that two different groups' single-grid static groups never claim the + // same DistributedID with inconsistent membership) + const std::size_t static_key_base = 2ul * nsteps_ + 2ul * first_slab_; + const madness::DistributedID col_did(DistEvalImpl_::id(), + static_key_base); col_group_ = proc_grid_.make_col_group(col_did); const madness::DistributedID row_did(DistEvalImpl_::id(), - 2ul * nsteps_ + 1ul); + static_key_base + 1ul); row_group_ = proc_grid_.make_row_group(row_did); #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE @@ -985,12 +1045,12 @@ class Summa #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE // Allocate memory for the reduce pair tasks (one per local result tile - // per slab). + // per slab of this rank's group). std::allocator> alloc; - reduce_tasks_ = alloc.allocate(nh_ * proc_grid_.local_size()); + reduce_tasks_ = alloc.allocate(my_slabs_ * proc_grid_.local_size()); // Iterate over all local tiles - const ordinal_type n = nh_ * proc_grid_.local_size(); + const ordinal_type n = my_slabs_ * proc_grid_.local_size(); for (ordinal_type t = 0ul; t < n; ++t) { // Initialize the reduction task ReducePairTask* MADNESS_RESTRICT const reduce_task = @@ -1019,9 +1079,9 @@ class Summa if (k_ == 0) return 0; // Allocate memory for the reduce pair tasks (one per local result tile - // per slab). + // per slab of this rank's group). std::allocator> alloc; - reduce_tasks_ = alloc.allocate(nh_ * proc_grid_.local_size()); + reduce_tasks_ = alloc.allocate(my_slabs_ * proc_grid_.local_size()); // Initialize iteration variables const ordinal_type col_stride = // The stride to iterate down a column @@ -1030,10 +1090,11 @@ class Summa proc_grid_.proc_cols(); // Iterate over all local tiles, slab by slab (the block-cyclic phase - // restarts at every slab: the owner of tile (h,i,j) does not depend on h) + // restarts at every slab: within a group, the owner of tile (h,i,j) + // does not depend on h) ordinal_type tile_count = 0ul; ReducePairTask* MADNESS_RESTRICT reduce_task = reduce_tasks_; - for (ordinal_type h = 0ul; h < nh_; ++h) { + for (ordinal_type h = first_slab_; h < nh_; h += proc_h_) { const ordinal_type slab_base = h * result_slab_size_; ordinal_type row_start = slab_base + proc_grid_.rank_row() * proc_grid_.cols(); @@ -1103,7 +1164,7 @@ class Summa // Iterate over all local tiles, slab by slab ReducePairTask* reduce_task = reduce_tasks_; - for (ordinal_type h = 0ul; h < nh_; ++h) { + for (ordinal_type h = first_slab_; h < nh_; h += proc_h_) { const ordinal_type slab_base = h * result_slab_size_; ordinal_type row_start = slab_base + proc_grid_.rank_row() * proc_grid_.cols(); @@ -1126,7 +1187,7 @@ class Summa // Deallocate the memory for the reduce pair tasks. std::allocator>().deallocate( - reduce_tasks_, nh_ * proc_grid_.local_size()); + reduce_tasks_, my_slabs_ * proc_grid_.local_size()); } /// Set the result tiles and destroy reduce tasks @@ -1145,7 +1206,7 @@ class Summa // Iterate over all local tiles, slab by slab ReducePairTask* reduce_task = reduce_tasks_; - for (ordinal_type h = 0ul; h < nh_; ++h) { + for (ordinal_type h = first_slab_; h < nh_; h += proc_h_) { const ordinal_type slab_base = h * result_slab_size_; ordinal_type row_start = slab_base + proc_grid_.rank_row() * proc_grid_.cols(); @@ -1177,7 +1238,7 @@ class Summa } // Deallocate the memory for the reduce pair tasks. std::allocator>().deallocate( - reduce_tasks_, nh_ * proc_grid_.local_size()); + reduce_tasks_, my_slabs_ * proc_grid_.local_size()); #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE ss << "}\n"; @@ -1229,9 +1290,10 @@ class Summa const std::vector& col, const std::vector& row, madness::TaskInterface* const task) { - // The reduce tasks of slab h occupy - // [h * local_size, (h+1) * local_size) - const ordinal_type slab_offset = step_h(s) * proc_grid_.local_size(); + // The reduce tasks of this group's slab h occupy + // [slab_ord(h) * local_size, (slab_ord(h)+1) * local_size) + const ordinal_type slab_offset = + slab_ord(step_h(s)) * proc_grid_.local_size(); // Iterate over the row for (ordinal_type i = 0ul; i < col.size(); ++i) { @@ -1266,9 +1328,10 @@ class Summa const std::vector& col, const std::vector& row, madness::TaskInterface* const task) { - // The reduce tasks of slab h occupy - // [h * local_size, (h+1) * local_size) - const ordinal_type slab_offset = step_h(s) * proc_grid_.local_size(); + // The reduce tasks of this group's slab h occupy + // [slab_ord(h) * local_size, (slab_ord(h)+1) * local_size) + const ordinal_type slab_offset = + slab_ord(step_h(s)) * proc_grid_.local_size(); // Iterate over the row for (ordinal_type i = 0ul; i < col.size(); ++i) { @@ -1452,8 +1515,12 @@ class Summa template void make_next_step_tasks(Derived* task, ordinal_type depth) { - // Set the depth to be no greater than the maximum number steps - if (depth > owner_->nsteps_) depth = owner_->nsteps_; + // Set the depth to be no greater than the number of SUMMA steps this + // rank's group actually executes. In the 2-d (proc_h_ == 1) case this is + // nsteps_ (my_slabs_ == nh_); in the 3-d (proc_h_ > 1) case my_steps() < + // nsteps_, and clamping to nsteps_ would pre-spawn surplus step tasks + // that all resolve to the terminating step (k_ == nsteps_). + if (depth > owner_->my_steps()) depth = owner_->my_steps(); // Spawn n=depth step tasks for (; depth > 0ul; --depth) { @@ -1540,13 +1607,14 @@ class Summa public: DenseStepTask(const std::shared_ptr& owner, const ordinal_type depth) - : StepTask(owner, owner->nsteps_ + 1ul), k_(0) { + : StepTask(owner, owner->my_steps() + 1ul), k_(owner->next_step(0ul)) { StepTask::make_next_step_tasks(this, depth); - StepTask::spawn_get_row_col_tasks(k_); + if (k_ < owner_->nsteps_) StepTask::spawn_get_row_col_tasks(k_); } DenseStepTask(DenseStepTask* const parent, const int ndep) - : StepTask(parent, ndep), k_(parent->k_ + 1ul) { + : StepTask(parent, ndep), + k_(parent->owner_->next_step(parent->k_ + 1ul)) { // Spawn tasks to get k-th row and column tiles if (k_ < owner_->nsteps_) StepTask::spawn_get_row_col_tasks(k_); } @@ -1671,7 +1739,8 @@ class Summa const trange_type trange, const shape_type& shape, const std::shared_ptr& pmap, const Perm& perm, const op_type& op, const ordinal_type k, const ProcGrid& proc_grid, - const ordinal_type nh = 1ul) + const ordinal_type nh = 1ul, const ordinal_type proc_h = 1ul, + const ordinal_type proc_h_stride = 0ul) : DistEvalImpl_(world, trange, shape, pmap, outer(perm)), left_(left), right_(right), @@ -1685,6 +1754,11 @@ class Summa left_slab_size_(left.size() / nh), right_slab_size_(right.size() / nh), result_slab_size_(proc_grid.rows() * proc_grid.cols()), + proc_h_(proc_h), + proc_h_stride_(proc_h_stride), + first_slab_(compute_first_slab(world, nh, proc_h, proc_h_stride)), + my_slabs_(first_slab_ < nh ? (nh - first_slab_ + proc_h - 1ul) / proc_h + : 0ul), reduce_tasks_(NULL), left_start_local_(proc_grid_.rank_row() * k), left_end_(left.size() / nh), @@ -1703,6 +1777,19 @@ class Summa TA_ASSERT(nh_ > 0); TA_ASSERT(left.size() % nh_ == 0); TA_ASSERT(right.size() % nh_ == 0); + TA_ASSERT(proc_h_ > 0); + TA_ASSERT(proc_h_ == 1ul || proc_h_stride > 0ul); + TA_ASSERT(proc_h_ <= nh_); + } + + /// \return this rank's group's first slab (== its group index), or + /// \p nh if this rank is outside the grouped rank interval + static ordinal_type compute_first_slab(World& world, const ordinal_type nh, + const ordinal_type proc_h, + const ordinal_type proc_h_stride) { + if (proc_h == 1ul) return 0ul; + const auto rank = ordinal_type(world.rank()); + return (rank < proc_h * proc_h_stride) ? (rank / proc_h_stride) : nh; } virtual ~Summa() {} @@ -1717,18 +1804,11 @@ class Summa TA_ASSERT(TensorImpl_::is_local(i)); TA_ASSERT(!TensorImpl_::is_zero(i)); - const ordinal_type source_index = DistEvalImpl_::perm_index_to_source(i); - - // Compute tile coordinate in tile grid (the owner of a tile is - // independent of its slab index) - const ordinal_type slab_index = source_index % result_slab_size_; - const ordinal_type tile_row = slab_index / proc_grid_.cols(); - const ordinal_type tile_col = slab_index % proc_grid_.cols(); - // Compute process coordinate of tile in the process grid - const ordinal_type proc_row = tile_row % proc_grid_.proc_rows(); - const ordinal_type proc_col = tile_col % proc_grid_.proc_cols(); - // Compute the process that owns tile - const ProcessID source = proc_row * proc_grid_.proc_cols() + proc_col; + // The process that owns tile i: the within-group cyclic owner shifted by + // the world-rank offset of the tile's slab group (see + // result_tile_owner). For proc_h_ == 1 this is the ordinary cyclic + // owner over the whole world. + const ProcessID source = result_tile_owner(i); const madness::DistributedID key(DistEvalImpl_::id(), i); return TensorImpl_::world().gop.template recv(source, key); diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index fdf5b0da82..1229b00a94 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -185,6 +185,9 @@ class ContEngine : public BinaryEngine { ///< from the canonical result layout, so ///< the evaluated result is re-permuted ///< by a streaming unary eval + size_type proc_h_ = 1; ///< process-grid extent along the slab (h) + ///< axis of the 3-d grid (# of h-planes) + size_type proc_h_stride_ = 0; ///< ranks per h-plane (0 = ungrouped 2-d) static unsigned int find(const BipartiteIndexList& indices, const std::string& index_label, unsigned int i, @@ -851,22 +854,81 @@ class ContEngine : public BinaryEngine { world, (pmap ? pmap : policy::default_pmap(*world, n_slabs_ * M * N))); } else { - // Construct the per-slab process grid. - proc_grid_ = TiledArray::detail::ProcGrid(*world, M, N, m, n); - - // Initialize children with slab-replicated SUMMA phase maps - left_.init_distribution( - world, std::make_shared( - *world, proc_grid_.make_row_phase_pmap(K_), n_slabs_)); - right_.init_distribution( - world, std::make_shared( - *world, proc_grid_.make_col_phase_pmap(K_), n_slabs_)); - - // Initialize the process map if not already defined - if (!pmap) - pmap = std::make_shared( - *world, proc_grid_.make_pmap(), n_slabs_); - ExprEngine_::init_distribution(world, pmap); + // Choose the process-grid extent proc_h_ along the slab (h) axis of + // the 3-d grid: ranks beyond one-per-result-tile are useless to a + // single slab's 2-d SUMMA, so the surplus is spread over the slab + // (communication-free) dimension instead -- slab h goes to plane + // h % proc_h_ of proc_h_stride_ = P / proc_h_ contiguous ranks (the + // division-remainder ranks idle for this evaluation). For a + // no-external product (M == N == 1) this degenerates to an effectively + // 1-d grid over the slabs; for an ordinary contraction (n_slabs_ == 1) + // it is a pure 2-d grid (proc_h_ == 1). + // TODO: co-optimize proc_h_ with the 2-d (proc_r, proc_c) aspect ratio + // using the h-, left-external-, and right-external-mode element + // extents (and a per-rank memory bound), rather than the current + // greedy tile-count heuristic. + const size_type P = world->size(); + proc_h_ = 1ul; + if (n_slabs_ > 1ul && P > 1ul) { + const size_type p2d_cap = std::min(P, M * N); + proc_h_ = std::min(n_slabs_, + std::max(1ul, P / p2d_cap)); + } + // keep the invariant proc_h_ == 1 => proc_h_stride_ == 0 (the ungrouped + // 2-d case) so downstream logic can key off either field; for grouped + // grids it is the per-plane world-rank count P / proc_h_. + proc_h_stride_ = (proc_h_ == 1ul) ? 0ul : P / proc_h_; + + if (proc_h_ == 1ul) { + // Construct the per-slab process grid over the whole world. + proc_grid_ = TiledArray::detail::ProcGrid(*world, M, N, m, n); + + // Initialize children with slab-replicated SUMMA phase maps + left_.init_distribution( + world, std::make_shared( + *world, proc_grid_.make_row_phase_pmap(K_), n_slabs_)); + right_.init_distribution( + world, std::make_shared( + *world, proc_grid_.make_col_phase_pmap(K_), n_slabs_)); + + // Initialize the process map if not already defined + if (!pmap) + pmap = std::make_shared( + *world, proc_grid_.make_pmap(), n_slabs_); + ExprEngine_::init_distribution(world, pmap); + } else { + // Construct this rank's GROUP-LOCAL per-slab process grid (ranks + // outside the grouped prefix of the world construct a valid + // not-in-grid instance). The grid shape is a pure function of + // (proc_h_stride_, M, N, m, n), so it is congruent across all groups, + // and the CyclicPmap factories below emit GROUP-LOCAL owners in + // [0, proc_h_stride_) which the h-grouped SlabbedPmap offsets by each + // slab's group. + const size_type rank = world->rank(); + const bool in_groups = rank < proc_h_ * proc_h_stride_; + const ProcessID grid_offset = + in_groups ? ProcessID((rank / proc_h_stride_) * proc_h_stride_) + : ProcessID(0); + proc_grid_ = TiledArray::detail::ProcGrid( + *world, TiledArray::detail::rank_subset, grid_offset, + proc_h_stride_, M, N, m, n); + + left_.init_distribution( + world, std::make_shared( + *world, proc_grid_.make_row_phase_pmap(K_), n_slabs_, + proc_h_, proc_h_stride_)); + right_.init_distribution( + world, std::make_shared( + *world, proc_grid_.make_col_phase_pmap(K_), n_slabs_, + proc_h_, proc_h_stride_)); + + // Initialize the process map if not already defined + if (!pmap) + pmap = std::make_shared( + *world, proc_grid_.make_pmap(), n_slabs_, proc_h_, + proc_h_stride_); + ExprEngine_::init_distribution(world, pmap); + } } } @@ -914,7 +976,8 @@ class ContEngine : public BinaryEngine { if (!general_repermute_) { std::shared_ptr pimpl = std::make_shared( left, right, *world_, trange_, shape_, pmap_, perm_, - batched_op_type(op_, n_fused_modes_), K_, proc_grid_, n_slabs_); + batched_op_type(op_, n_fused_modes_), K_, proc_grid_, n_slabs_, + proc_h_, proc_h_stride_); return dist_eval_type(pimpl); } @@ -935,12 +998,16 @@ class ContEngine : public BinaryEngine { // the inner Summa's result placement must be slab-replicated (the owner // of a tile independent of its slab index), regardless of the // (target-layout) pmap the consumer supplied for this node - auto canonical_pmap = std::make_shared( - *world_, proc_grid_.make_pmap(), n_slabs_); + auto canonical_pmap = + proc_h_ == 1ul ? std::make_shared( + *world_, proc_grid_.make_pmap(), n_slabs_) + : std::make_shared( + *world_, proc_grid_.make_pmap(), n_slabs_, proc_h_, + proc_h_stride_); std::shared_ptr pimpl = std::make_shared( left, right, *world_, canonical_trange, canonical_shape, canonical_pmap, BipartitePermutation{}, batched_op_type(op_, n_fused_modes_), K_, - proc_grid_, n_slabs_); + proc_grid_, n_slabs_, proc_h_, proc_h_stride_); dist_eval_type canonical(pimpl); typedef TiledArray::detail::UnaryEvalImpl< diff --git a/src/TiledArray/pmap/slabbed_pmap.h b/src/TiledArray/pmap/slabbed_pmap.h index a345eccb6a..c4e6d9b4b5 100644 --- a/src/TiledArray/pmap/slabbed_pmap.h +++ b/src/TiledArray/pmap/slabbed_pmap.h @@ -44,6 +44,16 @@ namespace detail { /// (Hadamard/batch) modes as their leading dimensions, every slab is /// distributed identically over the same 2-d process grid, and the slab /// index never participates in inter-process communication patterns. +/// The 3-d-grid variant adds a process dimension \c proc_h along the slab +/// (h) axis: the world's first `proc_h * proc_h_stride` ranks are partitioned +/// into `proc_h` contiguous h-planes of `proc_h_stride` ranks each; slab +/// \f$ h \f$ belongs to plane \f$ h \% proc\_h \f$, and within its plane is +/// distributed by the base map, whose owners must then be PLANE-LOCAL ranks +/// in \f$ [0, proc\_h\_stride) \f$: the owner of index \f$ o \f$ is +/// \f$ (h \% proc\_h) \cdot proc\_h\_stride + base(o \% S) \f$. +/// `proc_h == 1` reduces to the slab-replicated map above (and is constructed +/// via the 2-argument-base constructor, which delegates locality to the base +/// map). class SlabbedPmap : public Pmap { protected: // Import Pmap protected variables @@ -56,6 +66,14 @@ class SlabbedPmap : public Pmap { const std::shared_ptr base_; ///< The per-slab base map const size_type slab_size_; ///< The number of indices per slab const size_type nslabs_; ///< The number of slabs + const size_type proc_h_ = 1; ///< Process-grid extent along the slab + ///< (h) axis (the number of h-planes) + const size_type proc_h_stride_ = 0; ///< Ranks per h-plane (0 = the whole + ///< world; base owners are world ranks + ///< and locality delegates to the base) + + /// \return whether this map distributes the slab axis over a process plane + bool hgrouped() const { return proc_h_stride_ != 0ul; } public: typedef Pmap::size_type size_type; ///< Size type @@ -76,6 +94,49 @@ class SlabbedPmap : public Pmap { this->local_size_ = base_->local_size() * nslabs_; } + /// Construct an h-grouped (3-d grid) slabbed process map + + /// \param world The world where the tiles will be mapped + /// \param base The base process map, defined over one slab; its owners + /// must be GROUP-LOCAL ranks in [0, proc_h_stride) + /// \param nslabs The number of slabs + /// \param proc_h The number of slab groups (slab h -> group h % proc_h) + /// \param proc_h_stride The number of (contiguous) world ranks per group + SlabbedPmap(World& world, std::shared_ptr base, + const size_type nslabs, const size_type proc_h, + const size_type proc_h_stride) + : Pmap(world, base->size() * nslabs), + base_(std::move(base)), + slab_size_(base_->size()), + nslabs_(nslabs), + proc_h_(proc_h), + proc_h_stride_(proc_h_stride) { + TA_ASSERT(base_); + TA_ASSERT(nslabs_ > 0ul); + TA_ASSERT(proc_h_ > 0ul); + TA_ASSERT(proc_h_stride_ > 0ul); + TA_ASSERT(proc_h_ * proc_h_stride_ <= size_type(world.size())); + + // this rank's group and group-local rank; ranks beyond the grouped + // prefix of the world own nothing + const size_type rank = world.rank(); + if (rank < proc_h_ * proc_h_stride_) { + const size_type my_group = rank / proc_h_stride_; + const size_type my_group_rank = rank % proc_h_stride_; + // count of my group's slabs: h in [0, nslabs) with h % proc_h == + // my_group + const size_type my_slabs = + (nslabs_ / proc_h_) + (my_group < (nslabs_ % proc_h_) ? 1u : 0u); + // count of slab indices the base assigns to my group-local rank + size_type base_local = 0ul; + for (size_type j = 0ul; j < slab_size_; ++j) + if (base_->owner(j) == my_group_rank) ++base_local; + this->local_size_ = my_slabs * base_local; + } else { + this->local_size_ = 0ul; + } + } + virtual ~SlabbedPmap() {} /// \return the per-slab base map @@ -84,6 +145,8 @@ class SlabbedPmap : public Pmap { size_type slab_size() const { return slab_size_; } /// \return the number of slabs size_type nslabs() const { return nslabs_; } + /// \return the number of slab (h) groups + size_type proc_h() const { return proc_h_; } /// Maps \c tile to the process that owns it @@ -91,7 +154,9 @@ class SlabbedPmap : public Pmap { /// \return Process that logically owns \c tile virtual size_type owner(const size_type tile) const { TA_ASSERT(tile < size_); - return base_->owner(tile % slab_size_); + if (!hgrouped()) return base_->owner(tile % slab_size_); + const size_type group = (tile / slab_size_) % proc_h_; + return group * proc_h_stride_ + base_->owner(tile % slab_size_); } /// Check that the tile is owned by this process @@ -99,10 +164,13 @@ class SlabbedPmap : public Pmap { /// \param tile The tile to be checked /// \return \c true if \c tile is owned by this process, otherwise \c false virtual bool is_local(const size_type tile) const { - return base_->is_local(tile % slab_size_); + if (!hgrouped()) return base_->is_local(tile % slab_size_); + return owner(tile) == size_type(rank_); } - virtual bool known_local_size() const { return base_->known_local_size(); } + virtual bool known_local_size() const { + return hgrouped() ? true : base_->known_local_size(); + } virtual const_iterator begin() const { return Iterator(*this, 0ul, size_, 0ul, /*checking=*/true); diff --git a/src/TiledArray/proc_grid.h b/src/TiledArray/proc_grid.h index cd15c1b73e..758e7fcefb 100644 --- a/src/TiledArray/proc_grid.h +++ b/src/TiledArray/proc_grid.h @@ -55,6 +55,14 @@ namespace detail { /// \f] /// where the positive, real root of \f$P_{\rm{row}}\f$ give the optimal /// optimal communication time. + +/// Tag to disambiguate the rank-subset ProcGrid constructor from the +/// (same-arity) test-only constructor. +struct rank_subset_t { + explicit rank_subset_t() = default; +}; +inline constexpr rank_subset_t rank_subset{}; + class ProcGrid { public: typedef uint_fast32_t size_type; @@ -71,9 +79,12 @@ class ProcGrid { ///< may be less than the number of processes in world. ProcessID rank_row_; ///< This process's row in the process grid ProcessID rank_col_; ///< This process's column in the process grid - size_type local_rows_; ///< The number of local element rows - size_type local_cols_; ///< The number of local element columns - size_type local_size_; ///< Number of local elements + ProcessID rank_offset_ = 0; ///< World rank of the grid's first process + ///< (nonzero for a grid over a contiguous + ///< subset of the world's ranks) + size_type local_rows_; ///< The number of local element rows + size_type local_cols_; ///< The number of local element columns + size_type local_size_; ///< Number of local elements /// Compute the number of process rows that minimizes communication @@ -180,14 +191,16 @@ class ProcGrid { proc_cols_ = 1u; proc_size_ = 1u; - // Set this process rank - rank_row_ = 0; - rank_col_ = 0; + if (rank < proc_size_) { + // Set this process rank + rank_row_ = 0; + rank_col_ = 0; - // Set local counts - local_rows_ = rows_; - local_cols_ = cols_; - local_size_ = size_; + // Set local counts + local_rows_ = rows_; + local_cols_ = cols_; + local_size_ = size_; + } } else if (size_ <= nprocs) { // Max one tile per process @@ -291,6 +304,48 @@ class ProcGrid { init(world_->rank(), world_->size(), row_size, col_size); } + /// Construct a process grid over a contiguous subset of the world's ranks + + /// The grid spans world ranks [rank_offset, rank_offset + nprocs); ranks + /// outside that interval construct a valid "not in the grid" instance + /// (zero local sizes, empty groups). Used by the h-grouped (3-d) batched + /// Summa, where each fused-index slab group runs its own 2-d grid. + /// \param world The world where the process grid will live + /// \param rank_offset The world rank of the grid's first process + /// \param nprocs The number of processes spanned by the grid + /// \param rows The number of tile rows + /// \param cols The number of tile columns + /// \param row_size The number of element rows + /// \param col_size The number of element columns + ProcGrid(World& world, rank_subset_t, const ProcessID rank_offset, + const size_type nprocs, const size_type rows, const size_type cols, + const std::size_t row_size, const std::size_t col_size) + : world_(&world), + rows_(rows), + cols_(cols), + size_(rows_ * cols_), + proc_rows_(0ul), + proc_cols_(0ul), + proc_size_(0ul), + rank_row_(-1), + rank_col_(-1), + rank_offset_(rank_offset), + local_rows_(0ul), + local_cols_(0ul), + local_size_(0ul) { + TA_ASSERT(rank_offset >= 0); + TA_ASSERT(nprocs >= 1u); + TA_ASSERT(rank_offset + nprocs <= size_type(world.size())); + const auto world_rank = world.rank(); + // out-of-grid ranks pass rank == nprocs, which every init() branch + // treats as "not in the grid" + const size_type rank = (world_rank >= rank_offset && + world_rank < rank_offset + ProcessID(nprocs)) + ? world_rank - rank_offset + : nprocs; + init(rank, nprocs, row_size, col_size); + } + #ifdef TILEDARRAY_ENABLE_TEST_PROC_GRID // Note: The following function is here for testing purposes only. It // has the same functionality as the default constructor above, except the @@ -350,6 +405,7 @@ class ProcGrid { proc_size_(other.proc_size_), rank_row_(other.rank_row_), rank_col_(other.rank_col_), + rank_offset_(other.rank_offset_), local_rows_(other.local_rows_), local_cols_(other.local_cols_), local_size_(other.local_size_) {} @@ -367,6 +423,7 @@ class ProcGrid { proc_size_ = other.proc_size_; rank_row_ = other.rank_row_; rank_col_ = other.rank_col_; + rank_offset_ = other.rank_offset_; local_rows_ = other.local_rows_; local_cols_ = other.local_cols_; local_size_ = other.local_size_; @@ -447,7 +504,7 @@ class ProcGrid { // Populate the row process list size_type p = rank_row_ * proc_cols_; const size_type row_end = p + proc_cols_; - for (; p < row_end; ++p) proc_list.push_back(p); + for (; p < row_end; ++p) proc_list.push_back(p + rank_offset_); // Construct the group group = madness::Group(*world_, proc_list, did); @@ -472,7 +529,7 @@ class ProcGrid { // Populate the column process list for (size_type p = rank_col_; p < proc_size_; p += proc_cols_) - proc_list.push_back(p); + proc_list.push_back(p + rank_offset_); // Construct the group if (proc_list.size() != 0) @@ -489,7 +546,7 @@ class ProcGrid { /// (row,rank_col) ProcessID map_row(const size_type row) const { TA_ASSERT(row < proc_rows_); - return rank_col_ + row * proc_cols_; + return rank_col_ + row * proc_cols_ + rank_offset_; } /// Map a column to the process in this process's row @@ -499,7 +556,7 @@ class ProcGrid { /// (rank_row,col) ProcessID map_col(const size_type col) const { TA_ASSERT(col < proc_cols_); - return rank_row_ * proc_cols_ + col; + return rank_row_ * proc_cols_ + col + rank_offset_; } /// Construct a cyclic process diff --git a/tests/general_product.cpp b/tests/general_product.cpp index 58976f87dc..26316784d8 100644 --- a/tests/general_product.cpp +++ b/tests/general_product.cpp @@ -8,6 +8,8 @@ #include "tiledarray.h" #include "unit_test_config.h" +#include + BOOST_AUTO_TEST_SUITE(general_product_suite, TA_UT_LABEL_SERIAL) namespace TA = TiledArray; @@ -1342,3 +1344,146 @@ BOOST_AUTO_TEST_CASE(einsum_expression_route_matches_legacy) { } BOOST_AUTO_TEST_SUITE_END() + +// Distributed (np > 1) coverage of general products. Unlike +// general_product_suite (serial-labeled: classification/optimizer unit tests +// + np=1 evaluation), this suite carries NO label, so the CI harness runs it +// at BOTH np=1 and np=2 (see tests/CMakeLists.txt: np-1 excludes @distributed, +// np-2 excludes @serial). It exercises the batched Summa across ranks -- a +// path the serial suite never covered. Most cases differential-test the +// expression route against the legacy sub-World einsum oracle; the exception +// is dist_inner_node_thc, which checks a single fused expression against a +// reference built from explicit binary intermediates (the same expression +// machinery), validating that nested general products at inner nodes agree +// with the step-by-step evaluation. +// +// Most cases evaluate via the ordinary 2-d (proc_h == 1) batched Summa; +// dist_no_externals_3d_grid exercises the 3-d (proc_h > 1) grid, where the +// heuristic spreads ranks over the slab dimension because M == N == 1. +BOOST_AUTO_TEST_SUITE(general_product_distributed_suite) + +// the fixtures/helpers live in general_product_suite's anonymous namespace +using general_product_suite::diff_norm; +using general_product_suite::diff_norm_sp; +using general_product_suite::ForceLegacyEinsum; +using general_product_suite::make_patterned_array; +using general_product_suite::make_patterned_sparse_array; +using general_product_suite::make_patterned_tot_array; +using general_product_suite::TArrayToT; +using general_product_suite::tot_max_abs_diff; + +BOOST_AUTO_TEST_CASE(dist_dense) { + // batched general product: b fused, j contracted, i/k free + auto& world = TA::get_default_world(); + ForceLegacyEinsum legacy_oracle; + TA::TiledRange tr_a{{0, 2, 4}, {0, 2, 3}, {0, 2, 5}}; // b, i, j + TA::TiledRange tr_b{{0, 2, 4}, {0, 2, 5}, {0, 3, 4}}; // b, j, k + auto a = make_patterned_array(world, tr_a, 1.0); + auto b = make_patterned_array(world, tr_b, 2.0); + TA::TArrayD c; + BOOST_REQUIRE_NO_THROW(c("b,i,k") = a("b,i,j") * b("b,j,k")); + auto c_ref = TA::einsum(a("b,i,j"), b("b,j,k"), "b,i,k"); + BOOST_CHECK_SMALL(diff_norm(c, c_ref, "b,i,k"), 1e-10); +} + +BOOST_AUTO_TEST_CASE(dist_sparse) { + // block-sparse batched general product (SparseShape::gemm_batched + + // the sparse (h,k)-keyed Summa groups, across ranks) + auto& world = TA::get_default_world(); + ForceLegacyEinsum legacy_oracle; + TA::TiledRange tr_a{{0, 2, 5}, {0, 3, 4}, {0, 2, 6, 7}}; // b, i, j + TA::TiledRange tr_b{{0, 2, 5}, {0, 2, 6, 7}, {0, 4, 5}}; // b, j, k + auto a = make_patterned_sparse_array(world, tr_a, 1.0, 3); + auto b = make_patterned_sparse_array(world, tr_b, 2.0, 4); + TA::TSpArrayD c; + BOOST_REQUIRE_NO_THROW(c("b,i,k") = a("b,i,j") * b("b,j,k")); + auto c_ref = TA::einsum(a("b,i,j"), b("b,j,k"), "b,i,k"); + BOOST_CHECK_SMALL(diff_norm_sp(c, c_ref, "b,i,k"), 1e-10); +} + +BOOST_AUTO_TEST_CASE(dist_tot_mixed) { + // mixed T x ToT general product (inner Scale) across ranks + auto& world = TA::get_default_world(); + ForceLegacyEinsum legacy_oracle; + TA::TiledRange tr_a{{0, 2, 4}, {0, 2, 3}, {0, 2, 5}}; // b, i, j + TA::TiledRange tr_b{{0, 2, 4}, {0, 2, 5}, {0, 3, 4}}; // b, j, k + auto a = make_patterned_array(world, tr_a, 1.0); + auto b = make_patterned_tot_array(world, tr_b, {3}, 2.0); + TArrayToT c; + BOOST_REQUIRE_NO_THROW(c("b,i,k;m") = a("b,i,j") * b("b,j,k;m")); + auto c_ref = TA::einsum(a("b,i,j"), b("b,j,k;m"), "b,i,k;m"); + BOOST_CHECK_SMALL(tot_max_abs_diff(c, c_ref), 1e-10); +} + +BOOST_AUTO_TEST_CASE(dist_no_externals_dense) { + // no-external general product (Hadamard reduction shape) across ranks; + // the synthetic unit left-external mode handling under distribution + auto& world = TA::get_default_world(); + ForceLegacyEinsum legacy_oracle; + TA::TiledRange tr{{0, 2, 4}, {0, 3, 5}}; // i, j + auto a = make_patterned_array(world, tr, 1.0); + auto b = make_patterned_array(world, tr, 2.0); + TA::TArrayD c; + BOOST_REQUIRE_NO_THROW(c("i") = a("i,j") * b("i,j")); + auto c_ref = TA::einsum(a("i,j"), b("i,j"), "i"); + BOOST_CHECK_SMALL(diff_norm(c, c_ref, "i"), 1e-10); +} + +BOOST_AUTO_TEST_CASE(dist_no_externals_tot) { + // no-external ToT general product across ranks + auto& world = TA::get_default_world(); + ForceLegacyEinsum legacy_oracle; + TA::TiledRange tr{{0, 3, 5}, {0, 2, 4}, {0, 2, 3}}; // x, i, j + auto a = make_patterned_tot_array(world, tr, {2}, 1.0); + auto b = make_patterned_tot_array(world, tr, {3}, 2.0); + TArrayToT c; + BOOST_REQUIRE_NO_THROW(c("i,j;a,b") = a("x,i,j;a") * b("x,i,j;b")); + auto c_ref = TA::einsum(a("x,i,j;a"), b("x,i,j;b"), "i,j;a,b"); + BOOST_CHECK_SMALL(tot_max_abs_diff(c, c_ref), 1e-10); +} + +BOOST_AUTO_TEST_CASE(dist_inner_node_thc) { + // THC reconstruction in one expression (general products at inner nodes) + // across ranks + auto& world = TA::get_default_world(); + TA::TiledRange tr_x{{0, 2, 4}, {0, 3, 5}}; // orbital x auxiliary + TA::TiledRange tr_z{{0, 3, 5}, {0, 3, 5}}; // auxiliary x auxiliary + auto x = make_patterned_array(world, tr_x, 1.0); + auto z = make_patterned_array(world, tr_z, 2.0); + TA::TArrayD g; + BOOST_REQUIRE_NO_THROW(g("p,q,r,s") = x("p,r1") * x("q,r1") * z("r1,r2") * + x("r,r2") * x("s,r2")); + TA::TArrayD i1, i2, i3, g_ref; + i1("r1,p,q") = x("p,r1") * x("q,r1"); + i2("p,q,r2") = i1("r1,p,q") * z("r1,r2"); + i3("r2,p,q,r") = i2("p,q,r2") * x("r,r2"); + g_ref("p,q,r,s") = i3("r2,p,q,r") * x("s,r2"); + BOOST_CHECK_SMALL(diff_norm(g, g_ref, "p,q,r,s"), 1e-10); +} + +BOOST_AUTO_TEST_CASE(dist_no_externals_3d_grid) { + // a no-external product with several slabs: at np>1 the heuristic engages + // the 3-d grid (M=N=1 ⇒ proc_h > 1, the ranks spread over the slab axis), + // so the result tiles distribute across the h-planes rather than piling on + // one rank. Exercises the cross-plane result-tile transfer. + auto& world = TA::get_default_world(); + ForceLegacyEinsum legacy_oracle; + TA::TiledRange tr{{0, 2, 4, 6}, {0, 3, 5}}; // i (3 tiles), j + auto a = make_patterned_array(world, tr, 1.0); + auto b = make_patterned_array(world, tr, 2.0); + TA::TArrayD c; + BOOST_REQUIRE_NO_THROW(c("i") = a("i,j") * b("i,j")); + auto c_ref = TA::einsum(a("i,j"), b("i,j"), "i"); + BOOST_CHECK_SMALL(diff_norm(c, c_ref, "i"), 1e-10); + + // at np>1 the no-external result must NOT all live on one rank (the + // degeneracy the 3-d grid fixes) + if (world.size() > 1) { + std::set owners; + for (std::size_t o = 0; o < c.trange().tiles_range().volume(); ++o) + owners.insert(c.pmap()->owner(o)); + BOOST_CHECK_GT(owners.size(), 1u); + } +} + +BOOST_AUTO_TEST_SUITE_END()