diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h
index e2b9c8c731..aef9b0e853 100644
--- a/src/TiledArray/dist_eval/contraction_eval.h
+++ b/src/TiledArray/dist_eval/contraction_eval.h
@@ -127,11 +127,65 @@ class Summa
   const ordinal_type
       right_stride_local_;  ///< stride for local right row iterators
 
+  // 3-d (h-grouped) grid information. The world's first
+  // proc_h_ * proc_h_stride ranks are partitioned into proc_h_ contiguous
+  // groups; slab h belongs to group h % proc_h_, and each group runs its
+  // own 2-d SUMMA grid (proc_grid_ is this rank's GROUP-LOCAL grid,
+  // constructed over the group's rank interval). proc_h_ == 1 is the
+  // ordinary shared-grid batched contraction.
+  const ordinal_type proc_h_;  ///< Number of slab (h) groups
+  const ordinal_type
+      proc_h_stride_;              ///< World ranks per slab group (the
+                                   ///< group of slab h spans world ranks
+                                   ///< [(h % proc_h_) * proc_h_stride_, ...))
+  const ordinal_type first_slab_;  ///< This rank's group's first slab (== its
+                                   ///< group index), or nh_ if this rank is
+                                   ///< in no group (idle for this eval)
+  const ordinal_type my_slabs_;    ///< Number of slabs of this rank's group
+
+  /// \return the world rank that owns result tile \p i: the within-group
+  /// owner (from the group-local process grid) shifted by the world-rank
+  /// offset of the group that owns \p i's slab. For proc_h_ == 1 the offset
+  /// is 0 and this is the ordinary cyclic owner.
+  ProcessID result_tile_owner(const ordinal_type i) const {
+    const ordinal_type source_index = DistEvalImpl_::perm_index_to_source(i);
+    // owner is independent of slab index *within a group*
+    const ordinal_type slab_index = source_index % result_slab_size_;
+    const ordinal_type tile_row = slab_index / proc_grid_.cols();
+    const ordinal_type tile_col = slab_index % proc_grid_.cols();
+    const ordinal_type proc_row = tile_row % proc_grid_.proc_rows();
+    const ordinal_type proc_col = tile_col % proc_grid_.proc_cols();
+    const ProcessID within_group = proc_row * proc_grid_.proc_cols() + proc_col;
+    // shift by the offset of the group that owns this tile's slab
+    const ordinal_type slab = source_index / result_slab_size_;
+    const ordinal_type group = (proc_h_ > 1ul) ? (slab % proc_h_) : 0ul;
+    return ProcessID(group * proc_h_stride_) + within_group;
+  }
+
   /// \return the slab index of SUMMA step \p s
   ordinal_type step_h(const ordinal_type s) const { return s / k_; }
   /// \return the within-slab inner-dimension index of SUMMA step \p s
   ordinal_type step_k(const ordinal_type s) const { return s % k_; }
 
+  /// \return the smallest SUMMA step >= \p s that belongs to one of this
+  /// rank's group's slabs, or nsteps_ if there is none
+  ordinal_type next_step(ordinal_type s) const {
+    if (proc_h_ == 1ul) return std::min(s, nsteps_);
+    if (first_slab_ >= nh_) return nsteps_;  // not in any group
+    while (s < nsteps_ && (step_h(s) % proc_h_) != first_slab_)
+      s = (step_h(s) + 1ul) * k_;  // jump to the start of the next slab
+    return std::min(s, nsteps_);
+  }
+
+  /// \return this rank's group-local ordinal of slab \p h (which must
+  /// belong to this rank's group)
+  ordinal_type slab_ord(const ordinal_type h) const {
+    return (h - first_slab_) / proc_h_;
+  }
+
+  /// \return the number of SUMMA steps of this rank's group's slabs
+  ordinal_type my_steps() const { return my_slabs_ * k_; }
+
   typedef Future<typename right_type::eval_type>
       right_future;  ///< Future to a right-hand argument tile
   typedef Future<typename left_type::eval_type>
@@ -225,7 +279,7 @@ class Summa
   /// \param end The end of the row or column range
   /// \param stride The row or column index stride
   /// \param k The broadcast group index
-  /// \param max_group_size The maximum number of processes in the result
+  /// \param max_proc_h_stride The maximum number of processes in the result
   /// group, which is equal to the number of process in this process row or
   /// column as defined by \c proc_grid_.
   /// \param key_offset The key that will be used to identify the process group
@@ -238,21 +292,21 @@ class Summa
                             const std::vector<bool>& process_mask,
                             ordinal_type index, const ordinal_type end,
                             const ordinal_type stride,
-                            const ordinal_type max_group_size,
+                            const ordinal_type max_proc_h_stride,
                             const ordinal_type k, const ordinal_type key_offset,
                             const ProcMap& proc_map) const {
     // Generate the list of processes in rank_row
-    std::vector<ProcessID> proc_list(max_group_size, -1);
+    std::vector<ProcessID> proc_list(max_proc_h_stride, -1);
 
     // Flag the root processes of the broadcast, which may not be included
     // by shape.
-    ordinal_type p = k % max_group_size;
+    ordinal_type p = k % max_proc_h_stride;
     proc_list[p] = proc_map(p);
     ordinal_type count = 1ul;
 
     // Flag all processes that have non-zero tiles
-    for (p = 0ul; (index < end) && (count < max_group_size);
-         index += stride, p = (p + 1u) % max_group_size) {
+    for (p = 0ul; (index < end) && (count < max_proc_h_stride);
+         index += stride, p = (p + 1u) % max_proc_h_stride) {
       if ((proc_list[p] != -1) || (shape.is_zero(index)) || !process_mask.at(p))
         continue;
 
@@ -734,7 +788,7 @@ class Summa
     // broadcast root (i.e. within-slab k congruent to rank_col mod Pcols)
     const ordinal_type Pcols = proc_grid_.proc_cols();
 
-    for (; s < end; ++s) {
+    for (s = next_step(s); s < end; s = next_step(s + 1ul)) {
       const ordinal_type k = step_k(s);
       if (k % Pcols != static_cast<ordinal_type>(proc_grid_.rank_col()))
         continue;
@@ -787,7 +841,7 @@ class Summa
     // broadcast root (i.e. within-slab k congruent to rank_row mod Prows)
     const ordinal_type Prows = proc_grid_.proc_rows();
 
-    for (; s < end; ++s) {
+    for (s = next_step(s); s < end; s = next_step(s + 1ul)) {
       const ordinal_type k = step_k(s);
       if (k % Prows != static_cast<ordinal_type>(proc_grid_.rank_row()))
         continue;
@@ -847,9 +901,9 @@ class Summa
   /// \return The first step, greater than or equal to \c s with non-zero
   /// tiles, or \c nsteps_ if none is found.
   ordinal_type iterate_row(ordinal_type s) const {
-    // Iterate over steps until a non-zero tile is found or the end of the
-    // matrix is reached.
-    for (; s < nsteps_; ++s) {
+    // Iterate over this rank's group's steps until a non-zero tile is found
+    // or the end of the matrix is reached.
+    for (s = next_step(s); s < nsteps_; s = next_step(s + 1ul)) {
       // Search for non-zero tiles in row k of slab h of right
       ordinal_type i =
           step_h(s) * right_slab_size_ + step_k(s) * proc_grid_.cols();
@@ -871,9 +925,9 @@ class Summa
   /// \return The first step, greater than or equal to \c s, that contains
   /// a non-zero tile. If no non-zero tile is not found, return \c nsteps_.
   ordinal_type iterate_col(ordinal_type s) const {
-    // Iterate over steps until a non-zero tile is found or the end of the
-    // matrix is reached.
-    for (; s < nsteps_; ++s) {
+    // Iterate over this rank's group's steps until a non-zero tile is found
+    // or the end of the matrix is reached.
+    for (s = next_step(s); s < nsteps_; s = next_step(s + 1ul)) {
       // Search column k of slab h for non-zero tiles
       const ordinal_type base = step_h(s) * left_slab_size_;
       for (ordinal_type i = base + left_start_local_ + step_k(s);
@@ -963,11 +1017,17 @@ class Summa
       return tile_count;
     } else {
       // Construct static broadcast groups for dense arguments
-      // (key space [0, 2*nsteps_) is reserved for the sparse per-step groups)
-      const madness::DistributedID col_did(DistEvalImpl_::id(), 2ul * nsteps_);
+      // (key space [0, 2*nsteps_) is reserved for the sparse per-step groups,
+      // whose keys h*k_ and h*k_+nsteps_ are disjoint across h-groups; the
+      // two static keys are offset PAST that range and made group-unique so
+      // that two different groups' single-grid static groups never claim the
+      // same DistributedID with inconsistent membership)
+      const std::size_t static_key_base = 2ul * nsteps_ + 2ul * first_slab_;
+      const madness::DistributedID col_did(DistEvalImpl_::id(),
+                                           static_key_base);
       col_group_ = proc_grid_.make_col_group(col_did);
       const madness::DistributedID row_did(DistEvalImpl_::id(),
-                                           2ul * nsteps_ + 1ul);
+                                           static_key_base + 1ul);
       row_group_ = proc_grid_.make_row_group(row_did);
 
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
@@ -985,12 +1045,12 @@ class Summa
 #endif  // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
 
       // Allocate memory for the reduce pair tasks (one per local result tile
-      // per slab).
+      // per slab of this rank's group).
       std::allocator<ReducePairTask<op_type>> alloc;
-      reduce_tasks_ = alloc.allocate(nh_ * proc_grid_.local_size());
+      reduce_tasks_ = alloc.allocate(my_slabs_ * proc_grid_.local_size());
 
       // Iterate over all local tiles
-      const ordinal_type n = nh_ * proc_grid_.local_size();
+      const ordinal_type n = my_slabs_ * proc_grid_.local_size();
       for (ordinal_type t = 0ul; t < n; ++t) {
         // Initialize the reduction task
         ReducePairTask<op_type>* MADNESS_RESTRICT const reduce_task =
@@ -1019,9 +1079,9 @@ class Summa
     if (k_ == 0) return 0;
 
     // Allocate memory for the reduce pair tasks (one per local result tile
-    // per slab).
+    // per slab of this rank's group).
     std::allocator<ReducePairTask<op_type>> alloc;
-    reduce_tasks_ = alloc.allocate(nh_ * proc_grid_.local_size());
+    reduce_tasks_ = alloc.allocate(my_slabs_ * proc_grid_.local_size());
 
     // Initialize iteration variables
     const ordinal_type col_stride =  // The stride to iterate down a column
@@ -1030,10 +1090,11 @@ class Summa
         proc_grid_.proc_cols();
 
     // Iterate over all local tiles, slab by slab (the block-cyclic phase
-    // restarts at every slab: the owner of tile (h,i,j) does not depend on h)
+    // restarts at every slab: within a group, the owner of tile (h,i,j)
+    // does not depend on h)
     ordinal_type tile_count = 0ul;
     ReducePairTask<op_type>* MADNESS_RESTRICT reduce_task = reduce_tasks_;
-    for (ordinal_type h = 0ul; h < nh_; ++h) {
+    for (ordinal_type h = first_slab_; h < nh_; h += proc_h_) {
       const ordinal_type slab_base = h * result_slab_size_;
       ordinal_type row_start =
           slab_base + proc_grid_.rank_row() * proc_grid_.cols();
@@ -1103,7 +1164,7 @@ class Summa
 
     // Iterate over all local tiles, slab by slab
     ReducePairTask<op_type>* reduce_task = reduce_tasks_;
-    for (ordinal_type h = 0ul; h < nh_; ++h) {
+    for (ordinal_type h = first_slab_; h < nh_; h += proc_h_) {
       const ordinal_type slab_base = h * result_slab_size_;
       ordinal_type row_start =
           slab_base + proc_grid_.rank_row() * proc_grid_.cols();
@@ -1126,7 +1187,7 @@ class Summa
 
     // Deallocate the memory for the reduce pair tasks.
     std::allocator<ReducePairTask<op_type>>().deallocate(
-        reduce_tasks_, nh_ * proc_grid_.local_size());
+        reduce_tasks_, my_slabs_ * proc_grid_.local_size());
   }
 
   /// Set the result tiles and destroy reduce tasks
@@ -1145,7 +1206,7 @@ class Summa
 
     // Iterate over all local tiles, slab by slab
     ReducePairTask<op_type>* reduce_task = reduce_tasks_;
-    for (ordinal_type h = 0ul; h < nh_; ++h) {
+    for (ordinal_type h = first_slab_; h < nh_; h += proc_h_) {
       const ordinal_type slab_base = h * result_slab_size_;
       ordinal_type row_start =
           slab_base + proc_grid_.rank_row() * proc_grid_.cols();
@@ -1177,7 +1238,7 @@ class Summa
     }
     // Deallocate the memory for the reduce pair tasks.
     std::allocator<ReducePairTask<op_type>>().deallocate(
-        reduce_tasks_, nh_ * proc_grid_.local_size());
+        reduce_tasks_, my_slabs_ * proc_grid_.local_size());
 
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE
     ss << "}\n";
@@ -1229,9 +1290,10 @@ class Summa
                 const std::vector<col_datum>& col,
                 const std::vector<row_datum>& row,
                 madness::TaskInterface* const task) {
-    // The reduce tasks of slab h occupy
-    // [h * local_size, (h+1) * local_size)
-    const ordinal_type slab_offset = step_h(s) * proc_grid_.local_size();
+    // The reduce tasks of this group's slab h occupy
+    // [slab_ord(h) * local_size, (slab_ord(h)+1) * local_size)
+    const ordinal_type slab_offset =
+        slab_ord(step_h(s)) * proc_grid_.local_size();
 
     // Iterate over the row
     for (ordinal_type i = 0ul; i < col.size(); ++i) {
@@ -1266,9 +1328,10 @@ class Summa
                 const std::vector<col_datum>& col,
                 const std::vector<row_datum>& row,
                 madness::TaskInterface* const task) {
-    // The reduce tasks of slab h occupy
-    // [h * local_size, (h+1) * local_size)
-    const ordinal_type slab_offset = step_h(s) * proc_grid_.local_size();
+    // The reduce tasks of this group's slab h occupy
+    // [slab_ord(h) * local_size, (slab_ord(h)+1) * local_size)
+    const ordinal_type slab_offset =
+        slab_ord(step_h(s)) * proc_grid_.local_size();
 
     // Iterate over the row
     for (ordinal_type i = 0ul; i < col.size(); ++i) {
@@ -1452,8 +1515,12 @@ class Summa
 
     template <typename Derived>
     void make_next_step_tasks(Derived* task, ordinal_type depth) {
-      // Set the depth to be no greater than the maximum number steps
-      if (depth > owner_->nsteps_) depth = owner_->nsteps_;
+      // Set the depth to be no greater than the number of SUMMA steps this
+      // rank's group actually executes. In the 2-d (proc_h_ == 1) case this is
+      // nsteps_ (my_slabs_ == nh_); in the 3-d (proc_h_ > 1) case my_steps() <
+      // nsteps_, and clamping to nsteps_ would pre-spawn surplus step tasks
+      // that all resolve to the terminating step (k_ == nsteps_).
+      if (depth > owner_->my_steps()) depth = owner_->my_steps();
 
       // Spawn n=depth step tasks
       for (; depth > 0ul; --depth) {
@@ -1540,13 +1607,14 @@ class Summa
    public:
     DenseStepTask(const std::shared_ptr<Summa_>& owner,
                   const ordinal_type depth)
-        : StepTask(owner, owner->nsteps_ + 1ul), k_(0) {
+        : StepTask(owner, owner->my_steps() + 1ul), k_(owner->next_step(0ul)) {
       StepTask::make_next_step_tasks(this, depth);
-      StepTask::spawn_get_row_col_tasks(k_);
+      if (k_ < owner_->nsteps_) StepTask::spawn_get_row_col_tasks(k_);
     }
 
     DenseStepTask(DenseStepTask* const parent, const int ndep)
-        : StepTask(parent, ndep), k_(parent->k_ + 1ul) {
+        : StepTask(parent, ndep),
+          k_(parent->owner_->next_step(parent->k_ + 1ul)) {
       // Spawn tasks to get k-th row and column tiles
       if (k_ < owner_->nsteps_) StepTask::spawn_get_row_col_tasks(k_);
     }
@@ -1671,7 +1739,8 @@ class Summa
         const trange_type trange, const shape_type& shape,
         const std::shared_ptr<const pmap_interface>& pmap, const Perm& perm,
         const op_type& op, const ordinal_type k, const ProcGrid& proc_grid,
-        const ordinal_type nh = 1ul)
+        const ordinal_type nh = 1ul, const ordinal_type proc_h = 1ul,
+        const ordinal_type proc_h_stride = 0ul)
       : DistEvalImpl_(world, trange, shape, pmap, outer(perm)),
         left_(left),
         right_(right),
@@ -1685,6 +1754,11 @@ class Summa
         left_slab_size_(left.size() / nh),
         right_slab_size_(right.size() / nh),
         result_slab_size_(proc_grid.rows() * proc_grid.cols()),
+        proc_h_(proc_h),
+        proc_h_stride_(proc_h_stride),
+        first_slab_(compute_first_slab(world, nh, proc_h, proc_h_stride)),
+        my_slabs_(first_slab_ < nh ? (nh - first_slab_ + proc_h - 1ul) / proc_h
+                                   : 0ul),
         reduce_tasks_(NULL),
         left_start_local_(proc_grid_.rank_row() * k),
         left_end_(left.size() / nh),
@@ -1703,6 +1777,19 @@ class Summa
     TA_ASSERT(nh_ > 0);
     TA_ASSERT(left.size() % nh_ == 0);
     TA_ASSERT(right.size() % nh_ == 0);
+    TA_ASSERT(proc_h_ > 0);
+    TA_ASSERT(proc_h_ == 1ul || proc_h_stride > 0ul);
+    TA_ASSERT(proc_h_ <= nh_);
+  }
+
+  /// \return this rank's group's first slab (== its group index), or
+  /// \p nh if this rank is outside the grouped rank interval
+  static ordinal_type compute_first_slab(World& world, const ordinal_type nh,
+                                         const ordinal_type proc_h,
+                                         const ordinal_type proc_h_stride) {
+    if (proc_h == 1ul) return 0ul;
+    const auto rank = ordinal_type(world.rank());
+    return (rank < proc_h * proc_h_stride) ? (rank / proc_h_stride) : nh;
   }
 
   virtual ~Summa() {}
@@ -1717,18 +1804,11 @@ class Summa
     TA_ASSERT(TensorImpl_::is_local(i));
     TA_ASSERT(!TensorImpl_::is_zero(i));
 
-    const ordinal_type source_index = DistEvalImpl_::perm_index_to_source(i);
-
-    // Compute tile coordinate in tile grid (the owner of a tile is
-    // independent of its slab index)
-    const ordinal_type slab_index = source_index % result_slab_size_;
-    const ordinal_type tile_row = slab_index / proc_grid_.cols();
-    const ordinal_type tile_col = slab_index % proc_grid_.cols();
-    // Compute process coordinate of tile in the process grid
-    const ordinal_type proc_row = tile_row % proc_grid_.proc_rows();
-    const ordinal_type proc_col = tile_col % proc_grid_.proc_cols();
-    // Compute the process that owns tile
-    const ProcessID source = proc_row * proc_grid_.proc_cols() + proc_col;
+    // The process that owns tile i: the within-group cyclic owner shifted by
+    // the world-rank offset of the tile's slab group (see
+    // result_tile_owner). For proc_h_ == 1 this is the ordinary cyclic
+    // owner over the whole world.
+    const ProcessID source = result_tile_owner(i);
 
     const madness::DistributedID key(DistEvalImpl_::id(), i);
     return TensorImpl_::world().gop.template recv<value_type>(source, key);
diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h
index fdf5b0da82..1229b00a94 100644
--- a/src/TiledArray/expressions/cont_engine.h
+++ b/src/TiledArray/expressions/cont_engine.h
@@ -185,6 +185,9 @@ class ContEngine : public BinaryEngine<Derived> {
                                     ///< from the canonical result layout, so
                                     ///< the evaluated result is re-permuted
                                     ///< by a streaming unary eval
+  size_type proc_h_ = 1;            ///< process-grid extent along the slab (h)
+                                    ///< axis of the 3-d grid (# of h-planes)
+  size_type proc_h_stride_ = 0;     ///< ranks per h-plane (0 = ungrouped 2-d)
 
   static unsigned int find(const BipartiteIndexList& indices,
                            const std::string& index_label, unsigned int i,
@@ -851,22 +854,81 @@ class ContEngine : public BinaryEngine<Derived> {
           world,
           (pmap ? pmap : policy::default_pmap(*world, n_slabs_ * M * N)));
     } else {
-      // Construct the per-slab process grid.
-      proc_grid_ = TiledArray::detail::ProcGrid(*world, M, N, m, n);
-
-      // Initialize children with slab-replicated SUMMA phase maps
-      left_.init_distribution(
-          world, std::make_shared<TiledArray::detail::SlabbedPmap>(
-                     *world, proc_grid_.make_row_phase_pmap(K_), n_slabs_));
-      right_.init_distribution(
-          world, std::make_shared<TiledArray::detail::SlabbedPmap>(
-                     *world, proc_grid_.make_col_phase_pmap(K_), n_slabs_));
-
-      // Initialize the process map if not already defined
-      if (!pmap)
-        pmap = std::make_shared<TiledArray::detail::SlabbedPmap>(
-            *world, proc_grid_.make_pmap(), n_slabs_);
-      ExprEngine_::init_distribution(world, pmap);
+      // Choose the process-grid extent proc_h_ along the slab (h) axis of
+      // the 3-d grid: ranks beyond one-per-result-tile are useless to a
+      // single slab's 2-d SUMMA, so the surplus is spread over the slab
+      // (communication-free) dimension instead -- slab h goes to plane
+      // h % proc_h_ of proc_h_stride_ = P / proc_h_ contiguous ranks (the
+      // division-remainder ranks idle for this evaluation). For a
+      // no-external product (M == N == 1) this degenerates to an effectively
+      // 1-d grid over the slabs; for an ordinary contraction (n_slabs_ == 1)
+      // it is a pure 2-d grid (proc_h_ == 1).
+      // TODO: co-optimize proc_h_ with the 2-d (proc_r, proc_c) aspect ratio
+      //       using the h-, left-external-, and right-external-mode element
+      //       extents (and a per-rank memory bound), rather than the current
+      //       greedy tile-count heuristic.
+      const size_type P = world->size();
+      proc_h_ = 1ul;
+      if (n_slabs_ > 1ul && P > 1ul) {
+        const size_type p2d_cap = std::min<size_type>(P, M * N);
+        proc_h_ = std::min<size_type>(n_slabs_,
+                                      std::max<size_type>(1ul, P / p2d_cap));
+      }
+      // keep the invariant proc_h_ == 1 => proc_h_stride_ == 0 (the ungrouped
+      // 2-d case) so downstream logic can key off either field; for grouped
+      // grids it is the per-plane world-rank count P / proc_h_.
+      proc_h_stride_ = (proc_h_ == 1ul) ? 0ul : P / proc_h_;
+
+      if (proc_h_ == 1ul) {
+        // Construct the per-slab process grid over the whole world.
+        proc_grid_ = TiledArray::detail::ProcGrid(*world, M, N, m, n);
+
+        // Initialize children with slab-replicated SUMMA phase maps
+        left_.init_distribution(
+            world, std::make_shared<TiledArray::detail::SlabbedPmap>(
+                       *world, proc_grid_.make_row_phase_pmap(K_), n_slabs_));
+        right_.init_distribution(
+            world, std::make_shared<TiledArray::detail::SlabbedPmap>(
+                       *world, proc_grid_.make_col_phase_pmap(K_), n_slabs_));
+
+        // Initialize the process map if not already defined
+        if (!pmap)
+          pmap = std::make_shared<TiledArray::detail::SlabbedPmap>(
+              *world, proc_grid_.make_pmap(), n_slabs_);
+        ExprEngine_::init_distribution(world, pmap);
+      } else {
+        // Construct this rank's GROUP-LOCAL per-slab process grid (ranks
+        // outside the grouped prefix of the world construct a valid
+        // not-in-grid instance). The grid shape is a pure function of
+        // (proc_h_stride_, M, N, m, n), so it is congruent across all groups,
+        // and the CyclicPmap factories below emit GROUP-LOCAL owners in
+        // [0, proc_h_stride_) which the h-grouped SlabbedPmap offsets by each
+        // slab's group.
+        const size_type rank = world->rank();
+        const bool in_groups = rank < proc_h_ * proc_h_stride_;
+        const ProcessID grid_offset =
+            in_groups ? ProcessID((rank / proc_h_stride_) * proc_h_stride_)
+                      : ProcessID(0);
+        proc_grid_ = TiledArray::detail::ProcGrid(
+            *world, TiledArray::detail::rank_subset, grid_offset,
+            proc_h_stride_, M, N, m, n);
+
+        left_.init_distribution(
+            world, std::make_shared<TiledArray::detail::SlabbedPmap>(
+                       *world, proc_grid_.make_row_phase_pmap(K_), n_slabs_,
+                       proc_h_, proc_h_stride_));
+        right_.init_distribution(
+            world, std::make_shared<TiledArray::detail::SlabbedPmap>(
+                       *world, proc_grid_.make_col_phase_pmap(K_), n_slabs_,
+                       proc_h_, proc_h_stride_));
+
+        // Initialize the process map if not already defined
+        if (!pmap)
+          pmap = std::make_shared<TiledArray::detail::SlabbedPmap>(
+              *world, proc_grid_.make_pmap(), n_slabs_, proc_h_,
+              proc_h_stride_);
+        ExprEngine_::init_distribution(world, pmap);
+      }
     }
   }
 
@@ -914,7 +976,8 @@ class ContEngine : public BinaryEngine<Derived> {
     if (!general_repermute_) {
       std::shared_ptr<impl_type> pimpl = std::make_shared<impl_type>(
           left, right, *world_, trange_, shape_, pmap_, perm_,
-          batched_op_type(op_, n_fused_modes_), K_, proc_grid_, n_slabs_);
+          batched_op_type(op_, n_fused_modes_), K_, proc_grid_, n_slabs_,
+          proc_h_, proc_h_stride_);
       return dist_eval_type(pimpl);
     }
 
@@ -935,12 +998,16 @@ class ContEngine : public BinaryEngine<Derived> {
     // the inner Summa's result placement must be slab-replicated (the owner
     // of a tile independent of its slab index), regardless of the
     // (target-layout) pmap the consumer supplied for this node
-    auto canonical_pmap = std::make_shared<TiledArray::detail::SlabbedPmap>(
-        *world_, proc_grid_.make_pmap(), n_slabs_);
+    auto canonical_pmap =
+        proc_h_ == 1ul ? std::make_shared<TiledArray::detail::SlabbedPmap>(
+                             *world_, proc_grid_.make_pmap(), n_slabs_)
+                       : std::make_shared<TiledArray::detail::SlabbedPmap>(
+                             *world_, proc_grid_.make_pmap(), n_slabs_, proc_h_,
+                             proc_h_stride_);
     std::shared_ptr<impl_type> pimpl = std::make_shared<impl_type>(
         left, right, *world_, canonical_trange, canonical_shape, canonical_pmap,
         BipartitePermutation{}, batched_op_type(op_, n_fused_modes_), K_,
-        proc_grid_, n_slabs_);
+        proc_grid_, n_slabs_, proc_h_, proc_h_stride_);
     dist_eval_type canonical(pimpl);
 
     typedef TiledArray::detail::UnaryEvalImpl<
diff --git a/src/TiledArray/pmap/slabbed_pmap.h b/src/TiledArray/pmap/slabbed_pmap.h
index a345eccb6a..c4e6d9b4b5 100644
--- a/src/TiledArray/pmap/slabbed_pmap.h
+++ b/src/TiledArray/pmap/slabbed_pmap.h
@@ -44,6 +44,16 @@ namespace detail {
 /// (Hadamard/batch) modes as their leading dimensions, every slab is
 /// distributed identically over the same 2-d process grid, and the slab
 /// index never participates in inter-process communication patterns.
+/// The 3-d-grid variant adds a process dimension \c proc_h along the slab
+/// (h) axis: the world's first `proc_h * proc_h_stride` ranks are partitioned
+/// into `proc_h` contiguous h-planes of `proc_h_stride` ranks each; slab
+/// \f$ h \f$ belongs to plane \f$ h \% proc\_h \f$, and within its plane is
+/// distributed by the base map, whose owners must then be PLANE-LOCAL ranks
+/// in \f$ [0, proc\_h\_stride) \f$: the owner of index \f$ o \f$ is
+/// \f$ (h \% proc\_h) \cdot proc\_h\_stride + base(o \% S) \f$.
+/// `proc_h == 1` reduces to the slab-replicated map above (and is constructed
+/// via the 2-argument-base constructor, which delegates locality to the base
+/// map).
 class SlabbedPmap : public Pmap {
  protected:
   // Import Pmap protected variables
@@ -56,6 +66,14 @@ class SlabbedPmap : public Pmap {
   const std::shared_ptr<const Pmap> base_;  ///< The per-slab base map
   const size_type slab_size_;               ///< The number of indices per slab
   const size_type nslabs_;                  ///< The number of slabs
+  const size_type proc_h_ = 1;         ///< Process-grid extent along the slab
+                                       ///< (h) axis (the number of h-planes)
+  const size_type proc_h_stride_ = 0;  ///< Ranks per h-plane (0 = the whole
+                                       ///< world; base owners are world ranks
+                                       ///< and locality delegates to the base)
+
+  /// \return whether this map distributes the slab axis over a process plane
+  bool hgrouped() const { return proc_h_stride_ != 0ul; }
 
  public:
   typedef Pmap::size_type size_type;  ///< Size type
@@ -76,6 +94,49 @@ class SlabbedPmap : public Pmap {
     this->local_size_ = base_->local_size() * nslabs_;
   }
 
+  /// Construct an h-grouped (3-d grid) slabbed process map
+
+  /// \param world The world where the tiles will be mapped
+  /// \param base The base process map, defined over one slab; its owners
+  ///        must be GROUP-LOCAL ranks in [0, proc_h_stride)
+  /// \param nslabs The number of slabs
+  /// \param proc_h The number of slab groups (slab h -> group h % proc_h)
+  /// \param proc_h_stride The number of (contiguous) world ranks per group
+  SlabbedPmap(World& world, std::shared_ptr<const Pmap> base,
+              const size_type nslabs, const size_type proc_h,
+              const size_type proc_h_stride)
+      : Pmap(world, base->size() * nslabs),
+        base_(std::move(base)),
+        slab_size_(base_->size()),
+        nslabs_(nslabs),
+        proc_h_(proc_h),
+        proc_h_stride_(proc_h_stride) {
+    TA_ASSERT(base_);
+    TA_ASSERT(nslabs_ > 0ul);
+    TA_ASSERT(proc_h_ > 0ul);
+    TA_ASSERT(proc_h_stride_ > 0ul);
+    TA_ASSERT(proc_h_ * proc_h_stride_ <= size_type(world.size()));
+
+    // this rank's group and group-local rank; ranks beyond the grouped
+    // prefix of the world own nothing
+    const size_type rank = world.rank();
+    if (rank < proc_h_ * proc_h_stride_) {
+      const size_type my_group = rank / proc_h_stride_;
+      const size_type my_group_rank = rank % proc_h_stride_;
+      // count of my group's slabs: h in [0, nslabs) with h % proc_h ==
+      // my_group
+      const size_type my_slabs =
+          (nslabs_ / proc_h_) + (my_group < (nslabs_ % proc_h_) ? 1u : 0u);
+      // count of slab indices the base assigns to my group-local rank
+      size_type base_local = 0ul;
+      for (size_type j = 0ul; j < slab_size_; ++j)
+        if (base_->owner(j) == my_group_rank) ++base_local;
+      this->local_size_ = my_slabs * base_local;
+    } else {
+      this->local_size_ = 0ul;
+    }
+  }
+
   virtual ~SlabbedPmap() {}
 
   /// \return the per-slab base map
@@ -84,6 +145,8 @@ class SlabbedPmap : public Pmap {
   size_type slab_size() const { return slab_size_; }
   /// \return the number of slabs
   size_type nslabs() const { return nslabs_; }
+  /// \return the number of slab (h) groups
+  size_type proc_h() const { return proc_h_; }
 
   /// Maps \c tile to the process that owns it
 
@@ -91,7 +154,9 @@ class SlabbedPmap : public Pmap {
   /// \return Process that logically owns \c tile
   virtual size_type owner(const size_type tile) const {
     TA_ASSERT(tile < size_);
-    return base_->owner(tile % slab_size_);
+    if (!hgrouped()) return base_->owner(tile % slab_size_);
+    const size_type group = (tile / slab_size_) % proc_h_;
+    return group * proc_h_stride_ + base_->owner(tile % slab_size_);
   }
 
   /// Check that the tile is owned by this process
@@ -99,10 +164,13 @@ class SlabbedPmap : public Pmap {
   /// \param tile The tile to be checked
   /// \return \c true if \c tile is owned by this process, otherwise \c false
   virtual bool is_local(const size_type tile) const {
-    return base_->is_local(tile % slab_size_);
+    if (!hgrouped()) return base_->is_local(tile % slab_size_);
+    return owner(tile) == size_type(rank_);
   }
 
-  virtual bool known_local_size() const { return base_->known_local_size(); }
+  virtual bool known_local_size() const {
+    return hgrouped() ? true : base_->known_local_size();
+  }
 
   virtual const_iterator begin() const {
     return Iterator(*this, 0ul, size_, 0ul, /*checking=*/true);
diff --git a/src/TiledArray/proc_grid.h b/src/TiledArray/proc_grid.h
index cd15c1b73e..758e7fcefb 100644
--- a/src/TiledArray/proc_grid.h
+++ b/src/TiledArray/proc_grid.h
@@ -55,6 +55,14 @@ namespace detail {
 /// \f]
 /// where the positive, real root of \f$P_{\rm{row}}\f$ give the optimal
 /// optimal communication time.
+
+/// Tag to disambiguate the rank-subset ProcGrid constructor from the
+/// (same-arity) test-only constructor.
+struct rank_subset_t {
+  explicit rank_subset_t() = default;
+};
+inline constexpr rank_subset_t rank_subset{};
+
 class ProcGrid {
  public:
   typedef uint_fast32_t size_type;
@@ -71,9 +79,12 @@ class ProcGrid {
                         ///<  may be less than the number of processes in world.
   ProcessID rank_row_;  ///< This process's row in the process grid
   ProcessID rank_col_;  ///< This process's column in the process grid
-  size_type local_rows_;  ///< The number of local element rows
-  size_type local_cols_;  ///< The number of local element columns
-  size_type local_size_;  ///< Number of local elements
+  ProcessID rank_offset_ = 0;  ///< World rank of the grid's first process
+                               ///< (nonzero for a grid over a contiguous
+                               ///< subset of the world's ranks)
+  size_type local_rows_;       ///< The number of local element rows
+  size_type local_cols_;       ///< The number of local element columns
+  size_type local_size_;       ///< Number of local elements
 
   /// Compute the number of process rows that minimizes communication
 
@@ -180,14 +191,16 @@ class ProcGrid {
       proc_cols_ = 1u;
       proc_size_ = 1u;
 
-      // Set this process rank
-      rank_row_ = 0;
-      rank_col_ = 0;
+      if (rank < proc_size_) {
+        // Set this process rank
+        rank_row_ = 0;
+        rank_col_ = 0;
 
-      // Set local counts
-      local_rows_ = rows_;
-      local_cols_ = cols_;
-      local_size_ = size_;
+        // Set local counts
+        local_rows_ = rows_;
+        local_cols_ = cols_;
+        local_size_ = size_;
+      }
 
     } else if (size_ <= nprocs) {  // Max one tile per process
 
@@ -291,6 +304,48 @@ class ProcGrid {
     init(world_->rank(), world_->size(), row_size, col_size);
   }
 
+  /// Construct a process grid over a contiguous subset of the world's ranks
+
+  /// The grid spans world ranks [rank_offset, rank_offset + nprocs); ranks
+  /// outside that interval construct a valid "not in the grid" instance
+  /// (zero local sizes, empty groups). Used by the h-grouped (3-d) batched
+  /// Summa, where each fused-index slab group runs its own 2-d grid.
+  /// \param world The world where the process grid will live
+  /// \param rank_offset The world rank of the grid's first process
+  /// \param nprocs The number of processes spanned by the grid
+  /// \param rows The number of tile rows
+  /// \param cols The number of tile columns
+  /// \param row_size The number of element rows
+  /// \param col_size The number of element columns
+  ProcGrid(World& world, rank_subset_t, const ProcessID rank_offset,
+           const size_type nprocs, const size_type rows, const size_type cols,
+           const std::size_t row_size, const std::size_t col_size)
+      : world_(&world),
+        rows_(rows),
+        cols_(cols),
+        size_(rows_ * cols_),
+        proc_rows_(0ul),
+        proc_cols_(0ul),
+        proc_size_(0ul),
+        rank_row_(-1),
+        rank_col_(-1),
+        rank_offset_(rank_offset),
+        local_rows_(0ul),
+        local_cols_(0ul),
+        local_size_(0ul) {
+    TA_ASSERT(rank_offset >= 0);
+    TA_ASSERT(nprocs >= 1u);
+    TA_ASSERT(rank_offset + nprocs <= size_type(world.size()));
+    const auto world_rank = world.rank();
+    // out-of-grid ranks pass rank == nprocs, which every init() branch
+    // treats as "not in the grid"
+    const size_type rank = (world_rank >= rank_offset &&
+                            world_rank < rank_offset + ProcessID(nprocs))
+                               ? world_rank - rank_offset
+                               : nprocs;
+    init(rank, nprocs, row_size, col_size);
+  }
+
 #ifdef TILEDARRAY_ENABLE_TEST_PROC_GRID
   // Note: The following function is here for testing purposes only. It
   // has the same functionality as the default constructor above, except the
@@ -350,6 +405,7 @@ class ProcGrid {
         proc_size_(other.proc_size_),
         rank_row_(other.rank_row_),
         rank_col_(other.rank_col_),
+        rank_offset_(other.rank_offset_),
         local_rows_(other.local_rows_),
         local_cols_(other.local_cols_),
         local_size_(other.local_size_) {}
@@ -367,6 +423,7 @@ class ProcGrid {
     proc_size_ = other.proc_size_;
     rank_row_ = other.rank_row_;
     rank_col_ = other.rank_col_;
+    rank_offset_ = other.rank_offset_;
     local_rows_ = other.local_rows_;
     local_cols_ = other.local_cols_;
     local_size_ = other.local_size_;
@@ -447,7 +504,7 @@ class ProcGrid {
       // Populate the row process list
       size_type p = rank_row_ * proc_cols_;
       const size_type row_end = p + proc_cols_;
-      for (; p < row_end; ++p) proc_list.push_back(p);
+      for (; p < row_end; ++p) proc_list.push_back(p + rank_offset_);
 
       // Construct the group
       group = madness::Group(*world_, proc_list, did);
@@ -472,7 +529,7 @@ class ProcGrid {
 
       // Populate the column process list
       for (size_type p = rank_col_; p < proc_size_; p += proc_cols_)
-        proc_list.push_back(p);
+        proc_list.push_back(p + rank_offset_);
 
       // Construct the group
       if (proc_list.size() != 0)
@@ -489,7 +546,7 @@ class ProcGrid {
   /// (row,rank_col)
   ProcessID map_row(const size_type row) const {
     TA_ASSERT(row < proc_rows_);
-    return rank_col_ + row * proc_cols_;
+    return rank_col_ + row * proc_cols_ + rank_offset_;
   }
 
   /// Map a column to the process in this process's row
@@ -499,7 +556,7 @@ class ProcGrid {
   /// (rank_row,col)
   ProcessID map_col(const size_type col) const {
     TA_ASSERT(col < proc_cols_);
-    return rank_row_ * proc_cols_ + col;
+    return rank_row_ * proc_cols_ + col + rank_offset_;
   }
 
   /// Construct a cyclic process
diff --git a/tests/general_product.cpp b/tests/general_product.cpp
index 58976f87dc..26316784d8 100644
--- a/tests/general_product.cpp
+++ b/tests/general_product.cpp
@@ -8,6 +8,8 @@
 #include "tiledarray.h"
 #include "unit_test_config.h"
 
+#include <set>
+
 BOOST_AUTO_TEST_SUITE(general_product_suite, TA_UT_LABEL_SERIAL)
 
 namespace TA = TiledArray;
@@ -1342,3 +1344,146 @@ BOOST_AUTO_TEST_CASE(einsum_expression_route_matches_legacy) {
 }
 
 BOOST_AUTO_TEST_SUITE_END()
+
+// Distributed (np > 1) coverage of general products. Unlike
+// general_product_suite (serial-labeled: classification/optimizer unit tests
+// + np=1 evaluation), this suite carries NO label, so the CI harness runs it
+// at BOTH np=1 and np=2 (see tests/CMakeLists.txt: np-1 excludes @distributed,
+// np-2 excludes @serial). It exercises the batched Summa across ranks -- a
+// path the serial suite never covered. Most cases differential-test the
+// expression route against the legacy sub-World einsum oracle; the exception
+// is dist_inner_node_thc, which checks a single fused expression against a
+// reference built from explicit binary intermediates (the same expression
+// machinery), validating that nested general products at inner nodes agree
+// with the step-by-step evaluation.
+//
+// Most cases evaluate via the ordinary 2-d (proc_h == 1) batched Summa;
+// dist_no_externals_3d_grid exercises the 3-d (proc_h > 1) grid, where the
+// heuristic spreads ranks over the slab dimension because M == N == 1.
+BOOST_AUTO_TEST_SUITE(general_product_distributed_suite)
+
+// the fixtures/helpers live in general_product_suite's anonymous namespace
+using general_product_suite::diff_norm;
+using general_product_suite::diff_norm_sp;
+using general_product_suite::ForceLegacyEinsum;
+using general_product_suite::make_patterned_array;
+using general_product_suite::make_patterned_sparse_array;
+using general_product_suite::make_patterned_tot_array;
+using general_product_suite::TArrayToT;
+using general_product_suite::tot_max_abs_diff;
+
+BOOST_AUTO_TEST_CASE(dist_dense) {
+  // batched general product: b fused, j contracted, i/k free
+  auto& world = TA::get_default_world();
+  ForceLegacyEinsum legacy_oracle;
+  TA::TiledRange tr_a{{0, 2, 4}, {0, 2, 3}, {0, 2, 5}};  // b, i, j
+  TA::TiledRange tr_b{{0, 2, 4}, {0, 2, 5}, {0, 3, 4}};  // b, j, k
+  auto a = make_patterned_array(world, tr_a, 1.0);
+  auto b = make_patterned_array(world, tr_b, 2.0);
+  TA::TArrayD c;
+  BOOST_REQUIRE_NO_THROW(c("b,i,k") = a("b,i,j") * b("b,j,k"));
+  auto c_ref = TA::einsum(a("b,i,j"), b("b,j,k"), "b,i,k");
+  BOOST_CHECK_SMALL(diff_norm(c, c_ref, "b,i,k"), 1e-10);
+}
+
+BOOST_AUTO_TEST_CASE(dist_sparse) {
+  // block-sparse batched general product (SparseShape::gemm_batched +
+  // the sparse (h,k)-keyed Summa groups, across ranks)
+  auto& world = TA::get_default_world();
+  ForceLegacyEinsum legacy_oracle;
+  TA::TiledRange tr_a{{0, 2, 5}, {0, 3, 4}, {0, 2, 6, 7}};  // b, i, j
+  TA::TiledRange tr_b{{0, 2, 5}, {0, 2, 6, 7}, {0, 4, 5}};  // b, j, k
+  auto a = make_patterned_sparse_array(world, tr_a, 1.0, 3);
+  auto b = make_patterned_sparse_array(world, tr_b, 2.0, 4);
+  TA::TSpArrayD c;
+  BOOST_REQUIRE_NO_THROW(c("b,i,k") = a("b,i,j") * b("b,j,k"));
+  auto c_ref = TA::einsum(a("b,i,j"), b("b,j,k"), "b,i,k");
+  BOOST_CHECK_SMALL(diff_norm_sp(c, c_ref, "b,i,k"), 1e-10);
+}
+
+BOOST_AUTO_TEST_CASE(dist_tot_mixed) {
+  // mixed T x ToT general product (inner Scale) across ranks
+  auto& world = TA::get_default_world();
+  ForceLegacyEinsum legacy_oracle;
+  TA::TiledRange tr_a{{0, 2, 4}, {0, 2, 3}, {0, 2, 5}};  // b, i, j
+  TA::TiledRange tr_b{{0, 2, 4}, {0, 2, 5}, {0, 3, 4}};  // b, j, k
+  auto a = make_patterned_array(world, tr_a, 1.0);
+  auto b = make_patterned_tot_array(world, tr_b, {3}, 2.0);
+  TArrayToT c;
+  BOOST_REQUIRE_NO_THROW(c("b,i,k;m") = a("b,i,j") * b("b,j,k;m"));
+  auto c_ref = TA::einsum(a("b,i,j"), b("b,j,k;m"), "b,i,k;m");
+  BOOST_CHECK_SMALL(tot_max_abs_diff(c, c_ref), 1e-10);
+}
+
+BOOST_AUTO_TEST_CASE(dist_no_externals_dense) {
+  // no-external general product (Hadamard reduction shape) across ranks;
+  // the synthetic unit left-external mode handling under distribution
+  auto& world = TA::get_default_world();
+  ForceLegacyEinsum legacy_oracle;
+  TA::TiledRange tr{{0, 2, 4}, {0, 3, 5}};  // i, j
+  auto a = make_patterned_array(world, tr, 1.0);
+  auto b = make_patterned_array(world, tr, 2.0);
+  TA::TArrayD c;
+  BOOST_REQUIRE_NO_THROW(c("i") = a("i,j") * b("i,j"));
+  auto c_ref = TA::einsum(a("i,j"), b("i,j"), "i");
+  BOOST_CHECK_SMALL(diff_norm(c, c_ref, "i"), 1e-10);
+}
+
+BOOST_AUTO_TEST_CASE(dist_no_externals_tot) {
+  // no-external ToT general product across ranks
+  auto& world = TA::get_default_world();
+  ForceLegacyEinsum legacy_oracle;
+  TA::TiledRange tr{{0, 3, 5}, {0, 2, 4}, {0, 2, 3}};  // x, i, j
+  auto a = make_patterned_tot_array(world, tr, {2}, 1.0);
+  auto b = make_patterned_tot_array(world, tr, {3}, 2.0);
+  TArrayToT c;
+  BOOST_REQUIRE_NO_THROW(c("i,j;a,b") = a("x,i,j;a") * b("x,i,j;b"));
+  auto c_ref = TA::einsum(a("x,i,j;a"), b("x,i,j;b"), "i,j;a,b");
+  BOOST_CHECK_SMALL(tot_max_abs_diff(c, c_ref), 1e-10);
+}
+
+BOOST_AUTO_TEST_CASE(dist_inner_node_thc) {
+  // THC reconstruction in one expression (general products at inner nodes)
+  // across ranks
+  auto& world = TA::get_default_world();
+  TA::TiledRange tr_x{{0, 2, 4}, {0, 3, 5}};  // orbital x auxiliary
+  TA::TiledRange tr_z{{0, 3, 5}, {0, 3, 5}};  // auxiliary x auxiliary
+  auto x = make_patterned_array(world, tr_x, 1.0);
+  auto z = make_patterned_array(world, tr_z, 2.0);
+  TA::TArrayD g;
+  BOOST_REQUIRE_NO_THROW(g("p,q,r,s") = x("p,r1") * x("q,r1") * z("r1,r2") *
+                                        x("r,r2") * x("s,r2"));
+  TA::TArrayD i1, i2, i3, g_ref;
+  i1("r1,p,q") = x("p,r1") * x("q,r1");
+  i2("p,q,r2") = i1("r1,p,q") * z("r1,r2");
+  i3("r2,p,q,r") = i2("p,q,r2") * x("r,r2");
+  g_ref("p,q,r,s") = i3("r2,p,q,r") * x("s,r2");
+  BOOST_CHECK_SMALL(diff_norm(g, g_ref, "p,q,r,s"), 1e-10);
+}
+
+BOOST_AUTO_TEST_CASE(dist_no_externals_3d_grid) {
+  // a no-external product with several slabs: at np>1 the heuristic engages
+  // the 3-d grid (M=N=1 ⇒ proc_h > 1, the ranks spread over the slab axis),
+  // so the result tiles distribute across the h-planes rather than piling on
+  // one rank. Exercises the cross-plane result-tile transfer.
+  auto& world = TA::get_default_world();
+  ForceLegacyEinsum legacy_oracle;
+  TA::TiledRange tr{{0, 2, 4, 6}, {0, 3, 5}};  // i (3 tiles), j
+  auto a = make_patterned_array(world, tr, 1.0);
+  auto b = make_patterned_array(world, tr, 2.0);
+  TA::TArrayD c;
+  BOOST_REQUIRE_NO_THROW(c("i") = a("i,j") * b("i,j"));
+  auto c_ref = TA::einsum(a("i,j"), b("i,j"), "i");
+  BOOST_CHECK_SMALL(diff_norm(c, c_ref, "i"), 1e-10);
+
+  // at np>1 the no-external result must NOT all live on one rank (the
+  // degeneracy the 3-d grid fixes)
+  if (world.size() > 1) {
+    std::set<std::size_t> owners;
+    for (std::size_t o = 0; o < c.trange().tiles_range().volume(); ++o)
+      owners.insert(c.pmap()->owner(o));
+    BOOST_CHECK_GT(owners.size(), 1u);
+  }
+}
+
+BOOST_AUTO_TEST_SUITE_END()