WithAutonomi · mickvandijke · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -41,8 +41,11 @@ mimalloc = "0.1"
 # with ant-protocol's re-exports.
 ant-protocol = "2.2.0"
 
-# Core (provides EVERYTHING: networking, DHT, security, trust, storage)
-saorsa-core = "0.26.0"
+# Core (provides EVERYTHING: networking, DHT, security, trust, storage).
+# Pinned to WithAutonomi/saorsa-core PR #119 (trust quarantine thresholds);
+# the matching `[patch.crates-io]` below redirects ant-protocol's transitive
+# saorsa-core to the same source so Cargo unifies on one copy.
+saorsa-core = { git = "https://github.com/WithAutonomi/saorsa-core", branch = "feat/trust-quarantine-thresholds" }
 saorsa-pqc = "0.5"
 
 # Payment verification - autonomi network lookup + EVM payment
@@ -196,3 +199,11 @@ unused_async = "allow"
 cognitive_complexity = "allow"
 # Allow non-const functions during initial development (may need runtime features later)
 missing_const_for_fn = "allow"
+
+[patch.crates-io]
+# Redirect the saorsa-core that ant-protocol (and other crates) pull from
+# crates.io onto the same git source as the node's direct dependency,
+# eliminating the duplicate 0.26.0 copies (crates.io vs git) that otherwise
+# collide on shared types such as `saorsa_core::address::MultiAddr`.
+# Tracks WithAutonomi/saorsa-core PR #119 (trust quarantine thresholds).
+saorsa-core = { git = "https://github.com/WithAutonomi/saorsa-core", branch = "feat/trust-quarantine-thresholds" }
diff --git a/docs/adr/ADR-0003-full-node-detection-and-eviction.md b/docs/adr/ADR-0003-full-node-detection-and-eviction.md
@@ -141,6 +141,46 @@ mis-attributing the resulting replication failures to honest nodes.
   strangles the fallback it is meant to coexist with. Express both against the same
   width and choose them together.
 
+### Fresh-replication admission during convergence
+
+A healthy node that *should* hold a chunk can transiently fail to recognise its own
+responsibility: while full nodes closer to the key still sit in its routing table
+(not yet detected and evicted), they push it past the narrow
+`storage_admission_width` (close group + small margin) in its *own* view. With that
+narrow window it would then **reject** a fresh-replication offer it ought to accept —
+so the replication path that is meant to heal coverage instead refuses it, stalling
+convergence. This is also inconsistent: the same node already accepts a **client
+PUT** for that key within the wider `K_BUCKET_SIZE` window (both the payment
+issuer-closeness check and the self-closeness gate above use it), so a key could land
+on the node directly from a client but not via replication.
+
+**Decision:** widen the **fresh-replication accept** admission to the K-wide
+paid-close-group neighbourhood (`paid_list_close_group_size`, equal to
+`K_BUCKET_SIZE` = 20) — the same window client PUTs use — so transient view-skew
+from un-evicted full nodes no longer causes spurious rejection. A node accepts a
+fresh offer when it is within its own local `paid_list_close_group_size`-closest to
+the key. Two properties keep this safe:
+
+- **No extra storage from the accept side.** The sender still fans out only to its
+  `close_group_size` targets, so widening the *receiver's* accept window adds no
+  stores — it only stops those targets from rejecting due to view-skew.
+- **Retention stays narrow.** Long-term retention/pruning keeps using
+  `storage_admission_width`, so steady-state replication is still ≈ K. Any transient
+  over-coverage is reclaimed once the close group converges (full nodes evicted → the
+  node's view tightens → it sits correctly inside or outside the narrow window), and
+  the multi-day prune hysteresis comfortably spans the minutes-long eviction window,
+  so a legitimate replica is never pruned mid-convergence.
+
+**Sender side (deliberately not widened).** The mirror case — the *sender's* own
+close group being mostly full, so its `close_group_size` offers never reach the
+healthy nodes ranked beyond the full ones — is left to the existing convergence loop:
+the possession check evicts the full nodes, the close group tightens to healthy
+members, and the responsible-chunk repair (neighbour sync) fetches the chunk to the
+nodes that become responsible. Widening the sender fan-out to `K_BUCKET_SIZE` was
+considered and rejected — it would triple fresh-replication fan-out on *every* write
+and hold large transient over-replication for the full prune-hysteresis window, for a
+case the repair path already heals.
+
 ## Consequences
 
 ### Positive
@@ -205,7 +245,11 @@ How we will know this decision remains correct:
   present/absent; the adaptive timeout grace tracks widespread timeouts but never
   deterministic failures; the node emits into saorsa-core such that an
   evicted-for-fullness node can re-enter after it frees capacity (integration); the
-  self-closeness gate width is ≥ the fallback ceiling.
+  self-closeness gate width is ≥ the fallback ceiling; a healthy node with un-evicted
+  full nodes ahead of it in its routing table (so it ranks outside
+  `storage_admission_width` but within `K_BUCKET_SIZE`) still **accepts** a
+  fresh-replication offer for the key, while retention/pruning stays scoped to
+  `storage_admission_width`.
 - **Re-open triggers:** revisit thresholds if false positives appear; revisit the
   near-capacity degradation if the network approaches global capacity.
 

diff --git a/src/replication/config.rs b/src/replication/config.rs
@@ -47,6 +47,47 @@ pub const NEIGHBOR_SYNC_SCOPE: usize = 20;
 /// round.
 pub const NEIGHBOR_SYNC_PEER_COUNT: usize = 4;
 
+/// Best-effort delivery retries for a fresh-replication push, per peer.
+///
+/// ADR-0003: on a transport/send failure the offer is retried up to this many
+/// times so a transient hiccup does not silently drop it. This is delivery
+/// assurance only — possession is judged separately by the delayed possession
+/// check, which still penalises a close peer that lacks the chunk even if the
+/// push never reached it.
+pub const FRESH_REPLICATION_DELIVERY_MAX_RETRIES: u32 = 2;
+
+const POSSESSION_CHECK_DELAY_MIN_SECS: u64 = 5 * 60;
+const POSSESSION_CHECK_DELAY_MAX_SECS: u64 = 15 * 60;
+const POSSESSION_CHECK_TIMEOUT_SECS: u64 = 15;
+const POSSESSION_CHECK_RETRY_BACKOFF_SECS: u64 = 30;
+
+/// Lower bound of the delay before a fresh-replication possession check runs
+/// (ADR-0003).
+///
+/// The delay lets replication settle so an honest peer still mid-store is not
+/// judged prematurely, and makes the check unpredictable to the peer.
+pub const POSSESSION_CHECK_DELAY_MIN: Duration =
+    Duration::from_secs(POSSESSION_CHECK_DELAY_MIN_SECS);
+
+/// Upper bound of the possession-check delay (ADR-0003).
+pub const POSSESSION_CHECK_DELAY_MAX: Duration =
+    Duration::from_secs(POSSESSION_CHECK_DELAY_MAX_SECS);
+
+/// Per-peer request timeout for a single possession probe (ADR-0003).
+pub const POSSESSION_CHECK_TIMEOUT: Duration = Duration::from_secs(POSSESSION_CHECK_TIMEOUT_SECS);
+
+/// Maximum possession-probe attempts per peer before giving up without a
+/// verdict (ADR-0003).
+///
+/// A peer unreachable at check time is re-attempted under this bounded grace
+/// and is never penalised as absent.
+pub const POSSESSION_CHECK_MAX_ATTEMPTS: u32 = 3;
+
+/// Backoff between possession-probe attempts when a peer yields no verdict
+/// (ADR-0003).
+pub const POSSESSION_CHECK_RETRY_BACKOFF: Duration =
+    Duration::from_secs(POSSESSION_CHECK_RETRY_BACKOFF_SECS);
+
 /// Width used when deciding whether this node may locally store or retain a
 /// chunk.
 #[must_use]
@@ -428,6 +469,13 @@ pub struct ReplicationConfig {
     /// Seconds to wait for `DhtNetworkEvent::BootstrapComplete` before
     /// proceeding with bootstrap sync (covers bootstrap nodes with no peers).
     pub bootstrap_complete_timeout_secs: u64,
+    /// Lower bound of the delay before a fresh-replication possession check
+    /// runs (ADR-0003). Defaults to [`POSSESSION_CHECK_DELAY_MIN`]; tests
+    /// shorten it so the scheduled check fires quickly.
+    pub possession_check_delay_min: Duration,
+    /// Upper bound of the possession-check delay window (ADR-0003). Defaults
+    /// to [`POSSESSION_CHECK_DELAY_MAX`].
+    pub possession_check_delay_max: Duration,
 }
 
 impl Default for ReplicationConfig {
@@ -454,6 +502,8 @@ impl Default for ReplicationConfig {
             verification_request_timeout: VERIFICATION_REQUEST_TIMEOUT,
             fetch_request_timeout: FETCH_REQUEST_TIMEOUT,
             bootstrap_complete_timeout_secs: BOOTSTRAP_COMPLETE_TIMEOUT_SECS,
+            possession_check_delay_min: POSSESSION_CHECK_DELAY_MIN,
+            possession_check_delay_max: POSSESSION_CHECK_DELAY_MAX,
         }
     }
 }

diff --git a/src/replication/fresh.rs b/src/replication/fresh.rs
@@ -14,7 +14,9 @@ use saorsa_core::P2PNode;
 use tokio::sync::Semaphore;
 
 use crate::ant_protocol::XorName;
-use crate::replication::config::{ReplicationConfig, REPLICATION_PROTOCOL_ID};
+use crate::replication::config::{
+    ReplicationConfig, FRESH_REPLICATION_DELIVERY_MAX_RETRIES, REPLICATION_PROTOCOL_ID,
+};
 use crate::replication::paid_list::PaidList;
 use crate::replication::protocol::{
     FreshReplicationOffer, PaidNotify, ReplicationMessage, ReplicationMessageBody,
@@ -36,9 +38,10 @@ pub struct FreshWriteEvent {
 
 /// Execute fresh replication for a newly accepted record.
 ///
-/// Sends fresh offers to close group members and `PaidNotify` to
-/// `PaidCloseGroup`. Both are fire-and-forget (no ack tracking or retry per
-/// Section 6.1, rule 8).
+/// Sends fresh offers to close group members (with bounded delivery retries,
+/// ADR-0003) and `PaidNotify` to `PaidCloseGroup`. Returns the close-group
+/// peers responsible for the key (excluding self) so the caller can schedule
+/// the delayed possession check; `PaidNotify` remains fire-and-forget.
 ///
 /// The `send_semaphore` limits how many outbound chunk transfers can be
 /// in-flight concurrently across the entire replication engine, preventing
@@ -51,7 +54,7 @@ pub async fn replicate_fresh(
     paid_list: &Arc<PaidList>,
     config: &ReplicationConfig,
     send_semaphore: &Arc<Semaphore>,
-) {
+) -> Vec<PeerId> {
     let self_id = *p2p_node.peer_id();
 
     // Rule 6: Node that validates PoP adds K to PaidForList(self).
@@ -88,11 +91,15 @@ pub async fn replicate_fresh(
             "Failed to encode FreshReplicationOffer for {}",
             hex::encode(key),
         );
-        return;
+        return Vec::new();
     };
+    // Share one encoded copy across the per-peer send tasks so a retry only
+    // re-materialises the buffer for the (consuming) send call, keeping the
+    // common single-attempt path at one clone per peer.
+    let encoded = Arc::new(encoded);
     for peer in &target_peers {
         let p2p = Arc::clone(p2p_node);
-        let data = encoded.clone();
+        let data = Arc::clone(&encoded);
         let peer_id = *peer;
         let sem = Arc::clone(send_semaphore);
         tokio::spawn(async move {
@@ -103,11 +110,37 @@ pub async fn replicate_fresh(
                 "Replication send permit acquired for peer {peer_id} ({} available)",
                 sem.available_permits()
             );
-            if let Err(e) = p2p
-                .send_message(&peer_id, REPLICATION_PROTOCOL_ID, data, &[])
-                .await
-            {
-                debug!("Failed to send fresh offer to {peer_id}: {e}");
+            // ADR-0003: best-effort delivery. Retry the push up to
+            // FRESH_REPLICATION_DELIVERY_MAX_RETRIES times on a transport
+            // failure so a transient hiccup doesn't silently drop the offer.
+            // Possession is judged separately by the delayed possession check.
+            let mut attempt = 0u32;
+            loop {
+                match p2p
+                    .send_message(
+                        &peer_id,
+                        REPLICATION_PROTOCOL_ID,
+                        data.as_ref().clone(),
+                        &[],
+                    )
+                    .await
+                {
+                    Ok(()) => break,
+                    Err(e) => {
+                        if attempt >= FRESH_REPLICATION_DELIVERY_MAX_RETRIES {
+                            debug!(
+                                "Failed to send fresh offer to {peer_id} after {} attempts: {e}",
+                                attempt + 1
+                            );
+                            break;
+                        }
+                        attempt += 1;
+                        debug!(
+                            "Retrying fresh offer to {peer_id} (attempt {}): {e}",
+                            attempt + 1
+                        );
+                    }
+                }
             }
         });
     }
@@ -122,6 +155,8 @@ pub async fn replicate_fresh(
         hex::encode(key),
         target_peers.len()
     );
+
+    target_peers
 }
 
 /// Send `PaidNotify(K)` to every peer in `PaidCloseGroup(K)` (fire-and-forget).