From 5f29f543605f7070e34728d0a2606c10941fd941 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Tue, 9 Jun 2026 17:16:02 -0600
Subject: [PATCH 1/2] feat(bench): corpus flywheel wiring + the primed-vs-cold
 A/B
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

AgenticOptions gains corpus/corpusTags — the analyst's observe() pass now appends
trace-derived facts (the flywheel write side) with zero extra LLM calls; priming (the
read side) is the caller folding corpus.query() facts into the task systemPrompt.

eops-corpus-ab.mts: THE across-run experiment (layer-across-run.md). Two arms over the
same task stream, same order, canonical depth: cold (fresh every run) vs primed
(query top-k facts before, observe-append after). Equal compute by construction.
Reports paired lift, the SLOPE (first-half vs second-half — the flywheel signature is
a growing advantage), fact-injection counts, and a frozen holdout (fresh tasks, corpus
read-only). Smoke verified: run 1 wrote 3 facts, run 2 injected them.
---
 bench/src/agentic.ts         |  16 +++-
 bench/src/eops-corpus-ab.mts | 150 +++++++++++++++++++++++++++++++++++
 2 files changed, 164 insertions(+), 2 deletions(-)
 create mode 100644 bench/src/eops-corpus-ab.mts

diff --git a/bench/src/agentic.ts b/bench/src/agentic.ts
index cb8eb7e..ed66544 100644
--- a/bench/src/agentic.ts
+++ b/bench/src/agentic.ts
@@ -30,6 +30,7 @@ import {
 } from '@tangle-network/agent-runtime/loops'
 import type {
   Agent,
+  Corpus,
   AgentSpec,
   Budget,
   ExecutorContext,
@@ -92,6 +93,12 @@ export interface AgenticOptions {
   /** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a
    *  prompt optimizer (GEPA) tunes — the analyst IS the steerer. Omitted ⇒ the default. */
   analystInstruction?: string
+  /** Across-run learning: when set, the analyst's observe() pass appends trace-derived
+   *  facts here (the flywheel write side). Priming (the read side) is the caller's move —
+   *  query the corpus and fold facts into the task's systemPrompt before runAgentic. */
+  corpus?: Corpus
+  /** Tags written onto learned facts (and used by the caller's priming query). */
+  corpusTags?: string[]
 }
 
 // ── The unit: one agentic shot (a bounded tool loop) over a handle ───────────────
@@ -189,8 +196,13 @@ async function analyze(task: AgenticTask, messages: Msg[], opts: AgenticOptions)
     .slice(0, 7000)
   const chat = createChatClient({ transport: 'router', apiKey: opts.routerKey, baseUrl: opts.routerBaseUrl, defaultModel: opts.model })
   const obs = await observe(
-    { task: task.userPrompt, output: trajectory, trace: messages, outcome: 'failed' },
-    { chat, model: opts.model, ...(opts.analystInstruction ? { analystInstruction: opts.analystInstruction } : {}) },
+    { task: task.userPrompt, output: trajectory, trace: messages, outcome: 'failed', runId: task.id },
+    {
+      chat,
+      model: opts.model,
+      ...(opts.analystInstruction ? { analystInstruction: opts.analystInstruction } : {}),
+      ...(opts.corpus ? { corpus: opts.corpus, tags: opts.corpusTags ?? [] } : {}),
+    },
   )
   // The steer = the analyst's recommended actions for the agent. Empty ⇒ nothing left to do.
   const steer = obs.findings
diff --git a/bench/src/eops-corpus-ab.mts b/bench/src/eops-corpus-ab.mts
new file mode 100644
index 0000000..78234c3
--- /dev/null
+++ b/bench/src/eops-corpus-ab.mts
@@ -0,0 +1,150 @@
+/**
+ * The corpus flywheel A/B — primed-vs-cold at equal compute. THE across-run experiment
+ * (docs/research/layer-across-run.md): does run N+1 improve because the system learned
+ * from run N?
+ *
+ * Two arms over the SAME task stream, same order, canonical depth (Supervisor + observe):
+ *   cold    — every run fresh (the baseline as measured at +16.4pp vs breadth).
+ *   primed  — before each run, query the corpus (trace-derived facts accumulated by the
+ *             analyst's observe() pass on PRIOR runs) and fold the top-k into the task's
+ *             systemPrompt; after each run, observe() appends new facts. Zero extra LLM
+ *             calls (the analyst already runs); priming is prompt text — equal compute.
+ *
+ * Reported: per-position scores, paired lift, the SLOPE (first-half vs second-half lift —
+ * the flywheel signature is a GROWING advantage), fact uptake counts, and a frozen
+ * HOLDOUT: a disjoint slice run primed-from-the-accumulated-corpus vs cold (does the
+ * learned knowledge transfer to fresh tasks?).
+ *
+ * Falsifiers designed in (layer-across-run.md): context pollution (cap k, report dose),
+ * stale/instance facts (the gym DB resets per task — only PROCEDURAL facts can help),
+ * judge leakage (observe() is structurally trace-only), worker disregard (uptake column).
+ *
+ *   docker run -d --rm --name eops -p 8006:8005 shivakrishnareddyma225/enterpriseops-gym-mcp-itsm:latest
+ *   EOPS_GYM_DBS_DIR=… N=16 HOLDOUT=4 K_FACTS=3 WORKER_MODEL=deepseek-v4-pro tsx src/eops-corpus-ab.mts
+ */
+import { FileCorpus } from '@tangle-network/agent-runtime/loops'
+import { type AgenticOptions, type AgenticTask, runAgentic } from './agentic'
+import { createEopsSurface, eopsTaskFromRow } from './agentic-eops'
+import { type PairedLift, pairedLift } from './stats.mts'
+
+function must(name: string): string {
+  const v = process.env[name]
+  if (!v) throw new Error(`env ${name} is required`)
+  return v
+}
+
+async function loadItsmTasks(n: number, offset = 0): Promise<AgenticTask[]> {
+  const url = `https://datasets-server.huggingface.co/rows?dataset=${encodeURIComponent('ServiceNow-AI/EnterpriseOps-Gym')}&config=oracle&split=itsm&offset=${offset}&length=${n}`
+  const res = await fetch(url)
+  if (!res.ok) throw new Error(`EOPS HF rows HTTP ${res.status}`)
+  const body = (await res.json()) as { rows?: Array<{ row: Parameters<typeof eopsTaskFromRow>[0] }> }
+  return (body.rows ?? []).slice(0, n).map(({ row }) => eopsTaskFromRow(row))
+}
+
+const tags = ['eops', 'itsm', 'corpus-ab']
+const pct = (x: number) => `${(x * 100).toFixed(0)}%`
+const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`
+
+async function main(): Promise<void> {
+  const n = Number(process.env.N ?? 16)
+  const holdoutN = Number(process.env.HOLDOUT ?? 4)
+  const kFacts = Number(process.env.K_FACTS ?? 3)
+  const maxShots = Number(process.env.MAXSHOTS ?? 3)
+  const model = process.env.WORKER_MODEL ?? 'deepseek-v4-pro'
+  const corpusPath = process.env.CORPUS ?? `/tmp/eops-corpus-ab-${Date.now()}.jsonl`
+  const opts: AgenticOptions = {
+    routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1',
+    routerKey: must('TANGLE_API_KEY'),
+    model,
+    innerTurns: Number(process.env.INNER_TURNS ?? 4),
+    temperature: 0.7,
+  }
+  const surface = createEopsSurface(must('EOPS_GYM_DBS_DIR'))
+  const corpus = new FileCorpus(corpusPath)
+
+  const stream = await loadItsmTasks(n)
+  console.error(`=== corpus A/B · primed-vs-cold · stream n=${stream.length} + holdout ${holdoutN} · ${model} · k=${kFacts} facts ===`)
+  console.error(`    corpus: ${corpusPath}\n`)
+
+  /** Top-k trace-derived facts → a prime block for the worker's system prompt. */
+  async function primeBlock(): Promise<{ text: string; count: number }> {
+    const facts = await corpus.query({ tags: ['audience:agent'], limit: kFacts })
+    if (facts.length === 0) return { text: '', count: 0 }
+    const lines = facts.map((f) => `- ${f.claim}${f.rationale ? ` (${f.rationale.slice(0, 120)})` : ''}`)
+    return {
+      text: `\n\nLEARNINGS FROM PRIOR RUNS (apply where relevant):\n${lines.join('\n')}`,
+      count: facts.length,
+    }
+  }
+
+  async function runArm(task: AgenticTask, primed: boolean): Promise<{ score: number; facts: number } | null> {
+    try {
+      if (!primed) {
+        const r = await runAgentic({ ...opts, surface, task, mode: 'depth', budget: maxShots })
+        return { score: r.score, facts: 0 }
+      }
+      const prime = await primeBlock()
+      const primedTask: AgenticTask = { ...task, systemPrompt: `${task.systemPrompt}${prime.text}` }
+      const r = await runAgentic({ ...opts, corpus, corpusTags: tags, surface, task: primedTask, mode: 'depth', budget: maxShots })
+      return { score: r.score, facts: prime.count }
+    } catch (e) {
+      console.error(`     SKIP ${task.id.slice(-12)} (${e instanceof Error ? e.message.slice(0, 60) : e})`)
+      return null
+    }
+  }
+
+  // The stream: per task, cold first (no corpus contact), then primed (reads + writes).
+  const rows: Array<{ cold: number; primed: number; facts: number }> = []
+  for (let i = 0; i < stream.length; i += 1) {
+    const task = stream[i] as AgenticTask
+    const cold = await runArm(task, false)
+    const primed = await runArm(task, true)
+    if (!cold || !primed) continue
+    rows.push({ cold: cold.score, primed: primed.score, facts: primed.facts })
+    console.error(`  [${i + 1}/${stream.length}] ${task.id.slice(-12)}: cold ${pct(cold.score)}  primed ${pct(primed.score)}  (facts injected: ${primed.facts})`)
+  }
+
+  const mean = (xs: number[]) => (xs.length ? xs.reduce((s, x) => s + x, 0) / xs.length : 0)
+  const lift = pairedLift(rows.map((r) => r.cold), rows.map((r) => r.primed))
+  const half = Math.floor(rows.length / 2)
+  const firstHalf = mean(rows.slice(0, half).map((r) => r.primed - r.cold))
+  const secondHalf = mean(rows.slice(half).map((r) => r.primed - r.cold))
+
+  // Frozen holdout: fresh tasks, primed from the ACCUMULATED corpus (read-only) vs cold.
+  let holdout: { lift: PairedLift; n: number } | undefined
+  if (holdoutN > 0) {
+    console.error(`\n▶ holdout (${holdoutN} disjoint tasks, corpus read-only)…`)
+    const htasks = await loadItsmTasks(holdoutN, stream.length)
+    const hrows: Array<{ cold: number; primed: number }> = []
+    for (const task of htasks) {
+      const cold = await runArm(task, false)
+      if (!cold) continue
+      // read-only priming: query + inject, but do NOT pass the corpus (no writes).
+      try {
+        const prime = await primeBlock()
+        const primedTask: AgenticTask = { ...task, systemPrompt: `${task.systemPrompt}${prime.text}` }
+        const r = await runAgentic({ ...opts, surface, task: primedTask, mode: 'depth', budget: maxShots })
+        hrows.push({ cold: cold.score, primed: r.score })
+        console.error(`   ${task.id.slice(-12)}: cold ${pct(cold.score)}  primed ${pct(r.score)}`)
+      } catch {
+        /* skip */
+      }
+    }
+    if (hrows.length >= 2) holdout = { lift: pairedLift(hrows.map((r) => r.cold), hrows.map((r) => r.primed)), n: hrows.length }
+  }
+
+  const sig = (l: PairedLift) => (l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s.')
+  console.error(`\n${'='.repeat(74)}`)
+  console.error(`CORPUS A/B RESULT · stream n=${rows.length} · ${model} · k=${kFacts}`)
+  console.error('='.repeat(74))
+  console.error(`  cold   ${pct(mean(rows.map((r) => r.cold)))}    primed ${pct(mean(rows.map((r) => r.primed)))}`)
+  console.error(`  primed − cold (paired, B=10000)  ${pp(lift.point)}  CI [${pp(lift.low)}, ${pp(lift.high)}]  disc=${lift.discordant}  ${sig(lift)}`)
+  console.error(`  SLOPE: first-half lift ${pp(firstHalf)} → second-half lift ${pp(secondHalf)}  ${secondHalf > firstHalf ? '(growing — the flywheel signature)' : '(not growing)'}`)
+  if (holdout) console.error(`  HOLDOUT (${holdout.n} fresh tasks, accumulated corpus): ${pp(holdout.lift.point)}  CI [${pp(holdout.lift.low)}, ${pp(holdout.lift.high)}]  ${sig(holdout.lift)}`)
+  console.error(`  corpus facts accumulated: see ${corpusPath}`)
+}
+
+main().catch((e) => {
+  console.error(`eops-corpus-ab: ${e instanceof Error ? (e.stack ?? e.message) : String(e)}`)
+  process.exit(1)
+})

From d879ac93bb2ecba27d10f14a0f8cbc2c95f61072 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Tue, 9 Jun 2026 17:20:57 -0600
Subject: [PATCH 2/2] =?UTF-8?q?feat(runtime):=20publish=20the=20optimizati?=
 =?UTF-8?q?on=20suite=20=E2=80=94=20Environment/Strategy/defineStrategy/ru?=
 =?UTF-8?q?nBenchmark?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes packaging gap G1 (docs/research/product-integration-playbook.md): the suite
graduates from bench/ R&D into the published package, importable by any product.

- src/runtime/strategy.ts — the domain-blind core moved from bench/src/agentic.ts:
  the AgenticSurface seam (Environment), the canonical depth/breadth drivers
  (Supervisor + observe() — the +16.4pp configuration), open Strategy + sample/refine,
  defineStrategy with the shot()/critique() steps, adaptiveRefine, runAgentic, and the
  corpus flywheel threading (AgenticOptions.corpus → the analyst's observe() appends
  trace-derived facts).
- src/runtime/run-benchmark.ts — runBenchmark/printBenchmarkReport over an Environment;
  the paired lift now uses agent-eval's pairedBootstrap (the substrate's stats) instead
  of a bench-local copy.
- Exported via /loops. bench consumers (agentic-eops, agentic-run, eops-gepa,
  eops-corpus-ab, examples/strategy-demo) now import from the package; bench/src/
  agentic.ts + run-benchmark.mts deleted (-1000 LOC of R&D duplication).

Verified: typecheck clean both packages; 680 tests pass; strategy-demo runs all four
strategies end-to-end through the published exports.
---
 bench/src/agentic-eops.ts                     |   2 +-
 bench/src/agentic-run.mts                     |   2 +-
 bench/src/eops-corpus-ab.mts                  |   3 +-
 bench/src/eops-gepa.mts                       |   3 +-
 bench/src/examples/strategy-demo.mts          |   3 +-
 src/runtime/index.ts                          |  33 +++
 .../runtime/run-benchmark.ts                  | 108 +++++---
 .../src/agentic.ts => src/runtime/strategy.ts | 243 +++++++++++++-----
 8 files changed, 287 insertions(+), 110 deletions(-)
 rename bench/src/run-benchmark.mts => src/runtime/run-benchmark.ts (53%)
 rename bench/src/agentic.ts => src/runtime/strategy.ts (79%)

diff --git a/bench/src/agentic-eops.ts b/bench/src/agentic-eops.ts
index 86c82b9..1cd7870 100644
--- a/bench/src/agentic-eops.ts
+++ b/bench/src/agentic-eops.ts
@@ -7,7 +7,7 @@
  * AppWorld, terminal-bench) ships its own file like this one — the drivers in agentic.ts never change.
  */
 
-import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from './agentic'
+import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from '@tangle-network/agent-runtime/loops'
 import { callTool, deleteDb, type GymServer, type GymVerifier, loadTools, runVerifiers, seed } from './gym-agent'
 
 interface EopsMeta {
diff --git a/bench/src/agentic-run.mts b/bench/src/agentic-run.mts
index d3a7e17..b5f4fba 100644
--- a/bench/src/agentic-run.mts
+++ b/bench/src/agentic-run.mts
@@ -7,7 +7,7 @@
  *   TASKS=4 MAX_SHOTS=5 WIDTH=5 INNER_TURNS=4 WORKER_MODEL=gpt-4.1 tsx src/agentic-run.mts
  */
 
-import { type AgenticOptions, type AgenticTask, runAgentic } from './agentic'
+import { type AgenticOptions, type AgenticTask, runAgentic } from '@tangle-network/agent-runtime/loops'
 import { createEopsSurface, eopsTaskFromRow } from './agentic-eops'
 
 const must = (k: string): string => {
diff --git a/bench/src/eops-corpus-ab.mts b/bench/src/eops-corpus-ab.mts
index 78234c3..2b48daa 100644
--- a/bench/src/eops-corpus-ab.mts
+++ b/bench/src/eops-corpus-ab.mts
@@ -22,8 +22,7 @@
  *   docker run -d --rm --name eops -p 8006:8005 shivakrishnareddyma225/enterpriseops-gym-mcp-itsm:latest
  *   EOPS_GYM_DBS_DIR=… N=16 HOLDOUT=4 K_FACTS=3 WORKER_MODEL=deepseek-v4-pro tsx src/eops-corpus-ab.mts
  */
-import { FileCorpus } from '@tangle-network/agent-runtime/loops'
-import { type AgenticOptions, type AgenticTask, runAgentic } from './agentic'
+import { type AgenticOptions, type AgenticTask, FileCorpus, runAgentic } from '@tangle-network/agent-runtime/loops'
 import { createEopsSurface, eopsTaskFromRow } from './agentic-eops'
 import { type PairedLift, pairedLift } from './stats.mts'
 
diff --git a/bench/src/eops-gepa.mts b/bench/src/eops-gepa.mts
index 2306e06..f0eb591 100644
--- a/bench/src/eops-gepa.mts
+++ b/bench/src/eops-gepa.mts
@@ -18,8 +18,7 @@
  */
 import { readFileSync, writeFileSync } from 'node:fs'
 import { buildReflectionPrompt, paretoFrontier, parseReflectionResponse } from '@tangle-network/agent-eval'
-import { defaultAnalystInstruction } from '@tangle-network/agent-runtime/loops'
-import { type AgenticOptions, type AgenticTask, runAgentic } from './agentic'
+import { type AgenticOptions, type AgenticTask, defaultAnalystInstruction, runAgentic } from '@tangle-network/agent-runtime/loops'
 import { createEopsSurface, eopsTaskFromRow } from './agentic-eops'
 import { type RouterConfig, routerChatWithUsage } from './router-client'
 
diff --git a/bench/src/examples/strategy-demo.mts b/bench/src/examples/strategy-demo.mts
index 63edcb0..22e3fd5 100644
--- a/bench/src/examples/strategy-demo.mts
+++ b/bench/src/examples/strategy-demo.mts
@@ -14,8 +14,7 @@
  *   2. pick strategies — pass [sample, refine, adaptiveRefine].
  *   3. author your own — defineStrategy(name, body) in ~10 lines, no Supervisor ceremony.
  */
-import { adaptiveRefine, type AgenticTask, type ArtifactHandle, defineStrategy, refine, sample } from '../agentic'
-import { type Environment, printBenchmarkReport, runBenchmark } from '../run-benchmark.mts'
+import { adaptiveRefine, type AgenticTask, type ArtifactHandle, defineStrategy, type Environment, printBenchmarkReport, refine, runBenchmark, sample } from '@tangle-network/agent-runtime/loops'
 
 // ── 1. Implement an Environment (the only thing a new domain writes) ──────────────
 // A toy: the agent must drive a counter to exactly the target using the increment tool.
diff --git a/src/runtime/index.ts b/src/runtime/index.ts
index 21e779f..c94d7a0 100644
--- a/src/runtime/index.ts
+++ b/src/runtime/index.ts
@@ -152,6 +152,14 @@ export type {
   WidenSpec,
 } from './personify/wave-types'
 export { reportLoopUsage, type UsageSink } from './report-usage'
+export {
+  type BenchmarkConfig,
+  type BenchmarkLift,
+  type BenchmarkReport,
+  type Environment,
+  printBenchmarkReport,
+  runBenchmark,
+} from './run-benchmark'
 export type { RunLoopOptions } from './run-loop'
 export { createSandboxForSpec, defaultSelectWinner, runLoop } from './run-loop'
 export { acquireSandbox } from './sandbox-acquire'
@@ -176,6 +184,31 @@ export {
   type SandboxRun,
   type TurnResult,
 } from './sandbox-run'
+// The optimization suite: a domain = an Environment (5 hooks); a Strategy = how the
+// budget is spent to beat its check. Built-ins `sample`/`refine`; author your own with
+// `defineStrategy` (compose shot() + critique(), zero Supervisor ceremony); compare
+// with runBenchmark. The depth/breadth drivers are the reference implementations.
+export {
+  type AgenticOptions,
+  type AgenticRunResult,
+  type AgenticSurface,
+  type AgenticTask,
+  type AgenticTool,
+  type ArtifactHandle,
+  adaptiveRefine,
+  breadthDriver,
+  defineStrategy,
+  depthDriver,
+  type RunAgenticOptions,
+  refine,
+  runAgentic,
+  type ShotSpec,
+  type Strategy,
+  type StrategyCtx,
+  type StrategyResult,
+  type SurfaceScore,
+  sample,
+} from './strategy'
 export {
   type BudgetPool,
   type BudgetReadout,
diff --git a/bench/src/run-benchmark.mts b/src/runtime/run-benchmark.ts
similarity index 53%
rename from bench/src/run-benchmark.mts
rename to src/runtime/run-benchmark.ts
index 4027c05..8dd8c9b 100644
--- a/bench/src/run-benchmark.mts
+++ b/src/runtime/run-benchmark.ts
@@ -1,32 +1,33 @@
 /**
  * runBenchmark — the packaged optimization suite. Define a domain by implementing an
- * Environment (open / tools / call / score / close); get the optimization strategies
- * compared, scored by your own deployable check, with a paired-bootstrap report — FREE.
+ * `Environment` (open / tools / call / score / close); get the optimization strategies
+ * compared, scored by your own deployable check, with a paired-bootstrap report — free.
  *
- * The mental model: you have a TASK + a deployable CHECK + a compute BUDGET. An
- * optimization STRATEGY is how you spend the budget to beat the check. Two primitives:
+ * The mental model: you have a TASK + a deployable CHECK + a compute BUDGET. A strategy
+ * is how you spend the budget to beat the check. Two built-ins:
  *
- *   sample  — N independent attempts, keep the best-verifying one.   ("best-of-N" / resample)
+ *   sample  — N independent attempts, keep the best-verifying one.   (best-of-N / resample)
  *   refine  — attempt → a critic reads the trace → steer the next → repeat. (iterate-with-feedback)
  *
- * Both run at equal budget; the headline is the paired lift of refine over sample.
- * (Internally `sample`→breadth, `refine`→depth on the canonical Supervisor+observe loop.)
- *
- * Juniors call runBenchmark and read the report. Seniors customize the HOOKS: the critic
- * (worker.analystInstruction — observe()'s prompt), the check (Environment.score), the
- * worker (the model), and can drop to runAgentic / the Supervisor for new strategies.
+ * Both run at equal budget through the Supervisor's conserved pool; the headline is the
+ * paired lift of refine over sample. Author your own strategy with `defineStrategy`.
  */
-import { type AgenticOptions, type AgenticSurface, type AgenticTask, refine, runAgentic, sample, type Strategy } from './agentic'
-import { type PairedLift, pairedLift, pool } from './stats.mts'
+
+import { pairedBootstrap } from '@tangle-network/agent-eval'
+import {
+  type AgenticOptions,
+  type AgenticSurface,
+  type AgenticTask,
+  refine,
+  runAgentic,
+  type Strategy,
+  sample,
+} from './strategy'
 
 /** A checkable task domain — implement these 5 hooks and the suite does the rest. The
  *  same seam as `AgenticSurface`; `Environment` is the RL/gym-standard name for it. */
 export type Environment = AgenticSurface
 
-// Strategy is the OPEN extension point (re-exported from agentic): pass the built-ins or
-// author your own (implement Strategy.driver returning an Agent). See `refine`/`sample`.
-export { refine, sample, type Strategy } from './agentic'
-
 export interface BenchmarkConfig {
   /** The task domain (5 hooks). */
   environment: Environment
@@ -43,17 +44,47 @@ export interface BenchmarkConfig {
   concurrency?: number
 }
 
+export interface BenchmarkLift {
+  /** Mean of paired deltas (refine − sample). */
+  mean: number
+  low: number
+  high: number
+  n: number
+}
+
 export interface BenchmarkReport {
   n: number
   excluded: number
   /** Mean verifier score per strategy (keyed by strategy.name, 0..1). */
   perStrategy: Record<string, number>
-  /** The headline when exactly `refine` + `sample` ran: paired lift of refine over sample. */
-  refineVsSample?: PairedLift
+  /** The headline when both `refine` and `sample` ran: paired-bootstrap lift of refine over sample. */
+  refineVsSample?: BenchmarkLift
 }
 
-/** Run the requested strategies over the tasks, scored by the Environment's own check,
- *  and return the per-strategy means + the paired-bootstrap lift of refine over sample.
+/** Bounded-concurrency map preserving order; a worker that throws resolves its slot to null. */
+async function pool<T, R>(
+  items: readonly T[],
+  limit: number,
+  fn: (item: T, i: number) => Promise<R | null>,
+): Promise<Array<R | null>> {
+  const out: Array<R | null> = new Array(items.length).fill(null)
+  let next = 0
+  const workers = Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, async () => {
+    while (next < items.length) {
+      const i = next
+      next += 1
+      try {
+        out[i] = await fn(items[i] as T, i)
+      } catch {
+        out[i] = null
+      }
+    }
+  })
+  await Promise.all(workers)
+  return out
+}
+
+/** Run the requested strategies over the tasks, scored by the Environment's own check.
  *  Resilient: a task whose rollouts fail (transient infra) is excluded, not fatal. */
 export async function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkReport> {
   const strategies = cfg.strategies ?? [sample, refine]
@@ -62,15 +93,17 @@ export async function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkRepor
 
   const rows = await pool(cfg.tasks, concurrency, async (task) => {
     const scores: Record<string, number> = {}
-    try {
-      for (const s of strategies) {
-        const r = await runAgentic({ ...cfg.worker, surface: cfg.environment, task, strategy: s, budget })
-        scores[s.name] = r.score
-      }
-      return scores
-    } catch {
-      return null // transient infra on this task — exclude it
+    for (const s of strategies) {
+      const r = await runAgentic({
+        ...cfg.worker,
+        surface: cfg.environment,
+        task,
+        strategy: s,
+        budget,
+      })
+      scores[s.name] = r.score
     }
+    return scores
   })
 
   const ok = rows.filter((r): r is Record<string, number> => r !== null)
@@ -80,8 +113,12 @@ export async function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkRepor
 
   const report: BenchmarkReport = { n: ok.length, excluded: rows.length - ok.length, perStrategy }
   const names = strategies.map((s) => s.name)
-  if (names.includes('refine') && names.includes('sample')) {
-    report.refineVsSample = pairedLift(ok.map((r) => r.sample ?? 0), ok.map((r) => r.refine ?? 0))
+  if (names.includes('refine') && names.includes('sample') && ok.length >= 2) {
+    const b = pairedBootstrap(
+      ok.map((r) => r.sample ?? 0),
+      ok.map((r) => r.refine ?? 0),
+    )
+    report.refineVsSample = { mean: b.mean, low: b.low, high: b.high, n: b.n }
   }
   return report
 }
@@ -90,11 +127,14 @@ export async function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkRepor
 export function printBenchmarkReport(report: BenchmarkReport): void {
   const pct = (x: number) => `${(x * 100).toFixed(1)}%`
   const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`
-  console.log(`\n=== benchmark · n=${report.n}${report.excluded ? ` (excluded ${report.excluded})` : ''} ===`)
-  for (const [s, v] of Object.entries(report.perStrategy)) console.log(`  ${s.padEnd(8)} ${pct(v ?? 0)}`)
+  console.log(
+    `\n=== benchmark · n=${report.n}${report.excluded ? ` (excluded ${report.excluded})` : ''} ===`,
+  )
+  for (const [s, v] of Object.entries(report.perStrategy))
+    console.log(`  ${s.padEnd(8)} ${pct(v ?? 0)}`)
   const l = report.refineVsSample
   if (l) {
     const sig = l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s.'
-    console.log(`  refine − sample: ${pp(l.point)}  CI [${pp(l.low)}, ${pp(l.high)}]  (${sig})`)
+    console.log(`  refine − sample: ${pp(l.mean)}  CI [${pp(l.low)}, ${pp(l.high)}]  (${sig})`)
   }
 }
diff --git a/bench/src/agentic.ts b/src/runtime/strategy.ts
similarity index 79%
rename from bench/src/agentic.ts
rename to src/runtime/strategy.ts
index ed66544..98f0af1 100644
--- a/bench/src/agentic.ts
+++ b/src/runtime/strategy.ts
@@ -22,27 +22,24 @@
  */
 
 import { createChatClient } from '@tangle-network/agent-eval'
-import {
-  createSupervisor,
-  InMemoryResultBlobStore,
-  InMemorySpawnJournal,
-  observe,
-} from '@tangle-network/agent-runtime/loops'
+import { InMemoryResultBlobStore, InMemorySpawnJournal } from '../durable/spawn-journal'
+import { observe } from './observe'
+import type { Outcome } from './personify/types'
+import type { Corpus } from './personify/wave-types'
+import { createSupervisor } from './supervise/supervisor'
 import type {
   Agent,
-  Corpus,
   AgentSpec,
   Budget,
-  ExecutorContext,
-  ExecutorRegistry,
   Executor,
+  ExecutorContext,
   ExecutorFactory,
+  ExecutorRegistry,
   ExecutorResult,
-  Outcome,
   Scope,
   Settled,
   Spend,
-} from '@tangle-network/agent-runtime/loops'
+} from './supervise/types'
 
 // ── The general surface seam (the only thing a new benchmark implements) ─────────
 
@@ -132,7 +129,7 @@ const taskNudge =
  *  `surface.call`, carrying `messages`. Returns the updated conversation + counts. */
 async function runShot(
   surface: AgenticSurface,
-  task: AgenticTask,
+  _task: AgenticTask,
   handle: ArtifactHandle,
   tools: AgenticTool[],
   messages: Msg[],
@@ -146,15 +143,27 @@ async function runShot(
     const res = await fetch(`${opts.routerBaseUrl.replace(/\/$/, '')}/chat/completions`, {
       method: 'POST',
       headers: { 'content-type': 'application/json', authorization: `Bearer ${opts.routerKey}` },
-      body: JSON.stringify({ model: opts.model, messages, tools, tool_choice: 'auto', temperature: opts.temperature ?? 0.7 }),
+      body: JSON.stringify({
+        model: opts.model,
+        messages,
+        tools,
+        tool_choice: 'auto',
+        temperature: opts.temperature ?? 0.7,
+      }),
     })
     if (!res.ok) throw new Error(`router ${res.status}: ${(await res.text()).slice(0, 200)}`)
     completions += 1
-    const data = (await res.json()) as { choices?: Array<{ message?: { content?: string; tool_calls?: ToolCall[] } }> }
+    const data = (await res.json()) as {
+      choices?: Array<{ message?: { content?: string; tool_calls?: ToolCall[] } }>
+    }
     const msg = data.choices?.[0]?.message
     if (!msg) break
     const calls = msg.tool_calls ?? []
-    messages.push({ role: 'assistant', content: msg.content ?? '', ...(calls.length ? { tool_calls: calls } : {}) })
+    messages.push({
+      role: 'assistant',
+      content: msg.content ?? '',
+      ...(calls.length ? { tool_calls: calls } : {}),
+    })
     if (calls.length === 0) break
     for (const call of calls) {
       toolCalls += 1
@@ -189,14 +198,27 @@ async function analyze(task: AgenticTask, messages: Msg[], opts: AgenticOptions)
     .filter((m) => m.role === 'assistant' || m.role === 'tool')
     .map((m) => {
       if (m.role === 'tool') return `RESULT ${String(m.content).slice(0, 280)}`
-      const calls = (m.tool_calls as ToolCall[] | undefined)?.map((c) => `${c.function.name}(${c.function.arguments})`).join(', ')
+      const calls = (m.tool_calls as ToolCall[] | undefined)
+        ?.map((c) => `${c.function.name}(${c.function.arguments})`)
+        .join(', ')
       return calls ? `CALL ${calls}` : `SAY ${String(m.content).slice(0, 200)}`
     })
     .join('\n')
     .slice(0, 7000)
-  const chat = createChatClient({ transport: 'router', apiKey: opts.routerKey, baseUrl: opts.routerBaseUrl, defaultModel: opts.model })
+  const chat = createChatClient({
+    transport: 'router',
+    apiKey: opts.routerKey,
+    baseUrl: opts.routerBaseUrl,
+    defaultModel: opts.model,
+  })
   const obs = await observe(
-    { task: task.userPrompt, output: trajectory, trace: messages, outcome: 'failed', runId: task.id },
+    {
+      task: task.userPrompt,
+      output: trajectory,
+      trace: messages,
+      outcome: 'failed',
+      runId: task.id,
+    },
     {
       chat,
       model: opts.model,
@@ -224,7 +246,12 @@ interface ShotResult {
   toolErrors: number
 }
 
-const spend = (iterations: number): Spend => ({ iterations, tokens: { input: 0, output: 0 }, usd: 0, ms: 0 })
+const spend = (iterations: number): Spend => ({
+  iterations,
+  tokens: { input: 0, output: 0 },
+  usd: 0,
+  ms: 0,
+})
 
 /** Resolve a shot: if `handle` given, operate on the SHARED artifact (depth); else open+score+close
  *  an OWN artifact (breadth). Always scores the artifact's final state as the deployable verdict. */
@@ -246,7 +273,14 @@ function shotExecutor(surface: AgenticSurface, opts: AgenticOptions): Executor<u
         const shot = await runShot(surface, t.task, handle, tools, messages, opts)
         const s = await surface.score(t.task, handle)
         const score = s.total > 0 ? s.passes / s.total : 0
-        const out: ShotResult = { messages: shot.messages, score, passes: s.passes, total: s.total, completions: shot.completions, toolErrors: shot.toolErrors }
+        const out: ShotResult = {
+          messages: shot.messages,
+          score,
+          passes: s.passes,
+          total: s.total,
+          completions: shot.completions,
+          toolErrors: shot.toolErrors,
+        }
         artifact = {
           outRef: `shot:${handle.id}:${shot.completions}:${s.passes}/${s.total}`,
           out,
@@ -330,10 +364,18 @@ export interface AgenticRunResult {
   shots: number
 }
 
-const perChild = (innerTurns: number): Budget => ({ maxIterations: innerTurns + 1, maxTokens: 1_000_000 })
+const perChild = (innerTurns: number): Budget => ({
+  maxIterations: innerTurns + 1,
+  maxTokens: 1_000_000,
+})
 
 /** DEPTH: one persistent artifact, carried across analyst-steered shots. */
-export function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: { maxShots: number }): Agent<unknown, Outcome<unknown>> {
+export function depthDriver(
+  surface: AgenticSurface,
+  task: AgenticTask,
+  opts: AgenticOptions,
+  cfg: { maxShots: number },
+): Agent<unknown, Outcome<unknown>> {
   const innerTurns = opts.innerTurns ?? 4
   let pendingSteer: string | undefined // analyst-derived steer carried between shots
   return {
@@ -348,7 +390,10 @@ export function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: Ag
         for (shots = 0; shots < cfg.maxShots; shots += 1) {
           const child = leaf(`shot:${shots}`, 'shot')
           const steer = shots === 0 ? undefined : pendingSteer
-          const res = scope.spawn(child, { task, handle, messages, steer } as ShotTask, { budget: perChild(innerTurns), label: `shot:${shots}` })
+          const res = scope.spawn(child, { task, handle, messages, steer } as ShotTask, {
+            budget: perChild(innerTurns),
+            label: `shot:${shots}`,
+          })
           if (!res.ok) break
           const settled = await drainOne(scope)
           if (settled.kind === 'down') break
@@ -359,7 +404,11 @@ export function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: Ag
           if (out.score >= 1 || shots === cfg.maxShots - 1) break
           // Analyst reads the trajectory (firewalled) → steer the resumed session.
           const aChild = leaf(`analyst:${shots}`, 'analyst')
-          const aRes = scope.spawn(aChild, { task, messages }, { budget: perChild(1), label: `analyst:${shots}` })
+          const aRes = scope.spawn(
+            aChild,
+            { task, messages },
+            { budget: perChild(1), label: `analyst:${shots}` },
+          )
           if (!aRes.ok) break
           const aSettled = await drainOne(scope)
           completions += 1
@@ -370,7 +419,17 @@ export function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: Ag
         }
         const final = await surface.score(task, handle)
         const score = final.total > 0 ? final.passes / final.total : 0
-        return { kind: 'done', deliverable: { mode: 'depth', score, resolved: final.total > 0 && final.passes === final.total, completions, progression, shots: shots + 1 } }
+        return {
+          kind: 'done',
+          deliverable: {
+            mode: 'depth',
+            score,
+            resolved: final.total > 0 && final.passes === final.total,
+            completions,
+            progression,
+            shots: shots + 1,
+          },
+        }
       } finally {
         await surface.close(handle)
       }
@@ -379,14 +438,22 @@ export function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: Ag
 }
 
 /** BREADTH: K independent rollouts (each own artifact), verifier picks the best. */
-export function breadthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: { width: number }): Agent<unknown, Outcome<unknown>> {
+export function breadthDriver(
+  _surface: AgenticSurface,
+  task: AgenticTask,
+  opts: AgenticOptions,
+  cfg: { width: number },
+): Agent<unknown, Outcome<unknown>> {
   const innerTurns = opts.innerTurns ?? 4
   return {
     name: 'breadth',
     async act(_t, scope): Promise<Outcome<unknown>> {
       let opened = 0
       for (let k = 0; k < cfg.width; k += 1) {
-        const res = scope.spawn(leaf(`rollout:${k}`, 'shot'), { task } as ShotTask, { budget: perChild(innerTurns), label: `rollout:${k}` })
+        const res = scope.spawn(leaf(`rollout:${k}`, 'shot'), { task } as ShotTask, {
+          budget: perChild(innerTurns),
+          label: `rollout:${k}`,
+        })
         if (res.ok) opened += 1
       }
       if (opened === 0) return { kind: 'blocked', blockers: ['breadth: pool admitted no rollout'] }
@@ -403,7 +470,17 @@ export function breadthDriver(surface: AgenticSurface, task: AgenticTask, opts:
         progression.push(best)
       }
       if (best < 0) return { kind: 'blocked', blockers: ['breadth: every rollout went down'] }
-      return { kind: 'done', deliverable: { mode: 'breadth', score: best, resolved: bestResolved, completions, progression, shots: opened } }
+      return {
+        kind: 'done',
+        deliverable: {
+          mode: 'breadth',
+          score: best,
+          resolved: bestResolved,
+          completions,
+          progression,
+          shots: opened,
+        },
+      }
     },
   }
 }
@@ -420,7 +497,12 @@ export function breadthDriver(surface: AgenticSurface, task: AgenticTask, opts:
  */
 export interface Strategy {
   readonly name: string
-  driver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, budget: number): Agent<unknown, Outcome<unknown>>
+  driver(
+    surface: AgenticSurface,
+    task: AgenticTask,
+    opts: AgenticOptions,
+    budget: number,
+  ): Agent<unknown, Outcome<unknown>>
 }
 
 export const sample: Strategy = {
@@ -468,7 +550,10 @@ export interface StrategyCtx {
 }
 
 /** Author a Strategy from the composable steps — the open, compact way. */
-export function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise<StrategyResult>): Strategy {
+export function defineStrategy(
+  name: string,
+  run: (ctx: StrategyCtx) => Promise<StrategyResult>,
+): Strategy {
   return {
     name,
     driver: (surface, task, opts, budget) => ({
@@ -485,7 +570,16 @@ export function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise<
           async shot(spec) {
             const child = leaf(`shot:${seq}`, 'shot')
             seq += 1
-            const res = scope.spawn(child, { task, handle: spec?.handle, messages: spec?.messages, steer: spec?.steer } as ShotTask, { budget: perChild(innerTurns), label: child.name })
+            const res = scope.spawn(
+              child,
+              {
+                task,
+                handle: spec?.handle,
+                messages: spec?.messages,
+                steer: spec?.steer,
+              } as ShotTask,
+              { budget: perChild(innerTurns), label: child.name },
+            )
             if (!res.ok) return null
             const settled = await drainOne(scope)
             return settled.kind === 'down' ? null : (settled.out as unknown as ShotResult)
@@ -493,7 +587,11 @@ export function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise<
           async critique(messages) {
             const child = leaf(`analyst:${seq}`, 'analyst')
             seq += 1
-            const res = scope.spawn(child, { task, messages }, { budget: perChild(1), label: child.name })
+            const res = scope.spawn(
+              child,
+              { task, messages },
+              { budget: perChild(1), label: child.name },
+            )
             if (!res.ok) return null
             const settled = await drainOne(scope)
             if (settled.kind === 'down') return null
@@ -513,42 +611,45 @@ export function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise<
  *  — the widen/MCTS idea the depth-stuck failure motivated. Scored keep-best (the best
  *  checkpoint across all lines), the deployable metric. This is the "experts build BETTER
  *  optimizations" path: a new technique, compact, with zero Supervisor ceremony. */
-export const adaptiveRefine = defineStrategy('adaptiveRefine', async ({ surface, task, budget, shot, critique }) => {
-  let handle = await surface.open(task)
-  const progression: number[] = []
-  let messages: Msg[] | undefined
-  let steer: string | undefined
-  let completions = 0
-  let best = -1
-  let shots = 0
-  try {
-    for (shots = 0; shots < budget; shots += 1) {
-      const out = await shot({ handle, messages, steer })
-      if (!out) break
-      completions += out.completions
-      progression.push(out.score)
-      if (out.score >= 1) break
-      if (out.score <= best) {
-        // Stuck: steering isn't improving this line — abandon it, restart fresh.
-        await surface.close(handle)
-        handle = await surface.open(task)
-        messages = undefined
-        steer = undefined
-        continue
+export const adaptiveRefine = defineStrategy(
+  'adaptiveRefine',
+  async ({ surface, task, budget, shot, critique }) => {
+    let handle = await surface.open(task)
+    const progression: number[] = []
+    let messages: Msg[] | undefined
+    let steer: string | undefined
+    let completions = 0
+    let best = -1
+    let shots = 0
+    try {
+      for (shots = 0; shots < budget; shots += 1) {
+        const out = await shot({ handle, messages, steer })
+        if (!out) break
+        completions += out.completions
+        progression.push(out.score)
+        if (out.score >= 1) break
+        if (out.score <= best) {
+          // Stuck: steering isn't improving this line — abandon it, restart fresh.
+          await surface.close(handle)
+          handle = await surface.open(task)
+          messages = undefined
+          steer = undefined
+          continue
+        }
+        best = out.score
+        messages = out.messages
+        const findings = await critique(out.messages)
+        completions += 1
+        if (!findings) break
+        steer = `A reviewer flagged unfinished items:\n${findings}\n\nAddress each with the tools, verify they took, then continue.`
       }
-      best = out.score
-      messages = out.messages
-      const findings = await critique(out.messages)
-      completions += 1
-      if (!findings) break
-      steer = `A reviewer flagged unfinished items:\n${findings}\n\nAddress each with the tools, verify they took, then continue.`
+      const score = progression.length ? Math.max(...progression) : 0
+      return { score, resolved: score >= 1, completions, progression, shots }
+    } finally {
+      await surface.close(handle)
     }
-    const score = progression.length ? Math.max(...progression) : 0
-    return { score, resolved: score >= 1, completions, progression, shots }
-  } finally {
-    await surface.close(handle)
-  }
-})
+  },
+)
 
 export interface RunAgenticOptions extends AgenticOptions {
   surface: AgenticSurface
@@ -567,7 +668,10 @@ export async function runAgentic(opts: RunAgenticOptions): Promise<AgenticRunRes
   const strategy: Strategy = opts.strategy ?? (opts.mode === 'breadth' ? sample : refine)
   const driver = strategy.driver(opts.surface, opts.task, opts, opts.budget)
   const supervisor = createSupervisor<unknown, Outcome<unknown>>()
-  const root: Budget = opts.rootBudget ?? { maxIterations: opts.budget * ((opts.innerTurns ?? 4) + 2), maxTokens: 1_000_000_000 }
+  const root: Budget = opts.rootBudget ?? {
+    maxIterations: opts.budget * ((opts.innerTurns ?? 4) + 2),
+    maxTokens: 1_000_000_000,
+  }
   const result = await supervisor.run(driver, undefined, {
     budget: root,
     runId: `agentic:${strategy.name}:${opts.task.id}`,
@@ -577,7 +681,10 @@ export async function runAgentic(opts: RunAgenticOptions): Promise<AgenticRunRes
     maxDepth: 3,
   })
   if (result.kind !== 'winner' || result.out.kind !== 'done') {
-    const reason = result.kind === 'winner' ? `blocked: ${(result.out as { blockers?: string[] }).blockers?.join('; ')}` : `no-winner: ${result.reason}`
+    const reason =
+      result.kind === 'winner'
+        ? `blocked: ${(result.out as { blockers?: string[] }).blockers?.join('; ')}`
+        : `no-winner: ${result.reason}`
     throw new Error(`runAgentic(${strategy.name}) produced no result — ${reason}`)
   }
   return result.out.deliverable as AgenticRunResult