From 1041c54b73e8256cc641e0693bf9aa85f3efee61 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Tue, 9 Jun 2026 05:15:14 -0600
Subject: [PATCH 1/8] feat(bench): GEPA over the analyst/steerer prompt on the
 canonical stack
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The analyst IS the steerer (observe()'s findings → recommended_action → the depth
steer), so optimizing the analyst prompt optimizes the loop. This evolves it with
agent-eval's REAL GEPA primitives (buildReflectionPrompt + parseReflectionResponse
+ paretoFrontier) — no hand-rolled optimizer; there is no turnkey runPromptEvolution
in agent-eval 0.83, only the primitives, so the population loop is thin orchestration
over them.

- observe(): + analystInstruction? override (the analyst prompt is now the GEPA knob);
  defaultAnalystInstruction exported. Firewall stays structural (input has no score).
- agentic.ts: AgenticOptions.analystInstruction threads into the depth steerer.
- eops-gepa.mts: FITNESS = depth-vs-breadth lift on the canonical Supervisor+observe
  gate; breadth computed ONCE per task (shared baseline, correct + halves cost);
  failing per-task lifts = the reflection gradient. Seeds = observe()'s PROVEN default
  (the +16.4pp instruction) FIRST, then the designer-panel population.

Smoke (N=2, 1 gen) validated the full loop: score → paretoFrontier select → reflect
→ mutate → re-score → pick. Bounded real run (N=6, 2 gens) in flight.
---
 bench/src/agentic.ts    |   5 +-
 bench/src/eops-gepa.mts | 177 ++++++++++++++++++++++++++++++++++++++++
 src/runtime/index.ts    |   1 +
 src/runtime/observe.ts  |  21 +++--
 4 files changed, 197 insertions(+), 7 deletions(-)
 create mode 100644 bench/src/eops-gepa.mts

diff --git a/bench/src/agentic.ts b/bench/src/agentic.ts
index c91f5e3..69b6a48 100644
--- a/bench/src/agentic.ts
+++ b/bench/src/agentic.ts
@@ -89,6 +89,9 @@ export interface AgenticOptions {
   temperature?: number
   /** Turns the agent may take within ONE shot before the driver intervenes. */
   innerTurns?: number
+  /** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a
+   *  prompt optimizer (GEPA) tunes — the analyst IS the steerer. Omitted ⇒ the default. */
+  analystInstruction?: string
 }
 
 // ── The unit: one agentic shot (a bounded tool loop) over a handle ───────────────
@@ -187,7 +190,7 @@ async function analyze(task: AgenticTask, messages: Msg[], opts: AgenticOptions)
   const chat = createChatClient({ transport: 'router', apiKey: opts.routerKey, baseUrl: opts.routerBaseUrl, defaultModel: opts.model })
   const obs = await observe(
     { task: task.userPrompt, output: trajectory, trace: messages, outcome: 'failed' },
-    { chat, model: opts.model },
+    { chat, model: opts.model, ...(opts.analystInstruction ? { analystInstruction: opts.analystInstruction } : {}) },
   )
   // The steer = the analyst's recommended actions for the agent. Empty ⇒ nothing left to do.
   const steer = obs.findings
diff --git a/bench/src/eops-gepa.mts b/bench/src/eops-gepa.mts
new file mode 100644
index 0000000..5ce082a
--- /dev/null
+++ b/bench/src/eops-gepa.mts
@@ -0,0 +1,177 @@
+/**
+ * GEPA over the ANALYST/STEERER prompt — the flywheel, on the CANONICAL loop system.
+ *
+ * The analyst IS the steerer: observe()'s system instruction turns an agent's trace into
+ * the recommended_action that steers the next depth shot. This evolves THAT instruction
+ * against the live EOPS gate, using agent-eval's GEPA primitives (NOT a hand-rolled loop):
+ *   - buildReflectionPrompt / parseReflectionResponse — the reflective mutation (GEPA brain)
+ *   - paretoFrontier — non-dominated selection over [maximize lift, minimize cost]
+ *
+ * FITNESS = the depth-vs-breadth lift on the canonical stack (Supervisor + observe()): for a
+ * candidate analyst instruction, run depth (steered by it) on each task and subtract the
+ * SHARED breadth baseline (computed ONCE per task — breadth has no analyst). The failing
+ * tasks (low per-task lift) are the gradient the reflection reads.
+ *
+ *   docker run -d --rm --name eops -p 8006:8005 shivakrishnareddyma225/enterpriseops-gym-mcp-itsm:latest
+ *   EOPS_GYM_DBS_DIR=<unzipped gym_dbs.zip> TANGLE_API_KEY=… \
+ *     N=4 GENS=2 CHILDREN=2 MAXSHOTS=3 WORKER_MODEL=deepseek-v4-pro tsx src/eops-gepa.mts
+ */
+import { readFileSync, writeFileSync } from 'node:fs'
+import { buildReflectionPrompt, paretoFrontier, parseReflectionResponse } from '@tangle-network/agent-eval'
+import { defaultAnalystInstruction } from '@tangle-network/agent-runtime/loops'
+import { type AgenticOptions, type AgenticTask, runAgentic } from './agentic'
+import { createEopsSurface, eopsTaskFromRow } from './agentic-eops'
+import { type RouterConfig, routerChatWithUsage } from './router-client'
+
+function must(name: string): string {
+  const v = process.env[name]
+  if (!v) throw new Error(`env ${name} is required`)
+  return v
+}
+
+async function loadItsmTasks(n: number): Promise<AgenticTask[]> {
+  const url = `https://datasets-server.huggingface.co/rows?dataset=${encodeURIComponent('ServiceNow-AI/EnterpriseOps-Gym')}&config=oracle&split=itsm&offset=0&length=${n}`
+  const res = await fetch(url)
+  if (!res.ok) throw new Error(`EOPS HF rows HTTP ${res.status}`)
+  const body = (await res.json()) as { rows?: Array<{ row: Parameters<typeof eopsTaskFromRow>[0] }> }
+  return (body.rows ?? []).slice(0, n).map(({ row }) => eopsTaskFromRow(row))
+}
+
+interface Candidate {
+  id: string
+  instruction: string
+  gen: number
+  lift?: number
+  cost?: number
+  perTask?: Array<{ id: string; lift: number }>
+}
+
+const pct = (x: number) => `${(x * 100).toFixed(0)}%`
+const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`
+
+async function main(): Promise<void> {
+  const n = Number(process.env.N ?? 4)
+  const gens = Number(process.env.GENS ?? 2)
+  const childCount = Number(process.env.CHILDREN ?? 2)
+  const parents = Number(process.env.PARENTS ?? 2)
+  const maxShots = Number(process.env.MAXSHOTS ?? 3)
+  const width = Number(process.env.WIDTH ?? 3)
+  const model = process.env.WORKER_MODEL ?? 'deepseek-v4-pro'
+  const routerKey = must('TANGLE_API_KEY')
+  const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1'
+  const opts: AgenticOptions = { routerBaseUrl, routerKey, model, innerTurns: Number(process.env.INNER_TURNS ?? 4), temperature: 0.7 }
+  const reflectCfg: RouterConfig = { routerBaseUrl, routerKey, model: process.env.REFLECT_MODEL ?? model }
+  const surface = createEopsSurface(must('EOPS_GYM_DBS_DIR'))
+
+  const tasks = await loadItsmTasks(n)
+  console.error(`=== GEPA over the analyst prompt · ${tasks.length} EOPS tasks · ${model} · gens=${gens} children=${childCount} ===\n`)
+
+  // Shared breadth baseline per task (no analyst — same for every candidate). Compute ONCE.
+  console.error('▶ computing shared breadth baseline (once per task)…')
+  const breadthByTask = new Map<string, { score: number; comps: number }>()
+  for (const task of tasks) {
+    let breadthScore = 0
+    let cB = 0
+    for (let w = 0; w < width && cB < maxShots * (opts.innerTurns ?? 4); w += 1) {
+      const b = await runAgentic({ ...opts, surface, task, mode: 'breadth', budget: 1 })
+      cB += b.completions
+      if (b.score > breadthScore) breadthScore = b.score
+    }
+    breadthByTask.set(task.id, { score: breadthScore, comps: cB })
+    console.error(`   ${task.id.slice(-12)}: breadth ${pct(breadthScore)}`)
+  }
+
+  // Fitness: depth (steered by the candidate instruction) − the shared breadth baseline.
+  async function fitness(instruction: string): Promise<{ lift: number; cost: number; perTask: Array<{ id: string; lift: number }> }> {
+    let liftSum = 0
+    let cost = 0
+    const perTask: Array<{ id: string; lift: number }> = []
+    for (const task of tasks) {
+      const depth = await runAgentic({ ...opts, analystInstruction: instruction, surface, task, mode: 'depth', budget: maxShots })
+      const b = breadthByTask.get(task.id)?.score ?? 0
+      const taskLift = depth.score - b
+      liftSum += taskLift
+      cost += depth.completions
+      perTask.push({ id: task.id, lift: taskLift })
+    }
+    return { lift: liftSum / tasks.length, cost: cost / tasks.length, perTask }
+  }
+
+  // Seed population: the PROVEN baseline (observe()'s default — the +16.4pp instruction)
+  // FIRST, so GEPA improves from known-good, then the designer-panel steerer prompts.
+  const popSeeds = (JSON.parse(readFileSync('steerers/eops-itsm-population.json', 'utf8')) as Array<{ id: string; systemPrompt: string }>)
+    .slice(0, Math.max(0, Number(process.env.SEEDS ?? 4) - 1))
+    .map((s) => ({ id: `seed:${s.id}`, instruction: s.systemPrompt, gen: 0 }))
+  const pop: Candidate[] = [{ id: 'seed:observe-default', instruction: defaultAnalystInstruction, gen: 0 }, ...popSeeds]
+
+  const objectives = [
+    { name: 'lift', direction: 'maximize' as const, value: (c: Candidate) => c.lift ?? -1 },
+    { name: 'cost', direction: 'minimize' as const, value: (c: Candidate) => c.cost ?? 1e9 },
+  ]
+
+  for (let gen = 0; gen <= gens; gen += 1) {
+    console.error(`\n── generation ${gen} · scoring ${pop.filter((c) => c.lift === undefined).length} new candidate(s)`)
+    for (const c of pop) {
+      if (c.lift !== undefined) continue
+      const f = await fitness(c.instruction)
+      c.lift = f.lift
+      c.cost = f.cost
+      c.perTask = f.perTask
+      console.error(`   ${c.id.padEnd(28)} lift ${pp(c.lift)}  cost ${c.cost.toFixed(1)}`)
+    }
+    const ranked = [...pop].filter((c) => c.lift !== undefined).sort((a, b) => (b.lift ?? -1) - (a.lift ?? -1))
+    console.error(`   gen ${gen} best: ${ranked[0]?.id} @ ${pp(ranked[0]?.lift ?? 0)}`)
+    if (gen === gens) break
+
+    // Pareto-select parents (lift up, cost down), then reflective-mutate each.
+    const frontier = paretoFrontier(pop.filter((c) => c.lift !== undefined), objectives).frontier.slice(0, parents)
+    const children: Candidate[] = []
+    for (const parent of frontier) {
+      const sorted = [...(parent.perTask ?? [])].sort((a, b) => b.lift - a.lift)
+      const top = sorted.slice(0, 2).map((t) => ({ id: t.id.slice(-12), score: t.lift }))
+      const bottom = sorted.slice(-2).map((t) => ({ id: t.id.slice(-12), score: t.lift }))
+      const rp = buildReflectionPrompt({
+        target:
+          'The SYSTEM INSTRUCTION for a trace-analyst that reads an agent\'s tool-call trace on an unfinished IT-ops task and outputs ONE concrete corrective instruction to steer the next attempt. It must never see the grader. payload MUST be the full replacement instruction string.',
+        parentPayload: parent.instruction,
+        topTrials: top,
+        bottomTrials: bottom,
+        childCount,
+        mutationPrimitives: [
+          'Make the diagnosis more specific (name the exact record/field/target value still wrong).',
+          'Add an anti-degradation rule (freeze already-correct records; do not re-touch them).',
+          'Tighten the stop condition (when to declare done vs keep acting).',
+          'Add a verify-before-mutate step (read current state, then change only what is wrong).',
+        ],
+      })
+      const resp = await routerChatWithUsage(reflectCfg, [{ role: 'user', content: rp }], { temperature: 0.8 })
+      const proposals = parseReflectionResponse(resp.content, childCount)
+      for (const p of proposals) {
+        const instruction = typeof p.payload === 'string' ? p.payload : JSON.stringify(p.payload)
+        if (instruction.trim().length < 40) continue // reject degenerate mutations
+        children.push({ id: `g${gen + 1}:${parent.id.replace(/^seed:|^g\d+:/, '')}-${children.length}`, instruction, gen: gen + 1 })
+      }
+    }
+    console.error(`   reflective-mutated ${frontier.length} parent(s) → ${children.length} child(ren)`)
+    // Elitism: carry the frontier forward (already scored) + the new children.
+    pop.length = 0
+    pop.push(...frontier, ...children)
+  }
+
+  const scored = pop.filter((c) => c.lift !== undefined).sort((a, b) => (b.lift ?? -1) - (a.lift ?? -1))
+  const best = scored[0]
+  console.error(`\n${'='.repeat(72)}`)
+  console.error(`GEPA RESULT · ${tasks.length} tasks · ${model}`)
+  console.error('='.repeat(72))
+  for (const c of scored) console.error(`  ${c.id.padEnd(30)} gen${c.gen}  lift ${pp(c.lift ?? 0)}  cost ${(c.cost ?? 0).toFixed(1)}`)
+  console.error(`\n  WINNER: ${best?.id} @ lift ${pp(best?.lift ?? 0)} (gen ${best?.gen})`)
+  const out = { model, tasks: tasks.length, gens, best: { id: best?.id, gen: best?.gen, lift: best?.lift, instruction: best?.instruction }, all: scored.map((c) => ({ id: c.id, gen: c.gen, lift: c.lift, cost: c.cost })) }
+  const outPath = process.env.OUT ?? '/tmp/eops-gepa-result.json'
+  writeFileSync(outPath, JSON.stringify(out, null, 2))
+  console.error(`  best instruction + ranking → ${outPath}`)
+}
+
+main().catch((e) => {
+  console.error(`eops-gepa: ${e instanceof Error ? (e.stack ?? e.message) : String(e)}`)
+  process.exit(1)
+})
diff --git a/src/runtime/index.ts b/src/runtime/index.ts
index 2e50fe7..21e779f 100644
--- a/src/runtime/index.ts
+++ b/src/runtime/index.ts
@@ -63,6 +63,7 @@ export {
 // The third-person observer: a worker's trace → trace-grounded findings, an
 // operator report, and durable corpus facts for the next run (the closed loop).
 export {
+  defaultAnalystInstruction,
   type Observation,
   type ObserveInput,
   type ObserveOptions,
diff --git a/src/runtime/observe.ts b/src/runtime/observe.ts
index 44aaccd..9fb31a4 100644
--- a/src/runtime/observe.ts
+++ b/src/runtime/observe.ts
@@ -45,8 +45,22 @@ export interface ObserveOptions {
   signal?: AbortSignal
   /** Cap the trace lines fed to the observer (keeps the call cheap). Default 80. */
   maxTraceLines?: number
+  /** Override the analyst's system instruction — the prompt that turns a trace into
+   *  findings + recommended_actions. The analyst IS the steerer, so this is the knob a
+   *  prompt optimizer (GEPA) tunes. Omitted ⇒ the default observer instruction. The
+   *  firewall (trace-only, never the verdict) is structural (input has no score), so a
+   *  custom instruction cannot break it. */
+  analystInstruction?: string
 }
 
+/** The default observer instruction — exported so an optimizer can seed its population. */
+export const defaultAnalystInstruction =
+  'You are a third-person OBSERVER watching an AI agent work. You see its TRACE (what it did), not its grader. ' +
+  'From the trace, name SPECIFIC, behavior-grounded findings: wasted/duplicated tool calls, thrash/retries, ' +
+  'token/cost waste, missing verification, failure patterns. For each, a concrete recommended_action, and ' +
+  'whether the AGENT (fix its skills/prompt/tools) or the OPERATOR (fix framing/decomposition/config) should act. ' +
+  'Only claim what the trace shows. No findings if the run was clean.'
+
 export interface Observation {
   findings: AnalystFinding[]
   /** Facts persisted to the corpus (empty when no corpus was supplied). */
@@ -131,12 +145,7 @@ export async function observe(input: ObserveInput, opts: ObserveOptions): Promis
       messages: [
         {
           role: 'system',
-          content:
-            'You are a third-person OBSERVER watching an AI agent work. You see its TRACE (what it did), not its grader. ' +
-            'From the trace, name SPECIFIC, behavior-grounded findings: wasted/duplicated tool calls, thrash/retries, ' +
-            'token/cost waste, missing verification, failure patterns. For each, a concrete recommended_action, and ' +
-            'whether the AGENT (fix its skills/prompt/tools) or the OPERATOR (fix framing/decomposition/config) should act. ' +
-            'Only claim what the trace shows. No findings if the run was clean.',
+          content: opts.analystInstruction ?? defaultAnalystInstruction,
         },
         {
           role: 'user',

From ea4845f37a317f534932043f6dc5e7a4d2055710 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Tue, 9 Jun 2026 05:24:29 -0600
Subject: [PATCH 2/8] fix(bench): GEPA harness survives gym/router infra blips
 (skip failed tasks)

The first real run died when the (long-lived) gym container wedged: breadth
baselines returned 0% then runAgentic threw 'every rollout went down', killing the
whole GEPA run. runAgentic is fail-loud; the GEPA loop now catches per-task: a task
whose rollouts fail is SKIPPED (not fatal), both in the breadth precompute and the
depth fitness. Fails loud only if <2 tasks survive (genuine infra-down). Pair with a
fresh gym container + WIDTH<=2.
---
 bench/src/eops-gepa.mts | 46 +++++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/bench/src/eops-gepa.mts b/bench/src/eops-gepa.mts
index 5ce082a..821bd59 100644
--- a/bench/src/eops-gepa.mts
+++ b/bench/src/eops-gepa.mts
@@ -67,34 +67,48 @@ async function main(): Promise<void> {
   console.error(`=== GEPA over the analyst prompt · ${tasks.length} EOPS tasks · ${model} · gens=${gens} children=${childCount} ===\n`)
 
   // Shared breadth baseline per task (no analyst — same for every candidate). Compute ONCE.
+  // Resilient: a task whose rollouts all fail (transient gym/router infra) is SKIPPED, not
+  // fatal — runAgentic is fail-loud, so we catch + drop the task and press on.
   console.error('▶ computing shared breadth baseline (once per task)…')
   const breadthByTask = new Map<string, { score: number; comps: number }>()
+  const liveTasks: AgenticTask[] = []
   for (const task of tasks) {
-    let breadthScore = 0
-    let cB = 0
-    for (let w = 0; w < width && cB < maxShots * (opts.innerTurns ?? 4); w += 1) {
-      const b = await runAgentic({ ...opts, surface, task, mode: 'breadth', budget: 1 })
-      cB += b.completions
-      if (b.score > breadthScore) breadthScore = b.score
+    try {
+      let breadthScore = 0
+      let cB = 0
+      for (let w = 0; w < width && cB < maxShots * (opts.innerTurns ?? 4); w += 1) {
+        const b = await runAgentic({ ...opts, surface, task, mode: 'breadth', budget: 1 })
+        cB += b.completions
+        if (b.score > breadthScore) breadthScore = b.score
+      }
+      breadthByTask.set(task.id, { score: breadthScore, comps: cB })
+      liveTasks.push(task)
+      console.error(`   ${task.id.slice(-12)}: breadth ${pct(breadthScore)}`)
+    } catch (e) {
+      console.error(`   ${task.id.slice(-12)}: SKIP (${e instanceof Error ? e.message.slice(0, 70) : e})`)
     }
-    breadthByTask.set(task.id, { score: breadthScore, comps: cB })
-    console.error(`   ${task.id.slice(-12)}: breadth ${pct(breadthScore)}`)
   }
+  if (liveTasks.length < 2) throw new Error(`only ${liveTasks.length} task(s) survived breadth baseline — gym/router infra is down (restart the gym container)`)
 
   // Fitness: depth (steered by the candidate instruction) − the shared breadth baseline.
   async function fitness(instruction: string): Promise<{ lift: number; cost: number; perTask: Array<{ id: string; lift: number }> }> {
     let liftSum = 0
     let cost = 0
     const perTask: Array<{ id: string; lift: number }> = []
-    for (const task of tasks) {
-      const depth = await runAgentic({ ...opts, analystInstruction: instruction, surface, task, mode: 'depth', budget: maxShots })
-      const b = breadthByTask.get(task.id)?.score ?? 0
-      const taskLift = depth.score - b
-      liftSum += taskLift
-      cost += depth.completions
-      perTask.push({ id: task.id, lift: taskLift })
+    let scored = 0
+    for (const task of liveTasks) {
+      try {
+        const depth = await runAgentic({ ...opts, analystInstruction: instruction, surface, task, mode: 'depth', budget: maxShots })
+        const b = breadthByTask.get(task.id)?.score ?? 0
+        liftSum += depth.score - b
+        cost += depth.completions
+        perTask.push({ id: task.id, lift: depth.score - b })
+        scored += 1
+      } catch (e) {
+        console.error(`     depth SKIP ${task.id.slice(-12)} (${e instanceof Error ? e.message.slice(0, 50) : e})`)
+      }
     }
-    return { lift: liftSum / tasks.length, cost: cost / tasks.length, perTask }
+    return { lift: scored ? liftSum / scored : -1, cost: scored ? cost / scored : 1e9, perTask }
   }
 
   // Seed population: the PROVEN baseline (observe()'s default — the +16.4pp instruction)

From dfca5406efbf009eef622d8edff14cb7d63f7b8f Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Tue, 9 Jun 2026 05:31:32 -0600
Subject: [PATCH 3/8] =?UTF-8?q?refactor(bench):=20delete=20eops-gate.mts?=
 =?UTF-8?q?=20=E2=80=94=20the=20throwaway=20flat-loop=20prototype=20(?=
 =?UTF-8?q?=E2=88=92433=20LOC)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It was a dead-end (nothing imports it): a hand-rolled flat loop that BYPASSED the
canonical Supervisor + a second copy of the gym client (6 functions duplicating
gym-agent.ts's 5). Fully superseded by the canonical stack — agentic.ts (domain-blind
depth/breadth/Supervisor/observe, 428 LOC, written ONCE) + the AgenticSurface seam
(agentic-eops.ts, 73 LOC = the entire per-domain slot-in). The +16.4pp result and the
GEPA harness run on the canonical path; this prototype only de-risked the plumbing
(gym standup, router-tools worker, depth-best scoring) and is now dead weight.
---
 bench/src/eops-gate.mts | 433 ----------------------------------------
 1 file changed, 433 deletions(-)
 delete mode 100644 bench/src/eops-gate.mts

diff --git a/bench/src/eops-gate.mts b/bench/src/eops-gate.mts
deleted file mode 100644
index 355c18e..0000000
--- a/bench/src/eops-gate.mts
+++ /dev/null
@@ -1,433 +0,0 @@
-/**
- * EnterpriseOps-Gym depth-vs-breadth gate — the agentic, stateful domain where
- * steering is hypothesized to beat compute (the opposite regime to HumanEval,
- * where breadth/resampling won). The worker is the TOOL-USING router backend
- * (`routerToolLoop`): it calls the gym's live MCP tools, sees the results, and
- * acts — off-box (router inference + host→gym HTTP), no sandbox.
- *
- *   breadth@K — K independent shots, each a short agentic loop on its OWN fresh
- *               seeded DB; keep the best by the deployable verifier (resample).
- *   depth@K   — ONE sustained agentic loop over ONE DB, ~K× the turn budget; the
- *               artifact (DB state) accumulates, so each action conditions the next.
- *
- * Equal compute = equal total inference turns (K·M). Score = the task's own SQL
- * verifiers (deployable check), run on the final DB state. Per-task {0,1} resolved,
- * paired 95% bootstrap CI.
- *
- * Stand up first:
- *   docker run -d --rm --name eops -p 8006:8005 shivakrishnareddyma225/enterpriseops-gym-mcp-itsm:latest
- *   # gym_dbs.zip from github.com/ServiceNow/EnterpriseOps-Gym (root), unzipped:
- *   export EOPS_GYM_DBS_DIR=/path/to/unzipped/dbs
- *   TANGLE_API_KEY=… N=20 K=3 M=5 WORKER_MODEL=gpt-4o-mini tsx src/eops-gate.mts
- */
-import { readFileSync, writeFileSync } from 'node:fs'
-import { join } from 'node:path'
-import { type RouterConfig, type ToolSpec, routerChatWithUsage, routerToolLoop } from './router-client'
-import { type PairedLift, pairedLift, pool } from './stats.mts'
-
-function must(name: string): string {
-  const v = process.env[name]
-  if (!v) throw new Error(`env ${name} is required`)
-  return v
-}
-
-const dataset = 'ServiceNow-AI/EnterpriseOps-Gym'
-
-type ComparisonType = 'equals' | 'greater_than' | 'less_than' | 'contains'
-interface Verifier {
-  verifier_type?: string
-  gym_name?: string
-  /** EOPS nests the deterministic check here; comparison_type defaults to 'equals'. */
-  validation_config?: { query?: string; expected_value?: unknown; comparison_type?: ComparisonType }
-}
-interface GymServer {
-  mcp_server_url: string
-  seed_database_file: string
-  context?: Record<string, string>
-}
-export interface EopsTask {
-  taskId: string
-  systemPrompt: string
-  userPrompt: string
-  selectedTools: string[]
-  servers: GymServer[]
-  verifiers: Verifier[]
-}
-
-const asArray = <T,>(v: unknown): T[] => (typeof v === 'string' ? JSON.parse(v) : v) as T[]
-
-/** Pull itsm tasks from the HF rows server (the oracle tool-set config). Fail loud. */
-export async function loadTasks(n: number, offset: number): Promise<EopsTask[]> {
-  const url = `https://datasets-server.huggingface.co/rows?dataset=${encodeURIComponent(dataset)}&config=oracle&split=itsm&offset=${offset}&length=${n}`
-  const res = await fetch(url)
-  if (!res.ok) throw new Error(`EOPS HF rows HTTP ${res.status}: ${url}`)
-  const data = (await res.json()) as { rows?: Array<{ row: Record<string, unknown> }> }
-  const rows = data.rows ?? []
-  if (rows.length === 0) throw new Error('EOPS HF returned 0 rows')
-  return rows.map(({ row }) => ({
-    taskId: String(row.task_id),
-    systemPrompt: String(row.system_prompt ?? ''),
-    userPrompt: String(row.user_prompt ?? ''),
-    selectedTools: asArray<string>(row.selected_tools),
-    servers: asArray<GymServer>(row.gym_servers_config),
-    verifiers: asArray<Verifier>(row.verifiers),
-  }))
-}
-
-// ── gym client (mirrors scripts/enterpriseops_gym_judge.py) ────────────────────
-
-function authHeaders(server: GymServer, dbId: string): Record<string, string> {
-  return { 'content-type': 'application/json', ...(server.context ?? {}), 'x-database-id': dbId }
-}
-
-/** POST and parse a JSON body OR the last `data:` line of an SSE stream (/mcp streams SSE).
- *  Retries a THROWN fetch (transient network / connection-reset / router throttle under
- *  concurrency — surfaces as "fetch failed") with backoff, so a momentary blip doesn't
- *  drop a task. HTTP-status handling stays with the caller (seed retries on 500). */
-async function postJson(url: string, body: unknown, headers: Record<string, string>): Promise<{ status: number; json: unknown }> {
-  let lastErr: unknown
-  for (let attempt = 0; attempt < 4; attempt += 1) {
-    try {
-      const r = await fetch(url, { method: 'POST', headers, body: JSON.stringify(body) })
-      const text = await r.text()
-      const dataLines = text.split('\n').filter((l) => l.startsWith('data:')).map((l) => l.slice(5).trim())
-      const payload = dataLines.length ? dataLines[dataLines.length - 1] : text
-      try {
-        return { status: r.status, json: JSON.parse(payload ?? 'null') }
-      } catch {
-        return { status: r.status, json: text }
-      }
-    } catch (err) {
-      lastErr = err
-      await new Promise((res) => setTimeout(res, 1000 * (attempt + 1)))
-    }
-  }
-  throw new Error(`postJson ${url} failed after 4 attempts: ${lastErr instanceof Error ? lastErr.message : String(lastErr)}`)
-}
-
-async function seedDb(server: GymServer, dbsDir: string): Promise<string> {
-  const sql = readFileSync(join(dbsDir, server.seed_database_file), 'utf8')
-  const url = `${server.mcp_server_url.replace(/\/$/, '')}/api/seed-database`
-  // The gym's SQLite exhausts file handles under concurrency ("unable to open
-  // database file", HTTP 500) — TRANSIENT: it clears as sibling DBs are deleted.
-  // Bounded retry with backoff so a momentary limit doesn't drop the task's data.
-  let lastErr = ''
-  for (let attempt = 0; attempt < 5; attempt += 1) {
-    const dbId = `gate_${Math.random().toString(36).slice(2, 12)}`
-    const { status, json } = await postJson(url, { database_id: dbId, name: `gate_${dbId}`, description: 'gate', sql_content: sql }, { 'content-type': 'application/json' })
-    if (status === 200 && (json as { success?: boolean })?.success) return dbId
-    lastErr = `(${status}): ${JSON.stringify(json).slice(0, 160)}`
-    await new Promise((r) => setTimeout(r, 1500 * (attempt + 1)))
-  }
-  throw new Error(`seed-database failed after 5 attempts ${lastErr}`)
-}
-
-async function deleteDb(server: GymServer, dbId: string): Promise<void> {
-  await fetch(`${server.mcp_server_url.replace(/\/$/, '')}/api/delete-database`, {
-    method: 'DELETE',
-    headers: { 'content-type': 'application/json' },
-    body: JSON.stringify({ database_id: dbId }),
-  }).catch(() => {})
-}
-
-/** Coerce an MCP inputSchema to an OpenAI-tool-valid top-level object schema. The
- *  router rejects top-level oneOf/anyOf/allOf/enum/not — keep the properties (nested
- *  combinators are fine) but guarantee a plain `{type:'object'}` head. */
-function sanitizeSchema(s: unknown): { type: 'object'; properties: Record<string, unknown>; required?: string[] } {
-  const o = s && typeof s === 'object' ? (s as Record<string, unknown>) : {}
-  const banned = o.oneOf || o.anyOf || o.allOf || o.not || o.enum
-  if (o.type === 'object' && !banned && o.properties && typeof o.properties === 'object') {
-    return { type: 'object', properties: o.properties as Record<string, unknown>, ...(Array.isArray(o.required) ? { required: o.required as string[] } : {}) }
-  }
-  return { type: 'object', properties: {} }
-}
-
-/** Build OpenAI-shape tool specs for the task's selected tools from the gym's MCP tools/list. */
-async function toolSpecs(server: GymServer, dbId: string, selected: string[]): Promise<ToolSpec[]> {
-  const url = `${server.mcp_server_url.replace(/\/$/, '')}/mcp`
-  const { json } = await postJson(url, { jsonrpc: '2.0', id: 1, method: 'tools/list', params: {} }, authHeaders(server, dbId))
-  const all = ((json as { result?: { tools?: Array<{ name: string; description?: string; inputSchema?: unknown }> } }).result?.tools) ?? []
-  const want = new Set(selected)
-  return all
-    .filter((t) => want.has(t.name))
-    .map((t) => ({ type: 'function' as const, function: { name: t.name, description: (t.description ?? '').slice(0, 1000), parameters: sanitizeSchema(t.inputSchema) } }))
-}
-
-async function callTool(server: GymServer, dbId: string, name: string, args: Record<string, unknown>): Promise<string> {
-  const url = `${server.mcp_server_url.replace(/\/$/, '')}/mcp`
-  const { json } = await postJson(url, { jsonrpc: '2.0', id: 2, method: 'tools/call', params: { name, arguments: args } }, authHeaders(server, dbId))
-  const result = (json as { result?: { content?: Array<{ text?: string }>; isError?: boolean }; error?: unknown }) ?? {}
-  if (result.error) return `error: ${JSON.stringify(result.error).slice(0, 300)}`
-  const text = result.result?.content?.map((c) => c.text ?? '').join('\n') ?? JSON.stringify(result.result ?? json)
-  return text.slice(0, 1500)
-}
-
-function compare(actual: unknown, expected: unknown, kind: ComparisonType): boolean {
-  const fa = Number(actual)
-  const fe = Number(expected)
-  const numeric = !Number.isNaN(fa) && !Number.isNaN(fe)
-  if (kind === 'equals') return numeric ? fa === fe : String(actual) === String(expected)
-  if (kind === 'greater_than') return numeric && fa > fe
-  if (kind === 'less_than') return numeric && fa < fe
-  if (kind === 'contains') return String(actual).includes(String(expected))
-  throw new Error(`unsupported comparison_type ${kind}`)
-}
-
-/** Run the task's SQL verifiers on the final DB state; resolved = all pass. */
-async function score(server: GymServer, dbId: string, verifiers: Verifier[]): Promise<{ passes: number; total: number; resolved: boolean }> {
-  // Only deterministic database_state verifiers are scoreable (the judge rejects others).
-  const dbv = verifiers.filter((v) => (v.verifier_type ?? 'database_state') === 'database_state' && v.validation_config?.query)
-  let passes = 0
-  for (const v of dbv) {
-    const vc = v.validation_config as NonNullable<Verifier['validation_config']>
-    const url = `${server.mcp_server_url.replace(/\/$/, '')}/api/sql-runner`
-    const { json } = await postJson(url, { query: vc.query, database_id: dbId }, authHeaders(server, dbId))
-    const out = json as { data?: Array<Record<string, unknown>>; rows?: Array<Record<string, unknown>>; error?: unknown }
-    if (out.error) continue
-    const first = (out.data ?? out.rows ?? [])[0]
-    const actual = first && typeof first === 'object' ? Object.values(first)[0] : first
-    if (compare(actual, vc.expected_value, vc.comparison_type ?? 'equals')) passes += 1
-  }
-  return { passes, total: dbv.length, resolved: dbv.length > 0 && passes === dbv.length }
-}
-
-// ── one agentic shot: the tool-using worker acts on a (seeded) DB ──────────────
-
-function shotPrompt(task: EopsTask, steer?: string): string {
-  return [
-    task.userPrompt,
-    '',
-    'Use the available tools to investigate the current state, then take the actions needed to complete the task.',
-    'Inspect before you mutate. When you are confident the task is complete, give a one-line summary and stop calling tools.',
-    ...(steer ? ['', `CORRECTION FROM YOUR PRIOR ATTEMPT: ${steer}`] : []),
-  ].join('\n')
-}
-
-type ToolTrace = Array<{ name: string; args: string; result: string }>
-
-async function runShot(cfg: RouterConfig, task: EopsTask, server: GymServer, dbId: string, tools: ToolSpec[], maxTurns: number, steer?: string): Promise<{ toolCalls: number; toolTrace: ToolTrace }> {
-  const r = await routerToolLoop(
-    cfg,
-    task.systemPrompt || 'You are an IT service-management operations agent.',
-    shotPrompt(task, steer),
-    tools,
-    async (name, args) => callTool(server, dbId, name, args as Record<string, unknown>),
-    { maxTurns, temperature: 0.3 },
-  )
-  return { toolCalls: r.toolCalls, toolTrace: r.toolTrace }
-}
-
-type Score = { passes: number; total: number; resolved: boolean }
-const scoreRatio = (x: Score) => x.passes / Math.max(x.total, 1)
-
-const genericNudge =
-  'Re-inspect the current state with the read tools, identify what the task still requires, and complete it. Do not stop until every required change is verified in place.'
-
-/** A depth steerer under test. No template ⇒ the fixed generic nudge. With a template
- *  ⇒ an LLM steerer: {task}/{trace} are substituted; it reads BEHAVIOR only (firewalled). */
-export interface Steerer {
-  id: string
-  systemPrompt?: string
-  userTemplate?: string
-}
-
-function traceSummary(trace: ToolTrace): string {
-  return trace.map((t) => `${t.name}(${t.args.slice(0, 140)}) -> ${t.result.slice(0, 180)}`).join('\n').slice(-4000) || '(no tool calls yet)'
-}
-
-/** The next-shot instruction for a steerer. FIREWALLED: trace only, never verifiers. */
-async function steerInstruction(steerCfg: RouterConfig, steerer: Steerer, task: EopsTask, trace: ToolTrace): Promise<string> {
-  if (!steerer.userTemplate) return genericNudge
-  const user = steerer.userTemplate.replaceAll('{task}', task.userPrompt).replaceAll('{trace}', traceSummary(trace))
-  const r = await routerChatWithUsage(
-    steerCfg,
-    [
-      { role: 'system', content: steerer.systemPrompt ?? 'You are an ITSM operations reviewer. Output one concrete corrective instruction.' },
-      { role: 'user', content: user },
-    ],
-    { temperature: 0.2 },
-  )
-  return r.content.trim()
-}
-
-/** One depth arm: K sequential steered shots over ONE persistent DB, scored after each
- *  shot. Returns the best checkpoint (deployable, symmetric with breadth's best-of-K),
- *  the final state, the trajectory, and tool-call count. */
-async function runDepthArm(
-  cfg: RouterConfig,
-  steerCfg: RouterConfig,
-  steerer: Steerer,
-  task: EopsTask,
-  server: GymServer,
-  dbsDir: string,
-  k: number,
-  m: number,
-): Promise<{ best: Score; final: Score; traj: string; toolCalls: number }> {
-  const dbId = await seedDb(server, dbsDir)
-  let toolCalls = 0
-  try {
-    const tools = await toolSpecs(server, dbId, task.selectedTools)
-    const trace: ToolTrace = []
-    const shots: Score[] = []
-    for (let s = 0; s < k; s += 1) {
-      const steer = s === 0 ? undefined : await steerInstruction(steerCfg, steerer, task, trace)
-      const sr = await runShot(cfg, task, server, dbId, tools, m, steer)
-      toolCalls += sr.toolCalls
-      trace.push(...sr.toolTrace)
-      shots.push(await score(server, dbId, task.verifiers))
-    }
-    const final = shots[shots.length - 1] ?? { passes: 0, total: 1, resolved: false }
-    const best = shots.reduce((a, b) => (scoreRatio(b) > scoreRatio(a) ? b : a), shots[0] ?? final)
-    return { best, final, traj: shots.map((x) => `${x.passes}/${x.total}`).join('→'), toolCalls }
-  } finally {
-    await deleteDb(server, dbId)
-  }
-}
-
-/** The built-in inline analyst (STEER=analyst back-compat / the S1 baseline). */
-const inlineAnalyst: Steerer = {
-  id: 'analyst',
-  systemPrompt:
-    "You are a senior ITSM operations reviewer. You are shown an agent's tool-call trace on a task it has NOT completed. Diagnose precisely what the task still requires and issue ONE concrete corrective instruction — name the specific records, fields, and target values to set. Do not restate the task, do not praise, do not summarize the trace. Output only the single next instruction.",
-  userTemplate: 'TASK:\n{task}\n\nAGENT TRACE SO FAR:\n{trace}\n\nThe single most important still-missing or incorrect step, as one concrete instruction:',
-}
-
-const pct = (x: number) => `${(x * 100).toFixed(1)}%`
-const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`
-
-export interface SteererRank {
-  id: string
-  bestRate: number
-  lift: PairedLift
-  liftRes: PairedLift
-  degradation: number
-}
-export interface SteererLoss {
-  steererId: string
-  userPrompt: string
-  breadth: number
-  depthBest: number
-  traj: string
-}
-export interface EvalResult {
-  ok: Array<{ taskId: string; userPrompt: string; breadthR: number; breadthRes: number; perSteerer: Record<string, { bestR: number; finalR: number; bestRes: number; traj: string }> }>
-  excluded: number
-  ranked: SteererRank[]
-  losses: SteererLoss[]
-}
-
-/** The fitness function — runs every steerer as a depth arm against ONE shared breadth
- *  baseline per task, returns ranked lift + the per-steerer LOSSES (tasks where depth-best
- *  < breadth, with the trajectory). The losses are GEPA's reflection fuel. */
-export async function evaluateSteerers(args: {
-  cfg: RouterConfig
-  steerCfg: RouterConfig
-  steerers: Steerer[]
-  tasks: EopsTask[]
-  dbsDir: string
-  k: number
-  m: number
-  concurrency: number
-}): Promise<EvalResult> {
-  const { cfg, steerCfg, steerers, tasks, dbsDir, k, m, concurrency } = args
-  const rows = await pool(tasks, concurrency, async (task, i) => {
-    const server = task.servers[0]
-    if (!server) return null
-    try {
-      const breadthScores: Score[] = []
-      for (let s = 0; s < k; s += 1) {
-        const dbId = await seedDb(server, dbsDir)
-        try {
-          const tools = await toolSpecs(server, dbId, task.selectedTools)
-          await runShot(cfg, task, server, dbId, tools, m)
-          breadthScores.push(await score(server, dbId, task.verifiers))
-        } finally {
-          await deleteDb(server, dbId)
-        }
-      }
-      const breadthBest = breadthScores.reduce((a, b) => (scoreRatio(b) > scoreRatio(a) ? b : a), breadthScores[0] ?? { passes: 0, total: 1, resolved: false })
-      const perSteerer: Record<string, { bestR: number; finalR: number; bestRes: number; traj: string }> = {}
-      const tags: string[] = []
-      for (const st of steerers) {
-        const arm = await runDepthArm(cfg, steerCfg, st, task, server, dbsDir, k, m)
-        perSteerer[st.id] = { bestR: scoreRatio(arm.best), finalR: scoreRatio(arm.final), bestRes: arm.best.resolved ? 1 : 0, traj: arm.traj }
-        tags.push(`${st.id}=${arm.best.passes}/${arm.best.total}`)
-      }
-      process.stderr.write(`  [${i + 1}/${tasks.length}] ${task.taskId.slice(-12)}: breadth=${breadthBest.passes}/${breadthBest.total} | ${tags.join(' ')}\n`)
-      return { taskId: task.taskId, userPrompt: task.userPrompt, breadthR: scoreRatio(breadthBest), breadthRes: breadthBest.resolved ? 1 : 0, perSteerer }
-    } catch (err) {
-      process.stderr.write(`  [${i + 1}/${tasks.length}] ${task.taskId.slice(-12)}: SKIP (${err instanceof Error ? err.message.slice(0, 90) : String(err)})\n`)
-      return null
-    }
-  })
-  const ok = rows.filter((r): r is NonNullable<typeof r> => r !== null)
-  const rate = (xs: number[]) => xs.reduce((s, x) => s + x, 0) / Math.max(xs.length, 1)
-  const breadthR = ok.map((r) => r.breadthR)
-  const breadthRes = ok.map((r) => r.breadthRes)
-  const ranked = steerers
-    .map((st) => {
-      const bestR = ok.map((r) => r.perSteerer[st.id]?.bestR ?? 0)
-      const finalR = ok.map((r) => r.perSteerer[st.id]?.finalR ?? 0)
-      const bestRes = ok.map((r) => r.perSteerer[st.id]?.bestRes ?? 0)
-      return { id: st.id, bestRate: rate(bestR), lift: pairedLift(breadthR, bestR), liftRes: pairedLift(breadthRes, bestRes), degradation: rate(bestR) - rate(finalR) }
-    })
-    .sort((a, b) => b.lift.point - a.lift.point)
-  // Losses = the reflection fuel: tasks where a steerer's depth-best lost to breadth.
-  const losses: SteererLoss[] = []
-  for (const r of ok) {
-    for (const st of steerers) {
-      const ps = r.perSteerer[st.id]
-      if (ps && ps.bestR < r.breadthR) losses.push({ steererId: st.id, userPrompt: r.userPrompt, breadth: r.breadthR, depthBest: ps.bestR, traj: ps.traj })
-    }
-  }
-  return { ok, excluded: rows.length - ok.length, ranked, losses }
-}
-
-async function main(): Promise<void> {
-  const n = Number(process.env.N ?? 20)
-  const k = Number(process.env.K ?? 3)
-  const m = Number(process.env.M ?? 5)
-  const offset = Number(process.env.OFFSET ?? 0)
-  const model = process.env.WORKER_MODEL ?? 'gpt-4o-mini'
-  const dbsDir = must('EOPS_GYM_DBS_DIR')
-  const cfg: RouterConfig = { routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1', routerKey: must('TANGLE_API_KEY'), model }
-  const concurrency = Number(process.env.CONCURRENCY ?? 4)
-  const steerCfg: RouterConfig = { ...cfg, model: process.env.STEER_MODEL ?? model }
-  // The steerer population under test: generic (control) + either a STEERERS_FILE
-  // (JSON array of {id,systemPrompt,userTemplate}) or the built-in inline analyst.
-  const steerers: Steerer[] = [{ id: 'generic' }]
-  if (process.env.STEERERS_FILE) steerers.push(...(JSON.parse(readFileSync(process.env.STEERERS_FILE, 'utf8')) as Steerer[]))
-  else if (process.env.STEER === 'analyst') steerers.push(inlineAnalyst)
-
-  console.log(`=== EOPS steerer sweep · worker=${model} · steerer=${steerCfg.model} · N=${n} K=${k} M=${m} ===`)
-  console.log(`  steerers (depth arms, all vs ONE shared breadth baseline): ${steerers.map((s) => s.id).join(', ')}`)
-  const tasks = await loadTasks(n, offset)
-  console.log(`  loaded ${tasks.length} itsm task(s); each scored depth-BEST (checkpoint) vs breadth best-of-K, conc=${concurrency}\n`)
-
-  const { ok, excluded, ranked } = await evaluateSteerers({ cfg, steerCfg, steerers, tasks, dbsDir, k, m, concurrency })
-  const breadthR = ok.map((r) => r.breadthR)
-  const breadthRes = ok.map((r) => r.breadthRes)
-  const rate = (xs: number[]) => xs.reduce((s, x) => s + x, 0) / Math.max(xs.length, 1)
-  const sig = (l: PairedLift) => (l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s.')
-
-  console.log(`\n${'='.repeat(86)}`)
-  console.log(`RESULTS · EOPS steerer sweep · n=${ok.length} (excluded ${excluded}) · K=${k} M=${m} · worker=${model} · steerer=${steerCfg.model}`)
-  console.log('='.repeat(86))
-  console.log(`  breadth@${k} (best-of-K, shared baseline): score ${pct(rate(breadthR))}  resolved ${pct(rate(breadthRes))}\n`)
-  console.log(`  ${'steerer'.padEnd(22)} ${'depth-best'.padStart(10)} ${'− breadth'.padStart(10)} ${'95% CI'.padStart(18)} ${'resolved Δ'.padStart(11)} ${'degrade'.padStart(8)}`)
-  console.log(`  ${'-'.repeat(84)}`)
-  for (const r of ranked) {
-    console.log(
-      `  ${r.id.padEnd(22)} ${pct(r.bestRate).padStart(10)} ${pp(r.lift.point).padStart(10)} ${`[${pp(r.lift.low)},${pp(r.lift.high)}]`.padStart(18)} ${pp(r.liftRes.point).padStart(11)} ${pp(r.degradation).padStart(8)}  ${sig(r.lift)}`,
-    )
-  }
-  const best = ranked[0]
-  console.log(`\n  WINNER: ${best?.id} — depth-best beats breadth ${best ? pp(best.lift.point) : 'n/a'} (${best ? sig(best.lift) : ''}). Degradation across steerers shows how much keep-best recovers.`)
-}
-
-if (import.meta.url === `file://${process.argv[1]}`) {
-  main().catch((err) => {
-    console.error(`eops-gate: ${err instanceof Error ? (err.stack ?? err.message) : String(err)}`)
-    process.exit(1)
-  })
-}

From 6f3c15deaedcd7096c2696b7d8d4c0d30780f381 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Tue, 9 Jun 2026 05:44:27 -0600
Subject: [PATCH 4/8] feat(bench): package the optimization suite
 (runBenchmark) + clarify naming + onboarding fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pieces existed (Supervisor + observe + the depth/breadth strategies) but weren't
wrapped as a usable suite, and the vocabulary was opaque. runBenchmark is the packaged
front door:

  runBenchmark({ environment, tasks, worker, strategies: ['sample','refine'], budget })
    → runs each strategy, scores by the environment's own deployable check, returns the
      per-strategy means + the paired-bootstrap lift of refine over sample. printBenchmarkReport
      gives the verdict. Resilient to transient per-task infra (skip, don't crash).

Naming, made legible (public API; maps to internal depth/breadth — zero churn to the
running internals): a task domain is an `Environment` (the AgenticSurface seam under the
RL/gym-standard name); the strategies are `sample` (best-of-N / resample) and `refine`
(attempt → critic reads trace → steer → repeat), named by what they DO, not the search
tree's shape. Juniors call runBenchmark; seniors customize the hooks (worker.analystInstruction
= the critic, Environment.score = the check) or drop to runAgentic for new strategies.

Onboarding: deleted the orphaned empty examples/define-loop/ (defineLoop removed #194);
fixed the dead examples/model-resolution link in docs/concepts.md.
---
 bench/src/run-benchmark.mts | 98 +++++++++++++++++++++++++++++++++++++
 docs/concepts.md            |  5 +-
 2 files changed, 100 insertions(+), 3 deletions(-)
 create mode 100644 bench/src/run-benchmark.mts

diff --git a/bench/src/run-benchmark.mts b/bench/src/run-benchmark.mts
new file mode 100644
index 0000000..17d6c8d
--- /dev/null
+++ b/bench/src/run-benchmark.mts
@@ -0,0 +1,98 @@
+/**
+ * runBenchmark — the packaged optimization suite. Define a domain by implementing an
+ * Environment (open / tools / call / score / close); get the optimization strategies
+ * compared, scored by your own deployable check, with a paired-bootstrap report — FREE.
+ *
+ * The mental model: you have a TASK + a deployable CHECK + a compute BUDGET. An
+ * optimization STRATEGY is how you spend the budget to beat the check. Two primitives:
+ *
+ *   sample  — N independent attempts, keep the best-verifying one.   ("best-of-N" / resample)
+ *   refine  — attempt → a critic reads the trace → steer the next → repeat. (iterate-with-feedback)
+ *
+ * Both run at equal budget; the headline is the paired lift of refine over sample.
+ * (Internally `sample`→breadth, `refine`→depth on the canonical Supervisor+observe loop.)
+ *
+ * Juniors call runBenchmark and read the report. Seniors customize the HOOKS: the critic
+ * (worker.analystInstruction — observe()'s prompt), the check (Environment.score), the
+ * worker (the model), and can drop to runAgentic / the Supervisor for new strategies.
+ */
+import { type AgenticOptions, type AgenticSurface, type AgenticTask, runAgentic } from './agentic'
+import { type PairedLift, pairedLift, pool } from './stats.mts'
+
+/** A checkable task domain — implement these 5 hooks and the suite does the rest. The
+ *  same seam as `AgenticSurface`; `Environment` is the RL/gym-standard name for it. */
+export type Environment = AgenticSurface
+
+/** How to spend the compute budget to beat the Environment's check. */
+export type Strategy = 'sample' | 'refine'
+const modeForStrategy = { sample: 'breadth', refine: 'depth' } as const
+
+export interface BenchmarkConfig {
+  /** The task domain (5 hooks). */
+  environment: Environment
+  /** The tasks to score across. */
+  tasks: AgenticTask[]
+  /** The worker: model + router + (optional) the critic's instruction (the steerer knob). */
+  worker: AgenticOptions
+  /** Which strategies to compare. Default: both. */
+  strategies?: Strategy[]
+  /** Shots (refine) / width (sample) — the equal compute budget per strategy. Default 3. */
+  budget?: number
+  /** Tasks scored in parallel. Default 3. */
+  concurrency?: number
+}
+
+export interface BenchmarkReport {
+  n: number
+  excluded: number
+  /** Mean verifier score per strategy (0..1). */
+  perStrategy: Partial<Record<Strategy, number>>
+  /** The headline: paired lift of refine over sample (present when both ran). */
+  refineVsSample?: PairedLift
+}
+
+/** Run the requested strategies over the tasks, scored by the Environment's own check,
+ *  and return the per-strategy means + the paired-bootstrap lift of refine over sample.
+ *  Resilient: a task whose rollouts fail (transient infra) is excluded, not fatal. */
+export async function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkReport> {
+  const strategies = cfg.strategies ?? ['sample', 'refine']
+  const budget = cfg.budget ?? 3
+  const concurrency = cfg.concurrency ?? 3
+
+  const rows = await pool(cfg.tasks, concurrency, async (task) => {
+    const scores: Partial<Record<Strategy, number>> = {}
+    try {
+      for (const s of strategies) {
+        const r = await runAgentic({ ...cfg.worker, surface: cfg.environment, task, mode: modeForStrategy[s], budget })
+        scores[s] = r.score
+      }
+      return scores
+    } catch {
+      return null // transient infra on this task — exclude it
+    }
+  })
+
+  const ok = rows.filter((r): r is Partial<Record<Strategy, number>> => r !== null)
+  const mean = (xs: number[]) => (xs.length ? xs.reduce((s, x) => s + x, 0) / xs.length : 0)
+  const perStrategy: Partial<Record<Strategy, number>> = {}
+  for (const s of strategies) perStrategy[s] = mean(ok.map((r) => r[s] ?? 0))
+
+  const report: BenchmarkReport = { n: ok.length, excluded: rows.length - ok.length, perStrategy }
+  if (strategies.includes('refine') && strategies.includes('sample')) {
+    report.refineVsSample = pairedLift(ok.map((r) => r.sample ?? 0), ok.map((r) => r.refine ?? 0))
+  }
+  return report
+}
+
+/** Pretty-print a report — the "free optimization" verdict. */
+export function printBenchmarkReport(report: BenchmarkReport): void {
+  const pct = (x: number) => `${(x * 100).toFixed(1)}%`
+  const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`
+  console.log(`\n=== benchmark · n=${report.n}${report.excluded ? ` (excluded ${report.excluded})` : ''} ===`)
+  for (const [s, v] of Object.entries(report.perStrategy)) console.log(`  ${s.padEnd(8)} ${pct(v ?? 0)}`)
+  const l = report.refineVsSample
+  if (l) {
+    const sig = l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s.'
+    console.log(`  refine − sample: ${pp(l.point)}  CI [${pp(l.low)}, ${pp(l.high)}]  (${sig})`)
+  }
+}
diff --git a/docs/concepts.md b/docs/concepts.md
index 4fb338d..3117b77 100644
--- a/docs/concepts.md
+++ b/docs/concepts.md
@@ -140,6 +140,5 @@ agents because nothing in this list is baked into it.
 2. `examples/sandbox-stream-backend/` — what streaming looks like.
 3. `examples/chat-handler/` — `handleChatTurn` — the centerpiece chat handler.
 4. `examples/runtime-run/` — the production-run row + cost ledger.
-5. `examples/model-resolution/` — pick + validate a model.
-6. `examples/agent-into-reviewer/` — pipe one runtime's stream into a reviewer agent.
-7. The `README.md` entry-point table — every other primitive, one row each.
+5. `examples/agent-into-reviewer/` — pipe one runtime's stream into a reviewer agent.
+6. The `README.md` entry-point table — model resolution + every other primitive, one row each.

From e52b8d5e02a608bd3cbf273ccc719de0c527ead3 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Tue, 9 Jun 2026 05:53:05 -0600
Subject: [PATCH 5/8] feat(bench): make Strategy a first-class, OPEN
 abstraction (author your own)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The question: when we collapse to "refine", can a dev create their OWN strategy?
Before: no — runAgentic took mode:'depth'|'breadth', a CLOSED enum. The capability
existed (a strategy is an Agent) but the door wasn't cut.

Now: `Strategy` is an exported interface — `{ name, driver(surface, task, opts, budget)
=> Agent }`. A strategy builds the driver Agent the Supervisor runs; author your own by
returning an Agent whose act() spawns shots/analysts via scope.spawn/next/send. `refine`
and `sample` ship as instances AND the reference driver implementations (depthDriver/
breadthDriver) are exported to copy. runAgentic accepts a `strategy` (mode kept for
back-compat); runBenchmark takes `Strategy[]` — pass the built-ins or your own.

What's under the words:
  sample = K independent attempts, keep the best-verifying (best-of-N / resample)
  refine = attempt → observe() reads the trace → steer the next → repeat (iterate)
A multi-agent "team" is just a Strategy whose driver spawns several different agents —
same recursive Agent atom, coordinated over the Scope.
---
 bench/src/agentic.ts        | 47 ++++++++++++++++++++++++++++---------
 bench/src/run-benchmark.mts | 34 ++++++++++++++-------------
 2 files changed, 54 insertions(+), 27 deletions(-)

diff --git a/bench/src/agentic.ts b/bench/src/agentic.ts
index 69b6a48..b46a72c 100644
--- a/bench/src/agentic.ts
+++ b/bench/src/agentic.ts
@@ -320,7 +320,7 @@ export interface AgenticRunResult {
 const perChild = (innerTurns: number): Budget => ({ maxIterations: innerTurns + 1, maxTokens: 1_000_000 })
 
 /** DEPTH: one persistent artifact, carried across analyst-steered shots. */
-function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: { maxShots: number }): Agent<unknown, Outcome<unknown>> {
+export function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: { maxShots: number }): Agent<unknown, Outcome<unknown>> {
   const innerTurns = opts.innerTurns ?? 4
   let pendingSteer: string | undefined // analyst-derived steer carried between shots
   return {
@@ -366,7 +366,7 @@ function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOp
 }
 
 /** BREADTH: K independent rollouts (each own artifact), verifier picks the best. */
-function breadthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: { width: number }): Agent<unknown, Outcome<unknown>> {
+export function breadthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: { width: number }): Agent<unknown, Outcome<unknown>> {
   const innerTurns = opts.innerTurns ?? 4
   return {
     name: 'breadth',
@@ -395,26 +395,51 @@ function breadthDriver(surface: AgenticSurface, task: AgenticTask, opts: Agentic
   }
 }
 
+/**
+ * A Strategy is HOW you spend the compute budget to beat the Environment's check — it
+ * builds the driver `Agent` the Supervisor runs. This is the OPEN extension point: a dev
+ * authors their own by implementing `driver()` to return an Agent whose `act()` spawns
+ * shots/analysts via `scope.spawn` / `scope.next` / `scope.send`. The two built-ins are
+ * the reference implementations to copy:
+ *   sample — K INDEPENDENT attempts, keep the best-verifying (best-of-N / resample).
+ *   refine — attempt → observe() reads the trace → steer the next → repeat (iterate).
+ * (A multi-agent "team" is just a Strategy whose driver spawns several different agents.)
+ */
+export interface Strategy {
+  readonly name: string
+  driver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, budget: number): Agent<unknown, Outcome<unknown>>
+}
+
+export const sample: Strategy = {
+  name: 'sample',
+  driver: (surface, task, opts, budget) => breadthDriver(surface, task, opts, { width: budget }),
+}
+export const refine: Strategy = {
+  name: 'refine',
+  driver: (surface, task, opts, budget) => depthDriver(surface, task, opts, { maxShots: budget }),
+}
+
 export interface RunAgenticOptions extends AgenticOptions {
   surface: AgenticSurface
   task: AgenticTask
-  mode: 'depth' | 'breadth'
-  /** depth: max shots; breadth: rollout width. */
+  /** A Strategy (the open way) — author/pass your own. Overrides `mode` when present. */
+  strategy?: Strategy
+  /** Built-in shorthand: 'depth'→refine, 'breadth'→sample. Default 'depth'. */
+  mode?: 'depth' | 'breadth'
+  /** budget: refine→max shots; sample→rollout width. */
   budget: number
   rootBudget?: Budget
 }
 
-/** Run the chosen driver through the keystone Supervisor — `Agent.act` over a conserved-budget Scope. */
+/** Run a Strategy through the keystone Supervisor — `Agent.act` over a conserved-budget Scope. */
 export async function runAgentic(opts: RunAgenticOptions): Promise<AgenticRunResult> {
-  const driver =
-    opts.mode === 'depth'
-      ? depthDriver(opts.surface, opts.task, opts, { maxShots: opts.budget })
-      : breadthDriver(opts.surface, opts.task, opts, { width: opts.budget })
+  const strategy: Strategy = opts.strategy ?? (opts.mode === 'breadth' ? sample : refine)
+  const driver = strategy.driver(opts.surface, opts.task, opts, opts.budget)
   const supervisor = createSupervisor<unknown, Outcome<unknown>>()
   const root: Budget = opts.rootBudget ?? { maxIterations: opts.budget * ((opts.innerTurns ?? 4) + 2), maxTokens: 1_000_000_000 }
   const result = await supervisor.run(driver, undefined, {
     budget: root,
-    runId: `agentic:${opts.mode}:${opts.task.id}`,
+    runId: `agentic:${strategy.name}:${opts.task.id}`,
     journal: new InMemorySpawnJournal(),
     blobs: new InMemoryResultBlobStore(),
     executors: agenticRegistry(opts.surface, opts),
@@ -422,7 +447,7 @@ export async function runAgentic(opts: RunAgenticOptions): Promise<AgenticRunRes
   })
   if (result.kind !== 'winner' || result.out.kind !== 'done') {
     const reason = result.kind === 'winner' ? `blocked: ${(result.out as { blockers?: string[] }).blockers?.join('; ')}` : `no-winner: ${result.reason}`
-    throw new Error(`runAgentic(${opts.mode}) produced no result — ${reason}`)
+    throw new Error(`runAgentic(${strategy.name}) produced no result — ${reason}`)
   }
   return result.out.deliverable as AgenticRunResult
 }
diff --git a/bench/src/run-benchmark.mts b/bench/src/run-benchmark.mts
index 17d6c8d..4027c05 100644
--- a/bench/src/run-benchmark.mts
+++ b/bench/src/run-benchmark.mts
@@ -16,16 +16,16 @@
  * (worker.analystInstruction — observe()'s prompt), the check (Environment.score), the
  * worker (the model), and can drop to runAgentic / the Supervisor for new strategies.
  */
-import { type AgenticOptions, type AgenticSurface, type AgenticTask, runAgentic } from './agentic'
+import { type AgenticOptions, type AgenticSurface, type AgenticTask, refine, runAgentic, sample, type Strategy } from './agentic'
 import { type PairedLift, pairedLift, pool } from './stats.mts'
 
 /** A checkable task domain — implement these 5 hooks and the suite does the rest. The
  *  same seam as `AgenticSurface`; `Environment` is the RL/gym-standard name for it. */
 export type Environment = AgenticSurface
 
-/** How to spend the compute budget to beat the Environment's check. */
-export type Strategy = 'sample' | 'refine'
-const modeForStrategy = { sample: 'breadth', refine: 'depth' } as const
+// Strategy is the OPEN extension point (re-exported from agentic): pass the built-ins or
+// author your own (implement Strategy.driver returning an Agent). See `refine`/`sample`.
+export { refine, sample, type Strategy } from './agentic'
 
 export interface BenchmarkConfig {
   /** The task domain (5 hooks). */
@@ -34,7 +34,8 @@ export interface BenchmarkConfig {
   tasks: AgenticTask[]
   /** The worker: model + router + (optional) the critic's instruction (the steerer knob). */
   worker: AgenticOptions
-  /** Which strategies to compare. Default: both. */
+  /** Which strategies to compare. Pass the built-ins (`refine`, `sample`) or your own.
+   *  Default: [sample, refine]. */
   strategies?: Strategy[]
   /** Shots (refine) / width (sample) — the equal compute budget per strategy. Default 3. */
   budget?: number
@@ -45,9 +46,9 @@ export interface BenchmarkConfig {
 export interface BenchmarkReport {
   n: number
   excluded: number
-  /** Mean verifier score per strategy (0..1). */
-  perStrategy: Partial<Record<Strategy, number>>
-  /** The headline: paired lift of refine over sample (present when both ran). */
+  /** Mean verifier score per strategy (keyed by strategy.name, 0..1). */
+  perStrategy: Record<string, number>
+  /** The headline when exactly `refine` + `sample` ran: paired lift of refine over sample. */
   refineVsSample?: PairedLift
 }
 
@@ -55,16 +56,16 @@ export interface BenchmarkReport {
  *  and return the per-strategy means + the paired-bootstrap lift of refine over sample.
  *  Resilient: a task whose rollouts fail (transient infra) is excluded, not fatal. */
 export async function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkReport> {
-  const strategies = cfg.strategies ?? ['sample', 'refine']
+  const strategies = cfg.strategies ?? [sample, refine]
   const budget = cfg.budget ?? 3
   const concurrency = cfg.concurrency ?? 3
 
   const rows = await pool(cfg.tasks, concurrency, async (task) => {
-    const scores: Partial<Record<Strategy, number>> = {}
+    const scores: Record<string, number> = {}
     try {
       for (const s of strategies) {
-        const r = await runAgentic({ ...cfg.worker, surface: cfg.environment, task, mode: modeForStrategy[s], budget })
-        scores[s] = r.score
+        const r = await runAgentic({ ...cfg.worker, surface: cfg.environment, task, strategy: s, budget })
+        scores[s.name] = r.score
       }
       return scores
     } catch {
@@ -72,13 +73,14 @@ export async function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkRepor
     }
   })
 
-  const ok = rows.filter((r): r is Partial<Record<Strategy, number>> => r !== null)
+  const ok = rows.filter((r): r is Record<string, number> => r !== null)
   const mean = (xs: number[]) => (xs.length ? xs.reduce((s, x) => s + x, 0) / xs.length : 0)
-  const perStrategy: Partial<Record<Strategy, number>> = {}
-  for (const s of strategies) perStrategy[s] = mean(ok.map((r) => r[s] ?? 0))
+  const perStrategy: Record<string, number> = {}
+  for (const s of strategies) perStrategy[s.name] = mean(ok.map((r) => r[s.name] ?? 0))
 
   const report: BenchmarkReport = { n: ok.length, excluded: rows.length - ok.length, perStrategy }
-  if (strategies.includes('refine') && strategies.includes('sample')) {
+  const names = strategies.map((s) => s.name)
+  if (names.includes('refine') && names.includes('sample')) {
     report.refineVsSample = pairedLift(ok.map((r) => r.sample ?? 0), ok.map((r) => r.refine ?? 0))
   }
   return report

From 1dfbfd6746daf4b4a72e9dae2a9ba3a6a27a2466 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Tue, 9 Jun 2026 06:03:22 -0600
Subject: [PATCH 6/8] =?UTF-8?q?feat(bench):=20defineStrategy=20+=20composa?=
 =?UTF-8?q?ble=20steps=20=E2=80=94=20author=20a=20loop=20in=20~15=20lines?=
 =?UTF-8?q?=20(skillifiable)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The original goal: loops compact enough to skillify, so agents author them. A 70-line
Supervisor driver isn't that. This adds the composable LEGO:

  defineStrategy(name, async ({ shot, critique, surface, budget }) => { ...compose... })

A strategy body gets two steps — shot() (one worker attempt over an artifact) and
critique() (the firewalled analyst reads the trace → a steer) — with ZERO Supervisor/
Scope/spawn/leaf/drainOne ceremony (all of it lives inside defineStrategy now). That is
the unit an agent or a skill can emit.

Proof: adaptiveRefine — a NEW strategy (refine, but ABANDON-and-restart when a steered
shot fails to improve = branch-when-stuck, the widen/MCTS idea the depth-stuck failure
motivated), authored entirely from the steps, scored keep-best. ~22 lines of pure
strategy logic, no plumbing.

Behavior-preserving: the proven refine/sample drivers (depthDriver/breadthDriver) are
UNTOUCHED — the +16.4pp result + GEPA stay valid. The steps replicate their exact
spawn/drain pattern, so a step-authored strategy behaves identically. Typecheck-verified;
adaptiveRefine live-smoke pending the gym (GEPA has it).
---
 bench/src/agentic.ts | 121 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 120 insertions(+), 1 deletion(-)

diff --git a/bench/src/agentic.ts b/bench/src/agentic.ts
index b46a72c..cb8eb7e 100644
--- a/bench/src/agentic.ts
+++ b/bench/src/agentic.ts
@@ -308,7 +308,8 @@ async function drainOne(scope: Scope<Outcome<unknown>>): Promise<Settled<Outcome
 // ── The result + the two drivers (domain-blind Agents run by the Supervisor) ─────
 
 export interface AgenticRunResult {
-  mode: 'depth' | 'breadth'
+  /** The strategy name (built-in 'depth'/'breadth' or a custom strategy's name). */
+  mode: string
   score: number
   resolved: boolean
   completions: number
@@ -419,6 +420,124 @@ export const refine: Strategy = {
   driver: (surface, task, opts, budget) => depthDriver(surface, task, opts, { maxShots: budget }),
 }
 
+// ── The composable LEGO: author a strategy in ~15 lines from two steps ───────────
+//
+// A strategy body gets `shot()` (run one worker attempt over an artifact) and
+// `critique()` (the firewalled analyst reads the trace → a steer). Compose them — no
+// Supervisor/Scope ceremony. This is the skillifiable unit: an agent can emit a
+// `defineStrategy(name, body)` of a few step-calls; it can't reliably emit a 70-line
+// driver. (depthDriver/breadthDriver are the hand-written reference impls; refine/sample
+// stay on them — proven — while NEW strategies are authored compactly here.)
+
+export interface ShotSpec {
+  /** present ⇒ continue this artifact (depth); absent ⇒ the shot opens a fresh one (sample/restart). */
+  handle?: ArtifactHandle
+  messages?: Msg[]
+  steer?: string
+}
+export interface StrategyResult {
+  score: number
+  resolved: boolean
+  completions: number
+  progression: number[]
+  shots: number
+}
+/** What a strategy body composes with: the domain surface, the budget, and the two steps. */
+export interface StrategyCtx {
+  readonly surface: AgenticSurface
+  readonly task: AgenticTask
+  readonly opts: AgenticOptions
+  readonly budget: number
+  readonly scope: Scope<Outcome<unknown>>
+  /** Run ONE worker shot; its scored result, or null if it went down. */
+  shot(spec?: ShotSpec): Promise<ShotResult | null>
+  /** The firewalled critic reads the trajectory → a steer string, or null on COMPLETE/down. */
+  critique(messages: Msg[]): Promise<string | null>
+}
+
+/** Author a Strategy from the composable steps — the open, compact way. */
+export function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise<StrategyResult>): Strategy {
+  return {
+    name,
+    driver: (surface, task, opts, budget) => ({
+      name,
+      async act(_t, scope): Promise<Outcome<unknown>> {
+        let seq = 0
+        const innerTurns = opts.innerTurns ?? 4
+        const ctx: StrategyCtx = {
+          surface,
+          task,
+          opts,
+          budget,
+          scope,
+          async shot(spec) {
+            const child = leaf(`shot:${seq}`, 'shot')
+            seq += 1
+            const res = scope.spawn(child, { task, handle: spec?.handle, messages: spec?.messages, steer: spec?.steer } as ShotTask, { budget: perChild(innerTurns), label: child.name })
+            if (!res.ok) return null
+            const settled = await drainOne(scope)
+            return settled.kind === 'down' ? null : (settled.out as unknown as ShotResult)
+          },
+          async critique(messages) {
+            const child = leaf(`analyst:${seq}`, 'analyst')
+            seq += 1
+            const res = scope.spawn(child, { task, messages }, { budget: perChild(1), label: child.name })
+            if (!res.ok) return null
+            const settled = await drainOne(scope)
+            if (settled.kind === 'down') return null
+            const findings = settled.out as unknown as string
+            return /^\s*COMPLETE\b/i.test(findings) ? null : findings
+          },
+        }
+        const r = await run(ctx)
+        return { kind: 'done', deliverable: { mode: name, ...r } }
+      },
+    }),
+  }
+}
+
+/** A NEW strategy, authored from the steps (~20 lines): refine, but when a steered shot
+ *  fails to improve the score it ABANDONS that line and restarts fresh (branch-when-stuck)
+ *  — the widen/MCTS idea the depth-stuck failure motivated. Scored keep-best (the best
+ *  checkpoint across all lines), the deployable metric. This is the "experts build BETTER
+ *  optimizations" path: a new technique, compact, with zero Supervisor ceremony. */
+export const adaptiveRefine = defineStrategy('adaptiveRefine', async ({ surface, task, budget, shot, critique }) => {
+  let handle = await surface.open(task)
+  const progression: number[] = []
+  let messages: Msg[] | undefined
+  let steer: string | undefined
+  let completions = 0
+  let best = -1
+  let shots = 0
+  try {
+    for (shots = 0; shots < budget; shots += 1) {
+      const out = await shot({ handle, messages, steer })
+      if (!out) break
+      completions += out.completions
+      progression.push(out.score)
+      if (out.score >= 1) break
+      if (out.score <= best) {
+        // Stuck: steering isn't improving this line — abandon it, restart fresh.
+        await surface.close(handle)
+        handle = await surface.open(task)
+        messages = undefined
+        steer = undefined
+        continue
+      }
+      best = out.score
+      messages = out.messages
+      const findings = await critique(out.messages)
+      completions += 1
+      if (!findings) break
+      steer = `A reviewer flagged unfinished items:\n${findings}\n\nAddress each with the tools, verify they took, then continue.`
+    }
+    const score = progression.length ? Math.max(...progression) : 0
+    return { score, resolved: score >= 1, completions, progression, shots }
+  } finally {
+    await surface.close(handle)
+  }
+})
+
 export interface RunAgenticOptions extends AgenticOptions {
   surface: AgenticSurface
   task: AgenticTask

From ab137984f9422104d98433a268be7707f7931988 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Tue, 9 Jun 2026 06:15:59 -0600
Subject: [PATCH 7/8] =?UTF-8?q?docs(bench):=20strategy-demo=20example=20?=
 =?UTF-8?q?=E2=80=94=20the=20optimization=20suite=20in=203=20layers=20(gym?=
 =?UTF-8?q?-free,=20runnable)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The missing onboarding piece: a runnable demo of the whole suite on a toy "counter"
Environment (needs only a router key — no dataset, no sandbox). Shows all three layers:
  1. runBenchmark(env, …) — default strategies compared, free.
  2. strategies: [sample, refine, adaptiveRefine] — pick, named by behavior.
  3. defineStrategy('doubleCheck', body) — author your own in ~10 lines from shot()+critique(),
     zero Supervisor ceremony. The skillifiable unit.
Verified: runs end-to-end through the canonical Supervisor; all 4 strategies execute and
score via the Environment's own check. README documents the model + the customization hooks.
---
 bench/src/examples/README.md         |  46 ++++++++++
 bench/src/examples/strategy-demo.mts | 120 +++++++++++++++++++++++++++
 2 files changed, 166 insertions(+)
 create mode 100644 bench/src/examples/README.md
 create mode 100644 bench/src/examples/strategy-demo.mts

diff --git a/bench/src/examples/README.md b/bench/src/examples/README.md
new file mode 100644
index 0000000..edf735e
--- /dev/null
+++ b/bench/src/examples/README.md
@@ -0,0 +1,46 @@
+# Strategy demo — the optimization suite in three layers
+
+`strategy-demo.mts` is the smallest end-to-end demonstration of the optimization suite.
+It runs on a toy "counter" `Environment` so it needs only a router key — no benchmark
+dataset, no sandbox, no gym.
+
+```
+dotenvx run -f …/.env.keys -- env WORKER_MODEL=gpt-4o-mini tsx src/examples/strategy-demo.mts
+```
+
+## The model
+
+You have a **task**, a deployable **check**, and a compute **budget**. A *strategy* is
+**how you spend the budget to beat the check**. You implement an `Environment` (5 hooks)
+and get the strategies compared, scored by your own check, for free.
+
+## The three layers (each is a few lines in the demo)
+
+1. **Just run it** — `runBenchmark({ environment, tasks, worker })` compares the default
+   strategies and reports the paired lift. Black box; no vocabulary needed.
+
+2. **Pick strategies** — pass `strategies: [sample, refine, adaptiveRefine]`. Named by
+   what they *do*:
+   - **`sample`** — N independent attempts, keep the best-verifying (best-of-N / resample).
+   - **`refine`** — attempt → a critic reads the trace → steer the next → repeat (iterate).
+   - **`adaptiveRefine`** — refine, but abandon-and-restart a line that stops improving
+     (branch-when-stuck).
+
+3. **Author your own** — `defineStrategy(name, body)`. A strategy body composes two steps
+   — `shot()` (one worker attempt over an artifact) and `critique()` (the firewalled
+   analyst reads the trace → a steer) — with **zero** Supervisor/Scope ceremony. The demo
+   authors `doubleCheck` inline in ~10 lines. This is the unit a skill (or an agent) emits.
+
+## The hooks you customize (world-class-DX surface)
+
+- **the check / verifier** → `Environment.score` (your deployable success criterion)
+- **the critic / steerer** → `worker.analystInstruction` (the analyst prompt; GEPA tunes this)
+- **the worker** → the model (`worker.model`)
+- **the strategy** → `defineStrategy` (or drop to `runAgentic` / the Supervisor for novel topologies)
+
+## Where the real results live
+
+On a trivial task all strategies tie. The differences (e.g. refine/adaptiveRefine beating
+sample on stateful agentic work, +16.4pp on EnterpriseOps-Gym) show on real domains — see
+`bench/HARNESS.md` and `bench/src/agentic-run.mts` (the EOPS Environment), and
+`bench/src/eops-gepa.mts` (GEPA evolving the analyst/critic prompt against the check).
diff --git a/bench/src/examples/strategy-demo.mts b/bench/src/examples/strategy-demo.mts
new file mode 100644
index 0000000..63edcb0
--- /dev/null
+++ b/bench/src/examples/strategy-demo.mts
@@ -0,0 +1,120 @@
+/**
+ * Strategy demo — the optimization suite in three layers, on a toy Environment (no gym).
+ *
+ * The whole idea in one file: you implement an `Environment` (5 hooks: open/tools/call/
+ * score/close), and you get optimization STRATEGIES — sample (best-of-N), refine
+ * (iterate-with-feedback), and any you author — compared and scored by your own check,
+ * for free. This uses a trivial "counter" environment so it runs with just a router key
+ * (no benchmark dataset, no sandbox).
+ *
+ *   dotenvx run -f …/.env.keys -- env WORKER_MODEL=gpt-4o-mini tsx src/examples/strategy-demo.mts
+ *
+ * The three layers shown below:
+ *   1. just run it     — runBenchmark(env, …) compares the default strategies, free.
+ *   2. pick strategies — pass [sample, refine, adaptiveRefine].
+ *   3. author your own — defineStrategy(name, body) in ~10 lines, no Supervisor ceremony.
+ */
+import { adaptiveRefine, type AgenticTask, type ArtifactHandle, defineStrategy, refine, sample } from '../agentic'
+import { type Environment, printBenchmarkReport, runBenchmark } from '../run-benchmark.mts'
+
+// ── 1. Implement an Environment (the only thing a new domain writes) ──────────────
+// A toy: the agent must drive a counter to exactly the target using the increment tool.
+// score = how close it got. This is the seam every real benchmark (EOPS, a coding repo,
+// a browser task) implements the same way — open a checkable artifact, expose tools,
+// score it. Here the "artifact" is just an in-memory counter.
+
+const target = 5
+const counters = new Map<string, { count: number }>()
+
+const counterEnv: Environment = {
+  name: 'counter',
+  async open(_task) {
+    const id = `counter-${Math.random().toString(36).slice(2, 8)}`
+    counters.set(id, { count: 0 })
+    return { id, surface: 'counter' } satisfies ArtifactHandle
+  },
+  async tools() {
+    return [
+      { type: 'function', function: { name: 'increment', description: 'Add 1 to the counter.', parameters: { type: 'object', properties: {} } } },
+      { type: 'function', function: { name: 'read_count', description: 'Read the current counter value.', parameters: { type: 'object', properties: {} } } },
+    ]
+  },
+  async call(handle, name) {
+    const c = counters.get(handle.id)
+    if (!c) return 'ERROR: no such counter'
+    if (name === 'increment') {
+      c.count += 1
+      return `count is now ${c.count}`
+    }
+    if (name === 'read_count') return `count is ${c.count}`
+    return `ERROR: unknown tool ${name}`
+  },
+  // The deployable CHECK: exact hits toward the target. score = passes/total.
+  async score(_task, handle) {
+    const c = counters.get(handle.id)
+    const count = c?.count ?? 0
+    return { passes: Math.min(count, target), total: target, errored: 0 }
+  },
+  async close(handle) {
+    counters.delete(handle.id)
+  },
+}
+
+const task: AgenticTask = {
+  id: 'counter-to-5',
+  systemPrompt: 'You operate a counter with tools.',
+  userPrompt: `Use the increment tool to bring the counter to exactly ${target}. Use read_count to verify before you finish. Reply DONE when the count equals ${target}.`,
+}
+
+// ── 3. Author your OWN strategy in ~10 lines — the lego (no Supervisor ceremony) ──
+// "doubleCheck": one attempt, then critique twice (extra steering passes) before stopping.
+// A strategy body composes two steps: shot() (one worker attempt) + critique() (the
+// firewalled analyst → a steer). That's it. This is the skillifiable unit.
+const doubleCheck = defineStrategy('doubleCheck', async ({ surface, task: t, budget, shot, critique }) => {
+  const handle = await surface.open(t)
+  const progression: number[] = []
+  let messages: Record<string, unknown>[] | undefined
+  let steer: string | undefined
+  let completions = 0
+  try {
+    for (let i = 0; i < budget; i += 1) {
+      const out = await shot({ handle, messages, steer })
+      if (!out) break
+      completions += out.completions
+      progression.push(out.score)
+      if (out.score >= 1) break
+      messages = out.messages
+      const findings = await critique(out.messages)
+      completions += 1
+      if (!findings) break
+      steer = `Not done yet. ${findings}`
+    }
+    const score = progression.length ? Math.max(...progression) : 0
+    return { score, resolved: score >= 1, completions, progression, shots: progression.length }
+  } finally {
+    await surface.close(handle)
+  }
+})
+
+async function main(): Promise<void> {
+  const worker = {
+    routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1',
+    routerKey: process.env.TANGLE_API_KEY ?? '',
+    model: process.env.WORKER_MODEL ?? 'gpt-4o-mini',
+    innerTurns: 6,
+  }
+  if (!worker.routerKey) throw new Error('set TANGLE_API_KEY (the worker calls the router)')
+
+  console.log('Layer 1 — just run it (default strategies, scored by the env\'s own check):')
+  printBenchmarkReport(await runBenchmark({ environment: counterEnv, tasks: [task], worker, budget: 3 }))
+
+  console.log('\nLayer 2+3 — pick the built-ins AND your own authored strategy:')
+  printBenchmarkReport(
+    await runBenchmark({ environment: counterEnv, tasks: [task], worker, budget: 3, strategies: [sample, refine, adaptiveRefine, doubleCheck] }),
+  )
+}
+
+main().catch((e) => {
+  console.error(e instanceof Error ? (e.stack ?? e.message) : String(e))
+  process.exit(1)
+})

From 658160bc0176739a49e87e2c2a890330c97b09de Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Tue, 9 Jun 2026 06:25:50 -0600
Subject: [PATCH 8/8] =?UTF-8?q?chore(examples):=20clearer=20names=20?=
 =?UTF-8?q?=E2=80=94=20drop=20the=20confusing=20`with-`=20prefix;=20clarif?=
 =?UTF-8?q?y=20intent?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Disciplined subset of the examples-naming audit (NOT the proposed 01-08 numbering /
.deprecated quarantine — that's churn for throwaway examples and the README already
orders them):
  with-knowledge-readiness → knowledge-gating   (`with-` read as an optional toggle)
  with-intelligence-export → intelligence-export (same)
  agent-into-reviewer      → pipe-into-reviewer  (signals the 2-runtime piping)
KEPT runtime-run (it teaches startRuntimeRun — the name matches the product API) and
agents-of-all-shapes (memorable + has a test). git mv preserves history; README +
docs/concepts + all internal self-references updated; zero stragglers.
---
 docs/concepts.md                                          | 2 +-
 examples/README.md                                        | 8 ++++----
 examples/agents-of-all-shapes/README.md                   | 2 +-
 examples/agents-of-all-shapes/shapes.ts                   | 2 +-
 .../README.md                                             | 4 ++--
 .../intelligence-export.ts}                               | 2 +-
 .../README.md                                             | 4 ++--
 .../knowledge-gating.ts}                                  | 2 +-
 .../{agent-into-reviewer => pipe-into-reviewer}/README.md | 2 +-
 .../pipe-into-reviewer.ts}                                | 2 +-
 10 files changed, 15 insertions(+), 15 deletions(-)
 rename examples/{with-intelligence-export => intelligence-export}/README.md (87%)
 rename examples/{with-intelligence-export/with-intelligence-export.ts => intelligence-export/intelligence-export.ts} (97%)
 rename examples/{with-knowledge-readiness => knowledge-gating}/README.md (91%)
 rename examples/{with-knowledge-readiness/with-knowledge-readiness.ts => knowledge-gating/knowledge-gating.ts} (96%)
 rename examples/{agent-into-reviewer => pipe-into-reviewer}/README.md (93%)
 rename examples/{agent-into-reviewer/agent-into-reviewer.ts => pipe-into-reviewer/pipe-into-reviewer.ts} (98%)

diff --git a/docs/concepts.md b/docs/concepts.md
index 3117b77..00ae5ec 100644
--- a/docs/concepts.md
+++ b/docs/concepts.md
@@ -140,5 +140,5 @@ agents because nothing in this list is baked into it.
 2. `examples/sandbox-stream-backend/` — what streaming looks like.
 3. `examples/chat-handler/` — `handleChatTurn` — the centerpiece chat handler.
 4. `examples/runtime-run/` — the production-run row + cost ledger.
-5. `examples/agent-into-reviewer/` — pipe one runtime's stream into a reviewer agent.
+5. `examples/pipe-into-reviewer/` — pipe one runtime's stream into a reviewer agent.
 6. The `README.md` entry-point table — model resolution + every other primitive, one row each.
diff --git a/examples/README.md b/examples/README.md
index aa3e1bd..59c777d 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -9,7 +9,7 @@ Every example imports from `@tangle-network/agent-runtime` (the same surface con
 | # | Example | One sentence |
 |---|---|---|
 | 1 | [`chat-handler/`](./chat-handler/) | `handleChatTurn` — the production chat turn lifecycle every product runs |
-| 2 | [`with-knowledge-readiness/`](./with-knowledge-readiness/) | Same chat handler + `requiredKnowledge` + `decideKnowledgeReadiness` gating |
+| 2 | [`knowledge-gating/`](./knowledge-gating/) | Same chat handler + `requiredKnowledge` + `decideKnowledgeReadiness` gating |
 | 3 | [`sanitized-telemetry-streaming/`](./sanitized-telemetry-streaming/) | Same chat handler + redaction-by-default telemetry collector |
 | 4 | [`runtime-run/`](./runtime-run/) | Same chat handler + `startRuntimeRun` + cost ledger persistence |
 
@@ -38,8 +38,8 @@ These were standalone examples in an earlier release. The patterns are now folde
 - [`openai-stream-backend/`](./openai-stream-backend/) — `createOpenAICompatibleBackend`
 - [`sse-stream/`](./sse-stream/) — SSE helpers for browser routes
 - [`sanitized-telemetry/`](./sanitized-telemetry/) — non-streaming counterpart to `sanitized-telemetry-streaming`
-- [`agent-into-reviewer/`](./agent-into-reviewer/) — pipe one runtime's stream into a reviewer agent (advanced 2-runtime topology)
-- [`with-intelligence-export/`](./with-intelligence-export/) — ship loop traces to Tangle Intelligence (`createOtelExporter` + raw OTLP) for failure-correlation + quality insights
+- [`pipe-into-reviewer/`](./pipe-into-reviewer/) — pipe one runtime's stream into a reviewer agent (advanced 2-runtime topology)
+- [`intelligence-export/`](./intelligence-export/) — ship loop traces to Tangle Intelligence (`createOtelExporter` + raw OTLP) for failure-correlation + quality insights
 
 ## Conventions
 
@@ -54,7 +54,7 @@ From the agent-runtime repo root, in the suggested learning order:
 ```bash
 # Start here
 pnpm tsx examples/chat-handler/chat-handler.ts
-pnpm tsx examples/with-knowledge-readiness/with-knowledge-readiness.ts
+pnpm tsx examples/knowledge-gating/knowledge-gating.ts
 pnpm tsx examples/sanitized-telemetry-streaming/sanitized-telemetry-streaming.ts
 pnpm tsx examples/runtime-run/runtime-run.ts
 
diff --git a/examples/agents-of-all-shapes/README.md b/examples/agents-of-all-shapes/README.md
index cfdbc36..29631de 100644
--- a/examples/agents-of-all-shapes/README.md
+++ b/examples/agents-of-all-shapes/README.md
@@ -49,7 +49,7 @@ frameworks already emit them; you add `score`.
 
 | Shape | File | Live wiring |
 |---|---|---|
-| **Tangle runtime / router (tcloud)** | `shapes.ts` → `tangleRuntimeRuns` | `createOtelExporter` + `loopEventToOtelSpan` (see `examples/with-intelligence-export`) |
+| **Tangle runtime / router (tcloud)** | `shapes.ts` → `tangleRuntimeRuns` | `createOtelExporter` + `loopEventToOtelSpan` (see `examples/intelligence-export`) |
 | **OpenAI-compatible** (tcloud / OpenRouter / OpenAI / vLLM) | `shapes.ts` → `openAiCompatibleRuns` | any OpenAI client at the router's `baseURL`; emit a GenAI span per call |
 | **Mastra** | `shapes.ts` → `mastraRuns` | Mastra's native OTLP exporter → `${INTELLIGENCE_BASE}/v1/otlp/v1/traces` |
 | **Claude Agent SDK** | `shapes.ts` → `claudeAgentSdkRuns` | wrap `query()`, one GenAI span per turn from `msg.usage` |
diff --git a/examples/agents-of-all-shapes/shapes.ts b/examples/agents-of-all-shapes/shapes.ts
index 369da2c..4a7bd92 100644
--- a/examples/agents-of-all-shapes/shapes.ts
+++ b/examples/agents-of-all-shapes/shapes.ts
@@ -59,7 +59,7 @@ function batch(spec: BatchSpec): AgentRun[] {
  * 1. Tangle agent-runtime / router (tcloud).
  *
  * LIVE: agent-runtime already emits every loop event; ship them with the
- * built-in exporter (see `examples/with-intelligence-export`):
+ * built-in exporter (see `examples/intelligence-export`):
  *   const exporter = createOtelExporter({ endpoint, headers })
  *   for await (const e of runAgentTaskStream({ task, backend })) {
  *     exporter.exportSpan(loopEventToOtelSpan({ kind: e.type, runId, ... }, traceId))
diff --git a/examples/with-intelligence-export/README.md b/examples/intelligence-export/README.md
similarity index 87%
rename from examples/with-intelligence-export/README.md
rename to examples/intelligence-export/README.md
index b0ddf49..8dbd044 100644
--- a/examples/with-intelligence-export/README.md
+++ b/examples/intelligence-export/README.md
@@ -1,4 +1,4 @@
-# with-intelligence-export
+# intelligence-export
 
 Ship agent-runtime traces to **Tangle Intelligence** and get back insights:
 failure correlations (relative risk + p-value), latency percentiles, and an
@@ -18,5 +18,5 @@ The tenant is resolved from the Bearer key, never the payload. Read insights
 back from the dashboard or `GET /v1/insights/outputs?kind=report`.
 
 ```bash
-TANGLE_API_KEY=sk-tan-... npx tsx examples/with-intelligence-export/with-intelligence-export.ts
+TANGLE_API_KEY=sk-tan-... npx tsx examples/intelligence-export/intelligence-export.ts
 ```
diff --git a/examples/with-intelligence-export/with-intelligence-export.ts b/examples/intelligence-export/intelligence-export.ts
similarity index 97%
rename from examples/with-intelligence-export/with-intelligence-export.ts
rename to examples/intelligence-export/intelligence-export.ts
index 4848c4f..5ea363d 100644
--- a/examples/with-intelligence-export/with-intelligence-export.ts
+++ b/examples/intelligence-export/intelligence-export.ts
@@ -18,7 +18,7 @@
  * Read insights back from the dashboard or `GET /v1/insights/outputs` with
  * the same key. Tenant resolves from the Bearer key, never the payload.
  *
- * Run: TANGLE_API_KEY=sk-tan-... npx tsx examples/with-intelligence-export/with-intelligence-export.ts
+ * Run: TANGLE_API_KEY=sk-tan-... npx tsx examples/intelligence-export/intelligence-export.ts
  */
 import {
   type AgentBackendInput,
diff --git a/examples/with-knowledge-readiness/README.md b/examples/knowledge-gating/README.md
similarity index 91%
rename from examples/with-knowledge-readiness/README.md
rename to examples/knowledge-gating/README.md
index 74a2ce3..3f434e9 100644
--- a/examples/with-knowledge-readiness/README.md
+++ b/examples/knowledge-gating/README.md
@@ -1,4 +1,4 @@
-# with-knowledge-readiness
+# knowledge-gating
 
 A task that declares required knowledge. The runtime scores readiness
 before running the control loop and stops if a blocking requirement is
@@ -9,7 +9,7 @@ instead of failing the run.
 ## Run
 
 ```bash
-pnpm tsx examples/with-knowledge-readiness/with-knowledge-readiness.ts
+pnpm tsx examples/knowledge-gating/knowledge-gating.ts
 ```
 
 ## What it shows
diff --git a/examples/with-knowledge-readiness/with-knowledge-readiness.ts b/examples/knowledge-gating/knowledge-gating.ts
similarity index 96%
rename from examples/with-knowledge-readiness/with-knowledge-readiness.ts
rename to examples/knowledge-gating/knowledge-gating.ts
index 7414ccf..1762897 100644
--- a/examples/with-knowledge-readiness/with-knowledge-readiness.ts
+++ b/examples/knowledge-gating/knowledge-gating.ts
@@ -3,7 +3,7 @@
  * control loop runs and gates the task on readiness.
  *
  * Run with:
- *   pnpm tsx examples/with-knowledge-readiness/with-knowledge-readiness.ts
+ *   pnpm tsx examples/knowledge-gating/knowledge-gating.ts
  */
 
 import type { KnowledgeRequirement } from '@tangle-network/agent-eval'
diff --git a/examples/agent-into-reviewer/README.md b/examples/pipe-into-reviewer/README.md
similarity index 93%
rename from examples/agent-into-reviewer/README.md
rename to examples/pipe-into-reviewer/README.md
index 9e34505..e4c48d1 100644
--- a/examples/agent-into-reviewer/README.md
+++ b/examples/pipe-into-reviewer/README.md
@@ -19,5 +19,5 @@ task, backend, input })` with the same shape — an async iterable of
 events. The reviewer adapter is unchanged.
 
 ```bash
-pnpm tsx examples/agent-into-reviewer/agent-into-reviewer.ts
+pnpm tsx examples/pipe-into-reviewer/pipe-into-reviewer.ts
 ```
diff --git a/examples/agent-into-reviewer/agent-into-reviewer.ts b/examples/pipe-into-reviewer/pipe-into-reviewer.ts
similarity index 98%
rename from examples/agent-into-reviewer/agent-into-reviewer.ts
rename to examples/pipe-into-reviewer/pipe-into-reviewer.ts
index 3e0ec53..2afa8c1 100644
--- a/examples/agent-into-reviewer/agent-into-reviewer.ts
+++ b/examples/pipe-into-reviewer/pipe-into-reviewer.ts
@@ -10,7 +10,7 @@
  * other agent-runtime call uses.
  *
  * Run with:
- *   pnpm tsx examples/agent-into-reviewer/agent-into-reviewer.ts
+ *   pnpm tsx examples/pipe-into-reviewer/pipe-into-reviewer.ts
  */
 
 import type { AgentAdapter } from '@tangle-network/agent-runtime'