tangle-network · drewstone · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026
diff --git a/bench/src/agentic-eops.ts b/bench/src/agentic-eops.ts
@@ -7,7 +7,7 @@
  * AppWorld, terminal-bench) ships its own file like this one — the drivers in agentic.ts never change.
  */
 
-import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from './agentic'
+import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from '@tangle-network/agent-runtime/loops'
 import { callTool, deleteDb, type GymServer, type GymVerifier, loadTools, runVerifiers, seed } from './gym-agent'
 
 interface EopsMeta {

diff --git a/bench/src/agentic-run.mts b/bench/src/agentic-run.mts
@@ -7,7 +7,7 @@
  *   TASKS=4 MAX_SHOTS=5 WIDTH=5 INNER_TURNS=4 WORKER_MODEL=gpt-4.1 tsx src/agentic-run.mts
  */
 
-import { type AgenticOptions, type AgenticTask, runAgentic } from './agentic'
+import { type AgenticOptions, type AgenticTask, runAgentic } from '@tangle-network/agent-runtime/loops'
 import { createEopsSurface, eopsTaskFromRow } from './agentic-eops'
 
 const must = (k: string): string => {

diff --git a/bench/src/eops-corpus-ab.mts b/bench/src/eops-corpus-ab.mts
@@ -0,0 +1,149 @@
+/**
+ * The corpus flywheel A/B — primed-vs-cold at equal compute. THE across-run experiment
+ * (docs/research/layer-across-run.md): does run N+1 improve because the system learned
+ * from run N?
+ *
+ * Two arms over the SAME task stream, same order, canonical depth (Supervisor + observe):
+ *   cold    — every run fresh (the baseline as measured at +16.4pp vs breadth).
+ *   primed  — before each run, query the corpus (trace-derived facts accumulated by the
+ *             analyst's observe() pass on PRIOR runs) and fold the top-k into the task's
+ *             systemPrompt; after each run, observe() appends new facts. Zero extra LLM
+ *             calls (the analyst already runs); priming is prompt text — equal compute.
+ *
+ * Reported: per-position scores, paired lift, the SLOPE (first-half vs second-half lift —
+ * the flywheel signature is a GROWING advantage), fact uptake counts, and a frozen
+ * HOLDOUT: a disjoint slice run primed-from-the-accumulated-corpus vs cold (does the
+ * learned knowledge transfer to fresh tasks?).
+ *
+ * Falsifiers designed in (layer-across-run.md): context pollution (cap k, report dose),
+ * stale/instance facts (the gym DB resets per task — only PROCEDURAL facts can help),
+ * judge leakage (observe() is structurally trace-only), worker disregard (uptake column).
+ *
+ *   docker run -d --rm --name eops -p 8006:8005 shivakrishnareddyma225/enterpriseops-gym-mcp-itsm:latest
+ *   EOPS_GYM_DBS_DIR=… N=16 HOLDOUT=4 K_FACTS=3 WORKER_MODEL=deepseek-v4-pro tsx src/eops-corpus-ab.mts
+ */
+import { type AgenticOptions, type AgenticTask, FileCorpus, runAgentic } from '@tangle-network/agent-runtime/loops'
+import { createEopsSurface, eopsTaskFromRow } from './agentic-eops'
+import { type PairedLift, pairedLift } from './stats.mts'
+
+function must(name: string): string {
+  const v = process.env[name]
+  if (!v) throw new Error(`env ${name} is required`)
+  return v
+}
+
+async function loadItsmTasks(n: number, offset = 0): Promise<AgenticTask[]> {
+  const url = `https://datasets-server.huggingface.co/rows?dataset=${encodeURIComponent('ServiceNow-AI/EnterpriseOps-Gym')}&config=oracle&split=itsm&offset=${offset}&length=${n}`
+  const res = await fetch(url)
+  if (!res.ok) throw new Error(`EOPS HF rows HTTP ${res.status}`)
+  const body = (await res.json()) as { rows?: Array<{ row: Parameters<typeof eopsTaskFromRow>[0] }> }
+  return (body.rows ?? []).slice(0, n).map(({ row }) => eopsTaskFromRow(row))
+}
+
+const tags = ['eops', 'itsm', 'corpus-ab']
+const pct = (x: number) => `${(x * 100).toFixed(0)}%`
+const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`
+
+async function main(): Promise<void> {
+  const n = Number(process.env.N ?? 16)
+  const holdoutN = Number(process.env.HOLDOUT ?? 4)
+  const kFacts = Number(process.env.K_FACTS ?? 3)
+  const maxShots = Number(process.env.MAXSHOTS ?? 3)
+  const model = process.env.WORKER_MODEL ?? 'deepseek-v4-pro'
+  const corpusPath = process.env.CORPUS ?? `/tmp/eops-corpus-ab-${Date.now()}.jsonl`
+  const opts: AgenticOptions = {
+    routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1',
+    routerKey: must('TANGLE_API_KEY'),
+    model,
+    innerTurns: Number(process.env.INNER_TURNS ?? 4),
+    temperature: 0.7,
+  }
+  const surface = createEopsSurface(must('EOPS_GYM_DBS_DIR'))
+  const corpus = new FileCorpus(corpusPath)
+
+  const stream = await loadItsmTasks(n)
+  console.error(`=== corpus A/B · primed-vs-cold · stream n=${stream.length} + holdout ${holdoutN} · ${model} · k=${kFacts} facts ===`)
+  console.error(`    corpus: ${corpusPath}\n`)
+
+  /** Top-k trace-derived facts → a prime block for the worker's system prompt. */
+  async function primeBlock(): Promise<{ text: string; count: number }> {
+    const facts = await corpus.query({ tags: ['audience:agent'], limit: kFacts })
+    if (facts.length === 0) return { text: '', count: 0 }
+    const lines = facts.map((f) => `- ${f.claim}${f.rationale ? ` (${f.rationale.slice(0, 120)})` : ''}`)
+    return {
+      text: `\n\nLEARNINGS FROM PRIOR RUNS (apply where relevant):\n${lines.join('\n')}`,
+      count: facts.length,
+    }
+  }
+
+  async function runArm(task: AgenticTask, primed: boolean): Promise<{ score: number; facts: number } | null> {
+    try {
+      if (!primed) {
+        const r = await runAgentic({ ...opts, surface, task, mode: 'depth', budget: maxShots })
+        return { score: r.score, facts: 0 }
+      }
+      const prime = await primeBlock()
+      const primedTask: AgenticTask = { ...task, systemPrompt: `${task.systemPrompt}${prime.text}` }
+      const r = await runAgentic({ ...opts, corpus, corpusTags: tags, surface, task: primedTask, mode: 'depth', budget: maxShots })
+      return { score: r.score, facts: prime.count }
+    } catch (e) {
+      console.error(`     SKIP ${task.id.slice(-12)} (${e instanceof Error ? e.message.slice(0, 60) : e})`)
+      return null
+    }
+  }
+
+  // The stream: per task, cold first (no corpus contact), then primed (reads + writes).
+  const rows: Array<{ cold: number; primed: number; facts: number }> = []
+  for (let i = 0; i < stream.length; i += 1) {
+    const task = stream[i] as AgenticTask
+    const cold = await runArm(task, false)
+    const primed = await runArm(task, true)
+    if (!cold || !primed) continue
+    rows.push({ cold: cold.score, primed: primed.score, facts: primed.facts })
+    console.error(`  [${i + 1}/${stream.length}] ${task.id.slice(-12)}: cold ${pct(cold.score)}  primed ${pct(primed.score)}  (facts injected: ${primed.facts})`)
+  }
+
+  const mean = (xs: number[]) => (xs.length ? xs.reduce((s, x) => s + x, 0) / xs.length : 0)
+  const lift = pairedLift(rows.map((r) => r.cold), rows.map((r) => r.primed))
+  const half = Math.floor(rows.length / 2)
+  const firstHalf = mean(rows.slice(0, half).map((r) => r.primed - r.cold))
+  const secondHalf = mean(rows.slice(half).map((r) => r.primed - r.cold))
+
+  // Frozen holdout: fresh tasks, primed from the ACCUMULATED corpus (read-only) vs cold.
+  let holdout: { lift: PairedLift; n: number } | undefined
+  if (holdoutN > 0) {
+    console.error(`\n▶ holdout (${holdoutN} disjoint tasks, corpus read-only)…`)
+    const htasks = await loadItsmTasks(holdoutN, stream.length)
+    const hrows: Array<{ cold: number; primed: number }> = []
+    for (const task of htasks) {
+      const cold = await runArm(task, false)
+      if (!cold) continue
+      // read-only priming: query + inject, but do NOT pass the corpus (no writes).
+      try {
+        const prime = await primeBlock()
+        const primedTask: AgenticTask = { ...task, systemPrompt: `${task.systemPrompt}${prime.text}` }
+        const r = await runAgentic({ ...opts, surface, task: primedTask, mode: 'depth', budget: maxShots })
+        hrows.push({ cold: cold.score, primed: r.score })
+        console.error(`   ${task.id.slice(-12)}: cold ${pct(cold.score)}  primed ${pct(r.score)}`)
+      } catch {
+        /* skip */
+      }
+    }
+    if (hrows.length >= 2) holdout = { lift: pairedLift(hrows.map((r) => r.cold), hrows.map((r) => r.primed)), n: hrows.length }
+  }
+
+  const sig = (l: PairedLift) => (l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s.')
+  console.error(`\n${'='.repeat(74)}`)
+  console.error(`CORPUS A/B RESULT · stream n=${rows.length} · ${model} · k=${kFacts}`)
+  console.error('='.repeat(74))
+  console.error(`  cold   ${pct(mean(rows.map((r) => r.cold)))}    primed ${pct(mean(rows.map((r) => r.primed)))}`)
+  console.error(`  primed − cold (paired, B=10000)  ${pp(lift.point)}  CI [${pp(lift.low)}, ${pp(lift.high)}]  disc=${lift.discordant}  ${sig(lift)}`)
+  console.error(`  SLOPE: first-half lift ${pp(firstHalf)} → second-half lift ${pp(secondHalf)}  ${secondHalf > firstHalf ? '(growing — the flywheel signature)' : '(not growing)'}`)
+  if (holdout) console.error(`  HOLDOUT (${holdout.n} fresh tasks, accumulated corpus): ${pp(holdout.lift.point)}  CI [${pp(holdout.lift.low)}, ${pp(holdout.lift.high)}]  ${sig(holdout.lift)}`)
+  console.error(`  corpus facts accumulated: see ${corpusPath}`)
+}
+
+main().catch((e) => {
+  console.error(`eops-corpus-ab: ${e instanceof Error ? (e.stack ?? e.message) : String(e)}`)
+  process.exit(1)
+})
diff --git a/bench/src/eops-gepa.mts b/bench/src/eops-gepa.mts
@@ -18,8 +18,7 @@
  */
 import { readFileSync, writeFileSync } from 'node:fs'
 import { buildReflectionPrompt, paretoFrontier, parseReflectionResponse } from '@tangle-network/agent-eval'
-import { defaultAnalystInstruction } from '@tangle-network/agent-runtime/loops'
-import { type AgenticOptions, type AgenticTask, runAgentic } from './agentic'
+import { type AgenticOptions, type AgenticTask, defaultAnalystInstruction, runAgentic } from '@tangle-network/agent-runtime/loops'
 import { createEopsSurface, eopsTaskFromRow } from './agentic-eops'
 import { type RouterConfig, routerChatWithUsage } from './router-client'
 

diff --git a/bench/src/examples/strategy-demo.mts b/bench/src/examples/strategy-demo.mts
@@ -14,8 +14,7 @@
  *   2. pick strategies — pass [sample, refine, adaptiveRefine].
  *   3. author your own — defineStrategy(name, body) in ~10 lines, no Supervisor ceremony.
  */
-import { adaptiveRefine, type AgenticTask, type ArtifactHandle, defineStrategy, refine, sample } from '../agentic'
-import { type Environment, printBenchmarkReport, runBenchmark } from '../run-benchmark.mts'
+import { adaptiveRefine, type AgenticTask, type ArtifactHandle, defineStrategy, type Environment, printBenchmarkReport, refine, runBenchmark, sample } from '@tangle-network/agent-runtime/loops'
 
 // ── 1. Implement an Environment (the only thing a new domain writes) ──────────────
 // A toy: the agent must drive a counter to exactly the target using the increment tool.

diff --git a/src/runtime/index.ts b/src/runtime/index.ts
@@ -152,6 +152,14 @@ export type {
   WidenSpec,
 } from './personify/wave-types'
 export { reportLoopUsage, type UsageSink } from './report-usage'
+export {
+  type BenchmarkConfig,
+  type BenchmarkLift,
+  type BenchmarkReport,
+  type Environment,
+  printBenchmarkReport,
+  runBenchmark,
+} from './run-benchmark'
 export type { RunLoopOptions } from './run-loop'
 export { createSandboxForSpec, defaultSelectWinner, runLoop } from './run-loop'
 export { acquireSandbox } from './sandbox-acquire'
@@ -176,6 +184,31 @@ export {
   type SandboxRun,
   type TurnResult,
 } from './sandbox-run'
+// The optimization suite: a domain = an Environment (5 hooks); a Strategy = how the
+// budget is spent to beat its check. Built-ins `sample`/`refine`; author your own with
+// `defineStrategy` (compose shot() + critique(), zero Supervisor ceremony); compare
+// with runBenchmark. The depth/breadth drivers are the reference implementations.
+export {
+  type AgenticOptions,
+  type AgenticRunResult,
+  type AgenticSurface,
+  type AgenticTask,
+  type AgenticTool,
+  type ArtifactHandle,
+  adaptiveRefine,
+  breadthDriver,
+  defineStrategy,
+  depthDriver,
+  type RunAgenticOptions,
+  refine,
+  runAgentic,
+  type ShotSpec,
+  type Strategy,
+  type StrategyCtx,
+  type StrategyResult,
+  type SurfaceScore,
+  sample,
+} from './strategy'
 export {
   type BudgetPool,
   type BudgetReadout,

diff --git a/bench/src/run-benchmark.mts → src/runtime/run-benchmark.ts b/bench/src/run-benchmark.mts → src/runtime/run-benchmark.ts
@@ -1,32 +1,33 @@
 /**
  * runBenchmark — the packaged optimization suite. Define a domain by implementing an
- * Environment (open / tools / call / score / close); get the optimization strategies
- * compared, scored by your own deployable check, with a paired-bootstrap report — FREE.
+ * `Environment` (open / tools / call / score / close); get the optimization strategies
+ * compared, scored by your own deployable check, with a paired-bootstrap report — free.
  *
- * The mental model: you have a TASK + a deployable CHECK + a compute BUDGET. An
- * optimization STRATEGY is how you spend the budget to beat the check. Two primitives:
+ * The mental model: you have a TASK + a deployable CHECK + a compute BUDGET. A strategy
+ * is how you spend the budget to beat the check. Two built-ins:
  *
- *   sample  — N independent attempts, keep the best-verifying one.   ("best-of-N" / resample)
+ *   sample  — N independent attempts, keep the best-verifying one.   (best-of-N / resample)
  *   refine  — attempt → a critic reads the trace → steer the next → repeat. (iterate-with-feedback)
  *
- * Both run at equal budget; the headline is the paired lift of refine over sample.
- * (Internally `sample`→breadth, `refine`→depth on the canonical Supervisor+observe loop.)
- *
- * Juniors call runBenchmark and read the report. Seniors customize the HOOKS: the critic
- * (worker.analystInstruction — observe()'s prompt), the check (Environment.score), the
- * worker (the model), and can drop to runAgentic / the Supervisor for new strategies.
+ * Both run at equal budget through the Supervisor's conserved pool; the headline is the
+ * paired lift of refine over sample. Author your own strategy with `defineStrategy`.
  */
-import { type AgenticOptions, type AgenticSurface, type AgenticTask, refine, runAgentic, sample, type Strategy } from './agentic'
-import { type PairedLift, pairedLift, pool } from './stats.mts'
+
+import { pairedBootstrap } from '@tangle-network/agent-eval'
+import {
+  type AgenticOptions,
+  type AgenticSurface,
+  type AgenticTask,
+  refine,
+  runAgentic,
+  type Strategy,
+  sample,
+} from './strategy'
 
 /** A checkable task domain — implement these 5 hooks and the suite does the rest. The
  *  same seam as `AgenticSurface`; `Environment` is the RL/gym-standard name for it. */
 export type Environment = AgenticSurface
 
-// Strategy is the OPEN extension point (re-exported from agentic): pass the built-ins or
-// author your own (implement Strategy.driver returning an Agent). See `refine`/`sample`.
-export { refine, sample, type Strategy } from './agentic'
-
 export interface BenchmarkConfig {
   /** The task domain (5 hooks). */
   environment: Environment
@@ -43,17 +44,47 @@ export interface BenchmarkConfig {
   concurrency?: number
 }
 
+export interface BenchmarkLift {
+  /** Mean of paired deltas (refine − sample). */
+  mean: number
+  low: number
+  high: number
+  n: number
+}
+
 export interface BenchmarkReport {
   n: number
   excluded: number
   /** Mean verifier score per strategy (keyed by strategy.name, 0..1). */
   perStrategy: Record<string, number>
-  /** The headline when exactly `refine` + `sample` ran: paired lift of refine over sample. */
-  refineVsSample?: PairedLift
+  /** The headline when both `refine` and `sample` ran: paired-bootstrap lift of refine over sample. */
+  refineVsSample?: BenchmarkLift
 }
 
-/** Run the requested strategies over the tasks, scored by the Environment's own check,
- *  and return the per-strategy means + the paired-bootstrap lift of refine over sample.
+/** Bounded-concurrency map preserving order; a worker that throws resolves its slot to null. */
+async function pool<T, R>(
+  items: readonly T[],
+  limit: number,
+  fn: (item: T, i: number) => Promise<R | null>,
+): Promise<Array<R | null>> {
+  const out: Array<R | null> = new Array(items.length).fill(null)
+  let next = 0
+  const workers = Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, async () => {
+    while (next < items.length) {
+      const i = next
+      next += 1
+      try {
+        out[i] = await fn(items[i] as T, i)
+      } catch {
+        out[i] = null
+      }
+    }
+  })
+  await Promise.all(workers)
+  return out
+}
+
+/** Run the requested strategies over the tasks, scored by the Environment's own check.
  *  Resilient: a task whose rollouts fail (transient infra) is excluded, not fatal. */
 export async function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkReport> {
   const strategies = cfg.strategies ?? [sample, refine]
@@ -62,15 +93,17 @@ export async function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkRepor
 
   const rows = await pool(cfg.tasks, concurrency, async (task) => {
     const scores: Record<string, number> = {}
-    try {
-      for (const s of strategies) {
-        const r = await runAgentic({ ...cfg.worker, surface: cfg.environment, task, strategy: s, budget })
-        scores[s.name] = r.score
-      }
-      return scores
-    } catch {
-      return null // transient infra on this task — exclude it
+    for (const s of strategies) {
+      const r = await runAgentic({
+        ...cfg.worker,
+        surface: cfg.environment,
+        task,
+        strategy: s,
+        budget,
+      })
+      scores[s.name] = r.score
     }
+    return scores
   })
 
   const ok = rows.filter((r): r is Record<string, number> => r !== null)
@@ -80,8 +113,12 @@ export async function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkRepor
 
   const report: BenchmarkReport = { n: ok.length, excluded: rows.length - ok.length, perStrategy }
   const names = strategies.map((s) => s.name)
-  if (names.includes('refine') && names.includes('sample')) {
-    report.refineVsSample = pairedLift(ok.map((r) => r.sample ?? 0), ok.map((r) => r.refine ?? 0))
+  if (names.includes('refine') && names.includes('sample') && ok.length >= 2) {
+    const b = pairedBootstrap(
+      ok.map((r) => r.sample ?? 0),
+      ok.map((r) => r.refine ?? 0),
+    )
+    report.refineVsSample = { mean: b.mean, low: b.low, high: b.high, n: b.n }
   }
   return report
 }
@@ -90,11 +127,14 @@ export async function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkRepor
 export function printBenchmarkReport(report: BenchmarkReport): void {
   const pct = (x: number) => `${(x * 100).toFixed(1)}%`
   const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`
-  console.log(`\n=== benchmark · n=${report.n}${report.excluded ? ` (excluded ${report.excluded})` : ''} ===`)
-  for (const [s, v] of Object.entries(report.perStrategy)) console.log(`  ${s.padEnd(8)} ${pct(v ?? 0)}`)
+  console.log(
+    `\n=== benchmark · n=${report.n}${report.excluded ? ` (excluded ${report.excluded})` : ''} ===`,
+  )
+  for (const [s, v] of Object.entries(report.perStrategy))
+    console.log(`  ${s.padEnd(8)} ${pct(v ?? 0)}`)
   const l = report.refineVsSample
   if (l) {
     const sig = l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s.'
-    console.log(`  refine − sample: ${pp(l.point)}  CI [${pp(l.low)}, ${pp(l.high)}]  (${sig})`)
+    console.log(`  refine − sample: ${pp(l.mean)}  CI [${pp(l.low)}, ${pp(l.high)}]  (${sig})`)
   }
 }