From 5f29f543605f7070e34728d0a2606c10941fd941 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Tue, 9 Jun 2026 17:16:02 -0600 Subject: [PATCH 1/2] feat(bench): corpus flywheel wiring + the primed-vs-cold A/B MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AgenticOptions gains corpus/corpusTags — the analyst's observe() pass now appends trace-derived facts (the flywheel write side) with zero extra LLM calls; priming (the read side) is the caller folding corpus.query() facts into the task systemPrompt. eops-corpus-ab.mts: THE across-run experiment (layer-across-run.md). Two arms over the same task stream, same order, canonical depth: cold (fresh every run) vs primed (query top-k facts before, observe-append after). Equal compute by construction. Reports paired lift, the SLOPE (first-half vs second-half — the flywheel signature is a growing advantage), fact-injection counts, and a frozen holdout (fresh tasks, corpus read-only). Smoke verified: run 1 wrote 3 facts, run 2 injected them. --- bench/src/agentic.ts | 16 +++- bench/src/eops-corpus-ab.mts | 150 +++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+), 2 deletions(-) create mode 100644 bench/src/eops-corpus-ab.mts diff --git a/bench/src/agentic.ts b/bench/src/agentic.ts index cb8eb7e..ed66544 100644 --- a/bench/src/agentic.ts +++ b/bench/src/agentic.ts @@ -30,6 +30,7 @@ import { } from '@tangle-network/agent-runtime/loops' import type { Agent, + Corpus, AgentSpec, Budget, ExecutorContext, @@ -92,6 +93,12 @@ export interface AgenticOptions { /** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a * prompt optimizer (GEPA) tunes — the analyst IS the steerer. Omitted ⇒ the default. */ analystInstruction?: string + /** Across-run learning: when set, the analyst's observe() pass appends trace-derived + * facts here (the flywheel write side). Priming (the read side) is the caller's move — + * query the corpus and fold facts into the task's systemPrompt before runAgentic. */ + corpus?: Corpus + /** Tags written onto learned facts (and used by the caller's priming query). */ + corpusTags?: string[] } // ── The unit: one agentic shot (a bounded tool loop) over a handle ─────────────── @@ -189,8 +196,13 @@ async function analyze(task: AgenticTask, messages: Msg[], opts: AgenticOptions) .slice(0, 7000) const chat = createChatClient({ transport: 'router', apiKey: opts.routerKey, baseUrl: opts.routerBaseUrl, defaultModel: opts.model }) const obs = await observe( - { task: task.userPrompt, output: trajectory, trace: messages, outcome: 'failed' }, - { chat, model: opts.model, ...(opts.analystInstruction ? { analystInstruction: opts.analystInstruction } : {}) }, + { task: task.userPrompt, output: trajectory, trace: messages, outcome: 'failed', runId: task.id }, + { + chat, + model: opts.model, + ...(opts.analystInstruction ? { analystInstruction: opts.analystInstruction } : {}), + ...(opts.corpus ? { corpus: opts.corpus, tags: opts.corpusTags ?? [] } : {}), + }, ) // The steer = the analyst's recommended actions for the agent. Empty ⇒ nothing left to do. const steer = obs.findings diff --git a/bench/src/eops-corpus-ab.mts b/bench/src/eops-corpus-ab.mts new file mode 100644 index 0000000..78234c3 --- /dev/null +++ b/bench/src/eops-corpus-ab.mts @@ -0,0 +1,150 @@ +/** + * The corpus flywheel A/B — primed-vs-cold at equal compute. THE across-run experiment + * (docs/research/layer-across-run.md): does run N+1 improve because the system learned + * from run N? + * + * Two arms over the SAME task stream, same order, canonical depth (Supervisor + observe): + * cold — every run fresh (the baseline as measured at +16.4pp vs breadth). + * primed — before each run, query the corpus (trace-derived facts accumulated by the + * analyst's observe() pass on PRIOR runs) and fold the top-k into the task's + * systemPrompt; after each run, observe() appends new facts. Zero extra LLM + * calls (the analyst already runs); priming is prompt text — equal compute. + * + * Reported: per-position scores, paired lift, the SLOPE (first-half vs second-half lift — + * the flywheel signature is a GROWING advantage), fact uptake counts, and a frozen + * HOLDOUT: a disjoint slice run primed-from-the-accumulated-corpus vs cold (does the + * learned knowledge transfer to fresh tasks?). + * + * Falsifiers designed in (layer-across-run.md): context pollution (cap k, report dose), + * stale/instance facts (the gym DB resets per task — only PROCEDURAL facts can help), + * judge leakage (observe() is structurally trace-only), worker disregard (uptake column). + * + * docker run -d --rm --name eops -p 8006:8005 shivakrishnareddyma225/enterpriseops-gym-mcp-itsm:latest + * EOPS_GYM_DBS_DIR=… N=16 HOLDOUT=4 K_FACTS=3 WORKER_MODEL=deepseek-v4-pro tsx src/eops-corpus-ab.mts + */ +import { FileCorpus } from '@tangle-network/agent-runtime/loops' +import { type AgenticOptions, type AgenticTask, runAgentic } from './agentic' +import { createEopsSurface, eopsTaskFromRow } from './agentic-eops' +import { type PairedLift, pairedLift } from './stats.mts' + +function must(name: string): string { + const v = process.env[name] + if (!v) throw new Error(`env ${name} is required`) + return v +} + +async function loadItsmTasks(n: number, offset = 0): Promise { + const url = `https://datasets-server.huggingface.co/rows?dataset=${encodeURIComponent('ServiceNow-AI/EnterpriseOps-Gym')}&config=oracle&split=itsm&offset=${offset}&length=${n}` + const res = await fetch(url) + if (!res.ok) throw new Error(`EOPS HF rows HTTP ${res.status}`) + const body = (await res.json()) as { rows?: Array<{ row: Parameters[0] }> } + return (body.rows ?? []).slice(0, n).map(({ row }) => eopsTaskFromRow(row)) +} + +const tags = ['eops', 'itsm', 'corpus-ab'] +const pct = (x: number) => `${(x * 100).toFixed(0)}%` +const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp` + +async function main(): Promise { + const n = Number(process.env.N ?? 16) + const holdoutN = Number(process.env.HOLDOUT ?? 4) + const kFacts = Number(process.env.K_FACTS ?? 3) + const maxShots = Number(process.env.MAXSHOTS ?? 3) + const model = process.env.WORKER_MODEL ?? 'deepseek-v4-pro' + const corpusPath = process.env.CORPUS ?? `/tmp/eops-corpus-ab-${Date.now()}.jsonl` + const opts: AgenticOptions = { + routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1', + routerKey: must('TANGLE_API_KEY'), + model, + innerTurns: Number(process.env.INNER_TURNS ?? 4), + temperature: 0.7, + } + const surface = createEopsSurface(must('EOPS_GYM_DBS_DIR')) + const corpus = new FileCorpus(corpusPath) + + const stream = await loadItsmTasks(n) + console.error(`=== corpus A/B · primed-vs-cold · stream n=${stream.length} + holdout ${holdoutN} · ${model} · k=${kFacts} facts ===`) + console.error(` corpus: ${corpusPath}\n`) + + /** Top-k trace-derived facts → a prime block for the worker's system prompt. */ + async function primeBlock(): Promise<{ text: string; count: number }> { + const facts = await corpus.query({ tags: ['audience:agent'], limit: kFacts }) + if (facts.length === 0) return { text: '', count: 0 } + const lines = facts.map((f) => `- ${f.claim}${f.rationale ? ` (${f.rationale.slice(0, 120)})` : ''}`) + return { + text: `\n\nLEARNINGS FROM PRIOR RUNS (apply where relevant):\n${lines.join('\n')}`, + count: facts.length, + } + } + + async function runArm(task: AgenticTask, primed: boolean): Promise<{ score: number; facts: number } | null> { + try { + if (!primed) { + const r = await runAgentic({ ...opts, surface, task, mode: 'depth', budget: maxShots }) + return { score: r.score, facts: 0 } + } + const prime = await primeBlock() + const primedTask: AgenticTask = { ...task, systemPrompt: `${task.systemPrompt}${prime.text}` } + const r = await runAgentic({ ...opts, corpus, corpusTags: tags, surface, task: primedTask, mode: 'depth', budget: maxShots }) + return { score: r.score, facts: prime.count } + } catch (e) { + console.error(` SKIP ${task.id.slice(-12)} (${e instanceof Error ? e.message.slice(0, 60) : e})`) + return null + } + } + + // The stream: per task, cold first (no corpus contact), then primed (reads + writes). + const rows: Array<{ cold: number; primed: number; facts: number }> = [] + for (let i = 0; i < stream.length; i += 1) { + const task = stream[i] as AgenticTask + const cold = await runArm(task, false) + const primed = await runArm(task, true) + if (!cold || !primed) continue + rows.push({ cold: cold.score, primed: primed.score, facts: primed.facts }) + console.error(` [${i + 1}/${stream.length}] ${task.id.slice(-12)}: cold ${pct(cold.score)} primed ${pct(primed.score)} (facts injected: ${primed.facts})`) + } + + const mean = (xs: number[]) => (xs.length ? xs.reduce((s, x) => s + x, 0) / xs.length : 0) + const lift = pairedLift(rows.map((r) => r.cold), rows.map((r) => r.primed)) + const half = Math.floor(rows.length / 2) + const firstHalf = mean(rows.slice(0, half).map((r) => r.primed - r.cold)) + const secondHalf = mean(rows.slice(half).map((r) => r.primed - r.cold)) + + // Frozen holdout: fresh tasks, primed from the ACCUMULATED corpus (read-only) vs cold. + let holdout: { lift: PairedLift; n: number } | undefined + if (holdoutN > 0) { + console.error(`\n▶ holdout (${holdoutN} disjoint tasks, corpus read-only)…`) + const htasks = await loadItsmTasks(holdoutN, stream.length) + const hrows: Array<{ cold: number; primed: number }> = [] + for (const task of htasks) { + const cold = await runArm(task, false) + if (!cold) continue + // read-only priming: query + inject, but do NOT pass the corpus (no writes). + try { + const prime = await primeBlock() + const primedTask: AgenticTask = { ...task, systemPrompt: `${task.systemPrompt}${prime.text}` } + const r = await runAgentic({ ...opts, surface, task: primedTask, mode: 'depth', budget: maxShots }) + hrows.push({ cold: cold.score, primed: r.score }) + console.error(` ${task.id.slice(-12)}: cold ${pct(cold.score)} primed ${pct(r.score)}`) + } catch { + /* skip */ + } + } + if (hrows.length >= 2) holdout = { lift: pairedLift(hrows.map((r) => r.cold), hrows.map((r) => r.primed)), n: hrows.length } + } + + const sig = (l: PairedLift) => (l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s.') + console.error(`\n${'='.repeat(74)}`) + console.error(`CORPUS A/B RESULT · stream n=${rows.length} · ${model} · k=${kFacts}`) + console.error('='.repeat(74)) + console.error(` cold ${pct(mean(rows.map((r) => r.cold)))} primed ${pct(mean(rows.map((r) => r.primed)))}`) + console.error(` primed − cold (paired, B=10000) ${pp(lift.point)} CI [${pp(lift.low)}, ${pp(lift.high)}] disc=${lift.discordant} ${sig(lift)}`) + console.error(` SLOPE: first-half lift ${pp(firstHalf)} → second-half lift ${pp(secondHalf)} ${secondHalf > firstHalf ? '(growing — the flywheel signature)' : '(not growing)'}`) + if (holdout) console.error(` HOLDOUT (${holdout.n} fresh tasks, accumulated corpus): ${pp(holdout.lift.point)} CI [${pp(holdout.lift.low)}, ${pp(holdout.lift.high)}] ${sig(holdout.lift)}`) + console.error(` corpus facts accumulated: see ${corpusPath}`) +} + +main().catch((e) => { + console.error(`eops-corpus-ab: ${e instanceof Error ? (e.stack ?? e.message) : String(e)}`) + process.exit(1) +}) From d879ac93bb2ecba27d10f14a0f8cbc2c95f61072 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Tue, 9 Jun 2026 17:20:57 -0600 Subject: [PATCH 2/2] =?UTF-8?q?feat(runtime):=20publish=20the=20optimizati?= =?UTF-8?q?on=20suite=20=E2=80=94=20Environment/Strategy/defineStrategy/ru?= =?UTF-8?q?nBenchmark?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes packaging gap G1 (docs/research/product-integration-playbook.md): the suite graduates from bench/ R&D into the published package, importable by any product. - src/runtime/strategy.ts — the domain-blind core moved from bench/src/agentic.ts: the AgenticSurface seam (Environment), the canonical depth/breadth drivers (Supervisor + observe() — the +16.4pp configuration), open Strategy + sample/refine, defineStrategy with the shot()/critique() steps, adaptiveRefine, runAgentic, and the corpus flywheel threading (AgenticOptions.corpus → the analyst's observe() appends trace-derived facts). - src/runtime/run-benchmark.ts — runBenchmark/printBenchmarkReport over an Environment; the paired lift now uses agent-eval's pairedBootstrap (the substrate's stats) instead of a bench-local copy. - Exported via /loops. bench consumers (agentic-eops, agentic-run, eops-gepa, eops-corpus-ab, examples/strategy-demo) now import from the package; bench/src/ agentic.ts + run-benchmark.mts deleted (-1000 LOC of R&D duplication). Verified: typecheck clean both packages; 680 tests pass; strategy-demo runs all four strategies end-to-end through the published exports. --- bench/src/agentic-eops.ts | 2 +- bench/src/agentic-run.mts | 2 +- bench/src/eops-corpus-ab.mts | 3 +- bench/src/eops-gepa.mts | 3 +- bench/src/examples/strategy-demo.mts | 3 +- src/runtime/index.ts | 33 +++ .../runtime/run-benchmark.ts | 108 +++++--- .../src/agentic.ts => src/runtime/strategy.ts | 243 +++++++++++++----- 8 files changed, 287 insertions(+), 110 deletions(-) rename bench/src/run-benchmark.mts => src/runtime/run-benchmark.ts (53%) rename bench/src/agentic.ts => src/runtime/strategy.ts (79%) diff --git a/bench/src/agentic-eops.ts b/bench/src/agentic-eops.ts index 86c82b9..1cd7870 100644 --- a/bench/src/agentic-eops.ts +++ b/bench/src/agentic-eops.ts @@ -7,7 +7,7 @@ * AppWorld, terminal-bench) ships its own file like this one — the drivers in agentic.ts never change. */ -import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from './agentic' +import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from '@tangle-network/agent-runtime/loops' import { callTool, deleteDb, type GymServer, type GymVerifier, loadTools, runVerifiers, seed } from './gym-agent' interface EopsMeta { diff --git a/bench/src/agentic-run.mts b/bench/src/agentic-run.mts index d3a7e17..b5f4fba 100644 --- a/bench/src/agentic-run.mts +++ b/bench/src/agentic-run.mts @@ -7,7 +7,7 @@ * TASKS=4 MAX_SHOTS=5 WIDTH=5 INNER_TURNS=4 WORKER_MODEL=gpt-4.1 tsx src/agentic-run.mts */ -import { type AgenticOptions, type AgenticTask, runAgentic } from './agentic' +import { type AgenticOptions, type AgenticTask, runAgentic } from '@tangle-network/agent-runtime/loops' import { createEopsSurface, eopsTaskFromRow } from './agentic-eops' const must = (k: string): string => { diff --git a/bench/src/eops-corpus-ab.mts b/bench/src/eops-corpus-ab.mts index 78234c3..2b48daa 100644 --- a/bench/src/eops-corpus-ab.mts +++ b/bench/src/eops-corpus-ab.mts @@ -22,8 +22,7 @@ * docker run -d --rm --name eops -p 8006:8005 shivakrishnareddyma225/enterpriseops-gym-mcp-itsm:latest * EOPS_GYM_DBS_DIR=… N=16 HOLDOUT=4 K_FACTS=3 WORKER_MODEL=deepseek-v4-pro tsx src/eops-corpus-ab.mts */ -import { FileCorpus } from '@tangle-network/agent-runtime/loops' -import { type AgenticOptions, type AgenticTask, runAgentic } from './agentic' +import { type AgenticOptions, type AgenticTask, FileCorpus, runAgentic } from '@tangle-network/agent-runtime/loops' import { createEopsSurface, eopsTaskFromRow } from './agentic-eops' import { type PairedLift, pairedLift } from './stats.mts' diff --git a/bench/src/eops-gepa.mts b/bench/src/eops-gepa.mts index 2306e06..f0eb591 100644 --- a/bench/src/eops-gepa.mts +++ b/bench/src/eops-gepa.mts @@ -18,8 +18,7 @@ */ import { readFileSync, writeFileSync } from 'node:fs' import { buildReflectionPrompt, paretoFrontier, parseReflectionResponse } from '@tangle-network/agent-eval' -import { defaultAnalystInstruction } from '@tangle-network/agent-runtime/loops' -import { type AgenticOptions, type AgenticTask, runAgentic } from './agentic' +import { type AgenticOptions, type AgenticTask, defaultAnalystInstruction, runAgentic } from '@tangle-network/agent-runtime/loops' import { createEopsSurface, eopsTaskFromRow } from './agentic-eops' import { type RouterConfig, routerChatWithUsage } from './router-client' diff --git a/bench/src/examples/strategy-demo.mts b/bench/src/examples/strategy-demo.mts index 63edcb0..22e3fd5 100644 --- a/bench/src/examples/strategy-demo.mts +++ b/bench/src/examples/strategy-demo.mts @@ -14,8 +14,7 @@ * 2. pick strategies — pass [sample, refine, adaptiveRefine]. * 3. author your own — defineStrategy(name, body) in ~10 lines, no Supervisor ceremony. */ -import { adaptiveRefine, type AgenticTask, type ArtifactHandle, defineStrategy, refine, sample } from '../agentic' -import { type Environment, printBenchmarkReport, runBenchmark } from '../run-benchmark.mts' +import { adaptiveRefine, type AgenticTask, type ArtifactHandle, defineStrategy, type Environment, printBenchmarkReport, refine, runBenchmark, sample } from '@tangle-network/agent-runtime/loops' // ── 1. Implement an Environment (the only thing a new domain writes) ────────────── // A toy: the agent must drive a counter to exactly the target using the increment tool. diff --git a/src/runtime/index.ts b/src/runtime/index.ts index 21e779f..c94d7a0 100644 --- a/src/runtime/index.ts +++ b/src/runtime/index.ts @@ -152,6 +152,14 @@ export type { WidenSpec, } from './personify/wave-types' export { reportLoopUsage, type UsageSink } from './report-usage' +export { + type BenchmarkConfig, + type BenchmarkLift, + type BenchmarkReport, + type Environment, + printBenchmarkReport, + runBenchmark, +} from './run-benchmark' export type { RunLoopOptions } from './run-loop' export { createSandboxForSpec, defaultSelectWinner, runLoop } from './run-loop' export { acquireSandbox } from './sandbox-acquire' @@ -176,6 +184,31 @@ export { type SandboxRun, type TurnResult, } from './sandbox-run' +// The optimization suite: a domain = an Environment (5 hooks); a Strategy = how the +// budget is spent to beat its check. Built-ins `sample`/`refine`; author your own with +// `defineStrategy` (compose shot() + critique(), zero Supervisor ceremony); compare +// with runBenchmark. The depth/breadth drivers are the reference implementations. +export { + type AgenticOptions, + type AgenticRunResult, + type AgenticSurface, + type AgenticTask, + type AgenticTool, + type ArtifactHandle, + adaptiveRefine, + breadthDriver, + defineStrategy, + depthDriver, + type RunAgenticOptions, + refine, + runAgentic, + type ShotSpec, + type Strategy, + type StrategyCtx, + type StrategyResult, + type SurfaceScore, + sample, +} from './strategy' export { type BudgetPool, type BudgetReadout, diff --git a/bench/src/run-benchmark.mts b/src/runtime/run-benchmark.ts similarity index 53% rename from bench/src/run-benchmark.mts rename to src/runtime/run-benchmark.ts index 4027c05..8dd8c9b 100644 --- a/bench/src/run-benchmark.mts +++ b/src/runtime/run-benchmark.ts @@ -1,32 +1,33 @@ /** * runBenchmark — the packaged optimization suite. Define a domain by implementing an - * Environment (open / tools / call / score / close); get the optimization strategies - * compared, scored by your own deployable check, with a paired-bootstrap report — FREE. + * `Environment` (open / tools / call / score / close); get the optimization strategies + * compared, scored by your own deployable check, with a paired-bootstrap report — free. * - * The mental model: you have a TASK + a deployable CHECK + a compute BUDGET. An - * optimization STRATEGY is how you spend the budget to beat the check. Two primitives: + * The mental model: you have a TASK + a deployable CHECK + a compute BUDGET. A strategy + * is how you spend the budget to beat the check. Two built-ins: * - * sample — N independent attempts, keep the best-verifying one. ("best-of-N" / resample) + * sample — N independent attempts, keep the best-verifying one. (best-of-N / resample) * refine — attempt → a critic reads the trace → steer the next → repeat. (iterate-with-feedback) * - * Both run at equal budget; the headline is the paired lift of refine over sample. - * (Internally `sample`→breadth, `refine`→depth on the canonical Supervisor+observe loop.) - * - * Juniors call runBenchmark and read the report. Seniors customize the HOOKS: the critic - * (worker.analystInstruction — observe()'s prompt), the check (Environment.score), the - * worker (the model), and can drop to runAgentic / the Supervisor for new strategies. + * Both run at equal budget through the Supervisor's conserved pool; the headline is the + * paired lift of refine over sample. Author your own strategy with `defineStrategy`. */ -import { type AgenticOptions, type AgenticSurface, type AgenticTask, refine, runAgentic, sample, type Strategy } from './agentic' -import { type PairedLift, pairedLift, pool } from './stats.mts' + +import { pairedBootstrap } from '@tangle-network/agent-eval' +import { + type AgenticOptions, + type AgenticSurface, + type AgenticTask, + refine, + runAgentic, + type Strategy, + sample, +} from './strategy' /** A checkable task domain — implement these 5 hooks and the suite does the rest. The * same seam as `AgenticSurface`; `Environment` is the RL/gym-standard name for it. */ export type Environment = AgenticSurface -// Strategy is the OPEN extension point (re-exported from agentic): pass the built-ins or -// author your own (implement Strategy.driver returning an Agent). See `refine`/`sample`. -export { refine, sample, type Strategy } from './agentic' - export interface BenchmarkConfig { /** The task domain (5 hooks). */ environment: Environment @@ -43,17 +44,47 @@ export interface BenchmarkConfig { concurrency?: number } +export interface BenchmarkLift { + /** Mean of paired deltas (refine − sample). */ + mean: number + low: number + high: number + n: number +} + export interface BenchmarkReport { n: number excluded: number /** Mean verifier score per strategy (keyed by strategy.name, 0..1). */ perStrategy: Record - /** The headline when exactly `refine` + `sample` ran: paired lift of refine over sample. */ - refineVsSample?: PairedLift + /** The headline when both `refine` and `sample` ran: paired-bootstrap lift of refine over sample. */ + refineVsSample?: BenchmarkLift } -/** Run the requested strategies over the tasks, scored by the Environment's own check, - * and return the per-strategy means + the paired-bootstrap lift of refine over sample. +/** Bounded-concurrency map preserving order; a worker that throws resolves its slot to null. */ +async function pool( + items: readonly T[], + limit: number, + fn: (item: T, i: number) => Promise, +): Promise> { + const out: Array = new Array(items.length).fill(null) + let next = 0 + const workers = Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, async () => { + while (next < items.length) { + const i = next + next += 1 + try { + out[i] = await fn(items[i] as T, i) + } catch { + out[i] = null + } + } + }) + await Promise.all(workers) + return out +} + +/** Run the requested strategies over the tasks, scored by the Environment's own check. * Resilient: a task whose rollouts fail (transient infra) is excluded, not fatal. */ export async function runBenchmark(cfg: BenchmarkConfig): Promise { const strategies = cfg.strategies ?? [sample, refine] @@ -62,15 +93,17 @@ export async function runBenchmark(cfg: BenchmarkConfig): Promise { const scores: Record = {} - try { - for (const s of strategies) { - const r = await runAgentic({ ...cfg.worker, surface: cfg.environment, task, strategy: s, budget }) - scores[s.name] = r.score - } - return scores - } catch { - return null // transient infra on this task — exclude it + for (const s of strategies) { + const r = await runAgentic({ + ...cfg.worker, + surface: cfg.environment, + task, + strategy: s, + budget, + }) + scores[s.name] = r.score } + return scores }) const ok = rows.filter((r): r is Record => r !== null) @@ -80,8 +113,12 @@ export async function runBenchmark(cfg: BenchmarkConfig): Promise s.name) - if (names.includes('refine') && names.includes('sample')) { - report.refineVsSample = pairedLift(ok.map((r) => r.sample ?? 0), ok.map((r) => r.refine ?? 0)) + if (names.includes('refine') && names.includes('sample') && ok.length >= 2) { + const b = pairedBootstrap( + ok.map((r) => r.sample ?? 0), + ok.map((r) => r.refine ?? 0), + ) + report.refineVsSample = { mean: b.mean, low: b.low, high: b.high, n: b.n } } return report } @@ -90,11 +127,14 @@ export async function runBenchmark(cfg: BenchmarkConfig): Promise `${(x * 100).toFixed(1)}%` const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp` - console.log(`\n=== benchmark · n=${report.n}${report.excluded ? ` (excluded ${report.excluded})` : ''} ===`) - for (const [s, v] of Object.entries(report.perStrategy)) console.log(` ${s.padEnd(8)} ${pct(v ?? 0)}`) + console.log( + `\n=== benchmark · n=${report.n}${report.excluded ? ` (excluded ${report.excluded})` : ''} ===`, + ) + for (const [s, v] of Object.entries(report.perStrategy)) + console.log(` ${s.padEnd(8)} ${pct(v ?? 0)}`) const l = report.refineVsSample if (l) { const sig = l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s.' - console.log(` refine − sample: ${pp(l.point)} CI [${pp(l.low)}, ${pp(l.high)}] (${sig})`) + console.log(` refine − sample: ${pp(l.mean)} CI [${pp(l.low)}, ${pp(l.high)}] (${sig})`) } } diff --git a/bench/src/agentic.ts b/src/runtime/strategy.ts similarity index 79% rename from bench/src/agentic.ts rename to src/runtime/strategy.ts index ed66544..98f0af1 100644 --- a/bench/src/agentic.ts +++ b/src/runtime/strategy.ts @@ -22,27 +22,24 @@ */ import { createChatClient } from '@tangle-network/agent-eval' -import { - createSupervisor, - InMemoryResultBlobStore, - InMemorySpawnJournal, - observe, -} from '@tangle-network/agent-runtime/loops' +import { InMemoryResultBlobStore, InMemorySpawnJournal } from '../durable/spawn-journal' +import { observe } from './observe' +import type { Outcome } from './personify/types' +import type { Corpus } from './personify/wave-types' +import { createSupervisor } from './supervise/supervisor' import type { Agent, - Corpus, AgentSpec, Budget, - ExecutorContext, - ExecutorRegistry, Executor, + ExecutorContext, ExecutorFactory, + ExecutorRegistry, ExecutorResult, - Outcome, Scope, Settled, Spend, -} from '@tangle-network/agent-runtime/loops' +} from './supervise/types' // ── The general surface seam (the only thing a new benchmark implements) ───────── @@ -132,7 +129,7 @@ const taskNudge = * `surface.call`, carrying `messages`. Returns the updated conversation + counts. */ async function runShot( surface: AgenticSurface, - task: AgenticTask, + _task: AgenticTask, handle: ArtifactHandle, tools: AgenticTool[], messages: Msg[], @@ -146,15 +143,27 @@ async function runShot( const res = await fetch(`${opts.routerBaseUrl.replace(/\/$/, '')}/chat/completions`, { method: 'POST', headers: { 'content-type': 'application/json', authorization: `Bearer ${opts.routerKey}` }, - body: JSON.stringify({ model: opts.model, messages, tools, tool_choice: 'auto', temperature: opts.temperature ?? 0.7 }), + body: JSON.stringify({ + model: opts.model, + messages, + tools, + tool_choice: 'auto', + temperature: opts.temperature ?? 0.7, + }), }) if (!res.ok) throw new Error(`router ${res.status}: ${(await res.text()).slice(0, 200)}`) completions += 1 - const data = (await res.json()) as { choices?: Array<{ message?: { content?: string; tool_calls?: ToolCall[] } }> } + const data = (await res.json()) as { + choices?: Array<{ message?: { content?: string; tool_calls?: ToolCall[] } }> + } const msg = data.choices?.[0]?.message if (!msg) break const calls = msg.tool_calls ?? [] - messages.push({ role: 'assistant', content: msg.content ?? '', ...(calls.length ? { tool_calls: calls } : {}) }) + messages.push({ + role: 'assistant', + content: msg.content ?? '', + ...(calls.length ? { tool_calls: calls } : {}), + }) if (calls.length === 0) break for (const call of calls) { toolCalls += 1 @@ -189,14 +198,27 @@ async function analyze(task: AgenticTask, messages: Msg[], opts: AgenticOptions) .filter((m) => m.role === 'assistant' || m.role === 'tool') .map((m) => { if (m.role === 'tool') return `RESULT ${String(m.content).slice(0, 280)}` - const calls = (m.tool_calls as ToolCall[] | undefined)?.map((c) => `${c.function.name}(${c.function.arguments})`).join(', ') + const calls = (m.tool_calls as ToolCall[] | undefined) + ?.map((c) => `${c.function.name}(${c.function.arguments})`) + .join(', ') return calls ? `CALL ${calls}` : `SAY ${String(m.content).slice(0, 200)}` }) .join('\n') .slice(0, 7000) - const chat = createChatClient({ transport: 'router', apiKey: opts.routerKey, baseUrl: opts.routerBaseUrl, defaultModel: opts.model }) + const chat = createChatClient({ + transport: 'router', + apiKey: opts.routerKey, + baseUrl: opts.routerBaseUrl, + defaultModel: opts.model, + }) const obs = await observe( - { task: task.userPrompt, output: trajectory, trace: messages, outcome: 'failed', runId: task.id }, + { + task: task.userPrompt, + output: trajectory, + trace: messages, + outcome: 'failed', + runId: task.id, + }, { chat, model: opts.model, @@ -224,7 +246,12 @@ interface ShotResult { toolErrors: number } -const spend = (iterations: number): Spend => ({ iterations, tokens: { input: 0, output: 0 }, usd: 0, ms: 0 }) +const spend = (iterations: number): Spend => ({ + iterations, + tokens: { input: 0, output: 0 }, + usd: 0, + ms: 0, +}) /** Resolve a shot: if `handle` given, operate on the SHARED artifact (depth); else open+score+close * an OWN artifact (breadth). Always scores the artifact's final state as the deployable verdict. */ @@ -246,7 +273,14 @@ function shotExecutor(surface: AgenticSurface, opts: AgenticOptions): Executor 0 ? s.passes / s.total : 0 - const out: ShotResult = { messages: shot.messages, score, passes: s.passes, total: s.total, completions: shot.completions, toolErrors: shot.toolErrors } + const out: ShotResult = { + messages: shot.messages, + score, + passes: s.passes, + total: s.total, + completions: shot.completions, + toolErrors: shot.toolErrors, + } artifact = { outRef: `shot:${handle.id}:${shot.completions}:${s.passes}/${s.total}`, out, @@ -330,10 +364,18 @@ export interface AgenticRunResult { shots: number } -const perChild = (innerTurns: number): Budget => ({ maxIterations: innerTurns + 1, maxTokens: 1_000_000 }) +const perChild = (innerTurns: number): Budget => ({ + maxIterations: innerTurns + 1, + maxTokens: 1_000_000, +}) /** DEPTH: one persistent artifact, carried across analyst-steered shots. */ -export function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: { maxShots: number }): Agent> { +export function depthDriver( + surface: AgenticSurface, + task: AgenticTask, + opts: AgenticOptions, + cfg: { maxShots: number }, +): Agent> { const innerTurns = opts.innerTurns ?? 4 let pendingSteer: string | undefined // analyst-derived steer carried between shots return { @@ -348,7 +390,10 @@ export function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: Ag for (shots = 0; shots < cfg.maxShots; shots += 1) { const child = leaf(`shot:${shots}`, 'shot') const steer = shots === 0 ? undefined : pendingSteer - const res = scope.spawn(child, { task, handle, messages, steer } as ShotTask, { budget: perChild(innerTurns), label: `shot:${shots}` }) + const res = scope.spawn(child, { task, handle, messages, steer } as ShotTask, { + budget: perChild(innerTurns), + label: `shot:${shots}`, + }) if (!res.ok) break const settled = await drainOne(scope) if (settled.kind === 'down') break @@ -359,7 +404,11 @@ export function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: Ag if (out.score >= 1 || shots === cfg.maxShots - 1) break // Analyst reads the trajectory (firewalled) → steer the resumed session. const aChild = leaf(`analyst:${shots}`, 'analyst') - const aRes = scope.spawn(aChild, { task, messages }, { budget: perChild(1), label: `analyst:${shots}` }) + const aRes = scope.spawn( + aChild, + { task, messages }, + { budget: perChild(1), label: `analyst:${shots}` }, + ) if (!aRes.ok) break const aSettled = await drainOne(scope) completions += 1 @@ -370,7 +419,17 @@ export function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: Ag } const final = await surface.score(task, handle) const score = final.total > 0 ? final.passes / final.total : 0 - return { kind: 'done', deliverable: { mode: 'depth', score, resolved: final.total > 0 && final.passes === final.total, completions, progression, shots: shots + 1 } } + return { + kind: 'done', + deliverable: { + mode: 'depth', + score, + resolved: final.total > 0 && final.passes === final.total, + completions, + progression, + shots: shots + 1, + }, + } } finally { await surface.close(handle) } @@ -379,14 +438,22 @@ export function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: Ag } /** BREADTH: K independent rollouts (each own artifact), verifier picks the best. */ -export function breadthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: { width: number }): Agent> { +export function breadthDriver( + _surface: AgenticSurface, + task: AgenticTask, + opts: AgenticOptions, + cfg: { width: number }, +): Agent> { const innerTurns = opts.innerTurns ?? 4 return { name: 'breadth', async act(_t, scope): Promise> { let opened = 0 for (let k = 0; k < cfg.width; k += 1) { - const res = scope.spawn(leaf(`rollout:${k}`, 'shot'), { task } as ShotTask, { budget: perChild(innerTurns), label: `rollout:${k}` }) + const res = scope.spawn(leaf(`rollout:${k}`, 'shot'), { task } as ShotTask, { + budget: perChild(innerTurns), + label: `rollout:${k}`, + }) if (res.ok) opened += 1 } if (opened === 0) return { kind: 'blocked', blockers: ['breadth: pool admitted no rollout'] } @@ -403,7 +470,17 @@ export function breadthDriver(surface: AgenticSurface, task: AgenticTask, opts: progression.push(best) } if (best < 0) return { kind: 'blocked', blockers: ['breadth: every rollout went down'] } - return { kind: 'done', deliverable: { mode: 'breadth', score: best, resolved: bestResolved, completions, progression, shots: opened } } + return { + kind: 'done', + deliverable: { + mode: 'breadth', + score: best, + resolved: bestResolved, + completions, + progression, + shots: opened, + }, + } }, } } @@ -420,7 +497,12 @@ export function breadthDriver(surface: AgenticSurface, task: AgenticTask, opts: */ export interface Strategy { readonly name: string - driver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, budget: number): Agent> + driver( + surface: AgenticSurface, + task: AgenticTask, + opts: AgenticOptions, + budget: number, + ): Agent> } export const sample: Strategy = { @@ -468,7 +550,10 @@ export interface StrategyCtx { } /** Author a Strategy from the composable steps — the open, compact way. */ -export function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise): Strategy { +export function defineStrategy( + name: string, + run: (ctx: StrategyCtx) => Promise, +): Strategy { return { name, driver: (surface, task, opts, budget) => ({ @@ -485,7 +570,16 @@ export function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise< async shot(spec) { const child = leaf(`shot:${seq}`, 'shot') seq += 1 - const res = scope.spawn(child, { task, handle: spec?.handle, messages: spec?.messages, steer: spec?.steer } as ShotTask, { budget: perChild(innerTurns), label: child.name }) + const res = scope.spawn( + child, + { + task, + handle: spec?.handle, + messages: spec?.messages, + steer: spec?.steer, + } as ShotTask, + { budget: perChild(innerTurns), label: child.name }, + ) if (!res.ok) return null const settled = await drainOne(scope) return settled.kind === 'down' ? null : (settled.out as unknown as ShotResult) @@ -493,7 +587,11 @@ export function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise< async critique(messages) { const child = leaf(`analyst:${seq}`, 'analyst') seq += 1 - const res = scope.spawn(child, { task, messages }, { budget: perChild(1), label: child.name }) + const res = scope.spawn( + child, + { task, messages }, + { budget: perChild(1), label: child.name }, + ) if (!res.ok) return null const settled = await drainOne(scope) if (settled.kind === 'down') return null @@ -513,42 +611,45 @@ export function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise< * — the widen/MCTS idea the depth-stuck failure motivated. Scored keep-best (the best * checkpoint across all lines), the deployable metric. This is the "experts build BETTER * optimizations" path: a new technique, compact, with zero Supervisor ceremony. */ -export const adaptiveRefine = defineStrategy('adaptiveRefine', async ({ surface, task, budget, shot, critique }) => { - let handle = await surface.open(task) - const progression: number[] = [] - let messages: Msg[] | undefined - let steer: string | undefined - let completions = 0 - let best = -1 - let shots = 0 - try { - for (shots = 0; shots < budget; shots += 1) { - const out = await shot({ handle, messages, steer }) - if (!out) break - completions += out.completions - progression.push(out.score) - if (out.score >= 1) break - if (out.score <= best) { - // Stuck: steering isn't improving this line — abandon it, restart fresh. - await surface.close(handle) - handle = await surface.open(task) - messages = undefined - steer = undefined - continue +export const adaptiveRefine = defineStrategy( + 'adaptiveRefine', + async ({ surface, task, budget, shot, critique }) => { + let handle = await surface.open(task) + const progression: number[] = [] + let messages: Msg[] | undefined + let steer: string | undefined + let completions = 0 + let best = -1 + let shots = 0 + try { + for (shots = 0; shots < budget; shots += 1) { + const out = await shot({ handle, messages, steer }) + if (!out) break + completions += out.completions + progression.push(out.score) + if (out.score >= 1) break + if (out.score <= best) { + // Stuck: steering isn't improving this line — abandon it, restart fresh. + await surface.close(handle) + handle = await surface.open(task) + messages = undefined + steer = undefined + continue + } + best = out.score + messages = out.messages + const findings = await critique(out.messages) + completions += 1 + if (!findings) break + steer = `A reviewer flagged unfinished items:\n${findings}\n\nAddress each with the tools, verify they took, then continue.` } - best = out.score - messages = out.messages - const findings = await critique(out.messages) - completions += 1 - if (!findings) break - steer = `A reviewer flagged unfinished items:\n${findings}\n\nAddress each with the tools, verify they took, then continue.` + const score = progression.length ? Math.max(...progression) : 0 + return { score, resolved: score >= 1, completions, progression, shots } + } finally { + await surface.close(handle) } - const score = progression.length ? Math.max(...progression) : 0 - return { score, resolved: score >= 1, completions, progression, shots } - } finally { - await surface.close(handle) - } -}) + }, +) export interface RunAgenticOptions extends AgenticOptions { surface: AgenticSurface @@ -567,7 +668,10 @@ export async function runAgentic(opts: RunAgenticOptions): Promise>() - const root: Budget = opts.rootBudget ?? { maxIterations: opts.budget * ((opts.innerTurns ?? 4) + 2), maxTokens: 1_000_000_000 } + const root: Budget = opts.rootBudget ?? { + maxIterations: opts.budget * ((opts.innerTurns ?? 4) + 2), + maxTokens: 1_000_000_000, + } const result = await supervisor.run(driver, undefined, { budget: root, runId: `agentic:${strategy.name}:${opts.task.id}`, @@ -577,7 +681,10 @@ export async function runAgentic(opts: RunAgenticOptions): Promise